Profiling revealed that OPENSSL_cleanse consumes *more* CPU time than
authorAndy Polyakov <appro@openssl.org>
Mon, 14 May 2007 21:35:25 +0000 (21:35 +0000)
committerAndy Polyakov <appro@openssl.org>
Mon, 14 May 2007 21:35:25 +0000 (21:35 +0000)
sha1_block_data_order when hashing short messages. Move OPENSSL_cleanse
to "cpuid" assembler module and gain 2x.

Configure
crypto/Makefile
crypto/ia64cpuid.S
crypto/mem.c
crypto/sparccpuid.S
crypto/x86_64cpuid.pl
crypto/x86cpuid.pl

index 9b210953a5879bfaebedea9a574af99821640738..b072bfbc7d6851d87d1c246ca488eda11ac0fbd6 100755 (executable)
--- a/Configure
+++ b/Configure
@@ -1209,6 +1209,7 @@ $cflags.=" -DOPENSSL_IA32_SSE2" if (!$no_sse2 && $bn_obj =~ /bn86/);
 
 $cflags.=" -DOPENSSL_BN_ASM_MONT" if ($bn_obj =~ /\-mont|mo86\-/);
 
 
 $cflags.=" -DOPENSSL_BN_ASM_MONT" if ($bn_obj =~ /\-mont|mo86\-/);
 
+$cpuid_obj="mem_clr.o" unless ($cpuid_obj =~ /\.o$/);
 $des_obj=$des_enc      unless ($des_obj =~ /\.o$/);
 $bf_obj=$bf_enc                unless ($bf_obj =~ /\.o$/);
 $cast_obj=$cast_enc    unless ($cast_obj =~ /\.o$/);
 $des_obj=$des_enc      unless ($des_obj =~ /\.o$/);
 $bf_obj=$bf_enc                unless ($bf_obj =~ /\.o$/);
 $cast_obj=$cast_enc    unless ($cast_obj =~ /\.o$/);
@@ -1481,7 +1482,7 @@ print OUT "#ifdef OPENSSL_ALGORITHM_DEFINES\n";
 print OUT $openssl_algorithm_defines_trans;
 print OUT "#endif\n\n";
 
 print OUT $openssl_algorithm_defines_trans;
 print OUT "#endif\n\n";
 
-print OUT "#define OPENSSL_CPUID_OBJ\n\n" if ($cpuid_obj);
+print OUT "#define OPENSSL_CPUID_OBJ\n\n" if ($cpuid_obj ne "mem_clr.o");
 
 while (<IN>)
        {
 
 while (<IN>)
        {
index efe6a79d87c35befbad8a997750e58b76b498dbc..1b8c7c25918d14edd6e616d8eda3a09d64f7d1a8 100644 (file)
@@ -34,7 +34,7 @@ GENERAL=Makefile README crypto-lib.com install.com
 LIB= $(TOP)/libcrypto.a
 SHARED_LIB= libcrypto$(SHLIB_EXT)
 LIBSRC=        cryptlib.c mem.c mem_clr.c mem_dbg.c cversion.c ex_data.c cpt_err.c ebcdic.c uid.c o_time.c o_str.c o_dir.c
 LIB= $(TOP)/libcrypto.a
 SHARED_LIB= libcrypto$(SHLIB_EXT)
 LIBSRC=        cryptlib.c mem.c mem_clr.c mem_dbg.c cversion.c ex_data.c cpt_err.c ebcdic.c uid.c o_time.c o_str.c o_dir.c
-LIBOBJ= cryptlib.o mem.o mem_clr.o mem_dbg.o cversion.o ex_data.o cpt_err.o ebcdic.o uid.o o_time.o o_str.o o_dir.o $(CPUID_OBJ)
+LIBOBJ= cryptlib.o mem.o mem_dbg.o cversion.o ex_data.o cpt_err.o ebcdic.o uid.o o_time.o o_str.o o_dir.o $(CPUID_OBJ)
 
 SRC= $(LIBSRC)
 
 
 SRC= $(LIBSRC)
 
index 5836565abb310f5c67ecdc126038a719b4dc2fc1..818e2d1e1d537fb3484ed14f535d8087a05ea9a5 100644 (file)
@@ -1,11 +1,13 @@
 // Works on all IA-64 platforms: Linux, HP-UX, Win64i...
 // On Win64i compile with ias.exe.
 .text
 // Works on all IA-64 platforms: Linux, HP-UX, Win64i...
 // On Win64i compile with ias.exe.
 .text
+
 .global        OPENSSL_cpuid_setup#
 .proc  OPENSSL_cpuid_setup#
 OPENSSL_cpuid_setup:
 { .mib;        br.ret.sptk.many        b0              };;
 .endp  OPENSSL_cpuid_setup#
 .global        OPENSSL_cpuid_setup#
 .proc  OPENSSL_cpuid_setup#
 OPENSSL_cpuid_setup:
 { .mib;        br.ret.sptk.many        b0              };;
 .endp  OPENSSL_cpuid_setup#
+
 .global        OPENSSL_rdtsc#
 .proc  OPENSSL_rdtsc#
 OPENSSL_rdtsc:
 .global        OPENSSL_rdtsc#
 .proc  OPENSSL_rdtsc#
 OPENSSL_rdtsc:
@@ -124,3 +126,37 @@ OPENSSL_wipe_cpu:
        mov             ar.lc=r3
        br.ret.sptk     b0              };;
 .endp  OPENSSL_wipe_cpu#
        mov             ar.lc=r3
        br.ret.sptk     b0              };;
 .endp  OPENSSL_wipe_cpu#
+
+.global        OPENSSL_cleanse#
+.proc  OPENSSL_cleanse#
+OPENSSL_cleanse:
+{ .mib;        and             r2=7,r32
+       cmp.leu         p6,p0=15,r33        // len>=15
+(p6)   br.cond.dptk    .Lot            };;
+
+.Little:
+{ .mib;        st1             [r32]=r0,1
+       cmp.ltu         p6,p7=1,r33     }  // len>1
+{ .mbb;        add             r33=-1,r33         // len--
+(p6)   br.cond.dptk    .Little
+(p7)   br.ret.sptk.many        b0      };;
+
+.Lot:
+{ .mib;        cmp.eq          p6,p0=0,r2
+(p6)   br.cond.dptk    .Laligned       };;
+{ .mmi;        st1             [r32]=r0,1;;
+       and             r2=7,r32        }
+{ .mib;        add             r33=-1,r33
+       br              .Lot            };;
+
+.Laligned:
+{ .mmi;        st8             [r32]=r0,8
+       and             r2=-8,r33           // len&~7
+       add             r33=-8,r33      };; // len-=8
+{ .mib;        cmp.ltu         p6,p0=8,r2          // ((len+8)&~7)>8
+(p6)   br.cond.dptk    .Laligned       };;
+
+{ .mbb;        cmp.eq          p6,p7=r0,r33
+(p7)   br.cond.dpnt    .Little
+(p6)   br.ret.sptk.many        b0      };;
+.endp  OPENSSL_cleanse#
index 6635167228da1c9eb4ee6be3b4a147a1c3bc2af4..43d48ab425707d2d08c289e47f597fc32e9bb955 100644 (file)
@@ -250,7 +250,6 @@ void CRYPTO_get_mem_debug_functions(void (**m)(void *,int,const char *,int,int),
 void *CRYPTO_malloc_locked(int num, const char *file, int line)
        {
        void *ret = NULL;
 void *CRYPTO_malloc_locked(int num, const char *file, int line)
        {
        void *ret = NULL;
-       extern unsigned char cleanse_ctr;
 
        if (num <= 0) return NULL;
 
 
        if (num <= 0) return NULL;
 
@@ -267,11 +266,15 @@ void *CRYPTO_malloc_locked(int num, const char *file, int line)
        if (malloc_debug_func != NULL)
                malloc_debug_func(ret, num, file, line, 1);
 
        if (malloc_debug_func != NULL)
                malloc_debug_func(ret, num, file, line, 1);
 
+#ifndef OPENSSL_CPUID_OBJ
         /* Create a dependency on the value of 'cleanse_ctr' so our memory
          * sanitisation function can't be optimised out. NB: We only do
          * this for >2Kb so the overhead doesn't bother us. */
         if(ret && (num > 2048))
         /* Create a dependency on the value of 'cleanse_ctr' so our memory
          * sanitisation function can't be optimised out. NB: We only do
          * this for >2Kb so the overhead doesn't bother us. */
         if(ret && (num > 2048))
+       {       extern unsigned char cleanse_ctr;
                ((unsigned char *)ret)[0] = cleanse_ctr;
                ((unsigned char *)ret)[0] = cleanse_ctr;
+       }
+#endif
 
        return ret;
        }
 
        return ret;
        }
@@ -291,7 +294,6 @@ void CRYPTO_free_locked(void *str)
 void *CRYPTO_malloc(int num, const char *file, int line)
        {
        void *ret = NULL;
 void *CRYPTO_malloc(int num, const char *file, int line)
        {
        void *ret = NULL;
-       extern unsigned char cleanse_ctr;
 
        if (num <= 0) return NULL;
 
 
        if (num <= 0) return NULL;
 
@@ -308,11 +310,15 @@ void *CRYPTO_malloc(int num, const char *file, int line)
        if (malloc_debug_func != NULL)
                malloc_debug_func(ret, num, file, line, 1);
 
        if (malloc_debug_func != NULL)
                malloc_debug_func(ret, num, file, line, 1);
 
+#ifndef OPENSSL_CPUID_OBJ
         /* Create a dependency on the value of 'cleanse_ctr' so our memory
          * sanitisation function can't be optimised out. NB: We only do
          * this for >2Kb so the overhead doesn't bother us. */
         if(ret && (num > 2048))
         /* Create a dependency on the value of 'cleanse_ctr' so our memory
          * sanitisation function can't be optimised out. NB: We only do
          * this for >2Kb so the overhead doesn't bother us. */
         if(ret && (num > 2048))
+       {       extern unsigned char cleanse_ctr;
                 ((unsigned char *)ret)[0] = cleanse_ctr;
                 ((unsigned char *)ret)[0] = cleanse_ctr;
+       }
+#endif
 
        return ret;
        }
 
        return ret;
        }
index 52308abca64657b4652d15eba6dfdbc3a322f9e9..f691abc57fdc3bb547ed39202c462dd977d0c3b2 100644 (file)
@@ -232,6 +232,54 @@ _sparcv9_rdtick:
 .type  _sparcv9_rdtick,#function
 .size  _sparcv9_rdtick,.-_sparcv9_rdtick
 
 .type  _sparcv9_rdtick,#function
 .size  _sparcv9_rdtick,.-_sparcv9_rdtick
 
+.global        OPENSSL_cleanse
+.align 32
+OPENSSL_cleanse:
+       cmp     %o1,6
+       nop
+#ifdef ABI64
+       bgu     %xcc,.Lot
+#else
+       bgu     .Lot
+#endif
+       nop
+
+.Little:
+       stb     %g0,[%o0]
+       subcc   %o1,1,%o1
+       bnz     .Little
+       add     %o0,1,%o0
+       retl
+       nop
+.align 32
+.Lot:
+       andcc   %o0,3,%g0
+       bz      .Laligned
+       nop
+       stb     %g0,[%o0]
+       sub     %o1,1,%o1
+       ba      .Lot
+       add     %o0,1,%o0
+       nop
+.Laligned:
+       st      %g0,[%o0]
+       sub     %o1,4,%o1
+       andcc   %o1,-4,%g0
+#ifdef ABI64
+       bnz     %xcc,.Laligned
+#else
+       bnz     .Laligned
+#endif
+       add     %o0,4,%o0
+
+       cmp     %o1,0
+       bne     .Little
+       nop
+       retl
+       nop
+.type  OPENSSL_cleanse,#function
+.size  OPENSSL_cleanse,.-OPENSSL_cleanse
+
 .section       ".init",#alloc,#execinstr
        call    OPENSSL_cpuid_setup
        nop
 .section       ".init",#alloc,#execinstr
        call    OPENSSL_cpuid_setup
        nop
index bc06e99cfb732280f0fbab87a9fa3d49d90d0872..2f657ca9d8d7de7b1e8303673b5013b9572820f5 100644 (file)
@@ -155,4 +155,36 @@ OPENSSL_ia32_cpuid:
        or      %rcx,%rax
        ret
 .size  OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
        or      %rcx,%rax
        ret
 .size  OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
+
+.globl  OPENSSL_cleanse
+.type   OPENSSL_cleanse,\@function,2
+.align  16
+OPENSSL_cleanse:
+       xor     %rax,%rax
+       cmp     \$15,%rsi
+       jae     .Lot
+.Little:
+       mov     %al,(%rdi)
+       sub     \$1,%rsi
+       lea     1(%rdi),%rdi
+       jnz     .Little
+       ret
+.align 16
+.Lot:
+       test    \$7,%rdi
+       jz      .Laligned
+       mov     %al,(%rdi)
+       lea     -1(%rsi),%rsi
+       lea     1(%rdi),%rdi
+       jmp     .Lot
+.Laligned:
+       mov     %rax,(%rdi)
+       lea     -8(%rsi),%rsi
+       test    \$-8,%rsi
+       lea     8(%rdi),%rdi
+       jnz     .Laligned
+       cmp     \$0,%rsi
+       jne     .Little
+       ret
+.size  OPENSSL_cleanse,.-OPENSSL_cleanse
 ___
 ___
index 7d924a60b77d78c9352c50bbe89795332a78ee56..13828d5633160b34460311303b4fbfc60ff92707 100644 (file)
@@ -216,6 +216,37 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
        }
 &function_end_B("OPENSSL_indirect_call");
 
        }
 &function_end_B("OPENSSL_indirect_call");
 
+&function_begin_B("OPENSSL_cleanse");
+       &mov    ("edx",&wparam(0));
+       &mov    ("ecx",&wparam(1));
+       &xor    ("eax","eax");
+       &cmp    ("ecx",7);
+       &jae    (&label("lot"));
+&set_label("little");
+       &mov    (&BP(0,"edx"),"al");
+       &sub    ("ecx",1);
+       &lea    ("edx",&DWP(1,"edx"));
+       &jnz    (&label("little"));
+       &ret    ();
+
+&set_label("lot",16);
+       &test   ("edx",3);
+       &jz     (&label("aligned"));
+       &mov    (&BP(0,"edx"),"al");
+       &lea    ("ecx",&DWP(-1,"ecx"));
+       &lea    ("edx",&DWP(1,"edx"));
+       &jmp    (&label("lot"));
+&set_label("aligned");
+       &mov    (&DWP(0,"edx"),"eax");
+       &lea    ("ecx",&DWP(-4,"ecx"));
+       &test   ("ecx",-4);
+       &lea    ("edx",&DWP(4,"edx"));
+       &jnz    (&label("aligned"));
+       &cmp    ("ecx",0);
+       &jne    (&label("little"));
+       &ret    ();
+&function_end_B("OPENSSL_cleanse");
+
 &initseg("OPENSSL_cpuid_setup");
 
 &asm_finish();
 &initseg("OPENSSL_cpuid_setup");
 
 &asm_finish();