e_padlock-x86[_64].pl: protection against prefetch errata.
authorAndy Polyakov <appro@openssl.org>
Tue, 11 Oct 2011 21:07:53 +0000 (21:07 +0000)
committerAndy Polyakov <appro@openssl.org>
Tue, 11 Oct 2011 21:07:53 +0000 (21:07 +0000)
engines/asm/e_padlock-x86.pl
engines/asm/e_padlock-x86_64.pl

index e211706..1b2ba52 100644 (file)
@@ -37,6 +37,7 @@ require "x86asm.pl";
 
 &asm_init($ARGV[0],$0);
 
+%PADLOCK_MARGIN=(ecb=>128, cbc=>64); # prefetch errata
 $PADLOCK_CHUNK=512;    # Must be a power of 2 larger than 16
 
 $ctx="edx";
@@ -187,6 +188,10 @@ my ($mode,$opcode) = @_;
        &movq   ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter
                                        } else {
        &xor    ("ebx","ebx");
+    if ($PADLOCK_MARGIN{$mode}) {
+       &cmp    ($len,$PADLOCK_MARGIN{$mode});
+       &jbe    (&label("${mode}_short"));
+    }
        &test   (&DWP(0,$ctx),1<<5);    # align bit in control word
        &jnz    (&label("${mode}_aligned"));
        &test   ($out,0x0f);
@@ -285,20 +290,39 @@ my ($mode,$opcode) = @_;
        &mov    ($chunk,$PADLOCK_CHUNK);
        &jnz    (&label("${mode}_loop"));
                                                if ($mode ne "ctr32") {
-       &test   ($out,0x0f);                    # out_misaligned
-       &jz     (&label("${mode}_done"));
+       &cmp    ("esp","ebp");
+       &je     (&label("${mode}_done"));
                                                }
-       &mov    ($len,"ebp");
-       &mov    ($out,"esp");
-       &sub    ($len,"esp");
-       &xor    ("eax","eax");
-       &shr    ($len,2);
-       &data_byte(0xf3,0xab);                  # rep stosl
+       &pxor   ("xmm0","xmm0");
+       &lea    ("eax",&DWP(0,"esp"));
+&set_label("${mode}_bzero");
+       &movaps (&QWP(0,"eax"),"xmm0");
+       &lea    ("eax",&DWP(16,"eax"));
+       &cmp    ("ebp","eax");
+       &ja     (&label("${mode}_bzero"));
+
 &set_label("${mode}_done");
        &lea    ("esp",&DWP(24,"ebp"));
                                                if ($mode ne "ctr32") {
        &jmp    (&label("${mode}_exit"));
 
+&set_label("${mode}_short",16);
+       &xor    ("eax","eax");
+       &lea    ("ebp",&DWP(-24,"esp"));
+       &sub    ("eax",$len);
+       &lea    ("esp",&DWP(0,"eax","ebp"));
+       &and    ("esp",-16);
+       &xor    ($chunk,$chunk);
+&set_label("${mode}_short_copy");
+       &movups ("xmm0",&QWP(0,$inp,$chunk));
+       &lea    ($chunk,&DWP(16,$chunk));
+       &cmp    ($len,$chunk);
+       &movaps (&QWP(-16,"esp",$chunk),"xmm0");
+       &ja     (&label("${mode}_short_copy"));
+       &mov    ($inp,"esp");
+       &mov    ($chunk,$len);
+       &jmp    (&label("${mode}_loop"));
+
 &set_label("${mode}_aligned",16);
        &lea    ("eax",&DWP(-16,$ctx));         # ivp
        &lea    ("ebx",&DWP(16,$ctx));          # key
index db79a62..5091c7a 100644 (file)
@@ -27,6 +27,7 @@ open STDOUT,"| $^X $xlate $flavour $output";
 
 $code=".text\n";
 
+%PADLOCK_MARGIN=(ecb=>128, cbc=>64, ctr32=>64);        # prefetch errata
 $PADLOCK_CHUNK=512;    # Must be a power of 2 between 32 and 2^20
 
 $ctx="%rdx";
@@ -284,6 +285,17 @@ padlock_${mode}_encrypt:
        lea     16($ctx),$ctx           # control word
        xor     %eax,%eax
        xor     %ebx,%ebx
+___
+# Formally speaking correct condtion is $len<=$margin and $inp+$margin
+# crosses page boundary [and next page is unreadable]. But $inp can
+# be unaligned in which case data can be copied to $out if latter is
+# aligned, in which case $out+$margin has to be checked. Covering all
+# cases appears more complicated than just copying short input...
+$code.=<<___   if ($PADLOCK_MARGIN{$mode});
+       cmp     \$$PADLOCK_MARGIN{$mode},$len
+       jbe     .L${mode}_short
+___
+$code.=<<___;
        testl   \$`1<<5`,($ctx)         # align bit in control word
        jnz     .L${mode}_aligned
        test    \$0x0f,$out
@@ -305,6 +317,7 @@ padlock_${mode}_encrypt:
        lea     (%rax,%rbp),%rsp
 ___
 $code.=<<___                           if ($mode eq "ctr32");
+.L${mode}_reenter:
        mov     -4($ctx),%eax           # pull 32-bit counter
        bswap   %eax
        neg     %eax
@@ -373,19 +386,38 @@ $code.=<<___;
        mov     \$$PADLOCK_CHUNK,$chunk
        jnz     .L${mode}_loop
 
-       test    \$0x0f,$out
-       jz      .L${mode}_done
+       cmp     %rsp,%rbp
+       je      .L${mode}_done
+
+       pxor    %xmm0,%xmm0
+       lea     (%rsp),%rax
+.L${mode}_bzero:
+       movaps  %xmm0,(%rax)
+       lea     16(%rax),%rax
+       cmp     %rax,%rbp
+       ja      .L${mode}_bzero
 
-       mov     %rbp,$len
-       mov     %rsp,$out
-       sub     %rsp,$len
-       xor     %rax,%rax
-       shr     \$3,$len
-       .byte   0xf3,0x48,0xab          # rep stosq
 .L${mode}_done:
        lea     (%rbp),%rsp
        jmp     .L${mode}_exit
-
+___
+$code.=<<___ if ($PADLOCK_MARGIN{$mode});
+.align 16
+.L${mode}_short:
+       mov     %rsp,%rbp
+       sub     $len,%rsp
+       xor     $chunk,$chunk
+.L${mode}_short_copy:
+       movups  ($inp,$chunk),%xmm0
+       lea     16($chunk),$chunk
+       cmp     $chunk,$len
+       movaps  %xmm0,-16(%rsp,$chunk)
+       ja      .L${mode}_short_copy
+       mov     %rsp,$inp
+       mov     $len,$chunk
+       jmp     .L${mode}_`${mode} eq "ctr32"?"reenter":"loop"`
+___
+$code.=<<___;
 .align 16
 .L${mode}_aligned:
 ___