e_padlock-x86[_64].pl: better understanding of prefetch errata and proper
authorAndy Polyakov <appro@openssl.org>
Mon, 19 Mar 2012 20:23:32 +0000 (20:23 +0000)
committerAndy Polyakov <appro@openssl.org>
Mon, 19 Mar 2012 20:23:32 +0000 (20:23 +0000)
workaround.

engines/asm/e_padlock-x86.pl
engines/asm/e_padlock-x86_64.pl

index 1b2ba522537cef9e183e6ee74707dee946a09d89..4148468c41de695751e8731369a948dff171c1ca 100644 (file)
@@ -37,7 +37,7 @@ require "x86asm.pl";
 
 &asm_init($ARGV[0],$0);
 
 
 &asm_init($ARGV[0],$0);
 
-%PADLOCK_MARGIN=(ecb=>128, cbc=>64); # prefetch errata
+%PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata
 $PADLOCK_CHUNK=512;    # Must be a power of 2 larger than 16
 
 $ctx="edx";
 $PADLOCK_CHUNK=512;    # Must be a power of 2 larger than 16
 
 $ctx="edx";
@@ -188,10 +188,6 @@ my ($mode,$opcode) = @_;
        &movq   ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter
                                        } else {
        &xor    ("ebx","ebx");
        &movq   ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter
                                        } else {
        &xor    ("ebx","ebx");
-    if ($PADLOCK_MARGIN{$mode}) {
-       &cmp    ($len,$PADLOCK_MARGIN{$mode});
-       &jbe    (&label("${mode}_short"));
-    }
        &test   (&DWP(0,$ctx),1<<5);    # align bit in control word
        &jnz    (&label("${mode}_aligned"));
        &test   ($out,0x0f);
        &test   (&DWP(0,$ctx),1<<5);    # align bit in control word
        &jnz    (&label("${mode}_aligned"));
        &test   ($out,0x0f);
@@ -212,7 +208,27 @@ my ($mode,$opcode) = @_;
        &neg    ("eax");
        &and    ($chunk,$PADLOCK_CHUNK-1);      # chunk=len%PADLOCK_CHUNK
        &lea    ("esp",&DWP(0,"eax","ebp"));    # alloca
        &neg    ("eax");
        &and    ($chunk,$PADLOCK_CHUNK-1);      # chunk=len%PADLOCK_CHUNK
        &lea    ("esp",&DWP(0,"eax","ebp"));    # alloca
+       &mov    ("eax",$PADLOCK_CHUNK);
+       &cmovz  ($chunk,"eax");                 # chunk=chunk?:PADLOCK_CHUNK
+       &mov    ("eax","ebp");
+       &and    ("ebp",-16);
        &and    ("esp",-16);
        &and    ("esp",-16);
+       &mov    (&DWP(16,"ebp"),"eax");
+    if ($PADLOCK_PREFETCH{$mode}) {
+       &cmp    ($len,$chunk);
+       &ja     (&label("${mode}_loop"));
+       &mov    ("eax",$inp);           # check if prefetch crosses page
+       &cmp    ("ebp","esp");
+       &cmove  ("eax",$out);
+       &add    ("eax",$len);
+       &neg    ("eax");
+       &and    ("eax",0xfff);          # distance to page boundary
+       &cmp    ("eax",$PADLOCK_PREFETCH{$mode});
+       &mov    ("eax",-$PADLOCK_PREFETCH{$mode});
+       &cmovae ("eax",$chunk);         # mask=distance<prefetch?-prefetch:-1
+       &and    ($chunk,"eax");
+       &jz     (&label("${mode}_unaligned_tail"));
+    }
        &jmp    (&label("${mode}_loop"));
 
 &set_label("${mode}_loop",16);
        &jmp    (&label("${mode}_loop"));
 
 &set_label("${mode}_loop",16);
@@ -276,8 +292,8 @@ my ($mode,$opcode) = @_;
        &test   ($out,0x0f);
        &jz     (&label("${mode}_out_aligned"));
        &mov    ($len,$chunk);
        &test   ($out,0x0f);
        &jz     (&label("${mode}_out_aligned"));
        &mov    ($len,$chunk);
-       &shr    ($len,2);
        &lea    ($inp,&DWP(0,"esp"));
        &lea    ($inp,&DWP(0,"esp"));
+       &shr    ($len,2);
        &data_byte(0xf3,0xa5);                  # rep movsl
        &sub    ($out,$chunk);
 &set_label("${mode}_out_aligned");
        &data_byte(0xf3,0xa5);                  # rep movsl
        &sub    ($out,$chunk);
 &set_label("${mode}_out_aligned");
@@ -288,7 +304,30 @@ my ($mode,$opcode) = @_;
        &add    ($inp,$chunk);
        &sub    ($len,$chunk);
        &mov    ($chunk,$PADLOCK_CHUNK);
        &add    ($inp,$chunk);
        &sub    ($len,$chunk);
        &mov    ($chunk,$PADLOCK_CHUNK);
+    if (!$PADLOCK_PREFETCH{$mode}) {
        &jnz    (&label("${mode}_loop"));
        &jnz    (&label("${mode}_loop"));
+    } else {
+       &jz     (&label("${mode}_break"));
+       &cmp    ($len,$chunk);
+       &jae    (&label("${mode}_loop"));
+
+&set_label("${mode}_unaligned_tail");
+       &xor    ("eax","eax");
+       &cmp    ("esp","ebp");
+       &cmove  ("eax",$len);
+       &sub    ("esp","eax");                  # alloca
+       &mov    ("eax", $out);                  # save parameters
+       &mov    ($chunk,$len);
+       &shr    ($len,2);
+       &lea    ($out,&DWP(0,"esp"));
+       &data_byte(0xf3,0xa5);                  # rep movsl
+       &mov    ($inp,"esp");
+       &mov    ($out,"eax");                   # restore parameters
+       &mov    ($len,$chunk);
+       &jmp    (&label("${mode}_loop"));
+
+&set_label("${mode}_break",16);
+    }
                                                if ($mode ne "ctr32") {
        &cmp    ("esp","ebp");
        &je     (&label("${mode}_done"));
                                                if ($mode ne "ctr32") {
        &cmp    ("esp","ebp");
        &je     (&label("${mode}_done"));
@@ -302,28 +341,24 @@ my ($mode,$opcode) = @_;
        &ja     (&label("${mode}_bzero"));
 
 &set_label("${mode}_done");
        &ja     (&label("${mode}_bzero"));
 
 &set_label("${mode}_done");
+       &mov    ("ebp",&DWP(16,"ebp"));
        &lea    ("esp",&DWP(24,"ebp"));
                                                if ($mode ne "ctr32") {
        &jmp    (&label("${mode}_exit"));
 
        &lea    ("esp",&DWP(24,"ebp"));
                                                if ($mode ne "ctr32") {
        &jmp    (&label("${mode}_exit"));
 
-&set_label("${mode}_short",16);
-       &xor    ("eax","eax");
-       &lea    ("ebp",&DWP(-24,"esp"));
-       &sub    ("eax",$len);
-       &lea    ("esp",&DWP(0,"eax","ebp"));
-       &and    ("esp",-16);
-       &xor    ($chunk,$chunk);
-&set_label("${mode}_short_copy");
-       &movups ("xmm0",&QWP(0,$inp,$chunk));
-       &lea    ($chunk,&DWP(16,$chunk));
-       &cmp    ($len,$chunk);
-       &movaps (&QWP(-16,"esp",$chunk),"xmm0");
-       &ja     (&label("${mode}_short_copy"));
-       &mov    ($inp,"esp");
-       &mov    ($chunk,$len);
-       &jmp    (&label("${mode}_loop"));
-
 &set_label("${mode}_aligned",16);
 &set_label("${mode}_aligned",16);
+    if ($PADLOCK_PREFETCH{$mode}) {
+       &lea    ("ebp",&DWP(0,$inp,$len));
+       &neg    ("ebp");
+       &and    ("ebp",0xfff);                  # distance to page boundary
+       &xor    ("eax","eax");
+       &cmp    ("ebp",$PADLOCK_PREFETCH{$mode});
+       &mov    ("ebp",$PADLOCK_PREFETCH{$mode}-1);
+       &cmovae ("ebp","eax");
+       &and    ("ebp",$len);                   # remainder
+       &sub    ($len,"ebp");
+       &jz     (&label("${mode}_aligned_tail"));
+    }
        &lea    ("eax",&DWP(-16,$ctx));         # ivp
        &lea    ("ebx",&DWP(16,$ctx));          # key
        &shr    ($len,4);                       # len/=AES_BLOCK_SIZE
        &lea    ("eax",&DWP(-16,$ctx));         # ivp
        &lea    ("ebx",&DWP(16,$ctx));          # key
        &shr    ($len,4);                       # len/=AES_BLOCK_SIZE
@@ -332,6 +367,29 @@ my ($mode,$opcode) = @_;
        &movaps ("xmm0",&QWP(0,"eax"));
        &movaps (&QWP(-16,$ctx),"xmm0");        # copy [or refresh] iv
                                                }
        &movaps ("xmm0",&QWP(0,"eax"));
        &movaps (&QWP(-16,$ctx),"xmm0");        # copy [or refresh] iv
                                                }
+    if ($PADLOCK_PREFETCH{$mode}) {
+       &test   ("ebp","ebp");
+       &jz     (&label("${mode}_exit"));
+
+&set_label("${mode}_aligned_tail");
+       &mov    ($len,"ebp");
+       &lea    ("ebp",&DWP(-24,"esp"));
+       &mov    ("esp","ebp");
+       &mov    ("eax","ebp");
+       &sub    ("esp",$len);
+       &and    ("ebp",-16);
+       &and    ("esp",-16);
+       &mov    (&DWP(16,"ebp"),"eax");
+       &mov    ("eax", $out);                  # save parameters
+       &mov    ($chunk,$len);
+       &shr    ($len,2);
+       &lea    ($out,&DWP(0,"esp"));
+       &data_byte(0xf3,0xa5);                  # rep movsl
+       &mov    ($inp,"esp");
+       &mov    ($out,"eax");                   # restore parameters
+       &mov    ($len,$chunk);
+       &jmp    (&label("${mode}_loop"));
+    }
 &set_label("${mode}_exit");                    }
        &mov    ("eax",1);
        &lea    ("esp",&DWP(4,"esp"));          # popf
 &set_label("${mode}_exit");                    }
        &mov    ("eax",1);
        &lea    ("esp",&DWP(4,"esp"));          # popf
index 5091c7aaca7805568d4cc16ceb746541d250fbfe..297561a61b6c2584743652881d032c6ce203ccd7 100644 (file)
@@ -27,7 +27,7 @@ open STDOUT,"| $^X $xlate $flavour $output";
 
 $code=".text\n";
 
 
 $code=".text\n";
 
-%PADLOCK_MARGIN=(ecb=>128, cbc=>64, ctr32=>64);        # prefetch errata
+%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32);      # prefetch errata
 $PADLOCK_CHUNK=512;    # Must be a power of 2 between 32 and 2^20
 
 $ctx="%rdx";
 $PADLOCK_CHUNK=512;    # Must be a power of 2 between 32 and 2^20
 
 $ctx="%rdx";
@@ -285,17 +285,6 @@ padlock_${mode}_encrypt:
        lea     16($ctx),$ctx           # control word
        xor     %eax,%eax
        xor     %ebx,%ebx
        lea     16($ctx),$ctx           # control word
        xor     %eax,%eax
        xor     %ebx,%ebx
-___
-# Formally speaking correct condtion is $len<=$margin and $inp+$margin
-# crosses page boundary [and next page is unreadable]. But $inp can
-# be unaligned in which case data can be copied to $out if latter is
-# aligned, in which case $out+$margin has to be checked. Covering all
-# cases appears more complicated than just copying short input...
-$code.=<<___   if ($PADLOCK_MARGIN{$mode});
-       cmp     \$$PADLOCK_MARGIN{$mode},$len
-       jbe     .L${mode}_short
-___
-$code.=<<___;
        testl   \$`1<<5`,($ctx)         # align bit in control word
        jnz     .L${mode}_aligned
        test    \$0x0f,$out
        testl   \$`1<<5`,($ctx)         # align bit in control word
        jnz     .L${mode}_aligned
        test    \$0x0f,$out
@@ -315,6 +304,8 @@ $code.=<<___;
        neg     %rax
        and     \$$PADLOCK_CHUNK-1,$chunk       # chunk%=PADLOCK_CHUNK
        lea     (%rax,%rbp),%rsp
        neg     %rax
        and     \$$PADLOCK_CHUNK-1,$chunk       # chunk%=PADLOCK_CHUNK
        lea     (%rax,%rbp),%rsp
+       mov     \$$PADLOCK_CHUNK,%rax
+       cmovz   %rax,$chunk                     # chunk=chunk?:PADLOCK_CHUNK
 ___
 $code.=<<___                           if ($mode eq "ctr32");
 .L${mode}_reenter:
 ___
 $code.=<<___                           if ($mode eq "ctr32");
 .L${mode}_reenter:
@@ -322,10 +313,27 @@ $code.=<<___                              if ($mode eq "ctr32");
        bswap   %eax
        neg     %eax
        and     \$`$PADLOCK_CHUNK/16-1`,%eax
        bswap   %eax
        neg     %eax
        and     \$`$PADLOCK_CHUNK/16-1`,%eax
-       jz      .L${mode}_loop
+       mov     \$$PADLOCK_CHUNK,$chunk
        shl     \$4,%eax
        shl     \$4,%eax
+       cmovz   $chunk,%rax
        cmp     %rax,$len
        cmova   %rax,$chunk             # don't let counter cross PADLOCK_CHUNK
        cmp     %rax,$len
        cmova   %rax,$chunk             # don't let counter cross PADLOCK_CHUNK
+       cmovbe  $len,$chunk
+___
+$code.=<<___                           if ($PADLOCK_PREFETCH{$mode});
+       cmp     $chunk,$len
+       ja      .L${mode}_loop
+       mov     $inp,%rax               # check if prefetch crosses page
+       cmp     %rsp,%rbp
+       cmove   $out,%rax
+       add     $len,%rax
+       neg     %rax
+       and     \$0xfff,%rax            # distance to page boundary
+       cmp     \$$PADLOCK_PREFETCH{$mode},%rax
+       mov     \$-$PADLOCK_PREFETCH{$mode},%rax
+       cmovae  $chunk,%rax             # mask=distance<prefetch?-prefetch:-1
+       and     %rax,$chunk
+       jz      .L${mode}_unaligned_tail
 ___
 $code.=<<___;
        jmp     .L${mode}_loop
 ___
 $code.=<<___;
        jmp     .L${mode}_loop
@@ -360,12 +368,12 @@ ___
 $code.=<<___                           if ($mode eq "ctr32");
        mov     -4($ctx),%eax           # pull 32-bit counter
        test    \$0xffff0000,%eax
 $code.=<<___                           if ($mode eq "ctr32");
        mov     -4($ctx),%eax           # pull 32-bit counter
        test    \$0xffff0000,%eax
-       jnz     .L${mode}_no_corr
+       jnz     .L${mode}_no_carry
        bswap   %eax
        add     \$0x10000,%eax
        bswap   %eax
        mov     %eax,-4($ctx)
        bswap   %eax
        add     \$0x10000,%eax
        bswap   %eax
        mov     %eax,-4($ctx)
-.L${mode}_no_corr:
+.L${mode}_no_carry:
 ___
 $code.=<<___;
        mov     %r8,$out                # restore paramters
 ___
 $code.=<<___;
        mov     %r8,$out                # restore paramters
@@ -373,8 +381,8 @@ $code.=<<___;
        test    \$0x0f,$out
        jz      .L${mode}_out_aligned
        mov     $chunk,$len
        test    \$0x0f,$out
        jz      .L${mode}_out_aligned
        mov     $chunk,$len
-       shr     \$3,$len
        lea     (%rsp),$inp
        lea     (%rsp),$inp
+       shr     \$3,$len
        .byte   0xf3,0x48,0xa5          # rep movsq
        sub     $chunk,$out
 .L${mode}_out_aligned:
        .byte   0xf3,0x48,0xa5          # rep movsq
        sub     $chunk,$out
 .L${mode}_out_aligned:
@@ -384,9 +392,52 @@ $code.=<<___;
        add     $chunk,$inp
        sub     $chunk,$len
        mov     \$$PADLOCK_CHUNK,$chunk
        add     $chunk,$inp
        sub     $chunk,$len
        mov     \$$PADLOCK_CHUNK,$chunk
+___
+                                       if (!$PADLOCK_PREFETCH{$mode}) {
+$code.=<<___;
        jnz     .L${mode}_loop
        jnz     .L${mode}_loop
-
+___
+                                       } else {
+$code.=<<___;
+       jz      .L${mode}_break
+       cmp     $chunk,$len
+       jae     .L${mode}_loop
+___
+$code.=<<___                           if ($mode eq "ctr32");
+       mov     $len,$chunk
+       mov     $inp,%rax               # check if prefetch crosses page
        cmp     %rsp,%rbp
        cmp     %rsp,%rbp
+       cmove   $out,%rax
+       add     $len,%rax
+       neg     %rax
+       and     \$0xfff,%rax            # distance to page boundary
+       cmp     \$$PADLOCK_PREFETCH{$mode},%rax
+       mov     \$-$PADLOCK_PREFETCH{$mode},%rax
+       cmovae  $chunk,%rax
+       and     %rax,$chunk
+       jnz     .L${mode}_loop
+___
+$code.=<<___;
+.L${mode}_unaligned_tail:
+       xor     %eax,%eax
+       cmp     %rsp,%rbp
+       cmove   $len,%rax
+       mov     $out,%r8                # save parameters
+       mov     $len,$chunk
+       sub     %rax,%rsp               # alloca
+       shr     \$3,$len
+       lea     (%rsp),$out
+       .byte   0xf3,0x48,0xa5          # rep movsq
+       mov     %rsp,$inp
+       mov     %r8, $out               # restore parameters
+       mov     $chunk,$len
+       jmp     .L${mode}_loop
+.align 16
+.L${mode}_break:
+___
+                                       }
+$code.=<<___;
+       cmp     %rbp,%rsp
        je      .L${mode}_done
 
        pxor    %xmm0,%xmm0
        je      .L${mode}_done
 
        pxor    %xmm0,%xmm0
@@ -400,70 +451,87 @@ $code.=<<___;
 .L${mode}_done:
        lea     (%rbp),%rsp
        jmp     .L${mode}_exit
 .L${mode}_done:
        lea     (%rbp),%rsp
        jmp     .L${mode}_exit
-___
-$code.=<<___ if ($PADLOCK_MARGIN{$mode});
-.align 16
-.L${mode}_short:
-       mov     %rsp,%rbp
-       sub     $len,%rsp
-       xor     $chunk,$chunk
-.L${mode}_short_copy:
-       movups  ($inp,$chunk),%xmm0
-       lea     16($chunk),$chunk
-       cmp     $chunk,$len
-       movaps  %xmm0,-16(%rsp,$chunk)
-       ja      .L${mode}_short_copy
-       mov     %rsp,$inp
-       mov     $len,$chunk
-       jmp     .L${mode}_`${mode} eq "ctr32"?"reenter":"loop"`
-___
-$code.=<<___;
+
 .align 16
 .L${mode}_aligned:
 ___
 $code.=<<___                           if ($mode eq "ctr32");
        mov     -4($ctx),%eax           # pull 32-bit counter
 .align 16
 .L${mode}_aligned:
 ___
 $code.=<<___                           if ($mode eq "ctr32");
        mov     -4($ctx),%eax           # pull 32-bit counter
-       mov     \$`16*0x10000`,$chunk
        bswap   %eax
        bswap   %eax
-       cmp     $len,$chunk
-       cmova   $len,$chunk
        neg     %eax
        and     \$0xffff,%eax
        neg     %eax
        and     \$0xffff,%eax
-       jz      .L${mode}_aligned_loop
+       mov     \$`16*0x10000`,$chunk
        shl     \$4,%eax
        shl     \$4,%eax
+       cmovz   $chunk,%rax
        cmp     %rax,$len
        cmova   %rax,$chunk             # don't let counter cross 2^16
        cmp     %rax,$len
        cmova   %rax,$chunk             # don't let counter cross 2^16
-       jmp     .L${mode}_aligned_loop
-.align 16
+       cmovbe  $len,$chunk
+       jbe     .L${mode}_aligned_skip
+
 .L${mode}_aligned_loop:
 .L${mode}_aligned_loop:
-       cmp     $len,$chunk
-       cmova   $len,$chunk
        mov     $len,%r10               # save parameters
        mov     $chunk,$len
        mov     $chunk,%r11
        mov     $len,%r10               # save parameters
        mov     $chunk,$len
        mov     $chunk,%r11
-___
-$code.=<<___;
+
        lea     -16($ctx),%rax          # ivp
        lea     16($ctx),%rbx           # key
        shr     \$4,$len                # len/=AES_BLOCK_SIZE
        .byte   0xf3,0x0f,0xa7,$opcode  # rep xcrypt*
        lea     -16($ctx),%rax          # ivp
        lea     16($ctx),%rbx           # key
        shr     \$4,$len                # len/=AES_BLOCK_SIZE
        .byte   0xf3,0x0f,0xa7,$opcode  # rep xcrypt*
-___
-$code.=<<___                           if ($mode !~ /ecb|ctr/);
-       movdqa  (%rax),%xmm0
-       movdqa  %xmm0,-16($ctx)         # copy [or refresh] iv
-___
-$code.=<<___                           if ($mode eq "ctr32");
+
        mov     -4($ctx),%eax           # pull 32-bit counter
        bswap   %eax
        add     \$0x10000,%eax
        bswap   %eax
        mov     %eax,-4($ctx)
 
        mov     -4($ctx),%eax           # pull 32-bit counter
        bswap   %eax
        add     \$0x10000,%eax
        bswap   %eax
        mov     %eax,-4($ctx)
 
-       mov     %r11,$chunk             # restore paramters
-       mov     %r10,$len
-       sub     $chunk,$len
+       mov     %r10,$len               # restore paramters
+       sub     %r11,$len
        mov     \$`16*0x10000`,$chunk
        mov     \$`16*0x10000`,$chunk
-       jnz     .L${mode}_aligned_loop
+       jz      .L${mode}_exit
+       cmp     $chunk,$len
+       jae     .L${mode}_aligned_loop
+
+.L${mode}_aligned_skip:
+___
+$code.=<<___                           if ($PADLOCK_PREFETCH{$mode});
+       lea     ($inp,$len),%rbp
+       neg     %rbp
+       and     \$0xfff,%rbp            # distance to page boundary
+       xor     %eax,%eax
+       cmp     \$$PADLOCK_PREFETCH{$mode},%rbp
+       mov     \$$PADLOCK_PREFETCH{$mode}-1,%rbp
+       cmovae  %rax,%rbp
+       and     $len,%rbp               # remainder
+       sub     %rbp,$len
+       jz      .L${mode}_aligned_tail
+___
+$code.=<<___;
+       lea     -16($ctx),%rax          # ivp
+       lea     16($ctx),%rbx           # key
+       shr     \$4,$len                # len/=AES_BLOCK_SIZE
+       .byte   0xf3,0x0f,0xa7,$opcode  # rep xcrypt*
+___
+$code.=<<___                           if ($mode !~ /ecb|ctr/);
+       movdqa  (%rax),%xmm0
+       movdqa  %xmm0,-16($ctx)         # copy [or refresh] iv
+___
+$code.=<<___                           if ($PADLOCK_PREFETCH{$mode});
+       test    %rbp,%rbp               # check remainder
+       jz      .L${mode}_exit
+
+.L${mode}_aligned_tail:
+       mov     $out,%r8
+       mov     %rbp,$chunk
+       mov     %rbp,$len
+       lea     (%rsp),%rbp
+       sub     $len,%rsp
+       shr     \$3,$len
+       lea     (%rsp),$out
+       .byte   0xf3,0x48,0xa5          # rep movsq     
+       lea     (%r8),$out
+       lea     (%rsp),$inp
+       mov     $chunk,$len
+       jmp     .L${mode}_loop
 ___
 $code.=<<___;
 .L${mode}_exit:
 ___
 $code.=<<___;
 .L${mode}_exit: