x86_64 assembly pack: make Windows build more robust.

[openssl.git] / crypto / aes / asm / aesni-x86_64.pl
diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl

index ea9cf9404d7bde451b210b72bc105ce1ae0d578e..8a3090491657e9882ee30e21eb0c659666a0e377 100644 (file)
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@@ -130,7 +130,7 @@
  # Further data for other parallelizable modes:
  #
  # CBC decrypt                          1.16    0.93    0.93
-# CTR                                  1.14    0.91    n/a
+# CTR                                  1.14    0.91    0.90
  #
  # Well, given 3x column it's probably inappropriate to call the limit
  # asymptotic, if it can be surpassed, isn't it? What happens there?
@@ -157,6 +157,13 @@
  # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
  # in CTR mode AES instruction interleave factor was chosen to be 6x.
  
+######################################################################
+# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
+# with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70
+# in ECB, 0.76 in CTR, 0.95 in XTS... This means that aes[enc|dec]
+# instruction latency is 9 cycles and that they can be issued every
+# cycle.
+
  $PREFIX="aesni";       # if $PREFIX is set to "AES", the script
                         # generates drop-in replacement for
                         # crypto/aes/asm/aes-x86_64.pl:-)
@@ -172,9 +179,10 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  die "can't locate x86_64-xlate.pl";
  
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
  
-$movkey = $PREFIX eq "aesni" ? "movaps" : "movups";
+$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
  @_4args=$win64?        ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
                 ("%rdi","%rsi","%rdx","%rcx");  # Unix order
  
@@ -821,8 +829,8 @@ ___
  {
  my $cmac="%r9";        # 6th argument
  
-my $increment="%xmm8";
-my $bswap_mask="%xmm9";
+my $increment="%xmm6";
+my $bswap_mask="%xmm7";
  
  $code.=<<___;
  .globl aesni_ccm64_encrypt_blocks
@@ -839,30 +847,29 @@ $code.=<<___ if ($win64);
  .Lccm64_enc_body:
  ___
  $code.=<<___;
+       mov     240($key),$rounds               # key->rounds
         movdqu  ($ivp),$iv
-       movdqu  ($cmac),$inout1
         movdqa  .Lincrement64(%rip),$increment
         movdqa  .Lbswap_mask(%rip),$bswap_mask
-       pshufb  $bswap_mask,$iv                 # keep iv in reverse order
  
-       mov     240($key),$rounds               # key->rounds
-       mov     $key,$key_
-       mov     $rounds,$rnds_
+       shr     \$1,$rounds
+       lea     0($key),$key_
+       movdqu  ($cmac),$inout1
         movdqa  $iv,$inout0
-
+       mov     $rounds,$rnds_
+       pshufb  $bswap_mask,$iv
+       jmp     .Lccm64_enc_outer
+.align 16
  .Lccm64_enc_outer:
-       movups  ($inp),$in0                     # load inp
-       pshufb  $bswap_mask,$inout0
-       mov     $key_,$key
+       $movkey ($key_),$rndkey0
         mov     $rnds_,$rounds
+       movups  ($inp),$in0                     # load inp
  
-       $movkey ($key),$rndkey0
-       shr     \$1,$rounds
-       $movkey 16($key),$rndkey1
-       xorps   $rndkey0,$in0
-       lea     32($key),$key
-       xorps   $rndkey0,$inout0
-       xorps   $inout1,$in0                    # cmac^=inp
+       xorps   $rndkey0,$inout0                # counter
+       $movkey 16($key_),$rndkey1
+       xorps   $in0,$rndkey0
+       lea     32($key_),$key
+       xorps   $rndkey0,$inout1                # cmac^=inp
         $movkey ($key),$rndkey0
  
  .Lccm64_enc2_loop:
@@ -877,16 +884,17 @@ $code.=<<___;
         jnz     .Lccm64_enc2_loop
         aesenc  $rndkey1,$inout0
         aesenc  $rndkey1,$inout1
+       paddq   $increment,$iv
         aesenclast      $rndkey0,$inout0
         aesenclast      $rndkey0,$inout1
  
-       paddq   $increment,$iv
         dec     $len
         lea     16($inp),$inp
         xorps   $inout0,$in0                    # inp ^= E(iv)
         movdqa  $iv,$inout0
         movups  $in0,($out)                     # save output
         lea     16($out),$out
+       pshufb  $bswap_mask,$inout0
         jnz     .Lccm64_enc_outer
  
         movups  $inout1,($cmac)
@@ -919,39 +927,40 @@ $code.=<<___ if ($win64);
  .Lccm64_dec_body:
  ___
  $code.=<<___;
-       movdqu  ($ivp),$iv
+       mov     240($key),$rounds               # key->rounds
+       movups  ($ivp),$iv
         movdqu  ($cmac),$inout1
         movdqa  .Lincrement64(%rip),$increment
         movdqa  .Lbswap_mask(%rip),$bswap_mask
  
-       mov     240($key),$rounds               # key->rounds
-       movdqa  $iv,$inout0
-       pshufb  $bswap_mask,$iv                 # keep iv in reverse order
+       movaps  $iv,$inout0
         mov     $rounds,$rnds_
         mov     $key,$key_
+       pshufb  $bswap_mask,$iv
  ___
         &aesni_generate1("enc",$key,$rounds);
  $code.=<<___;
-.Lccm64_dec_outer:
-       paddq   $increment,$iv
         movups  ($inp),$in0                     # load inp
-       xorps   $inout0,$in0
-       movdqa  $iv,$inout0
+       paddq   $increment,$iv
         lea     16($inp),$inp
-       pshufb  $bswap_mask,$inout0
-       mov     $key_,$key
+       jmp     .Lccm64_dec_outer
+.align 16
+.Lccm64_dec_outer:
+       xorps   $inout0,$in0                    # inp ^= E(iv)
+       movdqa  $iv,$inout0
         mov     $rnds_,$rounds
-       movups  $in0,($out)
+       movups  $in0,($out)                     # save output
         lea     16($out),$out
+       pshufb  $bswap_mask,$inout0
  
         sub     \$1,$len
         jz      .Lccm64_dec_break
  
-       $movkey ($key),$rndkey0
+       $movkey ($key_),$rndkey0
         shr     \$1,$rounds
-       $movkey 16($key),$rndkey1
+       $movkey 16($key_),$rndkey1
         xorps   $rndkey0,$in0
-       lea     32($key),$key
+       lea     32($key_),$key
         xorps   $rndkey0,$inout0
         xorps   $in0,$inout1                    # cmac^=out
         $movkey ($key),$rndkey0
@@ -966,15 +975,20 @@ $code.=<<___;
         aesenc  $rndkey0,$inout1
         $movkey 0($key),$rndkey0
         jnz     .Lccm64_dec2_loop
+       movups  ($inp),$in0                     # load inp
+       paddq   $increment,$iv
         aesenc  $rndkey1,$inout0
         aesenc  $rndkey1,$inout1
+       lea     16($inp),$inp
         aesenclast      $rndkey0,$inout0
+       aesenclast      $rndkey0,$inout1
         jmp     .Lccm64_dec_outer
  
  .align 16
  .Lccm64_dec_break:
+       #xorps  $in0,$inout1                    # cmac^=out
  ___
-       &aesni_generate1("enc",$key,$rounds,$inout1);
+       &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
  $code.=<<___;
         movups  $inout1,($cmac)
  ___
@@ -1000,10 +1014,8 @@ ___
  # does not update *ivec! (see engine/eng_aesni.c for details)
  #
  {
-my $reserved = $win64?0:-0x28;
-my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
-my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
-my $bswap_mask="%xmm15";
+my ($in0,$in1,$in2,$in3,$one,$ivec)=map("%xmm$_",(10..15));
+my $len_="%r9";
  
  $code.=<<___;
  .globl aesni_ctr32_encrypt_blocks
@@ -1012,270 +1024,312 @@ $code.=<<___;
  aesni_ctr32_encrypt_blocks:
  ___
  $code.=<<___ if ($win64);
-       lea     -0xc8(%rsp),%rsp
-       movaps  %xmm6,0x20(%rsp)
-       movaps  %xmm7,0x30(%rsp)
-       movaps  %xmm8,0x40(%rsp)
-       movaps  %xmm9,0x50(%rsp)
-       movaps  %xmm10,0x60(%rsp)
-       movaps  %xmm11,0x70(%rsp)
-       movaps  %xmm12,0x80(%rsp)
-       movaps  %xmm13,0x90(%rsp)
-       movaps  %xmm14,0xa0(%rsp)
-       movaps  %xmm15,0xb0(%rsp)
+       lea     -0xa8(%rsp),%rsp
+       movaps  %xmm6,0x00(%rsp)
+       movaps  %xmm7,0x10(%rsp)
+       movaps  %xmm8,0x20(%rsp)
+       movaps  %xmm9,0x30(%rsp)
+       movaps  %xmm10,0x40(%rsp)
+       movaps  %xmm11,0x50(%rsp)
+       movaps  %xmm12,0x60(%rsp)
+       movaps  %xmm13,0x70(%rsp)
+       movaps  %xmm14,0x80(%rsp)
+       movaps  %xmm15,0x90(%rsp)
  .Lctr32_body:
  ___
  $code.=<<___;
         cmp     \$1,$len
         je      .Lctr32_one_shortcut
  
+       movzb   15($ivp),%rax                   # counter LSB
+       mov     $len,$len_                      # backup $len
+       mov     240($key),$rnds_                # key->rounds
+       mov     $key,$key_                      # backup $key
         movdqu  ($ivp),$ivec
-       movdqa  .Lbswap_mask(%rip),$bswap_mask
-       xor     $rounds,$rounds
-       pextrd  \$3,$ivec,$rnds_                # pull 32-bit counter
-       pinsrd  \$3,$rounds,$ivec               # wipe 32-bit counter
+       neg     %rax
+       movdqa  .Lincrement1(%rip),$one
+       add     \$256,%rax                      # steps to closest overflow
  
-       mov     240($key),$rounds               # key->rounds
-       bswap   $rnds_
-       pxor    $iv0,$iv0                       # vector of 3 32-bit counters
-       pxor    $iv1,$iv1                       # vector of 3 32-bit counters
-       pinsrd  \$0,$rnds_,$iv0
-       lea     3($rnds_),$key_
-       pinsrd  \$0,$key_,$iv1
-       inc     $rnds_
-       pinsrd  \$1,$rnds_,$iv0
-       inc     $key_
-       pinsrd  \$1,$key_,$iv1
-       inc     $rnds_
-       pinsrd  \$2,$rnds_,$iv0
-       inc     $key_
-       pinsrd  \$2,$key_,$iv1
-       movdqa  $iv0,$reserved(%rsp)
-       pshufb  $bswap_mask,$iv0
-       movdqa  $iv1,`$reserved+0x10`(%rsp)
-       pshufb  $bswap_mask,$iv1
-
-       pshufd  \$`3<<6`,$iv0,$inout0           # place counter to upper dword
-       pshufd  \$`2<<6`,$iv0,$inout1
-       pshufd  \$`1<<6`,$iv0,$inout2
-       cmp     \$6,$len
+.Lctr32_grandloop:
+       cmp     %rax,$len
+       cmova   %rax,$len
+       mov     $rnds_,$rounds                  # restore $rounds
+       sub     $len,$len_
+
+       cmp     \$8,$len
         jb      .Lctr32_tail
+
+       $movkey ($key_),$rndkey0
         shr     \$1,$rounds
-       mov     $key,$key_                      # backup $key
-       mov     $rounds,$rnds_                  # backup $rounds
-       sub     \$6,$len
-       jmp     .Lctr32_loop6
+       shr     \$1,$rnds_
+       sub     \$8,$len
+       jmp     .Lctr32_loop8
  
  .align 16
-.Lctr32_loop6:
-       pshufd  \$`3<<6`,$iv1,$inout3
-       por     $ivec,$inout0                   # merge counter-less ivec
-        $movkey        ($key_),$rndkey0
-       pshufd  \$`2<<6`,$iv1,$inout4
-       por     $ivec,$inout1
+.Lctr32_loop8:
          $movkey        16($key_),$rndkey1
-       pshufd  \$`1<<6`,$iv1,$inout5
-       por     $ivec,$inout2
-       por     $ivec,$inout3
-        xorps          $rndkey0,$inout0
-       por     $ivec,$inout4
-       por     $ivec,$inout5
-
-       # inline _aesni_encrypt6 and interleave last rounds
-       # with own code...
-
-       pxor            $rndkey0,$inout1
-       aesenc          $rndkey1,$inout0
-       lea             32($key_),$key
-       pxor            $rndkey0,$inout2
-       aesenc          $rndkey1,$inout1
-        movdqa         .Lincrement32(%rip),$iv1
-       pxor            $rndkey0,$inout3
-       aesenc          $rndkey1,$inout2
-        movdqa         $reserved(%rsp),$iv0
-       pxor            $rndkey0,$inout4
-       aesenc          $rndkey1,$inout3
-       pxor            $rndkey0,$inout5
-       $movkey         ($key),$rndkey0
-       dec             $rounds
-       aesenc          $rndkey1,$inout4
-       aesenc          $rndkey1,$inout5
-       jmp             .Lctr32_enc_loop6_enter
-.align 16
-.Lctr32_enc_loop6:
-       aesenc          $rndkey1,$inout0
-       aesenc          $rndkey1,$inout1
-       dec             $rounds
-       aesenc          $rndkey1,$inout2
-       aesenc          $rndkey1,$inout3
-       aesenc          $rndkey1,$inout4
-       aesenc          $rndkey1,$inout5
-.Lctr32_enc_loop6_enter:
-       $movkey         16($key),$rndkey1
-       aesenc          $rndkey0,$inout0
-       aesenc          $rndkey0,$inout1
-       lea             32($key),$key
-       aesenc          $rndkey0,$inout2
-       aesenc          $rndkey0,$inout3
-       aesenc          $rndkey0,$inout4
-       aesenc          $rndkey0,$inout5
-       $movkey         ($key),$rndkey0
-       jnz             .Lctr32_enc_loop6
-
-       aesenc          $rndkey1,$inout0
-        paddd          $iv1,$iv0               # increment counter vector
-       aesenc          $rndkey1,$inout1
-        paddd          `$reserved+0x10`(%rsp),$iv1
-       aesenc          $rndkey1,$inout2
-        movdqa         $iv0,$reserved(%rsp)    # save counter vector
-       aesenc          $rndkey1,$inout3
-        movdqa         $iv1,`$reserved+0x10`(%rsp)
-       aesenc          $rndkey1,$inout4
-        pshufb         $bswap_mask,$iv0        # byte swap
-       aesenc          $rndkey1,$inout5
-        pshufb         $bswap_mask,$iv1
-
-       aesenclast      $rndkey0,$inout0
-        movups         ($inp),$in0             # load input
-       aesenclast      $rndkey0,$inout1
-        movups         0x10($inp),$in1
-       aesenclast      $rndkey0,$inout2
-        movups         0x20($inp),$in2
-       aesenclast      $rndkey0,$inout3
-        movups         0x30($inp),$in3
-       aesenclast      $rndkey0,$inout4
-        movups         0x40($inp),$rndkey1
-       aesenclast      $rndkey0,$inout5
-        movups         0x50($inp),$rndkey0
-        lea    0x60($inp),$inp
-
-       xorps   $inout0,$in0                    # xor
-        pshufd \$`3<<6`,$iv0,$inout0
-       xorps   $inout1,$in1
-        pshufd \$`2<<6`,$iv0,$inout1
-       movups  $in0,($out)                     # store output
-       xorps   $inout2,$in2
-        pshufd \$`1<<6`,$iv0,$inout2
-       movups  $in1,0x10($out)
-       xorps   $inout3,$in3
-       movups  $in2,0x20($out)
-       xorps   $inout4,$rndkey1
-       movups  $in3,0x30($out)
-       xorps   $inout5,$rndkey0
-       movups  $rndkey1,0x40($out)
-       movups  $rndkey0,0x50($out)
-       lea     0x60($out),$out
+       movdqa          $rndkey0,$inout0
+       movdqa          $rndkey0,$inout1
+       pxor            $ivec,$inout0
+       paddb           $one,$ivec
+       movdqa          $rndkey0,$inout2
+        aesenc         $rndkey1,$inout0
+       pxor            $ivec,$inout1
+       paddb           $one,$ivec
+        lea            32($key_),$key
+       movdqa          $rndkey0,$inout3
+        aesenc         $rndkey1,$inout1
+       pxor            $ivec,$inout2
+       paddb           $one,$ivec
+       movdqa          $rndkey0,$inout4
+        aesenc         $rndkey1,$inout2
+       pxor            $ivec,$inout3
+       paddb           $one,$ivec
+       movdqa          $rndkey0,$inout5
+        aesenc         $rndkey1,$inout3
+       pxor            $ivec,$inout4
+       paddb           $one,$ivec
+       movdqa          $rndkey0,$inout6
+        aesenc         $rndkey1,$inout4
+       pxor            $ivec,$inout5
+       paddb           $one,$ivec
+       movdqa          $rndkey0,$inout7
+        aesenc         $rndkey1,$inout5
+       pxor            $ivec,$inout6
+       paddb           $one,$ivec
+        $movkey        ($key),$rndkey0
+        aesenc         $rndkey1,$inout6
+       pxor            $ivec,$inout7
+       paddb           $one,$ivec
+        dec            $rounds
+        aesenc         $rndkey1,$inout7
+        $movkey        16($key),$rndkey1
+         movups        ($inp),$in0             # load input
+         movups        0x10($inp),$in1
+         movups        0x20($inp),$in2
+         movups        0x30($inp),$in3
+
+       call            .Lenc_loop8_enter
+
+       xorps           $in0,$inout0            # xor
+       movups          0x40($inp),$in0
+       xorps           $in1,$inout1
+       movups          0x50($inp),$in1
+       xorps           $in2,$inout2
+       movups          0x60($inp),$in2
+       xorps           $in3,$inout3
+       movups          0x70($inp),$in3
+       lea             0x80($inp),$inp
+       xorps           $in0,$inout4
+       movups          $inout0,($out)          # store output
+       xorps           $in1,$inout5
+       movups          $inout1,0x10($out)
+       xorps           $in2,$inout6
+       movups          $inout2,0x20($out)
+       xorps           $in3,$inout7
+       movups          $inout3,0x30($out)
+       movups          $inout4,0x40($out)
+       movups          $inout5,0x50($out)
+       movups          $inout6,0x60($out)
+       movups          $inout7,0x70($out)
+       lea             0x80($out),$out
+       
+       $movkey ($key_),$rndkey0
         mov     $rnds_,$rounds
-       sub     \$6,$len
-       jnc     .Lctr32_loop6
+       sub     \$8,$len
+       jnc     .Lctr32_loop8
  
-       add     \$6,$len
-       jz      .Lctr32_done
-       mov     $key_,$key                      # restore $key
         lea     1($rounds,$rounds),$rounds      # restore original value
+       lea     1($rnds_,$rnds_),$rnds_         # restore original value
+       add     \$8,$len
+       jz      .Lctr32_done
  
  .Lctr32_tail:
-       por     $ivec,$inout0
+       mov     $key_,$key                      # restore $key
+       movdqa  $ivec,$inout0
+       paddb   $one,$ivec
         movups  ($inp),$in0
         cmp     \$2,$len
         jb      .Lctr32_one
  
-       por     $ivec,$inout1
+       movdqa  $ivec,$inout1
+       paddb   $one,$ivec
         movups  0x10($inp),$in1
         je      .Lctr32_two
  
-       pshufd  \$`3<<6`,$iv1,$inout3
-       por     $ivec,$inout2
+       movdqa  $ivec,$inout2
+       paddb   $one,$ivec
         movups  0x20($inp),$in2
         cmp     \$4,$len
         jb      .Lctr32_three
  
-       pshufd  \$`2<<6`,$iv1,$inout4
-       por     $ivec,$inout3
+       movdqa  $ivec,$inout3
+       paddb   $one,$ivec
         movups  0x30($inp),$in3
         je      .Lctr32_four
  
-       por     $ivec,$inout4
-       xorps   $inout5,$inout5
+       movdqa  $ivec,$inout4
+       paddb   $one,$ivec
+       cmp     \$6,$len
+       jb      .Lctr32_five
  
-       call    _aesni_encrypt6
+       movdqa  $ivec,$inout5
+       paddb   $one,$ivec
+       je      .Lctr32_six
  
-       movups  0x40($inp),$rndkey1
-       xorps   $inout0,$in0
-       xorps   $inout1,$in1
-       movups  $in0,($out)
-       xorps   $inout2,$in2
-       movups  $in1,0x10($out)
-       xorps   $inout3,$in3
-       movups  $in2,0x20($out)
-       xorps   $inout4,$rndkey1
-       movups  $in3,0x30($out)
-       movups  $rndkey1,0x40($out)
+       movdqa  $ivec,$inout6
+       paddb   $one,$ivec
+       xorps   $inout7,$inout7
+
+       call    _aesni_encrypt8
+
+       xorps           $in0,$inout0            # xor
+       movups          0x40($inp),$in0
+       xorps           $in1,$inout1
+       movups          0x50($inp),$in1
+       xorps           $in2,$inout2
+       movups          0x60($inp),$in2
+       lea             0x70($inp),$inp
+       xorps           $in3,$inout3
+       movups          $inout0,($out)          # store output
+       xorps           $in0,$inout4
+       movups          $inout1,0x10($out)
+       xorps           $in1,$inout5
+       movups          $inout2,0x20($out)
+       xorps           $in2,$inout6
+       movups          $inout3,0x30($out)
+       movups          $inout4,0x40($out)
+       movups          $inout5,0x50($out)
+       movups          $inout6,0x60($out)
+       lea             0x70($out),$out
         jmp     .Lctr32_done
  
  .align 16
  .Lctr32_one_shortcut:
         movups  ($ivp),$inout0
+       xor     $len_,$len_
         movups  ($inp),$in0
         mov     240($key),$rounds               # key->rounds
  .Lctr32_one:
  ___
         &aesni_generate1("enc",$key,$rounds);
  $code.=<<___;
-       xorps   $inout0,$in0
-       movups  $in0,($out)
+       xorps   $in0,$inout0
+       lea     0x10($inp),$inp
+       movups  $inout0,($out)
+       lea     0x10($out),$out
         jmp     .Lctr32_done
  
  .align 16
  .Lctr32_two:
         xorps   $inout2,$inout2
         call    _aesni_encrypt3
-       xorps   $inout0,$in0
-       xorps   $inout1,$in1
-       movups  $in0,($out)
-       movups  $in1,0x10($out)
+       xorps   $in0,$inout0            # xor
+       lea     0x20($inp),$inp
+       xorps   $in1,$inout1
+       movups  $inout0,($out)          # store output
+       movups  $inout1,0x10($out)
+       lea     0x20($out),$out
         jmp     .Lctr32_done
  
  .align 16
  .Lctr32_three:
         call    _aesni_encrypt3
-       xorps   $inout0,$in0
-       xorps   $inout1,$in1
-       movups  $in0,($out)
-       xorps   $inout2,$in2
-       movups  $in1,0x10($out)
-       movups  $in2,0x20($out)
+       xorps   $in0,$inout0            # xor
+       lea     0x30($inp),$inp
+       xorps   $in1,$inout1
+       movups  $inout0,($out)          # store output
+       xorps   $in2,$inout2
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       lea     0x30($out),$out
         jmp     .Lctr32_done
  
  .align 16
  .Lctr32_four:
         call    _aesni_encrypt4
-       xorps   $inout0,$in0
-       xorps   $inout1,$in1
-       movups  $in0,($out)
-       xorps   $inout2,$in2
-       movups  $in1,0x10($out)
-       xorps   $inout3,$in3
-       movups  $in2,0x20($out)
-       movups  $in3,0x30($out)
+       xorps   $in0,$inout0            # xor
+       lea     0x40($inp),$inp
+       xorps   $in1,$inout1
+       movups  $inout0,($out)          # store output
+       xorps   $in2,$inout2
+       movups  $inout1,0x10($out)
+       xorps   $in3,$inout3
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       lea     0x40($out),$out
+       jmp     .Lctr32_done
+
+.align 16
+.Lctr32_five:
+       xorps   $inout5,$inout5
+       call    _aesni_encrypt6
+       xorps   $in0,$inout0            # xor
+       movups  0x40($inp),$in0
+       lea     0x50($inp),$inp
+       xorps   $in1,$inout1
+       movups  $inout0,($out)          # store output
+       xorps   $in2,$inout2
+       movups  $inout1,0x10($out)
+       xorps   $in3,$inout3
+       movups  $inout2,0x20($out)
+       xorps   $in0,$inout4
+       movups  $inout3,0x30($out)
+       movups  $inout4,0x40($out)
+       lea     0x50($out),$out
+       jmp     .Lctr32_done
+
+.align 16
+.Lctr32_six:
+       call    _aesni_encrypt6
+       xorps   $in0,$inout0            # xor
+       movups  0x40($inp),$in0
+       xorps   $in1,$inout1
+       movups  0x50($inp),$in1
+       lea     0x60($inp),$inp
+       xorps   $in2,$inout2
+       movups  $inout0,($out)          # store output
+       xorps   $in3,$inout3
+       movups  $inout1,0x10($out)
+       xorps   $in0,$inout4
+       movups  $inout2,0x20($out)
+       xorps   $in1,$inout5
+       movups  $inout3,0x30($out)
+       movups  $inout4,0x40($out)
+       movups  $inout5,0x50($out)
+       lea     0x60($out),$out
  
  .Lctr32_done:
+       test    $len_,$len_
+       jz      .Lctr32_really_done
+
+       movdqa  .Lbswap_mask(%rip),$rndkey1
+       pshufb  $rndkey1,$ivec
+       psrldq  \$14,$one               # 256
+       paddd   $one,$ivec
+       pslldq  \$14,$one
+       pshufb  $rndkey1,$ivec
+       mov     $len_,$len
+       mov     \$256,%rax
+       jmp     .Lctr32_grandloop
+
+.Lctr32_really_done:
  ___
  $code.=<<___ if ($win64);
-       movaps  0x20(%rsp),%xmm6
-       movaps  0x30(%rsp),%xmm7
-       movaps  0x40(%rsp),%xmm8
-       movaps  0x50(%rsp),%xmm9
-       movaps  0x60(%rsp),%xmm10
-       movaps  0x70(%rsp),%xmm11
-       movaps  0x80(%rsp),%xmm12
-       movaps  0x90(%rsp),%xmm13
-       movaps  0xa0(%rsp),%xmm14
-       movaps  0xb0(%rsp),%xmm15
-       lea     0xc8(%rsp),%rsp
-.Lctr32_ret:
+       movaps  0x00(%rsp),%xmm6
+       movaps  0x10(%rsp),%xmm7
+       movaps  0x20(%rsp),%xmm8
+       movaps  0x30(%rsp),%xmm9
+       movaps  0x40(%rsp),%xmm10
+       movaps  0x50(%rsp),%xmm11
+       movaps  0x60(%rsp),%xmm12
+       movaps  0x70(%rsp),%xmm13
+       movaps  0x80(%rsp),%xmm14
+       movaps  0x90(%rsp),%xmm15
+       lea     0xa8(%rsp),%rsp
  ___
  $code.=<<___;
+.Lctr32_ret:
         ret
  .size  aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
  ___
@@ -1290,14 +1344,17 @@ ___
  my @tweak=map("%xmm$_",(10..15));
  my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
  my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
-my $frame_size = 0x68 + ($win64?160:0);
+my $frame_size = 0x60 + ($win64?160:0);
  
  $code.=<<___;
  .globl aesni_xts_encrypt
  .type  aesni_xts_encrypt,\@function,6
  .align 16
  aesni_xts_encrypt:
-       lea     -$frame_size(%rsp),%rsp
+       lea     (%rsp),%rax
+       push    %rbp
+       sub     \$$frame_size,%rsp
+       and     \$-16,%rsp      # Linux kernel stack can be incorrectly seeded
  ___
  $code.=<<___ if ($win64);
         movaps  %xmm6,0x60(%rsp)
@@ -1313,6 +1370,7 @@ $code.=<<___ if ($win64);
  .Lxts_enc_body:
  ___
  $code.=<<___;
+       lea     -8(%rax),%rbp
         movups  ($ivp),@tweak[5]                # load clear-text tweak
         mov     240(%r8),$rounds                # key2->rounds
         mov     240($key),$rnds_                # key1->rounds
@@ -1676,7 +1734,8 @@ $code.=<<___ if ($win64);
         movaps  0xf0(%rsp),%xmm15
  ___
  $code.=<<___;
-       lea     $frame_size(%rsp),%rsp
+       lea     (%rbp),%rsp
+       pop     %rbp
  .Lxts_enc_epilogue:
         ret
  .size  aesni_xts_encrypt,.-aesni_xts_encrypt
@@ -1687,7 +1746,10 @@ $code.=<<___;
  .type  aesni_xts_decrypt,\@function,6
  .align 16
  aesni_xts_decrypt:
-       lea     -$frame_size(%rsp),%rsp
+       lea     (%rsp),%rax
+       push    %rbp
+       sub     \$$frame_size,%rsp
+       and     \$-16,%rsp      # Linux kernel stack can be incorrectly seeded
  ___
  $code.=<<___ if ($win64);
         movaps  %xmm6,0x60(%rsp)
@@ -1703,6 +1765,7 @@ $code.=<<___ if ($win64);
  .Lxts_dec_body:
  ___
  $code.=<<___;
+       lea     -8(%rax),%rbp
         movups  ($ivp),@tweak[5]                # load clear-text tweak
         mov     240($key2),$rounds              # key2->rounds
         mov     240($key),$rnds_                # key1->rounds
@@ -2102,7 +2165,8 @@ $code.=<<___ if ($win64);
         movaps  0xf0(%rsp),%xmm15
  ___
  $code.=<<___;
-       lea     $frame_size(%rsp),%rsp
+       lea     (%rbp),%rsp
+       pop     %rbp
  .Lxts_dec_epilogue:
         ret
  .size  aesni_xts_decrypt,.-aesni_xts_decrypt
@@ -2114,7 +2178,7 @@ ___
  #                          size_t length, const AES_KEY *key,
  #                          unsigned char *ivp,const int enc);
  {
-my $reserved = $win64?0x40:-0x18;      # used in decrypt
+my $frame_size = 0x10 + ($win64?0x40:0);       # used in decrypt
  $code.=<<___;
  .globl ${PREFIX}_cbc_encrypt
  .type  ${PREFIX}_cbc_encrypt,\@function,6
@@ -2170,16 +2234,20 @@ $code.=<<___;
  \f#--------------------------- CBC DECRYPT ------------------------------#
  .align 16
  .Lcbc_decrypt:
+       lea     (%rsp),%rax
+       push    %rbp
+       sub     \$$frame_size,%rsp
+       and     \$-16,%rsp      # Linux kernel stack can be incorrectly seeded
  ___
  $code.=<<___ if ($win64);
-       lea     -0x58(%rsp),%rsp
-       movaps  %xmm6,(%rsp)
-       movaps  %xmm7,0x10(%rsp)
-       movaps  %xmm8,0x20(%rsp)
-       movaps  %xmm9,0x30(%rsp)
+       movaps  %xmm6,0x10(%rsp)
+       movaps  %xmm7,0x20(%rsp)
+       movaps  %xmm8,0x30(%rsp)
+       movaps  %xmm9,0x40(%rsp)
  .Lcbc_decrypt_body:
  ___
  $code.=<<___;
+       lea     -8(%rax),%rbp
         movups  ($ivp),$iv
         mov     $rnds_,$rounds
         cmp     \$0x70,$len
@@ -2187,11 +2255,11 @@ $code.=<<___;
         shr     \$1,$rnds_
         sub     \$0x70,$len
         mov     $rnds_,$rounds
-       movaps  $iv,$reserved(%rsp)
+       movaps  $iv,(%rsp)
         jmp     .Lcbc_dec_loop8_enter
  .align 16
  .Lcbc_dec_loop8:
-       movaps  $rndkey0,$reserved(%rsp)        # save IV
+       movaps  $rndkey0,(%rsp)                 # save IV
         movups  $inout7,($out)
         lea     0x10($out),$out
  .Lcbc_dec_loop8_enter:
@@ -2231,7 +2299,7 @@ $code.=<<___;
  
         movups  ($inp),$rndkey1         # re-load input
         movups  0x10($inp),$rndkey0
-       xorps   $reserved(%rsp),$inout0 # ^= IV
+       xorps   (%rsp),$inout0          # ^= IV
         xorps   $rndkey1,$inout1
         movups  0x20($inp),$rndkey1
         xorps   $rndkey0,$inout2
@@ -2295,11 +2363,11 @@ $code.=<<___;
         jbe     .Lcbc_dec_six
  
         movups  0x60($inp),$inout6
-       movaps  $iv,$reserved(%rsp)     # save IV
+       movaps  $iv,(%rsp)              # save IV
         call    _aesni_decrypt8
         movups  ($inp),$rndkey1
         movups  0x10($inp),$rndkey0
-       xorps   $reserved(%rsp),$inout0 # ^= IV
+       xorps   (%rsp),$inout0          # ^= IV
         xorps   $rndkey1,$inout1
         movups  0x20($inp),$rndkey1
         xorps   $rndkey0,$inout2
@@ -2423,23 +2491,24 @@ $code.=<<___;
         jmp     .Lcbc_dec_ret
  .align 16
  .Lcbc_dec_tail_partial:
-       movaps  $inout0,$reserved(%rsp)
+       movaps  $inout0,(%rsp)
         mov     \$16,%rcx
         mov     $out,%rdi
         sub     $len,%rcx
-       lea     $reserved(%rsp),%rsi
+       lea     (%rsp),%rsi
         .long   0x9066A4F3      # rep movsb
  
  .Lcbc_dec_ret:
  ___
  $code.=<<___ if ($win64);
-       movaps  (%rsp),%xmm6
-       movaps  0x10(%rsp),%xmm7
-       movaps  0x20(%rsp),%xmm8
-       movaps  0x30(%rsp),%xmm9
-       lea     0x58(%rsp),%rsp
+       movaps  0x10(%rsp),%xmm6
+       movaps  0x20(%rsp),%xmm7
+       movaps  0x30(%rsp),%xmm8
+       movaps  0x40(%rsp),%xmm9
  ___
  $code.=<<___;
+       lea     (%rbp),%rsp
+       pop     %rbp
  .Lcbc_ret:
         ret
  .size  ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
@@ -2706,6 +2775,8 @@ $code.=<<___;
         .long   1,0,0,0
  .Lxts_magic:
         .long   0x87,0,1,0
+.Lincrement1:
+       .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
  
  .asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
  .align 64
@@ -2810,11 +2881,11 @@ ctr32_se_handler:
         cmp     %r10,%rbx
         jae     .Lcommon_seh_tail
  
-       lea     0x20(%rax),%rsi         # %xmm save area
+       lea     (%rax),%rsi             # %xmm save area
         lea     512($context),%rdi      # &context.Xmm6
         mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
         .long   0xa548f3fc              # cld; rep movsq
-       lea     0xc8(%rax),%rax         # adjust stack pointer
+       lea     0xa8(%rax),%rax         # adjust stack pointer
  
         jmp     .Lcommon_seh_tail
  .size  ctr32_se_handler,.-ctr32_se_handler
@@ -2855,9 +2926,8 @@ xts_se_handler:
         lea     512($context),%rdi      # & context.Xmm6
         mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
         .long   0xa548f3fc              # cld; rep movsq
-       lea     0x68+160(%rax),%rax     # adjust stack pointer
  
-       jmp     .Lcommon_seh_tail
+       jmp     .Lcommon_rbp_tail
  .size  xts_se_handler,.-xts_se_handler
  ___
  $code.=<<___;
@@ -2890,11 +2960,16 @@ cbc_se_handler:
         cmp     %r10,%rbx               # context->Rip>="epilogue" label
         jae     .Lcommon_seh_tail
  
-       lea     0(%rax),%rsi            # top of stack
+       lea     16(%rax),%rsi           # %xmm save area
         lea     512($context),%rdi      # &context.Xmm6
         mov     \$8,%ecx                # 4*sizeof(%xmm0)/sizeof(%rax)
         .long   0xa548f3fc              # cld; rep movsq
-       lea     0x58(%rax),%rax         # adjust stack pointer
+
+.Lcommon_rbp_tail:
+       mov     160($context),%rax      # pull context->Rbp
+       mov     (%rax),%rbp             # restore saved %rbp
+       lea     8(%rax),%rax            # adjust stack pointer
+       mov     %rbp,160($context)      # restore context->Rbp
         jmp     .Lcommon_seh_tail
  
  .Lrestore_cbc_rax:
@@ -3018,15 +3093,13 @@ ___
  }
  
  sub rex {
- local *opcode=shift;
- my ($dst,$src)=@_;
-
-   if ($dst>=8 || $src>=8) {
-       $rex=0x40;
-       $rex|=0x04 if($dst>=8);
-       $rex|=0x01 if($src>=8);
-       push @opcode,$rex;
-   }
+  local *opcode=shift;
+  my ($dst,$src)=@_;
+  my $rex=0;
+
+    $rex|=0x04                 if($dst>=8);
+    $rex|=0x01                 if($src>=8);
+    push @opcode,$rex|0x40     if($rex);
  }
  
  sub aesni {