aesni-x86_64.pl: optimize CTR even further.

author Andy Polyakov <appro@openssl.org>

Tue, 19 Mar 2013 19:03:02 +0000 (20:03 +0100)

committer Andy Polyakov <appro@openssl.org>

Tue, 19 Mar 2013 19:03:02 +0000 (20:03 +0100)
author Andy Polyakov <appro@openssl.org>
Tue, 19 Mar 2013 19:03:02 +0000 (20:03 +0100)
committer Andy Polyakov <appro@openssl.org>
Tue, 19 Mar 2013 19:03:02 +0000 (20:03 +0100)
diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl

index 8a3090491657e9882ee30e21eb0c659666a0e377..1f3c7f848b81db7a2d7a51eeda9f65c49130fcd4 100644 (file)
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@@ -130,7 +130,7 @@
  # Further data for other parallelizable modes:
  #
  # CBC decrypt                          1.16    0.93    0.93
-# CTR                                  1.14    0.91    0.90
+# CTR                                  1.14    0.91    0.86
  #
  # Well, given 3x column it's probably inappropriate to call the limit
  # asymptotic, if it can be surpassed, isn't it? What happens there?
@@ -160,7 +160,7 @@
  ######################################################################
  # For reference, AMD Bulldozer spends 5.77 cycles per byte processed
  # with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70
-# in ECB, 0.76 in CTR, 0.95 in XTS... This means that aes[enc|dec]
+# in ECB, 0.73 in CTR, 0.95 in XTS... This means that aes[enc|dec]
  # instruction latency is 9 cycles and that they can be issued every
  # cycle.
  
@@ -1062,38 +1062,38 @@ $code.=<<___;
         $movkey ($key_),$rndkey0
         shr     \$1,$rounds
         shr     \$1,$rnds_
+       movdqa  $rndkey0,$inout0
+       movdqa  $rndkey0,$inout1
+       movdqa  $rndkey0,$inout2
+       movdqa  $rndkey0,$inout3
+       movdqa  $rndkey0,$inout4
+       movdqa  $rndkey0,$inout5
+       movdqa  $rndkey0,$inout6
+       movdqa  $rndkey0,$inout7
+       $movkey 16($key_),$rndkey1
         sub     \$8,$len
         jmp     .Lctr32_loop8
  
  .align 16
  .Lctr32_loop8:
-        $movkey        16($key_),$rndkey1
-       movdqa          $rndkey0,$inout0
-       movdqa          $rndkey0,$inout1
         pxor            $ivec,$inout0
         paddb           $one,$ivec
-       movdqa          $rndkey0,$inout2
          aesenc         $rndkey1,$inout0
         pxor            $ivec,$inout1
         paddb           $one,$ivec
          lea            32($key_),$key
-       movdqa          $rndkey0,$inout3
          aesenc         $rndkey1,$inout1
         pxor            $ivec,$inout2
         paddb           $one,$ivec
-       movdqa          $rndkey0,$inout4
          aesenc         $rndkey1,$inout2
         pxor            $ivec,$inout3
         paddb           $one,$ivec
-       movdqa          $rndkey0,$inout5
          aesenc         $rndkey1,$inout3
         pxor            $ivec,$inout4
         paddb           $one,$ivec
-       movdqa          $rndkey0,$inout6
          aesenc         $rndkey1,$inout4
         pxor            $ivec,$inout5
         paddb           $one,$ivec
-       movdqa          $rndkey0,$inout7
          aesenc         $rndkey1,$inout5
         pxor            $ivec,$inout6
         paddb           $one,$ivec
@@ -1104,37 +1104,97 @@ $code.=<<___;
          dec            $rounds
          aesenc         $rndkey1,$inout7
          $movkey        16($key),$rndkey1
+
+       aesenc          $rndkey0,$inout0
+       aesenc          $rndkey0,$inout1
+       lea             32($key),$key
+       aesenc          $rndkey0,$inout2
           movups        ($inp),$in0             # load input
+       aesenc          $rndkey0,$inout3
           movups        0x10($inp),$in1
+       aesenc          $rndkey0,$inout4
           movups        0x20($inp),$in2
+       aesenc          $rndkey0,$inout5
           movups        0x30($inp),$in3
+       aesenc          $rndkey0,$inout6
+         movups        0x40($inp),$one
+       aesenc          $rndkey0,$inout7
+       $movkey         ($key),$rndkey0
  
-       call            .Lenc_loop8_enter
+.Lctr32_enc_loop8:
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       dec             $rounds
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+       aesenc          $rndkey1,$inout4
+       aesenc          $rndkey1,$inout5
+       aesenc          $rndkey1,$inout6
+       aesenc          $rndkey1,$inout7
+       $movkey         16($key),$rndkey1
  
-       xorps           $in0,$inout0            # xor
-       movups          0x40($inp),$in0
-       xorps           $in1,$inout1
-       movups          0x50($inp),$in1
-       xorps           $in2,$inout2
-       movups          0x60($inp),$in2
-       xorps           $in3,$inout3
-       movups          0x70($inp),$in3
+       aesenc          $rndkey0,$inout0
+       aesenc          $rndkey0,$inout1
+       lea             32($key),$key
+       aesenc          $rndkey0,$inout2
+       aesenc          $rndkey0,$inout3
+       aesenc          $rndkey0,$inout4
+       aesenc          $rndkey0,$inout5
+       aesenc          $rndkey0,$inout6
+       aesenc          $rndkey0,$inout7
+       $movkey         ($key),$rndkey0
+       jnz             .Lctr32_enc_loop8
+
+       aesenc          $rndkey1,$inout0
+       pxor            $rndkey0,$in0
+       aesenc          $rndkey1,$inout1
+       pxor            $rndkey0,$in1
+       aesenc          $rndkey1,$inout2
+       pxor            $rndkey0,$in2
+       aesenc          $rndkey1,$inout3
+       pxor            $rndkey0,$in3
+       aesenc          $rndkey1,$inout4
+       pxor            $rndkey0,$one
+       aesenc          $rndkey1,$inout5
+       aesenc          $rndkey1,$inout6
+       aesenc          $rndkey1,$inout7
+       movdqu          0x50($inp),$rndkey1
+       aesenclast      $in0,$inout0
+       movdqu          0x60($inp),$in0
+       pxor            $rndkey0,$rndkey1
+       aesenclast      $in1,$inout1
+       movdqu          0x70($inp),$in1
+       pxor            $rndkey0,$in0
+       aesenclast      $in2,$inout2
+       pxor            $rndkey0,$in1
+       $movkey         ($key_),$rndkey0
+       aesenclast      $in3,$inout3
         lea             0x80($inp),$inp
-       xorps           $in0,$inout4
+       aesenclast      $one,$inout4
+       movdqa          .Lincrement1(%rip),$one
+       aesenclast      $rndkey1,$inout5
+       $movkey         16($key_),$rndkey1
+       aesenclast      $in0,$inout6
+       aesenclast      $in1,$inout7
+
         movups          $inout0,($out)          # store output
-       xorps           $in1,$inout5
+       movdqa          $rndkey0,$inout0
         movups          $inout1,0x10($out)
-       xorps           $in2,$inout6
+       movdqa          $rndkey0,$inout1
         movups          $inout2,0x20($out)
-       xorps           $in3,$inout7
+       movdqa          $rndkey0,$inout2
         movups          $inout3,0x30($out)
+       movdqa          $rndkey0,$inout3
         movups          $inout4,0x40($out)
+       movdqa          $rndkey0,$inout4
         movups          $inout5,0x50($out)
+       movdqa          $rndkey0,$inout5
         movups          $inout6,0x60($out)
+       movdqa          $rndkey0,$inout6
         movups          $inout7,0x70($out)
+       movdqa          $rndkey0,$inout7
         lea             0x80($out),$out
         
-       $movkey ($key_),$rndkey0
         mov     $rnds_,$rounds
         sub     \$8,$len
         jnc     .Lctr32_loop8
author	Andy Polyakov <appro@openssl.org>
	Tue, 19 Mar 2013 19:03:02 +0000 (20:03 +0100)
committer	Andy Polyakov <appro@openssl.org>
	Tue, 19 Mar 2013 19:03:02 +0000 (20:03 +0100)