ghash-x86_64.pl: optimize for upcoming Atom.
authorAndy Polyakov <appro@openssl.org>
Wed, 11 Jun 2014 09:34:18 +0000 (11:34 +0200)
committerAndy Polyakov <appro@openssl.org>
Wed, 11 Jun 2014 09:35:23 +0000 (11:35 +0200)
(cherry picked from commit 1cf8f57b43a99275ac0938e9e230bd0434699d87)

crypto/modes/asm/ghash-x86_64.pl

index 04001e6aae793ad4d16bd6dce5c0b30743920da5..f1aca3e5343157e4e67792b64b3a303a2e0b175c 100644 (file)
 # longer. A CPU with higher pclmulqdq issue rate would also benefit
 # from higher aggregate factor...
 #
-# Westmere     1.76(+14%)
-# Sandy Bridge 1.79(+9%)
-# Ivy Bridge   1.79(+8%)
+# Westmere     1.78(+13%)
+# Sandy Bridge 1.80(+8%)
+# Ivy Bridge   1.80(+7%)
 # Haswell      0.55(+93%) (if system doesn't support AVX)
-# Bulldozer    1.52(+25%)
+# Bulldozer    1.49(+27%)
 
 # March 2013
 #
@@ -673,8 +673,8 @@ $code.=<<___;
        pxor            $Xl,$Xm
        pclmulqdq       \$0x00,$Hkey2,$Xl
        pclmulqdq       \$0x11,$Hkey2,$Xh
-       xorps           $Xl,$Xln
        pclmulqdq       \$0x10,$HK,$Xm
+       xorps           $Xl,$Xln
        xorps           $Xh,$Xhn
        movups          0x50($Htbl),$HK
        xorps           $Xm,$Xmn
@@ -692,8 +692,8 @@ $code.=<<___;
         pshufd         \$0b01001110,$Xi,$T1
         pxor           $Xi,$T1
        pclmulqdq       \$0x11,$Hkey3,$Xh
-       xorps           $Xl,$Xln
        pclmulqdq       \$0x00,$HK,$Xm
+       xorps           $Xl,$Xln
        xorps           $Xh,$Xhn
 
        lea     0x40($inp),$inp
@@ -711,23 +711,23 @@ $code.=<<___;
        xorps           $Xln,$Xi
         movdqu         0x20($inp),$Xln
         movdqa         $Xl,$Xh
-        pshufd         \$0b01001110,$Xl,$Xm
        pclmulqdq       \$0x10,$HK,$T1
+        pshufd         \$0b01001110,$Xl,$Xm
        xorps           $Xhn,$Xhi
         pxor           $Xl,$Xm
         pshufb         $T3,$Xln
        movups          0x20($Htbl),$HK
-        pclmulqdq      \$0x00,$Hkey,$Xl
        xorps           $Xmn,$T1
-        movdqa         $Xln,$Xhn
+        pclmulqdq      \$0x00,$Hkey,$Xl
         pshufd         \$0b01001110,$Xln,$Xmn
 
        pxor            $Xi,$T1                 # aggregated Karatsuba post-processing
-        pxor           $Xln,$Xmn
+        movdqa         $Xln,$Xhn
        pxor            $Xhi,$T1                #
+        pxor           $Xln,$Xmn
        movdqa          $T1,$T2                 #
-       pslldq          \$8,$T1
         pclmulqdq      \$0x11,$Hkey,$Xh
+       pslldq          \$8,$T1
        psrldq          \$8,$T2                 #
        pxor            $T1,$Xi
        movdqa          .L7_mask(%rip),$T1
@@ -736,8 +736,8 @@ $code.=<<___;
 
        pand            $Xi,$T1                 # 1st phase
        pshufb          $T1,$T2                 #
-        pclmulqdq      \$0x00,$HK,$Xm
        pxor            $Xi,$T2                 #
+        pclmulqdq      \$0x00,$HK,$Xm
        psllq           \$57,$T2                #
        movdqa          $T2,$T1                 #
        pslldq          \$8,$T2
@@ -764,32 +764,31 @@ $code.=<<___;
         movdqa         $Xl,$Xh
         pxor           $Xm,$Xmn
         pshufd         \$0b01001110,$Xl,$Xm
-        pxor           $Xl,$Xm
-        pclmulqdq      \$0x00,$Hkey3,$Xl
        pxor            $T2,$Xi                 #
        pxor            $T1,$Xhi
+        pxor           $Xl,$Xm
+        pclmulqdq      \$0x00,$Hkey3,$Xl
        psrlq           \$1,$Xi                 #
+       pxor            $Xhi,$Xi                #
+       movdqa          $Xi,$Xhi
         pclmulqdq      \$0x11,$Hkey3,$Xh
         xorps          $Xl,$Xln
-       pxor            $Xhi,$Xi                #
+       pshufd          \$0b01001110,$Xi,$T1
+       pxor            $Xi,$T1
 
         pclmulqdq      \$0x00,$HK,$Xm
         xorps          $Xh,$Xhn
 
-       movdqa          $Xi,$Xhi
-       pshufd          \$0b01001110,$Xi,$T1
-       pxor            $Xi,$T1
-
        lea     0x40($inp),$inp
        sub     \$0x40,$len
        jnc     .Lmod4_loop
 
 .Ltail4x:
        pclmulqdq       \$0x00,$Hkey4,$Xi
-       xorps           $Xm,$Xmn
        pclmulqdq       \$0x11,$Hkey4,$Xhi
-       xorps           $Xln,$Xi
        pclmulqdq       \$0x10,$HK,$T1
+       xorps           $Xm,$Xmn
+       xorps           $Xln,$Xi
        xorps           $Xhn,$Xhi
        pxor            $Xi,$Xhi                # aggregated Karatsuba post-processing
        pxor            $Xmn,$T1
@@ -852,13 +851,13 @@ $code.=<<___;
 
        pxor            $Xln,$Xi                # (H*Ii+1) + H^2*(Ii+Xi)
        pxor            $Xhn,$Xhi
-         movdqu        ($inp),$Xhn             # Ii
+         movdqu        ($inp),$T2              # Ii
        pxor            $Xi,$T1                 # aggregated Karatsuba post-processing
-         pshufb        $T3,$Xhn
+         pshufb        $T3,$T2
          movdqu        16($inp),$Xln           # Ii+1
 
        pxor            $Xhi,$T1
-         pxor          $Xhn,$Xhi               # "Ii+Xi", consume early
+         pxor          $T2,$Xhi                # "Ii+Xi", consume early
        pxor            $T1,$Xmn
         pshufb         $T3,$Xln
        movdqa          $Xmn,$T1                #
@@ -885,9 +884,9 @@ $code.=<<___;
          pxor          $T1,$Xhi                #
        pxor            $Xhn,$Xmn               #
 
-       pclmulqdq       \$0x11,$Hkey,$Xhn       #######
          movdqa        $Xi,$T2                 # 2nd phase
          psrlq         \$1,$Xi
+       pclmulqdq       \$0x11,$Hkey,$Xhn       #######
          pxor          $T2,$Xhi                #
          pxor          $Xi,$T2
          psrlq         \$5,$Xi
@@ -896,7 +895,6 @@ $code.=<<___;
          psrlq         \$1,$Xi                 #
        pclmulqdq       \$0x00,$HK,$Xmn         #######
          pxor          $Xhi,$Xi                #
-         .byte         0x66,0x90
 
        sub             \$0x20,$len
        ja              .Lmod_loop