Add Broadwell performance results.
[openssl.git] / crypto / modes / asm / ghash-x86_64.pl
index f4af85d21b10fbf1e31a8954ed1371b0341bf9dd..6e656ca13b8029cdf105cdd92df58b4e21d0d938 100644 (file)
 # longer. A CPU with higher pclmulqdq issue rate would also benefit
 # from higher aggregate factor...
 #
-# Westmere     1.76(+14%)
-# Sandy Bridge 1.79(+9%)
-# Ivy Bridge   1.79(+8%)
-# Bulldozer    1.52(+25%)
+# Westmere     1.78(+13%)
+# Sandy Bridge 1.80(+8%)
+# Ivy Bridge   1.80(+7%)
+# Haswell      0.55(+93%) (if system doesn't support AVX)
+# Broadwell    0.45(+110%)(if system doesn't support AVX)
+# Bulldozer    1.49(+27%)
+# Silvermont   2.88(+13%)
 
 # March 2013
 #
@@ -70,9 +73,9 @@
 # suggested by Shay Gueron[1]. Even though contemporary AVX-capable
 # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
 # sub-optimally in comparison to above mentioned version. But thanks
-# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we know that
-# it will perform better on upcoming Haswell processor. [Exact
-# performance numbers to be added at launch.]
+# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
+# it performs in 0.41 cycles per byte on Haswell processor, and in
+# 0.29 on Broadwell.
 #
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 
@@ -102,6 +105,10 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
        $avx = ($1>=10) + ($1>=11);
 }
 
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+       $avx = ($2>=3.0) + ($2>3.0);
+}
+
 open OUT,"| \"$^X\" $xlate $flavour $output";
 *STDOUT=*OUT;
 
@@ -214,6 +221,7 @@ ___
 
 $code=<<___;
 .text
+.extern        OPENSSL_ia32cap_P
 
 .globl gcm_gmult_4bit
 .type  gcm_gmult_4bit,\@function,2
@@ -597,7 +605,8 @@ ___
 }
 \f
 { my ($Xip,$Htbl,$inp,$len)=@_4args;
-  my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(6..10));
+  my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
+  my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
 
 $code.=<<___;
 .globl gcm_ghash_clmul
@@ -624,7 +633,6 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
        movdqa          .Lbswap_mask(%rip),$T3
-       mov             \$0xA040608020C0E000,%rax       # ((7..0)·0xE0)&0xff
 
        movdqu          ($Xip),$Xi
        movdqu          ($Htbl),$Hkey
@@ -640,10 +648,16 @@ if ($do4xaggr) {
 my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
 
 $code.=<<___;
+       mov             OPENSSL_ia32cap_P+4(%rip),%eax
        cmp             \$0x30,$len
        jb              .Lskip4x
 
+       and             \$`1<<26|1<<22`,%eax    # isolate MOVBE+XSAVE
+       cmp             \$`1<<22`,%eax          # check for MOVBE without XSAVE
+       je              .Lskip4x
+
        sub             \$0x30,$len
+       mov             \$0xA040608020C0E000,%rax       # ((7..0)·0xE0)&0xff
        movdqu          0x30($Htbl),$Hkey3
        movdqu          0x40($Htbl),$Hkey4
 
@@ -666,8 +680,8 @@ $code.=<<___;
        pxor            $Xl,$Xm
        pclmulqdq       \$0x00,$Hkey2,$Xl
        pclmulqdq       \$0x11,$Hkey2,$Xh
-       xorps           $Xl,$Xln
        pclmulqdq       \$0x10,$HK,$Xm
+       xorps           $Xl,$Xln
        xorps           $Xh,$Xhn
        movups          0x50($Htbl),$HK
        xorps           $Xm,$Xmn
@@ -685,8 +699,8 @@ $code.=<<___;
         pshufd         \$0b01001110,$Xi,$T1
         pxor           $Xi,$T1
        pclmulqdq       \$0x11,$Hkey3,$Xh
-       xorps           $Xl,$Xln
        pclmulqdq       \$0x00,$HK,$Xm
+       xorps           $Xl,$Xln
        xorps           $Xh,$Xhn
 
        lea     0x40($inp),$inp
@@ -704,23 +718,23 @@ $code.=<<___;
        xorps           $Xln,$Xi
         movdqu         0x20($inp),$Xln
         movdqa         $Xl,$Xh
-        pshufd         \$0b01001110,$Xl,$Xm
        pclmulqdq       \$0x10,$HK,$T1
+        pshufd         \$0b01001110,$Xl,$Xm
        xorps           $Xhn,$Xhi
         pxor           $Xl,$Xm
         pshufb         $T3,$Xln
        movups          0x20($Htbl),$HK
-        pclmulqdq      \$0x00,$Hkey,$Xl
        xorps           $Xmn,$T1
-        movdqa         $Xln,$Xhn
+        pclmulqdq      \$0x00,$Hkey,$Xl
         pshufd         \$0b01001110,$Xln,$Xmn
 
        pxor            $Xi,$T1                 # aggregated Karatsuba post-processing
-        pxor           $Xln,$Xmn
+        movdqa         $Xln,$Xhn
        pxor            $Xhi,$T1                #
+        pxor           $Xln,$Xmn
        movdqa          $T1,$T2                 #
-       pslldq          \$8,$T1
         pclmulqdq      \$0x11,$Hkey,$Xh
+       pslldq          \$8,$T1
        psrldq          \$8,$T2                 #
        pxor            $T1,$Xi
        movdqa          .L7_mask(%rip),$T1
@@ -729,8 +743,8 @@ $code.=<<___;
 
        pand            $Xi,$T1                 # 1st phase
        pshufb          $T1,$T2                 #
-        pclmulqdq      \$0x00,$HK,$Xm
        pxor            $Xi,$T2                 #
+        pclmulqdq      \$0x00,$HK,$Xm
        psllq           \$57,$T2                #
        movdqa          $T2,$T1                 #
        pslldq          \$8,$T2
@@ -757,32 +771,31 @@ $code.=<<___;
         movdqa         $Xl,$Xh
         pxor           $Xm,$Xmn
         pshufd         \$0b01001110,$Xl,$Xm
-        pxor           $Xl,$Xm
-        pclmulqdq      \$0x00,$Hkey3,$Xl
        pxor            $T2,$Xi                 #
        pxor            $T1,$Xhi
+        pxor           $Xl,$Xm
+        pclmulqdq      \$0x00,$Hkey3,$Xl
        psrlq           \$1,$Xi                 #
+       pxor            $Xhi,$Xi                #
+       movdqa          $Xi,$Xhi
         pclmulqdq      \$0x11,$Hkey3,$Xh
         xorps          $Xl,$Xln
-       pxor            $Xhi,$Xi                #
+       pshufd          \$0b01001110,$Xi,$T1
+       pxor            $Xi,$T1
 
         pclmulqdq      \$0x00,$HK,$Xm
         xorps          $Xh,$Xhn
 
-       movdqa          $Xi,$Xhi
-       pshufd          \$0b01001110,$Xi,$T1
-       pxor            $Xi,$T1
-
        lea     0x40($inp),$inp
        sub     \$0x40,$len
        jnc     .Lmod4_loop
 
 .Ltail4x:
        pclmulqdq       \$0x00,$Hkey4,$Xi
-       xorps           $Xm,$Xmn
        pclmulqdq       \$0x11,$Hkey4,$Xhi
-       xorps           $Xln,$Xi
        pclmulqdq       \$0x10,$HK,$T1
+       xorps           $Xm,$Xmn
+       xorps           $Xln,$Xi
        xorps           $Xhn,$Xhi
        pxor            $Xi,$Xhi                # aggregated Karatsuba post-processing
        pxor            $Xmn,$T1
@@ -819,51 +832,54 @@ $code.=<<___;
        pxor            $T1,$Xi                 # Ii+Xi
 
        movdqa          $Xln,$Xhn
-       pshufd          \$0b01001110,$Xln,$T1
-       pxor            $Xln,$T1
+       pshufd          \$0b01001110,$Xln,$Xmn
+       pxor            $Xln,$Xmn
        pclmulqdq       \$0x00,$Hkey,$Xln
        pclmulqdq       \$0x11,$Hkey,$Xhn
-       pclmulqdq       \$0x00,$HK,$T1
+       pclmulqdq       \$0x00,$HK,$Xmn
 
        lea             32($inp),$inp           # i+=2
+       nop
        sub             \$0x20,$len
        jbe             .Leven_tail
+       nop
        jmp             .Lmod_loop
 
 .align 32
 .Lmod_loop:
        movdqa          $Xi,$Xhi
-       pshufd          \$0b01001110,$Xi,$T2    #
-       pxor            $Xi,$T2                 #
+       movdqa          $Xmn,$T1
+       pshufd          \$0b01001110,$Xi,$Xmn   #
+       pxor            $Xi,$Xmn                #
 
        pclmulqdq       \$0x00,$Hkey2,$Xi
        pclmulqdq       \$0x11,$Hkey2,$Xhi
-       pclmulqdq       \$0x10,$HK,$T2
+       pclmulqdq       \$0x10,$HK,$Xmn
 
        pxor            $Xln,$Xi                # (H*Ii+1) + H^2*(Ii+Xi)
        pxor            $Xhn,$Xhi
-         movdqu        ($inp),$Xhn             # Ii
-         pshufb        $T3,$Xhn
+         movdqu        ($inp),$T2              # Ii
+       pxor            $Xi,$T1                 # aggregated Karatsuba post-processing
+         pshufb        $T3,$T2
          movdqu        16($inp),$Xln           # Ii+1
 
-       pxor            $Xi,$T1                 # aggregated Karatsuba post-processing
        pxor            $Xhi,$T1
-         pxor          $Xhn,$Xhi               # "Ii+Xi", consume early
-       pxor            $T1,$T2
+         pxor          $T2,$Xhi                # "Ii+Xi", consume early
+       pxor            $T1,$Xmn
         pshufb         $T3,$Xln
-       movdqa          $T2,$T1                 #
+       movdqa          $Xmn,$T1                #
        psrldq          \$8,$T1
-       pslldq          \$8,$T2                 #
+       pslldq          \$8,$Xmn                #
        pxor            $T1,$Xhi
-       pxor            $T2,$Xi                 #
+       pxor            $Xmn,$Xi                #
 
        movdqa          $Xln,$Xhn               #
 
          movdqa        $Xi,$T2                 # 1st phase
          movdqa        $Xi,$T1
          psllq         \$5,$Xi
-       pclmulqdq       \$0x00,$Hkey,$Xln       #######
          pxor          $Xi,$T1                 #
+       pclmulqdq       \$0x00,$Hkey,$Xln       #######
          psllq         \$1,$Xi
          pxor          $T1,$Xi                 #
          psllq         \$57,$Xi                #
@@ -871,44 +887,45 @@ $code.=<<___;
          pslldq        \$8,$Xi
          psrldq        \$8,$T1                 #       
          pxor          $T2,$Xi
+       pshufd          \$0b01001110,$Xhn,$Xmn
          pxor          $T1,$Xhi                #
-       pshufd          \$0b01001110,$Xhn,$T1
-       pxor            $Xhn,$T1                #
+       pxor            $Xhn,$Xmn               #
 
-       pclmulqdq       \$0x11,$Hkey,$Xhn       #######
          movdqa        $Xi,$T2                 # 2nd phase
          psrlq         \$1,$Xi
+       pclmulqdq       \$0x11,$Hkey,$Xhn       #######
          pxor          $T2,$Xhi                #
          pxor          $Xi,$T2
          psrlq         \$5,$Xi
          pxor          $T2,$Xi                 #
+       lea             32($inp),$inp
          psrlq         \$1,$Xi                 #
-       pclmulqdq       \$0x00,$HK,$T1          #######
+       pclmulqdq       \$0x00,$HK,$Xmn         #######
          pxor          $Xhi,$Xi                #
 
-       lea             32($inp),$inp
        sub             \$0x20,$len
        ja              .Lmod_loop
 
 .Leven_tail:
         movdqa         $Xi,$Xhi
-        pshufd         \$0b01001110,$Xi,$T2    #
-        pxor           $Xi,$T2                 #
+        movdqa         $Xmn,$T1
+        pshufd         \$0b01001110,$Xi,$Xmn   #
+        pxor           $Xi,$Xmn                #
 
        pclmulqdq       \$0x00,$Hkey2,$Xi
        pclmulqdq       \$0x11,$Hkey2,$Xhi
-       pclmulqdq       \$0x10,$HK,$T2
+       pclmulqdq       \$0x10,$HK,$Xmn
 
        pxor            $Xln,$Xi                # (H*Ii+1) + H^2*(Ii+Xi)
        pxor            $Xhn,$Xhi
        pxor            $Xi,$T1
        pxor            $Xhi,$T1
-       pxor            $T1,$T2
-       movdqa          $T2,$T1                 #
+       pxor            $T1,$Xmn
+       movdqa          $Xmn,$T1                #
        psrldq          \$8,$T1
-       pslldq          \$8,$T2                 #
+       pslldq          \$8,$Xmn                #
        pxor            $T1,$Xhi
-       pxor            $T2,$Xi                 #
+       pxor            $Xmn,$Xi                #
 ___
        &reduction_alg9 ($Xhi,$Xi);
 $code.=<<___;