Skylake performance results.

[openssl.git] / crypto / modes / asm / ghash-x86_64.pl
diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl

index 6e656ca13b8029cdf105cdd92df58b4e21d0d938..a63c923429f32e1794c482b53445636b3dfcd948 100644 (file)
--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@@ -64,6 +64,7 @@
  # Ivy Bridge   1.80(+7%)
  # Haswell      0.55(+93%) (if system doesn't support AVX)
  # Broadwell    0.45(+110%)(if system doesn't support AVX)
+# Skylake      0.44(+110%)(if system doesn't support AVX)
  # Bulldozer    1.49(+27%)
  # Silvermont   2.88(+13%)
  
@@ -74,8 +75,8 @@
  # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
  # sub-optimally in comparison to above mentioned version. But thanks
  # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
-# it performs in 0.41 cycles per byte on Haswell processor, and in
-# 0.29 on Broadwell.
+# it performs in 0.41 cycles per byte on Haswell processor, in
+# 0.29 on Broadwell, and in 0.36 on Skylake.
  #
  # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
  
@@ -576,15 +577,15 @@ $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
         # experimental alternative. special thing about is that there
         # no dependency between the two multiplications... 
         mov             \$`0xE1<<1`,%eax
-       mov             \$0xA040608020C0E000,%r10       # ((7..0)·0xE0)&0xff
+       mov             \$0xA040608020C0E000,%r10       # ((7..0)Â·0xE0)&0xff
         mov             \$0x07,%r11d
         movq            %rax,$T1
         movq            %r10,$T2
         movq            %r11,$T3                # borrow $T3
         pand            $Xi,$T3
-       pshufb          $T3,$T2                 # ($Xi&7)·0xE0
+       pshufb          $T3,$T2                 # ($Xi&7)Â·0xE0
         movq            %rax,$T3
-       pclmulqdq       \$0x00,$Xi,$T1          # ·(0xE1<<1)
+       pclmulqdq       \$0x00,$Xi,$T1          # Â·(0xE1<<1)
         pxor            $Xi,$T2
         pslldq          \$15,$T2
         paddd           $T2,$T2                 # <<(64+56+1)
@@ -657,7 +658,7 @@ $code.=<<___;
         je              .Lskip4x
  
         sub             \$0x30,$len
-       mov             \$0xA040608020C0E000,%rax       # ((7..0)·0xE0)&0xff
+       mov             \$0xA040608020C0E000,%rax       # ((7..0)Â·0xE0)&0xff
         movdqu          0x30($Htbl),$Hkey3
         movdqu          0x40($Htbl),$Hkey4