Skylake performance results.
[openssl.git] / crypto / modes / asm / ghash-x86_64.pl
index 6e656ca13b8029cdf105cdd92df58b4e21d0d938..a63c923429f32e1794c482b53445636b3dfcd948 100644 (file)
@@ -64,6 +64,7 @@
 # Ivy Bridge   1.80(+7%)
 # Haswell      0.55(+93%) (if system doesn't support AVX)
 # Broadwell    0.45(+110%)(if system doesn't support AVX)
+# Skylake      0.44(+110%)(if system doesn't support AVX)
 # Bulldozer    1.49(+27%)
 # Silvermont   2.88(+13%)
 
@@ -74,8 +75,8 @@
 # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
 # sub-optimally in comparison to above mentioned version. But thanks
 # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
-# it performs in 0.41 cycles per byte on Haswell processor, and in
-# 0.29 on Broadwell.
+# it performs in 0.41 cycles per byte on Haswell processor, in
+# 0.29 on Broadwell, and in 0.36 on Skylake.
 #
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 
@@ -576,15 +577,15 @@ $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
        # experimental alternative. special thing about is that there
        # no dependency between the two multiplications... 
        mov             \$`0xE1<<1`,%eax
-       mov             \$0xA040608020C0E000,%r10       # ((7..0)·0xE0)&0xff
+       mov             \$0xA040608020C0E000,%r10       # ((7..0)·0xE0)&0xff
        mov             \$0x07,%r11d
        movq            %rax,$T1
        movq            %r10,$T2
        movq            %r11,$T3                # borrow $T3
        pand            $Xi,$T3
-       pshufb          $T3,$T2                 # ($Xi&7)·0xE0
+       pshufb          $T3,$T2                 # ($Xi&7)·0xE0
        movq            %rax,$T3
-       pclmulqdq       \$0x00,$Xi,$T1          # ·(0xE1<<1)
+       pclmulqdq       \$0x00,$Xi,$T1          # Â·(0xE1<<1)
        pxor            $Xi,$T2
        pslldq          \$15,$T2
        paddd           $T2,$T2                 # <<(64+56+1)
@@ -657,7 +658,7 @@ $code.=<<___;
        je              .Lskip4x
 
        sub             \$0x30,$len
-       mov             \$0xA040608020C0E000,%rax       # ((7..0)·0xE0)&0xff
+       mov             \$0xA040608020C0E000,%rax       # ((7..0)·0xE0)&0xff
        movdqu          0x30($Htbl),$Hkey3
        movdqu          0x40($Htbl),$Hkey4