Skylake performance results.

[openssl.git] / crypto / sha / asm / sha1-mb-x86_64.pl
diff --git a/crypto/sha/asm/sha1-mb-x86_64.pl b/crypto/sha/asm/sha1-mb-x86_64.pl

index 3a19029e5ee0fac30adb2705be31e682a9815342..099c803ebc41e76ef6c90aeb209d31fd1687e6c7 100644 (file)
--- a/crypto/sha/asm/sha1-mb-x86_64.pl
+++ b/crypto/sha/asm/sha1-mb-x86_64.pl
@@ -19,6 +19,7 @@
  # Sandy Bridge (8.16   +5.15=13.3)/n   4.99    5.98            +80%
  # Ivy Bridge   (8.08   +5.14=13.2)/n   4.60    5.54            +68%
  # Haswell(iii) (8.96   +5.00=14.0)/n   3.57    4.55            +160%
+# Skylake      (8.70   +5.00=13.7)/n   3.64    4.20            +145%
  # Bulldozer    (9.76   +5.76=15.5)/n   5.95    6.37            +64%
  #
  # (i)  multi-block CBC encrypt with 128-bit key;
@@ -58,6 +59,10 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
         $avx = ($1>=10) + ($1>=11);
  }
  
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+       $avx = ($2>=3.0) + ($2>3.0);
+}
+
  open OUT,"| \"$^X\" $xlate $flavour $output";
  *STDOUT=*OUT;
  
@@ -103,6 +108,19 @@ my ($i,$a,$b,$c,$d,$e)=@_;
  my $j=$i+1;
  my $k=$i+2;
  
+# Loads are performed 2+3/4 iterations in advance. 3/4 means that out
+# of 4 words you would expect to be loaded per given iteration one is
+# spilled to next iteration. In other words indices in four input
+# streams are distributed as following:
+#
+# $i==0:       0,0,0,0,1,1,1,1,2,2,2,
+# $i==1:       2,3,3,3,
+# $i==2:       3,4,4,4,
+# ...
+# $i==13:      14,15,15,15,
+# $i==14:      15
+# 
+# Then at $i==15 Xupdate is applied one iteration in advance...
  $code.=<<___ if ($i==0);
         movd            (@ptr[0]),@Xi[0]
          lea            `16*4`(@ptr[0]),@ptr[0]
@@ -149,7 +167,7 @@ $code.=<<___ if ($i<14);                    # just load input
         psrld   \$2,$b
         paddd   $t2,$e                          # e+=rol(a,5)
          pshufb $tx,@Xi[1]
-        movd           `4*$j-16*4`(@ptr[2]),$t2
+        movd           `4*$k-16*4`(@ptr[2]),$t2
         por     $t1,$b                          # b=rol(b,30)
  ___
  $code.=<<___ if ($i==14);                      # just load input