Implement new multi-threading API

[openssl.git] / crypto / sha / asm / sha1-586.pl
diff --git a/crypto/sha/asm/sha1-586.pl b/crypto/sha/asm/sha1-586.pl

index 9d08a4cad434aff37fa20fa707a3cb44cd82413c..e0b5d83b62018a86a0204b230a56197afcda4e21 100644 (file)
--- a/crypto/sha/asm/sha1-586.pl
+++ b/crypto/sha/asm/sha1-586.pl
@@ -66,9 +66,9 @@
  # switch to AVX alone improves performance by as little as 4% in
  # comparison to SSSE3 code path. But below result doesn't look like
  # 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as
-# pair of µ-ops, and it's the additional µ-ops, two per round, that
+# pair of Âµ-ops, and it's the additional Âµ-ops, two per round, that
  # make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded
-# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with
+# as single Âµ-op by Sandy Bridge and it's replacing 'ro[rl]' with
  # equivalent 'sh[rl]d' that is responsible for the impressive 5.1
  # cycles per processed byte. But 'sh[rl]d' is not something that used
  # to be fast, nor does it appear to be fast in upcoming Bulldozer
@@ -93,16 +93,19 @@
  # P4           10.6            -
  # AMD K8       7.1             -
  # Core2                7.3             6.0/+22%        -
-# Atom         12.5            9.3(*)/+35%     -
  # Westmere     7.3             5.5/+33%        -
  # Sandy Bridge 8.8             6.2/+40%        5.1(**)/+73%
  # Ivy Bridge   7.2             4.8/+51%        4.7(**)/+53%
  # Haswell      6.5             4.3/+51%        4.1(**)/+58%
  # Bulldozer    11.6            6.0/+92%
  # VIA Nano     10.6            7.5/+41%
+# Atom         12.5            9.3(*)/+35%
+# Silvermont   14.5            9.9(*)/+46%
  #
  # (*)  Loop is 1056 instructions long and expected result is ~8.25.
-#      It remains mystery [to me] why ILP is limited to 1.7.
+#      The discrepancy is because of front-end limitations, so
+#      called MS-ROM penalties, and on Silvermont even rotate's
+#      limited parallelism.
  #
  # (**) As per above comment, the result is for AVX *plus* sh[rl]d.
  
@@ -128,8 +131,8 @@ $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" &&
                 `ml 2>&1` =~ /Version ([0-9]+)\./ &&
                 $1>=10);        # first version supporting AVX
  
-$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v` =~ /LLVM ([3-9]\.[0-9]+)/ &&
-               $1>=3.0);       # first version supporting AVX
+$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ &&
+               $2>=3.0);       # first version supporting AVX
  
  $shaext=$xmm;  ### set to zero if compiling for 1.0.1
  
@@ -447,7 +450,7 @@ sub sha1msg2        { sha1op38(0xca,@_); }
         &sub    ("esp",32);
  
         &movdqu ($ABCD,&QWP(0,$ctx));
-       &movd   ($E,&QWP(16,$ctx));
+       &movd   ($E,&DWP(16,$ctx));
         &and    ("esp",-32);
         &movdqa ($BSWAP,&QWP(0x50,$tmp1));      # byte-n-word swap