# switch to AVX alone improves performance by as little as 4% in
# comparison to SSSE3 code path. But below result doesn't look like
# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as
-# pair of µ-ops, and it's the additional µ-ops, two per round, that
+# pair of µ-ops, and it's the additional µ-ops, two per round, that
# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded
-# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with
+# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with
# equivalent 'sh[rl]d' that is responsible for the impressive 5.1
# cycles per processed byte. But 'sh[rl]d' is not something that used
# to be fast, nor does it appear to be fast in upcoming Bulldozer
# P4 10.6 -
# AMD K8 7.1 -
# Core2 7.3 6.0/+22% -
-# Atom 12.5 9.3(*)/+35% -
# Westmere 7.3 5.5/+33% -
# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73%
# Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53%
# Haswell 6.5 4.3/+51% 4.1(**)/+58%
# Bulldozer 11.6 6.0/+92%
# VIA Nano 10.6 7.5/+41%
+# Atom 12.5 9.3(*)/+35%
+# Silvermont 14.5 9.9(*)/+46%
#
# (*) Loop is 1056 instructions long and expected result is ~8.25.
-# It remains mystery [to me] why ILP is limited to 1.7.
+# The discrepancy is because of front-end limitations, so
+# called MS-ROM penalties, and on Silvermont even rotate's
+# limited parallelism.
#
# (**) As per above comment, the result is for AVX *plus* sh[rl]d.
`ml 2>&1` =~ /Version ([0-9]+)\./ &&
$1>=10); # first version supporting AVX
-$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v` =~ /LLVM ([3-9]\.[0-9]+)/ &&
- $1>=3.0); # first version supporting AVX
+$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ &&
+ $2>=3.0); # first version supporting AVX
$shaext=$xmm; ### set to zero if compiling for 1.0.1
&sub ("esp",32);
&movdqu ($ABCD,&QWP(0,$ctx));
- &movd ($E,&QWP(16,$ctx));
+ &movd ($E,&DWP(16,$ctx));
&and ("esp",-32);
&movdqa ($BSWAP,&QWP(0x50,$tmp1)); # byte-n-word swap