# Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80%
# Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68%
# Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160%
+# Skylake (8.70 +5.00=13.7)/n 3.64 4.20 +145%
# Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64%
#
# (i) multi-block CBC encrypt with 128-bit key;
$avx = ($1>=10) + ($1>=11);
}
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
my $j=$i+1;
my $k=$i+2;
+# Loads are performed 2+3/4 iterations in advance. 3/4 means that out
+# of 4 words you would expect to be loaded per given iteration one is
+# spilled to next iteration. In other words indices in four input
+# streams are distributed as following:
+#
+# $i==0: 0,0,0,0,1,1,1,1,2,2,2,
+# $i==1: 2,3,3,3,
+# $i==2: 3,4,4,4,
+# ...
+# $i==13: 14,15,15,15,
+# $i==14: 15
+#
+# Then at $i==15 Xupdate is applied one iteration in advance...
$code.=<<___ if ($i==0);
movd (@ptr[0]),@Xi[0]
lea `16*4`(@ptr[0]),@ptr[0]
psrld \$2,$b
paddd $t2,$e # e+=rol(a,5)
pshufb $tx,@Xi[1]
- movd `4*$j-16*4`(@ptr[2]),$t2
+ movd `4*$k-16*4`(@ptr[2]),$t2
por $t1,$b # b=rol(b,30)
___
$code.=<<___ if ($i==14); # just load input