#!/usr/bin/env perl
# ====================================================================
-# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# [Re]written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# PIII 11.5 -
# P4 10.6 -
# AMD K8 7.1 -
-# Core2 7.3 6.1/+20% -
-# Atom 12.5 9.5(*)/+32% -
-# Westmere 7.3 5.6/+30% -
-# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70%
+# Core2 7.3 6.0/+22% -
+# Atom 12.5 9.3(*)/+35% -
+# Westmere 7.3 5.5/+33% -
+# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73%
+# Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53%
+# Haswell 6.5 4.3/+51% 4.1(**)/+58%
+# Bulldozer 11.6 6.0/+92%
+# VIA Nano 10.6 7.5/+41%
#
# (*) Loop is 1056 instructions long and expected result is ~8.25.
# It remains mystery [to me] why ILP is limited to 1.7.
for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
$ymm=1 if ($xmm &&
- `$ENV{CC} -Wa,-V -c -o /dev/null -x assembler /dev/null 2>&1`
+ `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
$1>=2.19); # first version supporting AVX
`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
$1>=2.03); # first version supporting AVX
+$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" &&
+ `ml 2>&1` =~ /Version ([0-9]+)\./ &&
+ $1>=10); # first version supporting AVX
+
&external_label("OPENSSL_ia32cap_P") if ($xmm);
my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
my @V=($A,$B,$C,$D,$E);
my $j=0; # hash round
+my $rx=0;
my @T=($T,$tmp1);
my $inp;
&movdqa (&QWP(0+16,"esp"),@X[-3&7]);
&psubd (@X[-3&7],@X[3]);
&movdqa (&QWP(0+32,"esp"),@X[-2&7]);
+ &mov (@T[1],$C);
&psubd (@X[-2&7],@X[3]);
- &movdqa (@X[0],@X[-3&7]);
+ &xor (@T[1],$D);
+ &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
+ &and (@T[0],@T[1]);
&jmp (&label("loop"));
######################################################################
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
my ($a,$b,$c,$d,$e);
+ eval(shift(@insns)); # ror
eval(shift(@insns));
eval(shift(@insns));
- &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
&movdqa (@X[2],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
&paddd (@X[3],@X[-1&7]);
&movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
- eval(shift(@insns));
+ eval(shift(@insns)); # rol
eval(shift(@insns));
&psrldq (@X[2],4); # "X[-3]", 3 dwords
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
eval(shift(@insns));
- eval(shift(@insns));
+ eval(shift(@insns)); # ror
&pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
- eval(shift(@insns));
&pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]"
eval(shift(@insns));
- eval(shift(@insns));
+ eval(shift(@insns)); # rol
&movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
&movdqa (@X[4],@X[0]);
- &movdqa (@X[2],@X[0]);
- eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &movdqa (@X[2],@X[0]);
eval(shift(@insns));
&pslldq (@X[4],12); # "X[0]"<<96, extract one dword
&paddd (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
&psrld (@X[2],31);
eval(shift(@insns));
- eval(shift(@insns));
+ eval(shift(@insns)); # rol
&movdqa (@X[3],@X[4]);
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
&psrld (@X[4],30);
- &por (@X[0],@X[2]); # "X[0]"<<<=1
eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &por (@X[0],@X[2]); # "X[0]"<<<=1
eval(shift(@insns));
&movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
eval(shift(@insns));
eval(shift(@insns));
&pslld (@X[3],2);
- &pxor (@X[0],@X[4]);
- eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ &pxor (@X[0],@X[4]);
&movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2
- &movdqa (@X[1],@X[-2&7]) if ($Xi<7);
+ &pshufd (@X[1],@X[-3&7],0xee) if ($Xi<7); # was &movdqa (@X[1],@X[-2&7])
+ &pshufd (@X[3],@X[-1&7],0xee) if ($Xi==7);
eval(shift(@insns));
eval(shift(@insns));
sub Xupdate_ssse3_32_79()
{ use integer;
my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
my ($a,$b,$c,$d,$e);
- &movdqa (@X[2],@X[-1&7]) if ($Xi==8);
eval(shift(@insns)); # body_20_39
&pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
- &palignr(@X[2],@X[-2&7],8); # compose "X[-6]"
+ &punpcklqdq(@X[2],@X[-1&7]); # compose "X[-6]", was &palignr(@X[2],@X[-2&7],8)
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # rol
&movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);
if ($Xi%5) {
&movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX...
} else { # ... or load next one
&movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
}
- &paddd (@X[3],@X[-1&7]);
eval(shift(@insns)); # ror
+ &paddd (@X[3],@X[-1&7]);
eval(shift(@insns));
&pxor (@X[0],@X[2]); # "X[0]"^="X[-6]"
eval(shift(@insns));
eval(shift(@insns)); # ror
eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);
&pslld (@X[0],2);
eval(shift(@insns)); # body_20_39
eval(shift(@insns));
eval(shift(@insns)); # ror
eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[1] =~ /_rol/);
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);
&por (@X[0],@X[2]); # "X[0]"<<<=2
eval(shift(@insns)); # body_20_39
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # ror
- &movdqa (@X[3],@X[0]) if ($Xi<19);
+ &pshufd (@X[3],@X[-1],0xee) if ($Xi<19); # was &movdqa (@X[3],@X[0])
eval(shift(@insns));
foreach (@insns) { eval; } # remaining instructions
my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
my ($a,$b,$c,$d,$e);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
eval(shift(@insns));
&paddd (@X[3],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
&pshufb (@X[($Xi-3)&7],@X[2]);
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
&paddd (@X[($Xi-4)&7],@X[3]);
eval(shift(@insns));
eval(shift(@insns));
&movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
&psubd (@X[($Xi-4)&7],@X[3]);
foreach (@insns) { eval; }
foreach (@insns) { eval; }
}
-sub body_00_19 () {
+sub body_00_19 () { # ((c^d)&b)^d
+ # on start @T[0]=(c^d)&b
+ return &body_20_39() if ($rx==19); $rx++;
(
'($a,$b,$c,$d,$e)=@V;'.
- '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer
- '&xor ($c,$d);',
+ '&$_ror ($b,$j?7:2);', # $b>>>2
+ '&xor (@T[0],$d);',
'&mov (@T[1],$a);', # $b in next round
+
+ '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer
+ '&xor ($b,$c);', # $c^$d for next round
+
'&$_rol ($a,5);',
- '&and (@T[0],$c);', # ($b&($c^$d))
- '&xor ($c,$d);', # restore $c
- '&xor (@T[0],$d);',
- '&add ($e,$a);',
- '&$_ror ($b,$j?7:2);', # $b>>>2
- '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ '&add ($e,@T[0]);',
+ '&and (@T[1],$b);', # ($b&($c^$d)) for next round
+
+ '&xor ($b,$c);', # restore $b
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
-sub body_20_39 () {
+sub body_20_39 () { # b^d^c
+ # on entry @T[0]=b^d
+ return &body_40_59() if ($rx==39); $rx++;
(
'($a,$b,$c,$d,$e)=@V;'.
- '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
- '&xor (@T[0],$d);', # ($b^$d)
+ '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer
+ '&xor (@T[0],$d) if($j==19);'.
+ '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c)
'&mov (@T[1],$a);', # $b in next round
+
'&$_rol ($a,5);',
- '&xor (@T[0],$c);', # ($b^$d^$c)
- '&add ($e,$a);',
+ '&add ($e,@T[0]);',
+ '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round
+
'&$_ror ($b,7);', # $b>>>2
- '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
-sub body_40_59 () {
+sub body_40_59 () { # ((b^c)&(c^d))^c
+ # on entry @T[0]=(b^c), (c^=d)
+ $rx++;
(
'($a,$b,$c,$d,$e)=@V;'.
- '&mov (@T[1],$c);',
- '&xor ($c,$d);',
- '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
- '&and (@T[1],$d);',
- '&and (@T[0],$c);', # ($b&($c^$d))
+ '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer
+ '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d)
+ '&xor ($c,$d) if ($j>=40);', # restore $c
+
'&$_ror ($b,7);', # $b>>>2
- '&add ($e,@T[1]);',
- '&mov (@T[1],$a);', # $b in next round
+ '&mov (@T[1],$a);', # $b for next round
+ '&xor (@T[0],$c);',
+
'&$_rol ($a,5);',
'&add ($e,@T[0]);',
- '&xor ($c,$d);', # restore $c
- '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ '&xor (@T[1],$c) if ($j==59);'.
+ '&xor (@T[1],$b) if ($j< 59);', # b^c for next round
+
+ '&xor ($b,$c) if ($j< 59);', # c^d for next round
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+}
+######
+sub bodyx_00_19 () { # ((c^d)&b)^d
+ # on start @T[0]=(b&c)^(~b&d), $e+=X[]+K
+ return &bodyx_20_39() if ($rx==19); $rx++;
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
+
+ '&rorx ($b,$b,2) if ($j==0);'. # $b>>>2
+ '&rorx ($b,@T[1],7) if ($j!=0);', # $b>>>2
+ '&lea ($e,&DWP(0,$e,@T[0]));',
+ '&rorx (@T[0],$a,5);',
+
+ '&andn (@T[1],$a,$c);',
+ '&and ($a,$b)',
+ '&add ($d,&DWP(4*(($j+1)&15),"esp"));', # X[]+K xfer
+
+ '&xor (@T[1],$a)',
+ '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+}
+
+sub bodyx_20_39 () { # b^d^c
+ # on start $b=b^c^d
+ return &bodyx_40_59() if ($rx==39); $rx++;
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
+
+ '&add ($e,($j==19?@T[0]:$b))',
+ '&rorx ($b,@T[1],7);', # $b>>>2
+ '&rorx (@T[0],$a,5);',
+
+ '&xor ($a,$b) if ($j<79);',
+ '&add ($d,&DWP(4*(($j+1)&15),"esp")) if ($j<79);', # X[]+K xfer
+ '&xor ($a,$c) if ($j<79);',
+ '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+}
+
+sub bodyx_40_59 () { # ((b^c)&(c^d))^c
+ # on start $b=((b^c)&(c^d))^c
+ return &bodyx_20_39() if ($rx==59); $rx++;
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
+
+ '&rorx (@T[0],$a,5)',
+ '&lea ($e,&DWP(0,$e,$b))',
+ '&rorx ($b,@T[1],7)', # $b>>>2
+ '&add ($d,&DWP(4*(($j+1)&15),"esp"))', # X[]+K xfer
+
+ '&mov (@T[1],$c)',
+ '&xor ($a,$b)', # b^c for next round
+ '&xor (@T[1],$b)', # c^d for next round
+
+ '&and ($a,@T[1])',
+ '&add ($e,@T[0])',
+ '&xor ($a,$b)' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
&mov (&DWP(4,@T[1]),@T[0]);
&add ($E,&DWP(16,@T[1]));
&mov (&DWP(8,@T[1]),$C);
- &mov ($B,@T[0]);
+ &mov ($B,$C);
&mov (&DWP(12,@T[1]),$D);
+ &xor ($B,$D);
&mov (&DWP(16,@T[1]),$E);
- &movdqa (@X[0],@X[-3&7]);
+ &mov (@T[1],@T[0]);
+ &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
+ &and (@T[0],$B);
+ &mov ($B,$T[1]);
&jmp (&label("loop"));
&function_end("_sha1_block_data_order_ssse3");
+$rx=0; # reset
+
if ($ymm) {
my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded
my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
&vpaddd (@X[1],@X[-3&7],@X[3]);
&vpaddd (@X[2],@X[-2&7],@X[3]);
&vmovdqa(&QWP(0,"esp"),@X[0]); # X[]+K xfer to IALU
+ &mov (@T[1],$C);
&vmovdqa(&QWP(0+16,"esp"),@X[1]);
+ &xor (@T[1],$D);
&vmovdqa(&QWP(0+32,"esp"),@X[2]);
+ &and (@T[0],@T[1]);
&jmp (&label("loop"));
sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
sub Xupdate_avx_32_79()
{ use integer;
my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
my ($a,$b,$c,$d,$e);
&vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
&add ($D,&DWP(12,@T[1]));
&mov (&DWP(4,@T[1]),@T[0]);
&add ($E,&DWP(16,@T[1]));
+ &mov ($B,$C);
&mov (&DWP(8,@T[1]),$C);
- &mov ($B,@T[0]);
+ &xor ($B,$D);
&mov (&DWP(12,@T[1]),$D);
&mov (&DWP(16,@T[1]),$E);
+ &mov (@T[1],@T[0]);
+ &and (@T[0],$B);
+ &mov ($B,@T[1]);
&jmp (&label("loop"));