projects
/
openssl.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
Minor sha[256|512]-586 performance tweaks.
[openssl.git]
/
crypto
/
sha
/
asm
/
sha512-586.pl
diff --git
a/crypto/sha/asm/sha512-586.pl
b/crypto/sha/asm/sha512-586.pl
index 49a981454494072b7e73e9b8e17d306c43fedaea..5554f022e99b777cfadeb739cee708e0096ddcbd 100644
(file)
--- a/
crypto/sha/asm/sha512-586.pl
+++ b/
crypto/sha/asm/sha512-586.pl
@@
-68,6
+68,8
@@
$E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and
# mm5-mm7, but it's done on on-demand basis...
sub BODY_00_15_sse2 {
# mm5-mm7, but it's done on on-demand basis...
sub BODY_00_15_sse2 {
+ my $prefetch=shift;
+
&movq ("mm5",$Fsse2); # load f
&movq ("mm6",$Gsse2); # load g
&movq ("mm7",$Hsse2); # load h
&movq ("mm5",$Fsse2); # load f
&movq ("mm6",$Gsse2); # load g
&movq ("mm7",$Hsse2); # load h
@@
-96,7
+98,7
@@
sub BODY_00_15_sse2 {
&pxor ("mm5","mm6"); # f^=g
&movq ($E,$Dsse2); # e = load d
&paddq ("mm3","mm5"); # T1+=Ch(e,f,g)
&pxor ("mm5","mm6"); # f^=g
&movq ($E,$Dsse2); # e = load d
&paddq ("mm3","mm5"); # T1+=Ch(e,f,g)
-
+ &movq (&QWP(0,"esp"),$A); # modulo-scheduled save a
&paddq ("mm3","mm7"); # T1+=h
&movq ("mm5",$A); # %mm5 is sliding right
&paddq ("mm3","mm7"); # T1+=h
&movq ("mm5",$A); # %mm5 is sliding right
@@
-114,15
+116,16
@@
sub BODY_00_15_sse2 {
&pxor ("mm7","mm6");
&psllq ("mm6",6);
&pxor ("mm7","mm5");
&pxor ("mm7","mm6");
&psllq ("mm6",6);
&pxor ("mm7","mm5");
- &
movq (&QWP(0,"esp"),$A); # modulo-scheduled save a
+ &
sub ("esp",8);
&pxor ("mm7","mm6"); # T2=Sigma0_512(a)
&movq ("mm5",$A); # %mm5=a
&por ($A,"mm2"); # a=a|c
&pxor ("mm7","mm6"); # T2=Sigma0_512(a)
&movq ("mm5",$A); # %mm5=a
&por ($A,"mm2"); # a=a|c
+ &movq ("mm6",&QWP(8*(9+16-14),"esp")) if ($prefetch);
&pand ("mm5","mm2"); # %mm5=a&c
&pand ($A,"mm1"); # a=(a|c)&b
&pand ("mm5","mm2"); # %mm5=a&c
&pand ($A,"mm1"); # a=(a|c)&b
+ &movq ("mm2",&QWP(8*(9+16-1),"esp")) if ($prefetch);
&por ("mm5",$A); # %mm5=(a&c)|((a|c)&b)
&por ("mm5",$A); # %mm5=(a&c)|((a|c)&b)
- &sub ("esp",8);
&paddq ("mm7","mm5"); # T2+=Maj(a,b,c)
&movq ($A,"mm3"); # a=T1
&paddq ("mm7","mm5"); # T2+=Maj(a,b,c)
&movq ($A,"mm3"); # a=T1
@@
-327,48
+330,48
@@
if ($sse2) {
&cmp (&LB("edx"),0x35);
&jne (&label("00_14_sse2"));
&cmp (&LB("edx"),0x35);
&jne (&label("00_14_sse2"));
- &BODY_00_15_sse2();
+ &BODY_00_15_sse2(
1
);
&set_label("16_79_sse2",16);
&set_label("16_79_sse2",16);
- &movq ("mm3",&QWP(8*(9+16-1),"esp"));
-
&movq
("mm6",&QWP(8*(9+16-14),"esp"));
- &movq ("mm1","mm
3
");
+ #&movq ("mm2",&QWP(8*(9+16-1),"esp")); #prefetched in BODY_00_15
+
#&movq
("mm6",&QWP(8*(9+16-14),"esp"));
+ &movq ("mm1","mm
2
");
- &psrlq ("mm
3
",1);
+ &psrlq ("mm
2
",1);
&movq ("mm7","mm6");
&psrlq ("mm6",6);
&movq ("mm7","mm6");
&psrlq ("mm6",6);
- &movq ("mm
2","mm3
");
+ &movq ("mm
3","mm2
");
- &psrlq ("mm
3
",7-1);
+ &psrlq ("mm
2
",7-1);
&movq ("mm5","mm6");
&psrlq ("mm6",19-6);
&movq ("mm5","mm6");
&psrlq ("mm6",19-6);
- &pxor ("mm
2","mm3
");
+ &pxor ("mm
3","mm2
");
- &psrlq ("mm
3
",8-7);
+ &psrlq ("mm
2
",8-7);
&pxor ("mm5","mm6");
&psrlq ("mm6",61-19);
&pxor ("mm5","mm6");
&psrlq ("mm6",61-19);
- &pxor ("mm
2","mm3
");
+ &pxor ("mm
3","mm2
");
- &movq ("mm
3
",&QWP(8*(9+16),"esp"));
+ &movq ("mm
2
",&QWP(8*(9+16),"esp"));
&psllq ("mm1",56);
&pxor ("mm5","mm6");
&psllq ("mm7",3);
&psllq ("mm1",56);
&pxor ("mm5","mm6");
&psllq ("mm7",3);
- &pxor ("mm
2
","mm1");
+ &pxor ("mm
3
","mm1");
- &paddq ("mm
3
",&QWP(8*(9+16-9),"esp"));
+ &paddq ("mm
2
",&QWP(8*(9+16-9),"esp"));
&psllq ("mm1",63-56);
&pxor ("mm5","mm7");
&psllq ("mm7",45-3);
&psllq ("mm1",63-56);
&pxor ("mm5","mm7");
&psllq ("mm7",45-3);
- &pxor ("mm
2
","mm1");
+ &pxor ("mm
3
","mm1");
&pxor ("mm5","mm7");
&pxor ("mm5","mm7");
- &paddq ("mm
2
","mm5");
- &paddq ("mm
2","mm3
");
- &movq (&QWP(8*9,"esp"),"mm
2
");
+ &paddq ("mm
3
","mm5");
+ &paddq ("mm
3","mm2
");
+ &movq (&QWP(8*9,"esp"),"mm
3
");
- &BODY_00_15_sse2();
+ &BODY_00_15_sse2(
1
);
&cmp (&LB("edx"),0x17);
&jne (&label("16_79_sse2"));
&cmp (&LB("edx"),0x17);
&jne (&label("16_79_sse2"));