From 8dc899dee4605b02ae33a82821a712739442eaa7 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sun, 16 Sep 2007 18:47:24 +0000 Subject: [PATCH 1/1] Minor sha[256|512]-586 performance tweaks. --- crypto/sha/asm/sha256-586.pl | 19 +++++++++------ crypto/sha/asm/sha512-586.pl | 45 +++++++++++++++++++----------------- 2 files changed, 36 insertions(+), 28 deletions(-) diff --git a/crypto/sha/asm/sha256-586.pl b/crypto/sha/asm/sha256-586.pl index 8fe13bc533..4cc1576b89 100644 --- a/crypto/sha/asm/sha256-586.pl +++ b/crypto/sha/asm/sha256-586.pl @@ -45,13 +45,17 @@ $Xoff=&DWP(32,"esp"); $K256="ebp"; sub BODY_00_15() { + my $in_16_64=shift; + &mov ("ecx",$E); + &add ($T,&DWP(4*(8+15+16-9),"esp")) if ($in_16_64); # T += X[-7] &ror ("ecx",6); &mov ("edi",$E); &ror ("edi",11); &mov ("esi",$Foff); &xor ("ecx","edi"); &ror ("edi",25-11); + &mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_64); # save X[0] &xor ("ecx","edi"); # Sigma1(e) &mov ("edi",$Goff); &add ($T,"ecx"); # T += Sigma1(e) @@ -88,6 +92,7 @@ sub BODY_00_15() { &add ($K256,4); &add ($A,$T); # h += T + &mov ($T,&DWP(4*(8+15+16-1),"esp")) if ($in_16_64); # preload T &add ($E,"esi"); # d += K256[i] &add ($A,"esi"); # h += K256[i] } @@ -159,10 +164,10 @@ sub BODY_00_15() { &cmp ("esi",0xc19bf174); &jne (&label("00_15")); + &mov ($T,&DWP(4*(8+15+16-1),"esp")); # preloaded in BODY_00_15(1) &set_label("16_63",16); - &mov ($T,&DWP(4*(8+15+16-1),"esp")); - &mov ("ecx",&DWP(4*(8+15+16-14),"esp")); &mov ("esi",$T); + &mov ("ecx",&DWP(4*(8+15+16-14),"esp")); &shr ($T,3); &ror ("esi",7); &xor ($T,"esi"); @@ -176,13 +181,13 @@ sub BODY_00_15() { &xor ("ecx","edi"); &ror ("edi",19-17); &add ($T,"esi"); # T += X[-16] - &xor ("ecx","edi") # sigma1(X[-2]) + &xor ("edi","ecx") # sigma1(X[-2]) - &add ($T,"ecx"); # T += sigma1(X[-2]) - &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7] - &mov (&DWP(4*(8+15),"esp"),$T); # save X[0] + &add ($T,"edi"); # T += sigma1(X[-2]) + # &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7], moved to BODY_00_15(1) + # &mov (&DWP(4*(8+15),"esp"),$T); # save X[0] - &BODY_00_15(); + &BODY_00_15(1); &cmp ("esi",0xc67178f2); &jne (&label("16_63")); diff --git a/crypto/sha/asm/sha512-586.pl b/crypto/sha/asm/sha512-586.pl index 49a9814544..5554f022e9 100644 --- a/crypto/sha/asm/sha512-586.pl +++ b/crypto/sha/asm/sha512-586.pl @@ -68,6 +68,8 @@ $E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and # mm5-mm7, but it's done on on-demand basis... sub BODY_00_15_sse2 { + my $prefetch=shift; + &movq ("mm5",$Fsse2); # load f &movq ("mm6",$Gsse2); # load g &movq ("mm7",$Hsse2); # load h @@ -96,7 +98,7 @@ sub BODY_00_15_sse2 { &pxor ("mm5","mm6"); # f^=g &movq ($E,$Dsse2); # e = load d &paddq ("mm3","mm5"); # T1+=Ch(e,f,g) - + &movq (&QWP(0,"esp"),$A); # modulo-scheduled save a &paddq ("mm3","mm7"); # T1+=h &movq ("mm5",$A); # %mm5 is sliding right @@ -114,15 +116,16 @@ sub BODY_00_15_sse2 { &pxor ("mm7","mm6"); &psllq ("mm6",6); &pxor ("mm7","mm5"); - &movq (&QWP(0,"esp"),$A); # modulo-scheduled save a + &sub ("esp",8); &pxor ("mm7","mm6"); # T2=Sigma0_512(a) &movq ("mm5",$A); # %mm5=a &por ($A,"mm2"); # a=a|c + &movq ("mm6",&QWP(8*(9+16-14),"esp")) if ($prefetch); &pand ("mm5","mm2"); # %mm5=a&c &pand ($A,"mm1"); # a=(a|c)&b + &movq ("mm2",&QWP(8*(9+16-1),"esp")) if ($prefetch); &por ("mm5",$A); # %mm5=(a&c)|((a|c)&b) - &sub ("esp",8); &paddq ("mm7","mm5"); # T2+=Maj(a,b,c) &movq ($A,"mm3"); # a=T1 @@ -327,48 +330,48 @@ if ($sse2) { &cmp (&LB("edx"),0x35); &jne (&label("00_14_sse2")); - &BODY_00_15_sse2(); + &BODY_00_15_sse2(1); &set_label("16_79_sse2",16); - &movq ("mm3",&QWP(8*(9+16-1),"esp")); - &movq ("mm6",&QWP(8*(9+16-14),"esp")); - &movq ("mm1","mm3"); + #&movq ("mm2",&QWP(8*(9+16-1),"esp")); #prefetched in BODY_00_15 + #&movq ("mm6",&QWP(8*(9+16-14),"esp")); + &movq ("mm1","mm2"); - &psrlq ("mm3",1); + &psrlq ("mm2",1); &movq ("mm7","mm6"); &psrlq ("mm6",6); - &movq ("mm2","mm3"); + &movq ("mm3","mm2"); - &psrlq ("mm3",7-1); + &psrlq ("mm2",7-1); &movq ("mm5","mm6"); &psrlq ("mm6",19-6); - &pxor ("mm2","mm3"); + &pxor ("mm3","mm2"); - &psrlq ("mm3",8-7); + &psrlq ("mm2",8-7); &pxor ("mm5","mm6"); &psrlq ("mm6",61-19); - &pxor ("mm2","mm3"); + &pxor ("mm3","mm2"); - &movq ("mm3",&QWP(8*(9+16),"esp")); + &movq ("mm2",&QWP(8*(9+16),"esp")); &psllq ("mm1",56); &pxor ("mm5","mm6"); &psllq ("mm7",3); - &pxor ("mm2","mm1"); + &pxor ("mm3","mm1"); - &paddq ("mm3",&QWP(8*(9+16-9),"esp")); + &paddq ("mm2",&QWP(8*(9+16-9),"esp")); &psllq ("mm1",63-56); &pxor ("mm5","mm7"); &psllq ("mm7",45-3); - &pxor ("mm2","mm1"); + &pxor ("mm3","mm1"); &pxor ("mm5","mm7"); - &paddq ("mm2","mm5"); - &paddq ("mm2","mm3"); - &movq (&QWP(8*9,"esp"),"mm2"); + &paddq ("mm3","mm5"); + &paddq ("mm3","mm2"); + &movq (&QWP(8*9,"esp"),"mm3"); - &BODY_00_15_sse2(); + &BODY_00_15_sse2(1); &cmp (&LB("edx"),0x17); &jne (&label("16_79_sse2")); -- 2.34.1