Minor sha[256|512]-586 performance tweaks.
authorAndy Polyakov <appro@openssl.org>
Sun, 16 Sep 2007 18:47:24 +0000 (18:47 +0000)
committerAndy Polyakov <appro@openssl.org>
Sun, 16 Sep 2007 18:47:24 +0000 (18:47 +0000)
crypto/sha/asm/sha256-586.pl
crypto/sha/asm/sha512-586.pl

index 8fe13bc..4cc1576 100644 (file)
@@ -45,13 +45,17 @@ $Xoff=&DWP(32,"esp");
 $K256="ebp";
 
 sub BODY_00_15() {
+    my $in_16_64=shift;
+
        &mov    ("ecx",$E);
+        &add   ($T,&DWP(4*(8+15+16-9),"esp"))  if ($in_16_64); # T += X[-7]
        &ror    ("ecx",6);
        &mov    ("edi",$E);
        &ror    ("edi",11);
         &mov   ("esi",$Foff);
        &xor    ("ecx","edi");
        &ror    ("edi",25-11);
+        &mov   (&DWP(4*(8+15),"esp"),$T)       if ($in_16_64); # save X[0]
        &xor    ("ecx","edi");  # Sigma1(e)
         &mov   ("edi",$Goff);
        &add    ($T,"ecx");     # T += Sigma1(e)
@@ -88,6 +92,7 @@ sub BODY_00_15() {
 
        &add    ($K256,4);
        &add    ($A,$T);        # h += T
+        &mov   ($T,&DWP(4*(8+15+16-1),"esp"))  if ($in_16_64); # preload T
        &add    ($E,"esi");     # d += K256[i]
        &add    ($A,"esi");     # h += K256[i]
 }
@@ -159,10 +164,10 @@ sub BODY_00_15() {
        &cmp    ("esi",0xc19bf174);
        &jne    (&label("00_15"));
 
+       &mov    ($T,&DWP(4*(8+15+16-1),"esp")); # preloaded in BODY_00_15(1)
 &set_label("16_63",16);
-       &mov    ($T,&DWP(4*(8+15+16-1),"esp"));
-        &mov   ("ecx",&DWP(4*(8+15+16-14),"esp"));
        &mov    ("esi",$T);
+        &mov   ("ecx",&DWP(4*(8+15+16-14),"esp"));
        &shr    ($T,3);
        &ror    ("esi",7);
        &xor    ($T,"esi");
@@ -176,13 +181,13 @@ sub BODY_00_15() {
        &xor    ("ecx","edi");
        &ror    ("edi",19-17);
         &add   ($T,"esi");                     # T += X[-16]
-       &xor    ("ecx","edi")                   # sigma1(X[-2])
+       &xor    ("edi","ecx")                   # sigma1(X[-2])
 
-       &add    ($T,"ecx");                     # T += sigma1(X[-2])
-       &add    ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7]
-       &mov    (&DWP(4*(8+15),"esp"),$T);      # save X[0]
+       &add    ($T,"edi");                     # T += sigma1(X[-2])
+       # &add  ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7], moved to BODY_00_15(1)
+       # &mov  (&DWP(4*(8+15),"esp"),$T);      # save X[0]
 
-       &BODY_00_15();
+       &BODY_00_15(1);
 
        &cmp    ("esi",0xc67178f2);
        &jne    (&label("16_63"));
index 49a9814..5554f02 100644 (file)
@@ -68,6 +68,8 @@ $E="mm4";     # F-H are commonly loaded to respectively mm1-mm3 and
                # mm5-mm7, but it's done on on-demand basis...
 
 sub BODY_00_15_sse2 {
+    my $prefetch=shift;
+
        &movq   ("mm5",$Fsse2);                 # load f
        &movq   ("mm6",$Gsse2);                 # load g
        &movq   ("mm7",$Hsse2);                 # load h
@@ -96,7 +98,7 @@ sub BODY_00_15_sse2 {
        &pxor   ("mm5","mm6");                  # f^=g
        &movq   ($E,$Dsse2);                    # e = load d
        &paddq  ("mm3","mm5");                  # T1+=Ch(e,f,g)
-
+       &movq   (&QWP(0,"esp"),$A);             # modulo-scheduled save a
        &paddq  ("mm3","mm7");                  # T1+=h
 
        &movq   ("mm5",$A);                     # %mm5 is sliding right
@@ -114,15 +116,16 @@ sub BODY_00_15_sse2 {
        &pxor   ("mm7","mm6");
        &psllq  ("mm6",6);
        &pxor   ("mm7","mm5");
-       &movq   (&QWP(0,"esp"),$A);             # modulo-scheduled save a
+       &sub    ("esp",8);
        &pxor   ("mm7","mm6");                  # T2=Sigma0_512(a)
 
        &movq   ("mm5",$A);                     # %mm5=a
        &por    ($A,"mm2");                     # a=a|c
+       &movq   ("mm6",&QWP(8*(9+16-14),"esp")) if ($prefetch);
        &pand   ("mm5","mm2");                  # %mm5=a&c
        &pand   ($A,"mm1");                     # a=(a|c)&b
+       &movq   ("mm2",&QWP(8*(9+16-1),"esp"))  if ($prefetch);
        &por    ("mm5",$A);                     # %mm5=(a&c)|((a|c)&b)
-       &sub    ("esp",8);
        &paddq  ("mm7","mm5");                  # T2+=Maj(a,b,c)
        &movq   ($A,"mm3");                     # a=T1
 
@@ -327,48 +330,48 @@ if ($sse2) {
        &cmp    (&LB("edx"),0x35);
        &jne    (&label("00_14_sse2"));
 
-       &BODY_00_15_sse2();
+       &BODY_00_15_sse2(1);
 
 &set_label("16_79_sse2",16);
-       &movq   ("mm3",&QWP(8*(9+16-1),"esp"));
-       &movq   ("mm6",&QWP(8*(9+16-14),"esp"));
-       &movq   ("mm1","mm3");
+       #&movq  ("mm2",&QWP(8*(9+16-1),"esp")); #prefetched in BODY_00_15 
+       #&movq  ("mm6",&QWP(8*(9+16-14),"esp"));
+       &movq   ("mm1","mm2");
 
-       &psrlq  ("mm3",1);
+       &psrlq  ("mm2",1);
        &movq   ("mm7","mm6");
        &psrlq  ("mm6",6);
-       &movq   ("mm2","mm3");
+       &movq   ("mm3","mm2");
 
-       &psrlq  ("mm3",7-1);
+       &psrlq  ("mm2",7-1);
        &movq   ("mm5","mm6");
        &psrlq  ("mm6",19-6);
-       &pxor   ("mm2","mm3");
+       &pxor   ("mm3","mm2");
 
-       &psrlq  ("mm3",8-7);
+       &psrlq  ("mm2",8-7);
        &pxor   ("mm5","mm6");
        &psrlq  ("mm6",61-19);
-       &pxor   ("mm2","mm3");
+       &pxor   ("mm3","mm2");
 
-       &movq   ("mm3",&QWP(8*(9+16),"esp"));
+       &movq   ("mm2",&QWP(8*(9+16),"esp"));
 
        &psllq  ("mm1",56);
        &pxor   ("mm5","mm6");
        &psllq  ("mm7",3);
-       &pxor   ("mm2","mm1");
+       &pxor   ("mm3","mm1");
 
-       &paddq  ("mm3",&QWP(8*(9+16-9),"esp"));
+       &paddq  ("mm2",&QWP(8*(9+16-9),"esp"));
 
        &psllq  ("mm1",63-56);
        &pxor   ("mm5","mm7");
        &psllq  ("mm7",45-3);
-       &pxor   ("mm2","mm1");
+       &pxor   ("mm3","mm1");
        &pxor   ("mm5","mm7");
 
-       &paddq  ("mm2","mm5");
-       &paddq  ("mm2","mm3");
-       &movq   (&QWP(8*9,"esp"),"mm2");
+       &paddq  ("mm3","mm5");
+       &paddq  ("mm3","mm2");
+       &movq   (&QWP(8*9,"esp"),"mm3");
 
-       &BODY_00_15_sse2();
+       &BODY_00_15_sse2(1);
 
        &cmp    (&LB("edx"),0x17);
        &jne    (&label("16_79_sse2"));