perlasm: fix symptom-less bugs, missing semicolons and 'my' declarations.
[openssl.git] / crypto / sha / asm / sha512-586.pl
index 8f215ac2475bde50da488ee166ff203df84bad4c..7eab6a5b88b245e85b96e08a4e85bec7ea1220ad 100644 (file)
@@ -38,6 +38,11 @@ require "x86asm.pl";
 
 &asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
 
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
 $Tlo=&DWP(0,"esp");    $Thi=&DWP(4,"esp");
 $Alo=&DWP(8,"esp");    $Ahi=&DWP(8+4,"esp");
 $Blo=&DWP(16,"esp");   $Bhi=&DWP(16+4,"esp");
@@ -63,6 +68,8 @@ $E="mm4";     # F-H are commonly loaded to respectively mm1-mm3 and
                # mm5-mm7, but it's done on on-demand basis...
 
 sub BODY_00_15_sse2 {
+    my $prefetch=shift;
+
        &movq   ("mm5",$Fsse2);                 # load f
        &movq   ("mm6",$Gsse2);                 # load g
        &movq   ("mm7",$Hsse2);                 # load h
@@ -70,7 +77,7 @@ sub BODY_00_15_sse2 {
        &movq   ("mm1",$E);                     # %mm1 is sliding right
        &movq   ("mm2",$E);                     # %mm2 is sliding left
        &psrlq  ("mm1",14);
-       &movq   ($Esse2,$E);                    # module-scheduled save e
+       &movq   ($Esse2,$E);                    # modulo-scheduled save e
        &psllq  ("mm2",23);
        &movq   ("mm3","mm1");                  # %mm3 is T1
        &psrlq  ("mm1",4);
@@ -91,7 +98,7 @@ sub BODY_00_15_sse2 {
        &pxor   ("mm5","mm6");                  # f^=g
        &movq   ($E,$Dsse2);                    # e = load d
        &paddq  ("mm3","mm5");                  # T1+=Ch(e,f,g)
-
+       &movq   (&QWP(0,"esp"),$A);             # modulo-scheduled save a
        &paddq  ("mm3","mm7");                  # T1+=h
 
        &movq   ("mm5",$A);                     # %mm5 is sliding right
@@ -109,15 +116,16 @@ sub BODY_00_15_sse2 {
        &pxor   ("mm7","mm6");
        &psllq  ("mm6",6);
        &pxor   ("mm7","mm5");
-       &movq   (&QWP(0,"esp"),$A);             # module-scheduled save a
+       &sub    ("esp",8);
        &pxor   ("mm7","mm6");                  # T2=Sigma0_512(a)
 
        &movq   ("mm5",$A);                     # %mm5=a
        &por    ($A,"mm2");                     # a=a|c
+       &movq   ("mm6",&QWP(8*(9+16-14),"esp")) if ($prefetch);
        &pand   ("mm5","mm2");                  # %mm5=a&c
        &pand   ($A,"mm1");                     # a=(a|c)&b
+       &movq   ("mm2",&QWP(8*(9+16-1),"esp"))  if ($prefetch);
        &por    ("mm5",$A);                     # %mm5=(a&c)|((a|c)&b)
-       &sub    ("esp",8);
        &paddq  ("mm7","mm5");                  # T2+=Maj(a,b,c)
        &movq   ($A,"mm3");                     # a=T1
 
@@ -134,9 +142,9 @@ sub BODY_00_15_x86 {
        &mov    ("edx",$Ehi);
        &mov    ("esi","ecx");
 
-       &shr    ("ecx",9)       # lo>>9
+       &shr    ("ecx",9);      # lo>>9
        &mov    ("edi","edx");
-       &shr    ("edx",9)       # hi>>9
+       &shr    ("edx",9);      # hi>>9
        &mov    ("ebx","ecx");
        &shl    ("esi",14);     # lo<<14
        &mov    ("eax","edx");
@@ -199,9 +207,9 @@ sub BODY_00_15_x86 {
        &mov    ($Dhi,"ebx");
        &mov    ("esi","ecx");
 
-       &shr    ("ecx",2)       # lo>>2
+       &shr    ("ecx",2);      # lo>>2
        &mov    ("edi","edx");
-       &shr    ("edx",2)       # hi>>2
+       &shr    ("edx",2);      # hi>>2
        &mov    ("ebx","ecx");
        &shl    ("esi",4);      # lo<<4
        &mov    ("eax","edx");
@@ -253,7 +261,7 @@ sub BODY_00_15_x86 {
 }
 
 
-&function_begin("sha512_block_data_order",16);
+&function_begin("sha512_block_data_order");
        &mov    ("esi",wparam(0));      # ctx
        &mov    ("edi",wparam(1));      # inp
        &mov    ("eax",wparam(2));      # num
@@ -274,7 +282,8 @@ sub BODY_00_15_x86 {
        &mov    (&DWP(8,"esp"),"eax");  # inp+num*128
        &mov    (&DWP(12,"esp"),"ebx"); # saved sp
 
-       &picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("pic_point"));
+if ($sse2) {
+       &picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("K512"));
        &bt     (&DWP(0,"edx"),26);
        &jnc    (&label("loop_x86"));
 
@@ -321,48 +330,48 @@ sub BODY_00_15_x86 {
        &cmp    (&LB("edx"),0x35);
        &jne    (&label("00_14_sse2"));
 
-       &BODY_00_15_sse2();
+       &BODY_00_15_sse2(1);
 
 &set_label("16_79_sse2",16);
-       &movq   ("mm3",&QWP(8*(9+16-1),"esp"));
-       &movq   ("mm6",&QWP(8*(9+16-14),"esp"));
-       &movq   ("mm1","mm3");
+       #&movq  ("mm2",&QWP(8*(9+16-1),"esp")); #prefetched in BODY_00_15 
+       #&movq  ("mm6",&QWP(8*(9+16-14),"esp"));
+       &movq   ("mm1","mm2");
 
-       &psrlq  ("mm3",1);
+       &psrlq  ("mm2",1);
        &movq   ("mm7","mm6");
        &psrlq  ("mm6",6);
-       &movq   ("mm2","mm3");
+       &movq   ("mm3","mm2");
 
-       &psrlq  ("mm3",7-1);
+       &psrlq  ("mm2",7-1);
        &movq   ("mm5","mm6");
        &psrlq  ("mm6",19-6);
-       &pxor   ("mm2","mm3");
+       &pxor   ("mm3","mm2");
 
-       &psrlq  ("mm3",8-7);
+       &psrlq  ("mm2",8-7);
        &pxor   ("mm5","mm6");
        &psrlq  ("mm6",61-19);
-       &pxor   ("mm2","mm3");
+       &pxor   ("mm3","mm2");
 
-       &movq   ("mm3",&QWP(8*(9+16),"esp"));
+       &movq   ("mm2",&QWP(8*(9+16),"esp"));
 
        &psllq  ("mm1",56);
        &pxor   ("mm5","mm6");
        &psllq  ("mm7",3);
-       &pxor   ("mm2","mm1");
+       &pxor   ("mm3","mm1");
 
-       &paddq  ("mm3",&QWP(8*(9+16-9),"esp"));
+       &paddq  ("mm2",&QWP(8*(9+16-9),"esp"));
 
        &psllq  ("mm1",63-56);
        &pxor   ("mm5","mm7");
        &psllq  ("mm7",45-3);
-       &pxor   ("mm2","mm1");
+       &pxor   ("mm3","mm1");
        &pxor   ("mm5","mm7");
 
-       &paddq  ("mm2","mm5");
-       &paddq  ("mm2","mm3");
-       &movq   (&QWP(8*9,"esp"),"mm2");
+       &paddq  ("mm3","mm5");
+       &paddq  ("mm3","mm2");
+       &movq   (&QWP(8*9,"esp"),"mm3");
 
-       &BODY_00_15_sse2();
+       &BODY_00_15_sse2(1);
 
        &cmp    (&LB("edx"),0x17);
        &jne    (&label("16_79_sse2"));
@@ -403,7 +412,7 @@ sub BODY_00_15_x86 {
        &emms   ();
        &mov    ("esp",&DWP(8*10+12,"esp"));    # restore sp
 &function_end_A();
-
+}
 &set_label("loop_x86",16);
     # copy input block to stack reversing byte and qword order
     for ($i=0;$i<8;$i++) {
@@ -443,9 +452,9 @@ sub BODY_00_15_x86 {
        &mov    ("edx",&DWP(8*(9+15+16-1)+4,"esp"));
        &mov    ("esi","ecx");
 
-       &shr    ("ecx",1)       # lo>>1
+       &shr    ("ecx",1);      # lo>>1
        &mov    ("edi","edx");
-       &shr    ("edx",1)       # hi>>1
+       &shr    ("edx",1);      # hi>>1
        &mov    ("eax","ecx");
        &shl    ("esi",24);     # lo<<24
        &mov    ("ebx","edx");
@@ -479,9 +488,9 @@ sub BODY_00_15_x86 {
        &mov    ("edx",&DWP(8*(9+15+16-14)+4,"esp"));
        &mov    ("esi","ecx");
 
-       &shr    ("ecx",6)       # lo>>6
+       &shr    ("ecx",6);      # lo>>6
        &mov    ("edi","edx");
-       &shr    ("edx",6)       # hi>>6
+       &shr    ("edx",6);      # hi>>6
        &mov    ("eax","ecx");
        &shl    ("esi",3);      # lo<<3
        &mov    ("ebx","edx");