Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
authorAndy Polyakov <appro@openssl.org>
Mon, 27 Nov 2006 14:59:35 +0000 (14:59 +0000)
committerAndy Polyakov <appro@openssl.org>
Mon, 27 Nov 2006 14:59:35 +0000 (14:59 +0000)
doesn't give performance improvement.

crypto/bn/asm/x86-mont.pl

index 8d01b7a87f9897f6214b9bd17ed127ab97462576..01eb1473d51b9233f79fa0b45147d697829d4f56 100755 (executable)
@@ -2,8 +2,9 @@
 
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 
 # October 2005
@@ -31,12 +32,12 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 
 &function_begin("bn_mul_mont",$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
 
-$i="ebx";
+$i="edx";
 $j="ecx";
 $ap="esi";
 $rp="edi";     $bp="edi";              # overlapping variables!!!
-$np="edx";
-$num="ebp";
+$np="ebp";
+$num="ebx";
 
 $_rp=&DWP(4*0,"esp");                  # stack top layout
 $_ap=&DWP(4*1,"esp");
@@ -45,21 +46,13 @@ $_np=&DWP(4*3,"esp");
 $_n0=&DWP(4*4,"esp");
 $_num=&DWP(4*5,"esp");
 $_sp=&DWP(4*6,"esp");
+$_bpend=&DWP(4*7,"esp");
 $frame=32;                             # size of above frame rounded up to 16n
 
-$acc0="mm0";                           # mmx register bank layout
-$acc1="mm1";
-$car0="mm2";
-$car1="mm3";
-$mul0="mm4";
-$mul1="mm5";
-$temp="mm6";
-$mask="mm7";
-
-if($sse2) {
-       &picmeup("eax","OPENSSL_ia32cap_P");
-       &bt     (&DWP(0,"eax"),26);
-       &jnc    (&label("non_sse2"));
+       &xor    ("eax","eax");
+       &mov    ("edi",&wparam(5));     # int num
+       &cmp    ("edi",3);
+       &jb     (&label("just_leave"));
 
        ################################# load argument block...
        &mov    ("eax",&wparam(0));     # BN_ULONG *rp
@@ -67,16 +60,14 @@ if($sse2) {
        &mov    ("ecx",&wparam(2));     # const BN_ULONG *bp
        &mov    ("edx",&wparam(3));     # const BN_ULONG *np
        &mov    ("esi",&wparam(4));     # const BN_ULONG *n0
-       &mov    ($num,&wparam(5));      # int num
+       #&mov   ("edi",&wparam(5));     # int num
 
-       &mov    ("edi","esp");          # saved stack pointer!
-       &add    ($num,1);               # extra word on top of tp
-       &neg    ($num);
-       &lea    ("esp",&DWP(-$frame,"esp",$num,4));     # alloca($frame+8*($num+1))
-       &neg    ($num);
-       &and    ("esp",-1024);          # minimize TLB utilization
-       &sub    ($num,1);               # num is restored to its original value
-                                       # and will remain constant from now...
+       &mov    ("ebp","esp");          # saved stack pointer!
+       &add    ("edi",2);              # extra two words on top of tp
+       &neg    ("edi");
+       &lea    ("esp",&DWP(-$frame,"esp","edi",4));    # alloca($frame+4*(num+2))
+       &neg    ("edi");
+       &and    ("esp",-4096);          # minimize TLB utilization
 
        &mov    ("esi",&DWP(0,"esi"));  # pull n0[0]
        &mov    ($_rp,"eax");           # ... save a copy of argument block
@@ -84,8 +75,23 @@ if($sse2) {
        &mov    ($_bp,"ecx");
        &mov    ($_np,"edx");
        &mov    ($_n0,"esi");
-       #&mov   ($_num,$num);           # redundant in sse2 context
-       &mov    ($_sp,"edi");           # saved stack pointer!
+       &lea    ($num,&DWP(-2,"edi"));  # num is restored to its original value
+       #&mov   ($_num,$num);           # redundant as $num is not reused
+       &mov    ($_sp,"ebp");           # saved stack pointer!
+
+if($sse2) {
+$acc0="mm0";   # mmx register bank layout
+$acc1="mm1";
+$car0="mm2";
+$car1="mm3";
+$mul0="mm4";
+$mul1="mm5";
+$temp="mm6";
+$mask="mm7";
+
+       &picmeup("eax","OPENSSL_ia32cap_P");
+       &bt     (&DWP(0,"eax"),26);
+       &jnc    (&label("non_sse2"));
 
        &mov    ("eax",-1);
        &movd   ($mask,"eax");          # mask 32 lower bits
@@ -195,7 +201,153 @@ if($sse2) {
        &jl     (&label("outer"));
 
        &emms   ();                             # done with mmx bank
+       &jmp    (&label("common_tail"));
+
+&set_label("non_sse2",16);
+}
 
+if (1) {
+       &mov    ("esp",$_sp);
+       &xor    ("eax","eax");  # signal "not fast enough [yet]"
+       &jmp    (&label("just_leave"));
+       # The code below gives ~15% improvement on 512-bit benchmark
+       # *only*:-( On all other key lengths it's slower for up to 20%.
+       # This is because the original code path holds down the overall
+       # amount of multiplications by ~25% by deploying bn_sqr_words.
+       # In other words, for the code below to be competitive,
+       # dedicated squaring procedure is a must...
+} else {
+$inp="esi";    # integer path uses these registers differently
+$word="edi";
+$carry="ebp";
+
+       &sub    ($num,1);               # non-SSE2 path uses num-1
+
+       &mov    ($inp,$_ap);
+       &mov    ($word,$_bp);
+       &lea    ("eax",&DWP(4,$word,$num,4));           # &bp[num]
+       &mov    ($word,&DWP(0,$word));                  # bp[0]
+       &mov    ($_bpend,"eax");
+       &xor    ($j,$j);
+       &xor    ("edx","edx");
+
+&set_label("mull",16);
+       &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # ap[j]*bp[0]
+       &lea    ($j,&DWP(1,$j));
+       &add    ("eax",$carry);
+       &adc    ("edx",0);
+       &mov    (&DWP($frame-4,"esp",$j,4),"eax");      # tp[j]=
+       &cmp    ($j,$num);
+       &jb     (&label("mull"));
+
+       &mov    ("eax",&DWP(0,$inp,$num,4));            # ap[num-1]
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # ap[num-1]*bp[0]
+       &add    ("eax",$carry);
+       &adc    ("edx",0);
+
+       &mov    ($word,$_n0);
+       &mov    ($inp,$_np);
+       &imul   ($word,&DWP($frame,"esp"));             # n0*tp[0]
+
+       &mov    (&DWP($frame,"esp",$num,4),"eax");      # tp[num-1]=
+       &xor    ($j,$j);
+       &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
+       &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
+
+       &mov    ("eax",&DWP(0,$inp));                   # np[0]
+       &mul    ($word);                                # np[0]*m
+       &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
+       &adc    ("edx",0);
+       &mov    ($j,1);
+
+       &jmp    (&label("2ndmadd"));
+
+&set_label("1stmadd",16);
+       &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # ap[j]*bp[i]
+       &lea    ($j,&DWP(1,$j));
+       &add    ("eax",&DWP($frame-4,"esp",$j,4));      # +=tp[j]
+       &adc    ("edx",0);
+       &add    ("eax",$carry);
+       &adc    ("edx",0);
+       &mov    (&DWP($frame-4,"esp",$j,4),"eax");      # tp[j]=
+       &cmp    ($j,$num);
+       &jb     (&label("1stmadd"));
+
+       &mov    ("eax",&DWP(0,$inp,$num,4));            # ap[num-1]
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # ap[num-1]*bp[i]
+       &add    ("eax",&DWP($frame,"esp",$num,4));      # +=tp[num-1]
+       &adc    ("edx",0);
+       &add    ("eax",$carry);
+       &adc    ("edx",0);
+
+       &mov    ($word,$_n0);
+       &mov    ($inp,$_np);
+       &imul   ($word,&DWP($frame,"esp"));             # n0*tp[0]
+
+       &xor    ($j,$j);
+       &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
+       &mov    (&DWP($frame,"esp",$num,4),"eax");      # tp[num-1]=
+       &adc    ($j,0);
+       &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
+       &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
+
+       &mov    ("eax",&DWP(0,$inp));                   # np[0]
+       &mul    ($word);                                # np[0]*m
+       &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
+       &adc    ("edx",0);
+       &mov    ($j,1);
+
+&set_label("2ndmadd",16);
+       &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j]
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # np[j]*m
+       &lea    ($j,&DWP(1,$j));
+       &add    ("eax",&DWP($frame-4,"esp",$j,4));      # +=tp[j]
+       &adc    ("edx",0);
+       &add    ("eax",$carry);
+       &adc    ("edx",0);
+       &mov    (&DWP($frame-8,"esp",$j,4),"eax");      # tp[j-1]=
+       &cmp    ($j,$num);
+       &jb     (&label("2ndmadd"));
+
+       &mov    ("eax",&DWP(0,$inp,$num,4));            # np[num-1]
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # np[num-1]*m
+       &add    ("eax",&DWP($frame,"esp",$num,4));      # +=tp[num-1]
+       &adc    ("edx",0);
+       &add    ("eax",$carry);
+       &adc    ("edx",0);
+       &mov    (&DWP($frame-4,"esp",$num,4),"eax");    # tp[num-2]=
+
+       &xor    ("eax","eax");
+       &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
+       &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
+       &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
+       &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
+
+       &mov    ($carry,$_bp);                          # &bp[i]
+       &add    ($carry,4);
+       &cmp    ($carry,$_bpend);
+       &je     (&label("x86done"));
+       &mov    ($word,&DWP(0,$carry));                 # bp[i]
+       &mov    ($inp,$_ap);
+       &mov    ($_bp,$carry);                          # &bp[++i]
+       &xor    ($j,$j);
+       &xor    ("edx","edx");
+       &jmp    (&label("1stmadd"));
+
+&set_label("x86done",16);
+       &mov    ($np,$_np);     # make adjustments for tail processing
+       &add    ($num,1);
+}
+
+&set_label("common_tail",16);
        &mov    ("esi",&DWP($frame,"esp",$num,4));# load upmost overflow bit
        &mov    ($rp,$_rp);                     # load result pointer
                                                # [$ap and $bp are zapped]
@@ -206,15 +358,15 @@ if($sse2) {
        &mov    ("eax",&DWP($frame,"esp",$j,4));
        &cmp    ("eax",&DWP(0,$np,$j,4));       # tp[num-1]-np[num-1]?
        &jae    (&label("sub"));                # if taken CF is cleared
-&set_label("copy");
+&set_label("copy",16);
        &mov    ("eax",&DWP($frame,"esp",$j,4));
        &mov    (&DWP(0,$rp,$j,4),"eax");       # rp[i]=tp[i]
        &mov    (&DWP($frame,"esp",$j,4),$j);   # zap temporary vector
        &dec    ($j);
        &jge    (&label("copy"));
-       &jmp    (&label("exit_sse2"));
+       &jmp    (&label("exit"));
 
-&set_label("sub",4);
+&set_label("sub",16);
        &mov    ("eax",&DWP($frame,"esp",$i,4));
        &sbb    ("eax",&DWP(0,$np,$i,4));
        &mov    (&DWP(0,$rp,$i,4),"eax");       # rp[i]=tp[i]-np[i]
@@ -224,21 +376,15 @@ if($sse2) {
        &lea    ($j,&DWP(-1,$num));             # j=num-1
        &sbb    ("esi",0);                      # esi holds upmost overflow bit
        &jc     (&label("copy"));
-&set_label("zap");
+&set_label("zap",16);
        &mov    (&DWP($frame,"esp",$j,4),$i);   # zap temporary vector
        &dec    ($j);
        &jge    (&label("zap"));
 
-&set_label("exit_sse2");
+&set_label("exit",4);
        &mov    ("esp",$_sp);           # pull saved stack pointer
        &mov    ("eax",1);
-       &jmp    (&label("leave"));
-&set_label("non_sse2");
-}
-
-       &xor    ("eax","eax");  # zero signals "not implemented [yet]"
-
-&set_label("leave");
+&set_label("just_leave");
 &function_end("bn_mul_mont");
 
 &asm_finish();