+if (1) {
+ &mov ("esp",$_sp);
+ &xor ("eax","eax"); # signal "not fast enough [yet]"
+ &jmp (&label("just_leave"));
+ # The code below gives ~15% improvement on 512-bit benchmark
+ # *only*:-( On all other key lengths it's slower for up to 20%.
+ # This is because the original code path holds down the overall
+ # amount of multiplications by ~25% by deploying bn_sqr_words.
+ # In other words, for the code below to be competitive,
+ # dedicated squaring procedure is a must...
+} else {
+$inp="esi"; # integer path uses these registers differently
+$word="edi";
+$carry="ebp";
+
+ &sub ($num,1); # non-SSE2 path uses num-1
+
+ &mov ($inp,$_ap);
+ &mov ($word,$_bp);
+ &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
+ &mov ($word,&DWP(0,$word)); # bp[0]
+ &mov ($_bpend,"eax");
+ &xor ($j,$j);
+ &xor ("edx","edx");
+
+&set_label("mull",16);
+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
+ &mov ($carry,"edx");
+ &mul ($word); # ap[j]*bp[0]
+ &lea ($j,&DWP(1,$j));
+ &add ("eax",$carry);
+ &adc ("edx",0);
+ &mov (&DWP($frame-4,"esp",$j,4),"eax"); # tp[j]=
+ &cmp ($j,$num);
+ &jb (&label("mull"));
+
+ &mov ("eax",&DWP(0,$inp,$num,4)); # ap[num-1]
+ &mov ($carry,"edx");
+ &mul ($word); # ap[num-1]*bp[0]
+ &add ("eax",$carry);
+ &adc ("edx",0);
+
+ &mov ($word,$_n0);
+ &mov ($inp,$_np);
+ &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
+
+ &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
+ &xor ($j,$j);
+ &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
+ &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
+
+ &mov ("eax",&DWP(0,$inp)); # np[0]
+ &mul ($word); # np[0]*m
+ &add ("eax",&DWP($frame,"esp")); # +=tp[0]
+ &adc ("edx",0);
+ &mov ($j,1);
+
+ &jmp (&label("2ndmadd"));
+
+&set_label("1stmadd",16);
+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
+ &mov ($carry,"edx");
+ &mul ($word); # ap[j]*bp[i]
+ &lea ($j,&DWP(1,$j));
+ &add ("eax",&DWP($frame-4,"esp",$j,4)); # +=tp[j]
+ &adc ("edx",0);
+ &add ("eax",$carry);
+ &adc ("edx",0);
+ &mov (&DWP($frame-4,"esp",$j,4),"eax"); # tp[j]=
+ &cmp ($j,$num);
+ &jb (&label("1stmadd"));
+
+ &mov ("eax",&DWP(0,$inp,$num,4)); # ap[num-1]
+ &mov ($carry,"edx");
+ &mul ($word); # ap[num-1]*bp[i]
+ &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
+ &adc ("edx",0);
+ &add ("eax",$carry);
+ &adc ("edx",0);
+
+ &mov ($word,$_n0);
+ &mov ($inp,$_np);
+ &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
+
+ &xor ($j,$j);
+ &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
+ &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
+ &adc ($j,0);
+ &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
+ &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
+
+ &mov ("eax",&DWP(0,$inp)); # np[0]
+ &mul ($word); # np[0]*m
+ &add ("eax",&DWP($frame,"esp")); # +=tp[0]
+ &adc ("edx",0);
+ &mov ($j,1);
+
+&set_label("2ndmadd",16);
+ &mov ("eax",&DWP(0,$inp,$j,4)); # np[j]
+ &mov ($carry,"edx");
+ &mul ($word); # np[j]*m
+ &lea ($j,&DWP(1,$j));
+ &add ("eax",&DWP($frame-4,"esp",$j,4)); # +=tp[j]
+ &adc ("edx",0);
+ &add ("eax",$carry);
+ &adc ("edx",0);
+ &mov (&DWP($frame-8,"esp",$j,4),"eax"); # tp[j-1]=
+ &cmp ($j,$num);
+ &jb (&label("2ndmadd"));
+
+ &mov ("eax",&DWP(0,$inp,$num,4)); # np[num-1]
+ &mov ($carry,"edx");
+ &mul ($word); # np[num-1]*m
+ &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
+ &adc ("edx",0);
+ &add ("eax",$carry);
+ &adc ("edx",0);
+ &mov (&DWP($frame-4,"esp",$num,4),"eax"); # tp[num-2]=
+
+ &xor ("eax","eax");
+ &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
+ &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
+ &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
+ &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
+
+ &mov ($carry,$_bp); # &bp[i]
+ &add ($carry,4);
+ &cmp ($carry,$_bpend);
+ &je (&label("x86done"));
+ &mov ($word,&DWP(0,$carry)); # bp[i]
+ &mov ($inp,$_ap);
+ &mov ($_bp,$carry); # &bp[++i]
+ &xor ($j,$j);
+ &xor ("edx","edx");
+ &jmp (&label("1stmadd"));
+
+&set_label("x86done",16);
+ &mov ($np,$_np); # make adjustments for tail processing
+ &add ($num,1);
+}
+
+&set_label("common_tail",16);