x86-mont.pl sse2 tune-up and integer-only squaring procedure.
[openssl.git] / crypto / bn / asm / x86-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # October 2005
11 #
12 # This is a "teaser" code, as it can be improved in several ways...
13 # First of all non-SSE2 path should be implemented (yes, for now it
14 # performs Montgomery multiplication/convolution only on SSE2-capable
15 # CPUs such as P4, others fall down to original code). Then inner loop
16 # can be unrolled and modulo-scheduled to improve ILP and possibly
17 # moved to 128-bit XMM register bank (though it would require input
18 # rearrangement and/or increase bus bandwidth utilization). Dedicated
19 # squaring procedure should give further performance improvement...
20 # Yet, for being draft, the code improves rsa512 *sign* benchmark by
21 # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
22
23 # December 2006
24 #
25 # Modulo-scheduling SSE2 loops results in further 15-20% improvement.
26 # Integer-only code [being equipped with dedicated squaring procedure]
27 # gives >=30% on rsa512 sign benchmark...
28
29 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30 push(@INC,"${dir}","${dir}../../perlasm");
31 require "x86asm.pl";
32
33 &asm_init($ARGV[0],$0);
34
35 $sse2=0;
36 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
37
38 &external_label("OPENSSL_ia32cap_P") if ($sse2);
39
40 &function_begin("bn_mul_mont");
41
42 $i="edx";
43 $j="ecx";
44 $ap="esi";
45 $rp="edi";      $bp="edi";              # overlapping variables!!!
46 $np="ebp";
47 $num="ebx";
48
49 $_rp=&DWP(4*0,"esp");                   # stack top layout
50 $_ap=&DWP(4*1,"esp");
51 $_bp=&DWP(4*2,"esp");
52 $_np=&DWP(4*3,"esp");
53 $_n0=&DWP(4*4,"esp");
54 $_num=&DWP(4*5,"esp");
55 $_sp=&DWP(4*6,"esp");
56 $_bpend=&DWP(4*7,"esp");
57 $frame=32;                              # size of above frame rounded up to 16n
58
59         &xor    ("eax","eax");
60         &mov    ("edi",&wparam(5));     # int num
61         &cmp    ("edi",4);
62         &jl     (&label("just_leave"));
63
64         ################################# load argument block...
65         &mov    ("eax",&wparam(0));     # BN_ULONG *rp
66         &mov    ("ebx",&wparam(1));     # const BN_ULONG *ap
67         &mov    ("ecx",&wparam(2));     # const BN_ULONG *bp
68         &mov    ("edx",&wparam(3));     # const BN_ULONG *np
69         &mov    ("esi",&wparam(4));     # const BN_ULONG *n0
70         #&mov   ("edi",&wparam(5));     # int num
71
72         &mov    ("ebp","esp");          # saved stack pointer!
73         &add    ("edi",2);              # extra two words on top of tp
74         &neg    ("edi");
75         &lea    ("esp",&DWP(-$frame,"esp","edi",4));    # alloca($frame+4*(num+2))
76         &neg    ("edi");
77         &and    ("esp",-4096);          # minimize TLB utilization
78
79         &mov    ("esi",&DWP(0,"esi"));  # pull n0[0]
80         &mov    ($_rp,"eax");           # ... save a copy of argument block
81         &mov    ($_ap,"ebx");
82         &mov    ($_bp,"ecx");
83         &mov    ($_np,"edx");
84         &mov    ($_n0,"esi");
85         &lea    ($num,&DWP(-3,"edi"));  # num=num-1 to assist modulo-scheduling
86         #&mov   ($_num,$num);           # redundant as $num is not reused
87         &mov    ($_sp,"ebp");           # saved stack pointer!
88 \f
89 if($sse2) {
90 $acc0="mm0";    # mmx register bank layout
91 $acc1="mm1";
92 $car0="mm2";
93 $car1="mm3";
94 $mul0="mm4";
95 $mul1="mm5";
96 $temp="mm6";
97 $mask="mm7";
98
99         &picmeup("eax","OPENSSL_ia32cap_P");
100         &bt     (&DWP(0,"eax"),26);
101         &jnc    (&label("non_sse2"));
102
103         &mov    ("eax",-1);
104         &movd   ($mask,"eax");          # mask 32 lower bits
105
106         &mov    ($ap,$_ap);             # load input pointers
107         &mov    ($bp,$_bp);
108         &mov    ($np,$_np);
109
110         &xor    ($i,$i);                # i=0
111         &xor    ($j,$j);                # j=0
112
113         &movd   ($mul0,&DWP(0,$bp));            # bp[0]
114         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
115         &movd   ($car1,&DWP(0,$np));            # np[0]
116
117         &pmuludq($mul1,$mul0);                  # ap[0]*bp[0]
118         &movq   ($car0,$mul1);
119         &movq   ($acc0,$mul1);                  # I wish movd worked for
120         &pand   ($acc0,$mask);                  # inter-register transfers
121
122         &pmuludq($mul1,$_n0);                   # *=n0
123
124         &pmuludq($car1,$mul1);                  # "t[0]"*np[0]*n0
125         &paddq  ($car1,$acc0);
126
127         &movd   ($acc1,&DWP(4,$np));            # np[1]
128         &movd   ($acc0,&DWP(4,$ap));            # ap[1]
129
130         &psrlq  ($car0,32);
131         &psrlq  ($car1,32);
132
133         &inc    ($j);                           # j++
134 &set_label("1st");
135         &pmuludq($acc0,$mul0);                  # ap[j]*bp[0]
136         &pmuludq($acc1,$mul1);                  # np[j]*m1
137         &paddq  ($car0,$acc0);                  # +=c0
138         &paddq  ($car1,$acc1);                  # +=c1
139
140         &movq   ($acc0,$car0);
141         &pand   ($acc0,$mask);
142         &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
143         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[0];
144         &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
145         &psrlq  ($car0,32);
146         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[j-1]=
147         &psrlq  ($car1,32);
148
149         &lea    ($j,&DWP(1,$j));
150         &cmp    ($j,$num);
151         &jl     (&label("1st"));
152
153         &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[0]
154         &pmuludq($acc1,$mul1);                  # np[num-1]*m1
155         &paddq  ($car0,$acc0);                  # +=c0
156         &paddq  ($car1,$acc1);                  # +=c1
157
158         &movq   ($acc0,$car0);
159         &pand   ($acc0,$mask);
160         &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[0];
161         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
162
163         &psrlq  ($car0,32);
164         &psrlq  ($car1,32);
165
166         &paddq  ($car1,$car0);
167         &movq   (&DWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
168 \f
169         &inc    ($i);                           # i++
170 &set_label("outer");
171         &xor    ($j,$j);                        # j=0
172
173         &movd   ($mul0,&DWP(0,$bp,$i,4));       # bp[i]
174         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
175         &movd   ($temp,&DWP($frame,"esp"));     # tp[0]
176         &movd   ($car1,&DWP(0,$np));            # np[0]
177         &pmuludq($mul1,$mul0);                  # ap[0]*bp[i]
178
179         &paddq  ($mul1,$temp);                  # +=tp[0]
180         &movq   ($acc0,$mul1);
181         &movq   ($car0,$mul1);
182         &pand   ($acc0,$mask);
183
184         &pmuludq($mul1,$_n0);                   # *=n0
185
186         &pmuludq($car1,$mul1);
187         &paddq  ($car1,$acc0);
188
189         &movd   ($temp,&DWP($frame+4,"esp"));   # tp[1]
190         &movd   ($acc1,&DWP(4,$np));            # np[1]
191         &movd   ($acc0,&DWP(4,$ap));            # ap[1]
192
193         &psrlq  ($car0,32);
194         &psrlq  ($car1,32);
195         &paddq  ($car0,$temp);                  # +=tp[1]
196
197         &inc    ($j);                           # j++
198         &dec    ($num);
199 &set_label("inner");
200         &pmuludq($acc0,$mul0);                  # ap[j]*bp[i]
201         &pmuludq($acc1,$mul1);                  # np[j]*m1
202         &paddq  ($car0,$acc0);                  # +=c0
203         &paddq  ($car1,$acc1);                  # +=c1
204
205         &movq   ($acc0,$car0);
206         &movd   ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
207         &pand   ($acc0,$mask);
208         &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
209         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[i]+tp[j]
210         &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
211         &psrlq  ($car0,32);
212         &movd   (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
213         &psrlq  ($car1,32);
214         &paddq  ($car0,$temp);                  # +=tp[j+1]
215
216         &dec    ($num);
217         &lea    ($j,&DWP(1,$j));                # j++
218         &jnz    (&label("inner"));
219
220         &mov    ($num,$j);
221         &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[i]
222         &pmuludq($acc1,$mul1);                  # np[num-1]*m1
223         &paddq  ($car0,$acc0);                  # +=c0
224         &paddq  ($car1,$acc1);                  # +=c1
225
226         &movq   ($acc0,$car0);
227         &pand   ($acc0,$mask);
228         &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[i]+tp[num-1]
229         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
230         &psrlq  ($car0,32);
231         &psrlq  ($car1,32);
232
233         &movd   ($temp,&DWP($frame+4,"esp",$num,4));    # += tp[num]
234         &paddq  ($car1,$car0);
235         &paddq  ($car1,$temp);
236         &movq   (&DWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
237
238         &lea    ($i,&DWP(1,$i));                # i++
239         &cmp    ($i,$num);
240         &jle    (&label("outer"));
241
242         &emms   ();                             # done with mmx bank
243         &jmp    (&label("common_tail"));
244
245 &set_label("non_sse2",16);
246 }
247 \f
248 if (0) {
249         &mov    ("esp",$_sp);
250         &xor    ("eax","eax");  # signal "not fast enough [yet]"
251         &jmp    (&label("just_leave"));
252         # While the below code provides competitive performance for
253         # all key lengthes on modern cores, it's still a tad slower
254         # for >=2048-bits keys on *elder* CPUs:-( "Competitive" means
255         # compared to the original integer-only assembler. 512-bit
256         # RSA sign is better by >=30%, but that's about all one can
257         # say about all CPUs...
258 } else {
259 $inp="esi";     # integer path uses these registers differently
260 $word="edi";
261 $carry="ebp";
262
263         &mov    ($inp,$_ap);
264         &lea    ($carry,&DWP(1,$num));
265         &mov    ($word,$_bp);
266         &xor    ($j,$j);                                # j=0
267         &mov    ("edx",$inp);
268         &and    ($carry,1);                             # see if num is even
269         &sub    ("edx",$word);                          # see if ap==bp
270         &lea    ("eax",&DWP(4,$word,$num,4));           # &bp[num]
271         &or     ($carry,"edx");
272         &mov    ($word,&DWP(0,$word));                  # bp[0]
273         &jz     (&label("bn_sqr_mont"));
274         &mov    ($_bpend,"eax");
275         &mov    ("eax",&DWP(0,$inp));
276         &xor    ("edx","edx");
277
278 &set_label("mull",16);
279         &mov    ($carry,"edx");
280         &mul    ($word);                                # ap[j]*bp[0]
281         &add    ($carry,"eax");
282         &lea    ($j,&DWP(1,$j));
283         &adc    ("edx",0);
284         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
285         &cmp    ($j,$num);
286         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
287         &jl     (&label("mull"));
288
289         &mov    ($carry,"edx");
290         &mul    ($word);                                # ap[num-1]*bp[0]
291          &mov   ($word,$_n0);
292         &add    ("eax",$carry);
293          &mov   ($inp,$_np);
294         &adc    ("edx",0);
295          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
296
297         &mov    (&DWP($frame,"esp",$num,4),"eax");      # tp[num-1]=
298         &xor    ($j,$j);
299         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
300         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
301
302         &mov    ("eax",&DWP(0,$inp));                   # np[0]
303         &mul    ($word);                                # np[0]*m
304         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
305         &mov    ("eax",&DWP(4,$inp));                   # np[1]
306         &adc    ("edx",0);
307         &inc    ($j);
308
309         &jmp    (&label("2ndmadd"));
310 \f\f
311 &set_label("1stmadd",16);
312         &mov    ($carry,"edx");
313         &mul    ($word);                                # ap[j]*bp[i]
314         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
315         &lea    ($j,&DWP(1,$j));
316         &adc    ("edx",0);
317         &add    ($carry,"eax");
318         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
319         &adc    ("edx",0);
320         &cmp    ($j,$num);
321         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
322         &jl     (&label("1stmadd"));
323
324         &mov    ($carry,"edx");
325         &mul    ($word);                                # ap[num-1]*bp[i]
326         &add    ("eax",&DWP($frame,"esp",$num,4));      # +=tp[num-1]
327          &mov   ($word,$_n0);
328         &adc    ("edx",0);
329          &mov   ($inp,$_np);
330         &add    ($carry,"eax");
331         &adc    ("edx",0);
332          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
333
334         &xor    ($j,$j);
335         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
336         &mov    (&DWP($frame,"esp",$num,4),$carry);     # tp[num-1]=
337         &adc    ($j,0);
338          &mov   ("eax",&DWP(0,$inp));                   # np[0]
339         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
340         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
341
342         &mul    ($word);                                # np[0]*m
343         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
344         &mov    ("eax",&DWP(4,$inp));                   # np[1]
345         &adc    ("edx",0);
346         &mov    ($j,1);
347 \f
348 &set_label("2ndmadd",16);
349         &mov    ($carry,"edx");
350         &mul    ($word);                                # np[j]*m
351         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
352         &lea    ($j,&DWP(1,$j));
353         &adc    ("edx",0);
354         &add    ($carry,"eax");
355         &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+1]
356         &adc    ("edx",0);
357         &cmp    ($j,$num);
358         &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j-1]=
359         &jl     (&label("2ndmadd"));
360
361         &mov    ($carry,"edx");
362         &mul    ($word);                                # np[j]*m
363         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
364         &adc    ("edx",0);
365         &add    ($carry,"eax");
366         &adc    ("edx",0);
367         &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
368
369         &xor    ("eax","eax");
370          &mov   ($j,$_bp);                              # &bp[i]
371         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
372         &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
373          &lea   ($j,&DWP(4,$j));
374         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
375          &cmp   ($j,$_bpend);
376         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
377         &je     (&label("x86done"));
378
379         &mov    ($word,&DWP(0,$j));                     # bp[i]
380         &mov    ($inp,$_ap);
381         &mov    ($_bp,$j);                              # &bp[++i]
382         &xor    ($j,$j);
383         &xor    ("edx","edx");
384         &mov    ("eax",&DWP(0,$inp));
385         &jmp    (&label("1stmadd"));
386 \f
387 &set_label("bn_sqr_mont",16);
388 $sbit=$num;
389         &mov    ($_num,$num);
390         &mov    ($_bp,$j);                              # i=0
391
392         &mov    ("eax",$word);                          # ap[0]
393         &mul    ($word);                                # ap[0]*ap[0]
394         &mov    (&DWP($frame,"esp"),"eax");             # tp[0]=
395         &mov    ($sbit,"edx");
396         &shr    ("edx",1);
397         &and    ($sbit,1);
398         &inc    ($j);
399 &set_label("sqr",16);
400         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
401         &mov    ($carry,"edx");
402         &mul    ($word);                                # ap[j]*ap[0]
403         &add    ("eax",$carry);
404         &lea    ($j,&DWP(1,$j));
405         &adc    ("edx",0);
406         &lea    ($carry,&DWP(0,$sbit,"eax",2));
407         &shr    ("eax",31);
408         &cmp    ($j,$_num);
409         &mov    ($sbit,"eax");
410         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
411         &jl     (&label("sqr"));
412
413         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[num-1]
414         &mov    ($carry,"edx");
415         &mul    ($word);                                # ap[num-1]*ap[0]
416         &add    ("eax",$carry);
417          &mov   ($word,$_n0);
418         &adc    ("edx",0);
419          &mov   ($inp,$_np);
420         &lea    ($carry,&DWP(0,$sbit,"eax",2));
421          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
422         &shr    ("eax",31);
423         &mov    (&DWP($frame,"esp",$j,4),$carry);       # tp[num-1]=
424
425         &lea    ($carry,&DWP(0,"eax","edx",2));
426          &mov   ("eax",&DWP(0,$inp));                   # np[0]
427         &shr    ("edx",31);
428         &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num]=
429         &mov    (&DWP($frame+8,"esp",$j,4),"edx");      # tp[num+1]=
430
431         &mul    ($word);                                # np[0]*m
432         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
433         &mov    ($num,$j);
434         &adc    ("edx",0);
435         &mov    ("eax",&DWP(4,$inp));                   # np[1]
436         &mov    ($j,1);
437 \f\f
438 &set_label("3rdmadd",16);
439         &mov    ($carry,"edx");
440         &mul    ($word);                                # np[j]*m
441         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
442         &adc    ("edx",0);
443         &add    ($carry,"eax");
444         &mov    ("eax",&DWP(4,$inp,$j,4));              # np[j+1]
445         &adc    ("edx",0);
446         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j-1]=
447
448         &mov    ($carry,"edx");
449         &mul    ($word);                                # np[j+1]*m
450         &add    ($carry,&DWP($frame+4,"esp",$j,4));     # +=tp[j+1]
451         &lea    ($j,&DWP(2,$j));
452         &adc    ("edx",0);
453         &add    ($carry,"eax");
454         &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+2]
455         &adc    ("edx",0);
456         &cmp    ($j,$num);
457         &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j]=
458         &jl     (&label("3rdmadd"));
459
460         &mov    ($carry,"edx");
461         &mul    ($word);                                # np[j]*m
462         &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
463         &adc    ("edx",0);
464         &add    ($carry,"eax");
465         &adc    ("edx",0);
466         &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
467
468         &mov    ($j,$_bp);                              # i
469         &xor    ("eax","eax");
470         &mov    ($inp,$_ap);
471         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
472         &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
473         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
474         &cmp    ($j,$num);
475         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
476         &je     (&label("x86done"));
477 \f
478         &mov    ($word,&DWP(4,$inp,$j,4));              # ap[i]
479         &lea    ($j,&DWP(1,$j));
480         &mov    ("eax",$word);
481         &mov    ($_bp,$j);                              # ++i
482         &mul    ($word);                                # ap[i]*ap[i]
483         &add    ("eax",&DWP($frame,"esp",$j,4));        # +=tp[i]
484         &adc    ("edx",0);
485         &mov    (&DWP($frame,"esp",$j,4),"eax");        # tp[i]=
486         &xor    ($carry,$carry);
487         &cmp    ($j,$num);
488         &lea    ($j,&DWP(1,$j));
489         &je     (&label("sqrlast"));
490
491         &mov    ($sbit,"edx");                          # zaps $num
492         &shr    ("edx",1);
493         &and    ($sbit,1);
494 &set_label("sqradd",16);
495         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
496         &mov    ($carry,"edx");
497         &mul    ($word);                                # ap[j]*ap[i]
498         &add    ("eax",$carry);
499         &lea    ($j,&DWP(1,$j));
500         &adc    ("edx",0);
501         &lea    ($carry,&DWP(0,$sbit,"eax",2));
502         &shr    ("eax",31);
503         &add    ($carry,&DWP($frame-4,"esp",$j,4));     # +=tp[j]
504         &adc    ("eax",0);
505         &cmp    ($j,$_num);
506         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
507         &mov    ($sbit,"eax");
508         &jle    (&label("sqradd"));
509
510         &mov    ($carry,"edx");
511         &lea    ("edx",&DWP(0,$sbit,"edx",2));
512         &shr    ($carry,31);
513 &set_label("sqrlast");
514         &mov    ($word,$_n0);
515         &mov    ($inp,$_np);
516         &imul   ($word,&DWP($frame,"esp"));             # n0*tp[0]
517
518         &add    ("edx",&DWP($frame,"esp",$j,4));        # +=tp[num]
519         &mov    ("eax",&DWP(0,$inp));                   # np[0]
520         &adc    ($carry,0);
521         &mov    (&DWP($frame,"esp",$j,4),"edx");        # tp[num]=
522         &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num+1]=
523
524         &mul    ($word);                                # np[0]*m
525         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
526         &lea    ($num,&DWP(-1,$j));
527         &adc    ("edx",0);
528         &mov    ($j,1);
529         &mov    ("eax",&DWP(4,$inp));                   # np[1]
530
531         &jmp    (&label("3rdmadd"));
532 \f
533 &set_label("x86done",4);
534         &mov    ($np,$_np);     # make adjustments for tail processing
535 }
536
537 &set_label("common_tail",16);
538         &mov    ("esi",&DWP($frame+4,"esp",$num,4));# load upmost overflow bit
539         &mov    ($rp,$_rp);                     # load result pointer
540                                                 # [$ap and $bp are zapped]
541         &xor    ($i,$i);                        # i=0
542         &mov    ($j,$num);                      # j=num-1
543         &cmp    ("esi",0);                      # clears CF unconditionally
544         &jnz    (&label("sub"));
545         &mov    ("eax",&DWP($frame,"esp",$j,4));
546         &cmp    ("eax",&DWP(0,$np,$j,4));       # tp[num-1]-np[num-1]?
547         &jae    (&label("sub"));                # if taken CF is cleared
548 &set_label("copy",16);
549         &mov    ("eax",&DWP($frame,"esp",$j,4));
550         &mov    (&DWP(0,$rp,$j,4),"eax");       # rp[i]=tp[i]
551         &mov    (&DWP($frame,"esp",$j,4),$j);   # zap temporary vector
552         &dec    ($j);
553         &jge    (&label("copy"));
554         &jmp    (&label("exit"));
555
556 &set_label("sub",16);
557         &mov    ("eax",&DWP($frame,"esp",$i,4));
558         &sbb    ("eax",&DWP(0,$np,$i,4));
559         &mov    (&DWP(0,$rp,$i,4),"eax");       # rp[i]=tp[i]-np[i]
560         &lea    ($i,&DWP(1,$i));                # i++
561         &dec    ($j);                           # doesn't affect CF!
562         &jge    (&label("sub"));
563         &mov    ($j,$num);                      # j=num-1
564         &sbb    ("esi",0);                      # esi holds upmost overflow bit
565         &jc     (&label("copy"));
566 &set_label("zap",16);
567         &mov    (&DWP($frame,"esp",$j,4),$i);   # zap temporary vector
568         &dec    ($j);
569         &jge    (&label("zap"));
570
571 &set_label("exit",4);
572         &mov    ("esp",$_sp);           # pull saved stack pointer
573         &mov    ("eax",1);
574 &set_label("just_leave");
575 &function_end("bn_mul_mont");
576
577 &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
578
579 &asm_finish();