On Windows, page walking is known as __chkstk.
[openssl.git] / crypto / bn / asm / x86-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # October 2005
11 #
12 # This is a "teaser" code, as it can be improved in several ways...
13 # First of all non-SSE2 path should be implemented (yes, for now it
14 # performs Montgomery multiplication/convolution only on SSE2-capable
15 # CPUs such as P4, others fall down to original code). Then inner loop
16 # can be unrolled and modulo-scheduled to improve ILP and possibly
17 # moved to 128-bit XMM register bank (though it would require input
18 # rearrangement and/or increase bus bandwidth utilization). Dedicated
19 # squaring procedure should give further performance improvement...
20 # Yet, for being draft, the code improves rsa512 *sign* benchmark by
21 # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
22
23 # December 2006
24 #
25 # Modulo-scheduling SSE2 loops results in further 15-20% improvement.
26 # Integer-only code [being equipped with dedicated squaring procedure]
27 # gives ~40% on rsa512 sign benchmark...
28
29 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30 push(@INC,"${dir}","${dir}../../perlasm");
31 require "x86asm.pl";
32
33 $output = pop;
34 open STDOUT,">$output";
35  
36 &asm_init($ARGV[0],$0);
37
38 $sse2=0;
39 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
40
41 &external_label("OPENSSL_ia32cap_P") if ($sse2);
42
43 &function_begin("bn_mul_mont");
44
45 $i="edx";
46 $j="ecx";
47 $ap="esi";      $tp="esi";              # overlapping variables!!!
48 $rp="edi";      $bp="edi";              # overlapping variables!!!
49 $np="ebp";
50 $num="ebx";
51
52 $_num=&DWP(4*0,"esp");                  # stack top layout
53 $_rp=&DWP(4*1,"esp");
54 $_ap=&DWP(4*2,"esp");
55 $_bp=&DWP(4*3,"esp");
56 $_np=&DWP(4*4,"esp");
57 $_n0=&DWP(4*5,"esp");   $_n0q=&QWP(4*5,"esp");
58 $_sp=&DWP(4*6,"esp");
59 $_bpend=&DWP(4*7,"esp");
60 $frame=32;                              # size of above frame rounded up to 16n
61
62         &xor    ("eax","eax");
63         &mov    ("edi",&wparam(5));     # int num
64         &cmp    ("edi",4);
65         &jl     (&label("just_leave"));
66
67         &lea    ("esi",&wparam(0));     # put aside pointer to argument block
68         &lea    ("edx",&wparam(1));     # load ap
69         &mov    ("ebp","esp");          # saved stack pointer!
70         &add    ("edi",2);              # extra two words on top of tp
71         &neg    ("edi");
72         &lea    ("esp",&DWP(-$frame,"esp","edi",4));    # alloca($frame+4*(num+2))
73         &neg    ("edi");
74
75         # minimize cache contention by arraning 2K window between stack
76         # pointer and ap argument [np is also position sensitive vector,
77         # but it's assumed to be near ap, as it's allocated at ~same
78         # time].
79         &mov    ("eax","esp");
80         &sub    ("eax","edx");
81         &and    ("eax",2047);
82         &sub    ("esp","eax");          # this aligns sp and ap modulo 2048
83
84         &xor    ("edx","esp");
85         &and    ("edx",2048);
86         &xor    ("edx",2048);
87         &sub    ("esp","edx");          # this splits them apart modulo 4096
88
89         &and    ("esp",-64);            # align to cache line
90
91         # An OS-agnostic version of __chkstk.
92         #
93         # Some OSes (Windows) insist on stack being "wired" to
94         # physical memory in strictly sequential manner, i.e. if stack
95         # allocation spans two pages, then reference to farmost one can
96         # be punishable by SEGV. But page walking can do good even on
97         # other OSes, because it guarantees that villain thread hits
98         # the guard page before it can make damage to innocent one...
99         &mov    ("eax","ebp");
100         &sub    ("eax","esp");
101         &and    ("eax",-4096);
102 &set_label("page_walk");
103         &mov    ("edx",&DWP(0,"esp","eax"));
104         &sub    ("eax",4096);
105         &data_byte(0x2e);
106         &jnc    (&label("page_walk"));
107
108         ################################# load argument block...
109         &mov    ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
110         &mov    ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
111         &mov    ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
112         &mov    ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
113         &mov    ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
114         #&mov   ("edi",&DWP(5*4,"esi"));# int num
115
116         &mov    ("esi",&DWP(0,"esi"));  # pull n0[0]
117         &mov    ($_rp,"eax");           # ... save a copy of argument block
118         &mov    ($_ap,"ebx");
119         &mov    ($_bp,"ecx");
120         &mov    ($_np,"edx");
121         &mov    ($_n0,"esi");
122         &lea    ($num,&DWP(-3,"edi"));  # num=num-1 to assist modulo-scheduling
123         #&mov   ($_num,$num);           # redundant as $num is not reused
124         &mov    ($_sp,"ebp");           # saved stack pointer!
125 \f
126 if($sse2) {
127 $acc0="mm0";    # mmx register bank layout
128 $acc1="mm1";
129 $car0="mm2";
130 $car1="mm3";
131 $mul0="mm4";
132 $mul1="mm5";
133 $temp="mm6";
134 $mask="mm7";
135
136         &picmeup("eax","OPENSSL_ia32cap_P");
137         &bt     (&DWP(0,"eax"),26);
138         &jnc    (&label("non_sse2"));
139
140         &mov    ("eax",-1);
141         &movd   ($mask,"eax");          # mask 32 lower bits
142
143         &mov    ($ap,$_ap);             # load input pointers
144         &mov    ($bp,$_bp);
145         &mov    ($np,$_np);
146
147         &xor    ($i,$i);                # i=0
148         &xor    ($j,$j);                # j=0
149
150         &movd   ($mul0,&DWP(0,$bp));            # bp[0]
151         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
152         &movd   ($car1,&DWP(0,$np));            # np[0]
153
154         &pmuludq($mul1,$mul0);                  # ap[0]*bp[0]
155         &movq   ($car0,$mul1);
156         &movq   ($acc0,$mul1);                  # I wish movd worked for
157         &pand   ($acc0,$mask);                  # inter-register transfers
158
159         &pmuludq($mul1,$_n0q);                  # *=n0
160
161         &pmuludq($car1,$mul1);                  # "t[0]"*np[0]*n0
162         &paddq  ($car1,$acc0);
163
164         &movd   ($acc1,&DWP(4,$np));            # np[1]
165         &movd   ($acc0,&DWP(4,$ap));            # ap[1]
166
167         &psrlq  ($car0,32);
168         &psrlq  ($car1,32);
169
170         &inc    ($j);                           # j++
171 &set_label("1st",16);
172         &pmuludq($acc0,$mul0);                  # ap[j]*bp[0]
173         &pmuludq($acc1,$mul1);                  # np[j]*m1
174         &paddq  ($car0,$acc0);                  # +=c0
175         &paddq  ($car1,$acc1);                  # +=c1
176
177         &movq   ($acc0,$car0);
178         &pand   ($acc0,$mask);
179         &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
180         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[0];
181         &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
182         &psrlq  ($car0,32);
183         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[j-1]=
184         &psrlq  ($car1,32);
185
186         &lea    ($j,&DWP(1,$j));
187         &cmp    ($j,$num);
188         &jl     (&label("1st"));
189
190         &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[0]
191         &pmuludq($acc1,$mul1);                  # np[num-1]*m1
192         &paddq  ($car0,$acc0);                  # +=c0
193         &paddq  ($car1,$acc1);                  # +=c1
194
195         &movq   ($acc0,$car0);
196         &pand   ($acc0,$mask);
197         &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[0];
198         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
199
200         &psrlq  ($car0,32);
201         &psrlq  ($car1,32);
202
203         &paddq  ($car1,$car0);
204         &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
205 \f
206         &inc    ($i);                           # i++
207 &set_label("outer");
208         &xor    ($j,$j);                        # j=0
209
210         &movd   ($mul0,&DWP(0,$bp,$i,4));       # bp[i]
211         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
212         &movd   ($temp,&DWP($frame,"esp"));     # tp[0]
213         &movd   ($car1,&DWP(0,$np));            # np[0]
214         &pmuludq($mul1,$mul0);                  # ap[0]*bp[i]
215
216         &paddq  ($mul1,$temp);                  # +=tp[0]
217         &movq   ($acc0,$mul1);
218         &movq   ($car0,$mul1);
219         &pand   ($acc0,$mask);
220
221         &pmuludq($mul1,$_n0q);                  # *=n0
222
223         &pmuludq($car1,$mul1);
224         &paddq  ($car1,$acc0);
225
226         &movd   ($temp,&DWP($frame+4,"esp"));   # tp[1]
227         &movd   ($acc1,&DWP(4,$np));            # np[1]
228         &movd   ($acc0,&DWP(4,$ap));            # ap[1]
229
230         &psrlq  ($car0,32);
231         &psrlq  ($car1,32);
232         &paddq  ($car0,$temp);                  # +=tp[1]
233
234         &inc    ($j);                           # j++
235         &dec    ($num);
236 &set_label("inner");
237         &pmuludq($acc0,$mul0);                  # ap[j]*bp[i]
238         &pmuludq($acc1,$mul1);                  # np[j]*m1
239         &paddq  ($car0,$acc0);                  # +=c0
240         &paddq  ($car1,$acc1);                  # +=c1
241
242         &movq   ($acc0,$car0);
243         &movd   ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
244         &pand   ($acc0,$mask);
245         &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
246         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[i]+tp[j]
247         &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
248         &psrlq  ($car0,32);
249         &movd   (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
250         &psrlq  ($car1,32);
251         &paddq  ($car0,$temp);                  # +=tp[j+1]
252
253         &dec    ($num);
254         &lea    ($j,&DWP(1,$j));                # j++
255         &jnz    (&label("inner"));
256
257         &mov    ($num,$j);
258         &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[i]
259         &pmuludq($acc1,$mul1);                  # np[num-1]*m1
260         &paddq  ($car0,$acc0);                  # +=c0
261         &paddq  ($car1,$acc1);                  # +=c1
262
263         &movq   ($acc0,$car0);
264         &pand   ($acc0,$mask);
265         &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[i]+tp[num-1]
266         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
267         &psrlq  ($car0,32);
268         &psrlq  ($car1,32);
269
270         &movd   ($temp,&DWP($frame+4,"esp",$num,4));    # += tp[num]
271         &paddq  ($car1,$car0);
272         &paddq  ($car1,$temp);
273         &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
274
275         &lea    ($i,&DWP(1,$i));                # i++
276         &cmp    ($i,$num);
277         &jle    (&label("outer"));
278
279         &emms   ();                             # done with mmx bank
280         &jmp    (&label("common_tail"));
281
282 &set_label("non_sse2",16);
283 }
284 \f
285 if (0) {
286         &mov    ("esp",$_sp);
287         &xor    ("eax","eax");  # signal "not fast enough [yet]"
288         &jmp    (&label("just_leave"));
289         # While the below code provides competitive performance for
290         # all key lengthes on modern Intel cores, it's still more
291         # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
292         # means compared to the original integer-only assembler.
293         # 512-bit RSA sign is better by ~40%, but that's about all
294         # one can say about all CPUs...
295 } else {
296 $inp="esi";     # integer path uses these registers differently
297 $word="edi";
298 $carry="ebp";
299
300         &mov    ($inp,$_ap);
301         &lea    ($carry,&DWP(1,$num));
302         &mov    ($word,$_bp);
303         &xor    ($j,$j);                                # j=0
304         &mov    ("edx",$inp);
305         &and    ($carry,1);                             # see if num is even
306         &sub    ("edx",$word);                          # see if ap==bp
307         &lea    ("eax",&DWP(4,$word,$num,4));           # &bp[num]
308         &or     ($carry,"edx");
309         &mov    ($word,&DWP(0,$word));                  # bp[0]
310         &jz     (&label("bn_sqr_mont"));
311         &mov    ($_bpend,"eax");
312         &mov    ("eax",&DWP(0,$inp));
313         &xor    ("edx","edx");
314
315 &set_label("mull",16);
316         &mov    ($carry,"edx");
317         &mul    ($word);                                # ap[j]*bp[0]
318         &add    ($carry,"eax");
319         &lea    ($j,&DWP(1,$j));
320         &adc    ("edx",0);
321         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
322         &cmp    ($j,$num);
323         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
324         &jl     (&label("mull"));
325
326         &mov    ($carry,"edx");
327         &mul    ($word);                                # ap[num-1]*bp[0]
328          &mov   ($word,$_n0);
329         &add    ("eax",$carry);
330          &mov   ($inp,$_np);
331         &adc    ("edx",0);
332          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
333
334         &mov    (&DWP($frame,"esp",$num,4),"eax");      # tp[num-1]=
335         &xor    ($j,$j);
336         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
337         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
338
339         &mov    ("eax",&DWP(0,$inp));                   # np[0]
340         &mul    ($word);                                # np[0]*m
341         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
342         &mov    ("eax",&DWP(4,$inp));                   # np[1]
343         &adc    ("edx",0);
344         &inc    ($j);
345
346         &jmp    (&label("2ndmadd"));
347 \f\f
348 &set_label("1stmadd",16);
349         &mov    ($carry,"edx");
350         &mul    ($word);                                # ap[j]*bp[i]
351         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
352         &lea    ($j,&DWP(1,$j));
353         &adc    ("edx",0);
354         &add    ($carry,"eax");
355         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
356         &adc    ("edx",0);
357         &cmp    ($j,$num);
358         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
359         &jl     (&label("1stmadd"));
360
361         &mov    ($carry,"edx");
362         &mul    ($word);                                # ap[num-1]*bp[i]
363         &add    ("eax",&DWP($frame,"esp",$num,4));      # +=tp[num-1]
364          &mov   ($word,$_n0);
365         &adc    ("edx",0);
366          &mov   ($inp,$_np);
367         &add    ($carry,"eax");
368         &adc    ("edx",0);
369          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
370
371         &xor    ($j,$j);
372         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
373         &mov    (&DWP($frame,"esp",$num,4),$carry);     # tp[num-1]=
374         &adc    ($j,0);
375          &mov   ("eax",&DWP(0,$inp));                   # np[0]
376         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
377         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
378
379         &mul    ($word);                                # np[0]*m
380         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
381         &mov    ("eax",&DWP(4,$inp));                   # np[1]
382         &adc    ("edx",0);
383         &mov    ($j,1);
384 \f
385 &set_label("2ndmadd",16);
386         &mov    ($carry,"edx");
387         &mul    ($word);                                # np[j]*m
388         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
389         &lea    ($j,&DWP(1,$j));
390         &adc    ("edx",0);
391         &add    ($carry,"eax");
392         &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+1]
393         &adc    ("edx",0);
394         &cmp    ($j,$num);
395         &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j-1]=
396         &jl     (&label("2ndmadd"));
397
398         &mov    ($carry,"edx");
399         &mul    ($word);                                # np[j]*m
400         &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
401         &adc    ("edx",0);
402         &add    ($carry,"eax");
403         &adc    ("edx",0);
404         &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
405
406         &xor    ("eax","eax");
407          &mov   ($j,$_bp);                              # &bp[i]
408         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
409         &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
410          &lea   ($j,&DWP(4,$j));
411         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
412          &cmp   ($j,$_bpend);
413         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
414         &je     (&label("common_tail"));
415
416         &mov    ($word,&DWP(0,$j));                     # bp[i+1]
417         &mov    ($inp,$_ap);
418         &mov    ($_bp,$j);                              # &bp[++i]
419         &xor    ($j,$j);
420         &xor    ("edx","edx");
421         &mov    ("eax",&DWP(0,$inp));
422         &jmp    (&label("1stmadd"));
423 \f
424 &set_label("bn_sqr_mont",16);
425 $sbit=$num;
426         &mov    ($_num,$num);
427         &mov    ($_bp,$j);                              # i=0
428
429         &mov    ("eax",$word);                          # ap[0]
430         &mul    ($word);                                # ap[0]*ap[0]
431         &mov    (&DWP($frame,"esp"),"eax");             # tp[0]=
432         &mov    ($sbit,"edx");
433         &shr    ("edx",1);
434         &and    ($sbit,1);
435         &inc    ($j);
436 &set_label("sqr",16);
437         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
438         &mov    ($carry,"edx");
439         &mul    ($word);                                # ap[j]*ap[0]
440         &add    ("eax",$carry);
441         &lea    ($j,&DWP(1,$j));
442         &adc    ("edx",0);
443         &lea    ($carry,&DWP(0,$sbit,"eax",2));
444         &shr    ("eax",31);
445         &cmp    ($j,$_num);
446         &mov    ($sbit,"eax");
447         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
448         &jl     (&label("sqr"));
449
450         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[num-1]
451         &mov    ($carry,"edx");
452         &mul    ($word);                                # ap[num-1]*ap[0]
453         &add    ("eax",$carry);
454          &mov   ($word,$_n0);
455         &adc    ("edx",0);
456          &mov   ($inp,$_np);
457         &lea    ($carry,&DWP(0,$sbit,"eax",2));
458          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
459         &shr    ("eax",31);
460         &mov    (&DWP($frame,"esp",$j,4),$carry);       # tp[num-1]=
461
462         &lea    ($carry,&DWP(0,"eax","edx",2));
463          &mov   ("eax",&DWP(0,$inp));                   # np[0]
464         &shr    ("edx",31);
465         &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num]=
466         &mov    (&DWP($frame+8,"esp",$j,4),"edx");      # tp[num+1]=
467
468         &mul    ($word);                                # np[0]*m
469         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
470         &mov    ($num,$j);
471         &adc    ("edx",0);
472         &mov    ("eax",&DWP(4,$inp));                   # np[1]
473         &mov    ($j,1);
474 \f\f
475 &set_label("3rdmadd",16);
476         &mov    ($carry,"edx");
477         &mul    ($word);                                # np[j]*m
478         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
479         &adc    ("edx",0);
480         &add    ($carry,"eax");
481         &mov    ("eax",&DWP(4,$inp,$j,4));              # np[j+1]
482         &adc    ("edx",0);
483         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j-1]=
484
485         &mov    ($carry,"edx");
486         &mul    ($word);                                # np[j+1]*m
487         &add    ($carry,&DWP($frame+4,"esp",$j,4));     # +=tp[j+1]
488         &lea    ($j,&DWP(2,$j));
489         &adc    ("edx",0);
490         &add    ($carry,"eax");
491         &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+2]
492         &adc    ("edx",0);
493         &cmp    ($j,$num);
494         &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j]=
495         &jl     (&label("3rdmadd"));
496
497         &mov    ($carry,"edx");
498         &mul    ($word);                                # np[j]*m
499         &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
500         &adc    ("edx",0);
501         &add    ($carry,"eax");
502         &adc    ("edx",0);
503         &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
504
505         &mov    ($j,$_bp);                              # i
506         &xor    ("eax","eax");
507         &mov    ($inp,$_ap);
508         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
509         &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
510         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
511         &cmp    ($j,$num);
512         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
513         &je     (&label("common_tail"));
514 \f
515         &mov    ($word,&DWP(4,$inp,$j,4));              # ap[i]
516         &lea    ($j,&DWP(1,$j));
517         &mov    ("eax",$word);
518         &mov    ($_bp,$j);                              # ++i
519         &mul    ($word);                                # ap[i]*ap[i]
520         &add    ("eax",&DWP($frame,"esp",$j,4));        # +=tp[i]
521         &adc    ("edx",0);
522         &mov    (&DWP($frame,"esp",$j,4),"eax");        # tp[i]=
523         &xor    ($carry,$carry);
524         &cmp    ($j,$num);
525         &lea    ($j,&DWP(1,$j));
526         &je     (&label("sqrlast"));
527
528         &mov    ($sbit,"edx");                          # zaps $num
529         &shr    ("edx",1);
530         &and    ($sbit,1);
531 &set_label("sqradd",16);
532         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
533         &mov    ($carry,"edx");
534         &mul    ($word);                                # ap[j]*ap[i]
535         &add    ("eax",$carry);
536         &lea    ($carry,&DWP(0,"eax","eax"));
537         &adc    ("edx",0);
538         &shr    ("eax",31);
539         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
540         &lea    ($j,&DWP(1,$j));
541         &adc    ("eax",0);
542         &add    ($carry,$sbit);
543         &adc    ("eax",0);
544         &cmp    ($j,$_num);
545         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
546         &mov    ($sbit,"eax");
547         &jle    (&label("sqradd"));
548
549         &mov    ($carry,"edx");
550         &add    ("edx","edx");
551         &shr    ($carry,31);
552         &add    ("edx",$sbit);
553         &adc    ($carry,0);
554 &set_label("sqrlast");
555         &mov    ($word,$_n0);
556         &mov    ($inp,$_np);
557         &imul   ($word,&DWP($frame,"esp"));             # n0*tp[0]
558
559         &add    ("edx",&DWP($frame,"esp",$j,4));        # +=tp[num]
560         &mov    ("eax",&DWP(0,$inp));                   # np[0]
561         &adc    ($carry,0);
562         &mov    (&DWP($frame,"esp",$j,4),"edx");        # tp[num]=
563         &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num+1]=
564
565         &mul    ($word);                                # np[0]*m
566         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
567         &lea    ($num,&DWP(-1,$j));
568         &adc    ("edx",0);
569         &mov    ($j,1);
570         &mov    ("eax",&DWP(4,$inp));                   # np[1]
571
572         &jmp    (&label("3rdmadd"));
573 }
574 \f
575 &set_label("common_tail",16);
576         &mov    ($np,$_np);                     # load modulus pointer
577         &mov    ($rp,$_rp);                     # load result pointer
578         &lea    ($tp,&DWP($frame,"esp"));       # [$ap and $bp are zapped]
579
580         &mov    ("eax",&DWP(0,$tp));            # tp[0]
581         &mov    ($j,$num);                      # j=num-1
582         &xor    ($i,$i);                        # i=0 and clear CF!
583
584 &set_label("sub",16);
585         &sbb    ("eax",&DWP(0,$np,$i,4));
586         &mov    (&DWP(0,$rp,$i,4),"eax");       # rp[i]=tp[i]-np[i]
587         &dec    ($j);                           # doesn't affect CF!
588         &mov    ("eax",&DWP(4,$tp,$i,4));       # tp[i+1]
589         &lea    ($i,&DWP(1,$i));                # i++
590         &jge    (&label("sub"));
591
592         &sbb    ("eax",0);                      # handle upmost overflow bit
593         &and    ($tp,"eax");
594         &not    ("eax");
595         &mov    ($np,$rp);
596         &and    ($np,"eax");
597         &or     ($tp,$np);                      # tp=carry?tp:rp
598
599 &set_label("copy",16);                          # copy or in-place refresh
600         &mov    ("eax",&DWP(0,$tp,$num,4));
601         &mov    (&DWP(0,$rp,$num,4),"eax");     # rp[i]=tp[i]
602         &mov    (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
603         &dec    ($num);
604         &jge    (&label("copy"));
605
606         &mov    ("esp",$_sp);           # pull saved stack pointer
607         &mov    ("eax",1);
608 &set_label("just_leave");
609 &function_end("bn_mul_mont");
610
611 &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
612
613 &asm_finish();
614
615 close STDOUT;