make update
[openssl.git] / crypto / bn / asm / x86-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # October 2005
11 #
12 # This is a "teaser" code, as it can be improved in several ways...
13 # First of all non-SSE2 path should be implemented (yes, for now it
14 # performs Montgomery multiplication/convolution only on SSE2-capable
15 # CPUs such as P4, others fall down to original code). Then inner loop
16 # can be unrolled and modulo-scheduled to improve ILP and possibly
17 # moved to 128-bit XMM register bank (though it would require input
18 # rearrangement and/or increase bus bandwidth utilization). Dedicated
19 # squaring procedure should give further performance improvement...
20 # Yet, for being draft, the code improves rsa512 *sign* benchmark by
21 # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
22
23 # December 2006
24 #
25 # Modulo-scheduling SSE2 loops results in further 15-20% improvement.
26 # Integer-only code [being equipped with dedicated squaring procedure]
27 # gives ~40% on rsa512 sign benchmark...
28
29 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30 push(@INC,"${dir}","${dir}../../perlasm");
31 require "x86asm.pl";
32
33 &asm_init($ARGV[0],$0);
34
35 $sse2=0;
36 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
37
38 &external_label("OPENSSL_ia32cap_P") if ($sse2);
39
40 &function_begin("bn_mul_mont");
41
42 $i="edx";
43 $j="ecx";
44 $ap="esi";      $tp="esi";              # overlapping variables!!!
45 $rp="edi";      $bp="edi";              # overlapping variables!!!
46 $np="ebp";
47 $num="ebx";
48
49 $_num=&DWP(4*0,"esp");                  # stack top layout
50 $_rp=&DWP(4*1,"esp");
51 $_ap=&DWP(4*2,"esp");
52 $_bp=&DWP(4*3,"esp");
53 $_np=&DWP(4*4,"esp");
54 $_n0=&DWP(4*5,"esp");   $_n0q=&QWP(4*5,"esp");
55 $_sp=&DWP(4*6,"esp");
56 $_bpend=&DWP(4*7,"esp");
57 $frame=32;                              # size of above frame rounded up to 16n
58
59         &xor    ("eax","eax");
60         &mov    ("edi",&wparam(5));     # int num
61         &cmp    ("edi",4);
62         &jl     (&label("just_leave"));
63
64         &lea    ("esi",&wparam(0));     # put aside pointer to argument block
65         &lea    ("edx",&wparam(1));     # load ap
66         &add    ("edi",2);              # extra two words on top of tp
67         &neg    ("edi");
68         &lea    ("ebp",&DWP(-$frame,"esp","edi",4));    # future alloca($frame+4*(num+2))
69         &neg    ("edi");
70
71         # minimize cache contention by arraning 2K window between stack
72         # pointer and ap argument [np is also position sensitive vector,
73         # but it's assumed to be near ap, as it's allocated at ~same
74         # time].
75         &mov    ("eax","ebp");
76         &sub    ("eax","edx");
77         &and    ("eax",2047);
78         &sub    ("ebp","eax");          # this aligns sp and ap modulo 2048
79
80         &xor    ("edx","ebp");
81         &and    ("edx",2048);
82         &xor    ("edx",2048);
83         &sub    ("ebp","edx");          # this splits them apart modulo 4096
84
85         &and    ("ebp",-64);            # align to cache line
86
87         # Some OSes, *cough*-dows, insist on stack being "wired" to
88         # physical memory in strictly sequential manner, i.e. if stack
89         # allocation spans two pages, then reference to farmost one can
90         # be punishable by SEGV. But page walking can do good even on
91         # other OSes, because it guarantees that villain thread hits
92         # the guard page before it can make damage to innocent one...
93         &mov    ("eax","esp");
94         &sub    ("eax","ebp");
95         &and    ("eax",-4096);
96         &mov    ("edx","esp");          # saved stack pointer!
97         &lea    ("esp",&DWP(0,"ebp","eax"));
98         &mov    ("eax",&DWP(0,"esp"));
99         &cmp    ("esp","ebp");
100         &ja     (&label("page_walk"));
101         &jmp    (&label("page_walk_done"));
102
103 &set_label("page_walk",16);
104         &lea    ("esp",&DWP(-4096,"esp"));
105         &mov    ("eax",&DWP(0,"esp"));
106         &cmp    ("esp","ebp");
107         &ja     (&label("page_walk"));
108 &set_label("page_walk_done");
109
110         ################################# load argument block...
111         &mov    ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
112         &mov    ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
113         &mov    ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
114         &mov    ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
115         &mov    ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
116         #&mov   ("edi",&DWP(5*4,"esi"));# int num
117
118         &mov    ("esi",&DWP(0,"esi"));  # pull n0[0]
119         &mov    ($_rp,"eax");           # ... save a copy of argument block
120         &mov    ($_ap,"ebx");
121         &mov    ($_bp,"ecx");
122         &mov    ($_np,"ebp");
123         &mov    ($_n0,"esi");
124         &lea    ($num,&DWP(-3,"edi"));  # num=num-1 to assist modulo-scheduling
125         #&mov   ($_num,$num);           # redundant as $num is not reused
126         &mov    ($_sp,"edx");           # saved stack pointer!
127 \f
128 if($sse2) {
129 $acc0="mm0";    # mmx register bank layout
130 $acc1="mm1";
131 $car0="mm2";
132 $car1="mm3";
133 $mul0="mm4";
134 $mul1="mm5";
135 $temp="mm6";
136 $mask="mm7";
137
138         &picmeup("eax","OPENSSL_ia32cap_P");
139         &bt     (&DWP(0,"eax"),26);
140         &jnc    (&label("non_sse2"));
141
142         &mov    ("eax",-1);
143         &movd   ($mask,"eax");          # mask 32 lower bits
144
145         &mov    ($ap,$_ap);             # load input pointers
146         &mov    ($bp,$_bp);
147         &mov    ($np,$_np);
148
149         &xor    ($i,$i);                # i=0
150         &xor    ($j,$j);                # j=0
151
152         &movd   ($mul0,&DWP(0,$bp));            # bp[0]
153         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
154         &movd   ($car1,&DWP(0,$np));            # np[0]
155
156         &pmuludq($mul1,$mul0);                  # ap[0]*bp[0]
157         &movq   ($car0,$mul1);
158         &movq   ($acc0,$mul1);                  # I wish movd worked for
159         &pand   ($acc0,$mask);                  # inter-register transfers
160
161         &pmuludq($mul1,$_n0q);                  # *=n0
162
163         &pmuludq($car1,$mul1);                  # "t[0]"*np[0]*n0
164         &paddq  ($car1,$acc0);
165
166         &movd   ($acc1,&DWP(4,$np));            # np[1]
167         &movd   ($acc0,&DWP(4,$ap));            # ap[1]
168
169         &psrlq  ($car0,32);
170         &psrlq  ($car1,32);
171
172         &inc    ($j);                           # j++
173 &set_label("1st",16);
174         &pmuludq($acc0,$mul0);                  # ap[j]*bp[0]
175         &pmuludq($acc1,$mul1);                  # np[j]*m1
176         &paddq  ($car0,$acc0);                  # +=c0
177         &paddq  ($car1,$acc1);                  # +=c1
178
179         &movq   ($acc0,$car0);
180         &pand   ($acc0,$mask);
181         &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
182         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[0];
183         &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
184         &psrlq  ($car0,32);
185         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[j-1]=
186         &psrlq  ($car1,32);
187
188         &lea    ($j,&DWP(1,$j));
189         &cmp    ($j,$num);
190         &jl     (&label("1st"));
191
192         &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[0]
193         &pmuludq($acc1,$mul1);                  # np[num-1]*m1
194         &paddq  ($car0,$acc0);                  # +=c0
195         &paddq  ($car1,$acc1);                  # +=c1
196
197         &movq   ($acc0,$car0);
198         &pand   ($acc0,$mask);
199         &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[0];
200         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
201
202         &psrlq  ($car0,32);
203         &psrlq  ($car1,32);
204
205         &paddq  ($car1,$car0);
206         &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
207 \f
208         &inc    ($i);                           # i++
209 &set_label("outer");
210         &xor    ($j,$j);                        # j=0
211
212         &movd   ($mul0,&DWP(0,$bp,$i,4));       # bp[i]
213         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
214         &movd   ($temp,&DWP($frame,"esp"));     # tp[0]
215         &movd   ($car1,&DWP(0,$np));            # np[0]
216         &pmuludq($mul1,$mul0);                  # ap[0]*bp[i]
217
218         &paddq  ($mul1,$temp);                  # +=tp[0]
219         &movq   ($acc0,$mul1);
220         &movq   ($car0,$mul1);
221         &pand   ($acc0,$mask);
222
223         &pmuludq($mul1,$_n0q);                  # *=n0
224
225         &pmuludq($car1,$mul1);
226         &paddq  ($car1,$acc0);
227
228         &movd   ($temp,&DWP($frame+4,"esp"));   # tp[1]
229         &movd   ($acc1,&DWP(4,$np));            # np[1]
230         &movd   ($acc0,&DWP(4,$ap));            # ap[1]
231
232         &psrlq  ($car0,32);
233         &psrlq  ($car1,32);
234         &paddq  ($car0,$temp);                  # +=tp[1]
235
236         &inc    ($j);                           # j++
237         &dec    ($num);
238 &set_label("inner");
239         &pmuludq($acc0,$mul0);                  # ap[j]*bp[i]
240         &pmuludq($acc1,$mul1);                  # np[j]*m1
241         &paddq  ($car0,$acc0);                  # +=c0
242         &paddq  ($car1,$acc1);                  # +=c1
243
244         &movq   ($acc0,$car0);
245         &movd   ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
246         &pand   ($acc0,$mask);
247         &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
248         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[i]+tp[j]
249         &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
250         &psrlq  ($car0,32);
251         &movd   (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
252         &psrlq  ($car1,32);
253         &paddq  ($car0,$temp);                  # +=tp[j+1]
254
255         &dec    ($num);
256         &lea    ($j,&DWP(1,$j));                # j++
257         &jnz    (&label("inner"));
258
259         &mov    ($num,$j);
260         &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[i]
261         &pmuludq($acc1,$mul1);                  # np[num-1]*m1
262         &paddq  ($car0,$acc0);                  # +=c0
263         &paddq  ($car1,$acc1);                  # +=c1
264
265         &movq   ($acc0,$car0);
266         &pand   ($acc0,$mask);
267         &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[i]+tp[num-1]
268         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
269         &psrlq  ($car0,32);
270         &psrlq  ($car1,32);
271
272         &movd   ($temp,&DWP($frame+4,"esp",$num,4));    # += tp[num]
273         &paddq  ($car1,$car0);
274         &paddq  ($car1,$temp);
275         &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
276
277         &lea    ($i,&DWP(1,$i));                # i++
278         &cmp    ($i,$num);
279         &jle    (&label("outer"));
280
281         &emms   ();                             # done with mmx bank
282         &jmp    (&label("common_tail"));
283
284 &set_label("non_sse2",16);
285 }
286 \f
287 if (0) {
288         &mov    ("esp",$_sp);
289         &xor    ("eax","eax");  # signal "not fast enough [yet]"
290         &jmp    (&label("just_leave"));
291         # While the below code provides competitive performance for
292         # all key lengthes on modern Intel cores, it's still more
293         # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
294         # means compared to the original integer-only assembler.
295         # 512-bit RSA sign is better by ~40%, but that's about all
296         # one can say about all CPUs...
297 } else {
298 $inp="esi";     # integer path uses these registers differently
299 $word="edi";
300 $carry="ebp";
301
302         &mov    ($inp,$_ap);
303         &lea    ($carry,&DWP(1,$num));
304         &mov    ($word,$_bp);
305         &xor    ($j,$j);                                # j=0
306         &mov    ("edx",$inp);
307         &and    ($carry,1);                             # see if num is even
308         &sub    ("edx",$word);                          # see if ap==bp
309         &lea    ("eax",&DWP(4,$word,$num,4));           # &bp[num]
310         &or     ($carry,"edx");
311         &mov    ($word,&DWP(0,$word));                  # bp[0]
312         &jz     (&label("bn_sqr_mont"));
313         &mov    ($_bpend,"eax");
314         &mov    ("eax",&DWP(0,$inp));
315         &xor    ("edx","edx");
316
317 &set_label("mull",16);
318         &mov    ($carry,"edx");
319         &mul    ($word);                                # ap[j]*bp[0]
320         &add    ($carry,"eax");
321         &lea    ($j,&DWP(1,$j));
322         &adc    ("edx",0);
323         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
324         &cmp    ($j,$num);
325         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
326         &jl     (&label("mull"));
327
328         &mov    ($carry,"edx");
329         &mul    ($word);                                # ap[num-1]*bp[0]
330          &mov   ($word,$_n0);
331         &add    ("eax",$carry);
332          &mov   ($inp,$_np);
333         &adc    ("edx",0);
334          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
335
336         &mov    (&DWP($frame,"esp",$num,4),"eax");      # tp[num-1]=
337         &xor    ($j,$j);
338         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
339         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
340
341         &mov    ("eax",&DWP(0,$inp));                   # np[0]
342         &mul    ($word);                                # np[0]*m
343         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
344         &mov    ("eax",&DWP(4,$inp));                   # np[1]
345         &adc    ("edx",0);
346         &inc    ($j);
347
348         &jmp    (&label("2ndmadd"));
349 \f\f
350 &set_label("1stmadd",16);
351         &mov    ($carry,"edx");
352         &mul    ($word);                                # ap[j]*bp[i]
353         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
354         &lea    ($j,&DWP(1,$j));
355         &adc    ("edx",0);
356         &add    ($carry,"eax");
357         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
358         &adc    ("edx",0);
359         &cmp    ($j,$num);
360         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
361         &jl     (&label("1stmadd"));
362
363         &mov    ($carry,"edx");
364         &mul    ($word);                                # ap[num-1]*bp[i]
365         &add    ("eax",&DWP($frame,"esp",$num,4));      # +=tp[num-1]
366          &mov   ($word,$_n0);
367         &adc    ("edx",0);
368          &mov   ($inp,$_np);
369         &add    ($carry,"eax");
370         &adc    ("edx",0);
371          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
372
373         &xor    ($j,$j);
374         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
375         &mov    (&DWP($frame,"esp",$num,4),$carry);     # tp[num-1]=
376         &adc    ($j,0);
377          &mov   ("eax",&DWP(0,$inp));                   # np[0]
378         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
379         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
380
381         &mul    ($word);                                # np[0]*m
382         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
383         &mov    ("eax",&DWP(4,$inp));                   # np[1]
384         &adc    ("edx",0);
385         &mov    ($j,1);
386 \f
387 &set_label("2ndmadd",16);
388         &mov    ($carry,"edx");
389         &mul    ($word);                                # np[j]*m
390         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
391         &lea    ($j,&DWP(1,$j));
392         &adc    ("edx",0);
393         &add    ($carry,"eax");
394         &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+1]
395         &adc    ("edx",0);
396         &cmp    ($j,$num);
397         &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j-1]=
398         &jl     (&label("2ndmadd"));
399
400         &mov    ($carry,"edx");
401         &mul    ($word);                                # np[j]*m
402         &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
403         &adc    ("edx",0);
404         &add    ($carry,"eax");
405         &adc    ("edx",0);
406         &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
407
408         &xor    ("eax","eax");
409          &mov   ($j,$_bp);                              # &bp[i]
410         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
411         &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
412          &lea   ($j,&DWP(4,$j));
413         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
414          &cmp   ($j,$_bpend);
415         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
416         &je     (&label("common_tail"));
417
418         &mov    ($word,&DWP(0,$j));                     # bp[i+1]
419         &mov    ($inp,$_ap);
420         &mov    ($_bp,$j);                              # &bp[++i]
421         &xor    ($j,$j);
422         &xor    ("edx","edx");
423         &mov    ("eax",&DWP(0,$inp));
424         &jmp    (&label("1stmadd"));
425 \f
426 &set_label("bn_sqr_mont",16);
427 $sbit=$num;
428         &mov    ($_num,$num);
429         &mov    ($_bp,$j);                              # i=0
430
431         &mov    ("eax",$word);                          # ap[0]
432         &mul    ($word);                                # ap[0]*ap[0]
433         &mov    (&DWP($frame,"esp"),"eax");             # tp[0]=
434         &mov    ($sbit,"edx");
435         &shr    ("edx",1);
436         &and    ($sbit,1);
437         &inc    ($j);
438 &set_label("sqr",16);
439         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
440         &mov    ($carry,"edx");
441         &mul    ($word);                                # ap[j]*ap[0]
442         &add    ("eax",$carry);
443         &lea    ($j,&DWP(1,$j));
444         &adc    ("edx",0);
445         &lea    ($carry,&DWP(0,$sbit,"eax",2));
446         &shr    ("eax",31);
447         &cmp    ($j,$_num);
448         &mov    ($sbit,"eax");
449         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
450         &jl     (&label("sqr"));
451
452         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[num-1]
453         &mov    ($carry,"edx");
454         &mul    ($word);                                # ap[num-1]*ap[0]
455         &add    ("eax",$carry);
456          &mov   ($word,$_n0);
457         &adc    ("edx",0);
458          &mov   ($inp,$_np);
459         &lea    ($carry,&DWP(0,$sbit,"eax",2));
460          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
461         &shr    ("eax",31);
462         &mov    (&DWP($frame,"esp",$j,4),$carry);       # tp[num-1]=
463
464         &lea    ($carry,&DWP(0,"eax","edx",2));
465          &mov   ("eax",&DWP(0,$inp));                   # np[0]
466         &shr    ("edx",31);
467         &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num]=
468         &mov    (&DWP($frame+8,"esp",$j,4),"edx");      # tp[num+1]=
469
470         &mul    ($word);                                # np[0]*m
471         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
472         &mov    ($num,$j);
473         &adc    ("edx",0);
474         &mov    ("eax",&DWP(4,$inp));                   # np[1]
475         &mov    ($j,1);
476 \f\f
477 &set_label("3rdmadd",16);
478         &mov    ($carry,"edx");
479         &mul    ($word);                                # np[j]*m
480         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
481         &adc    ("edx",0);
482         &add    ($carry,"eax");
483         &mov    ("eax",&DWP(4,$inp,$j,4));              # np[j+1]
484         &adc    ("edx",0);
485         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j-1]=
486
487         &mov    ($carry,"edx");
488         &mul    ($word);                                # np[j+1]*m
489         &add    ($carry,&DWP($frame+4,"esp",$j,4));     # +=tp[j+1]
490         &lea    ($j,&DWP(2,$j));
491         &adc    ("edx",0);
492         &add    ($carry,"eax");
493         &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+2]
494         &adc    ("edx",0);
495         &cmp    ($j,$num);
496         &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j]=
497         &jl     (&label("3rdmadd"));
498
499         &mov    ($carry,"edx");
500         &mul    ($word);                                # np[j]*m
501         &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
502         &adc    ("edx",0);
503         &add    ($carry,"eax");
504         &adc    ("edx",0);
505         &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
506
507         &mov    ($j,$_bp);                              # i
508         &xor    ("eax","eax");
509         &mov    ($inp,$_ap);
510         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
511         &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
512         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
513         &cmp    ($j,$num);
514         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
515         &je     (&label("common_tail"));
516 \f
517         &mov    ($word,&DWP(4,$inp,$j,4));              # ap[i]
518         &lea    ($j,&DWP(1,$j));
519         &mov    ("eax",$word);
520         &mov    ($_bp,$j);                              # ++i
521         &mul    ($word);                                # ap[i]*ap[i]
522         &add    ("eax",&DWP($frame,"esp",$j,4));        # +=tp[i]
523         &adc    ("edx",0);
524         &mov    (&DWP($frame,"esp",$j,4),"eax");        # tp[i]=
525         &xor    ($carry,$carry);
526         &cmp    ($j,$num);
527         &lea    ($j,&DWP(1,$j));
528         &je     (&label("sqrlast"));
529
530         &mov    ($sbit,"edx");                          # zaps $num
531         &shr    ("edx",1);
532         &and    ($sbit,1);
533 &set_label("sqradd",16);
534         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
535         &mov    ($carry,"edx");
536         &mul    ($word);                                # ap[j]*ap[i]
537         &add    ("eax",$carry);
538         &lea    ($carry,&DWP(0,"eax","eax"));
539         &adc    ("edx",0);
540         &shr    ("eax",31);
541         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
542         &lea    ($j,&DWP(1,$j));
543         &adc    ("eax",0);
544         &add    ($carry,$sbit);
545         &adc    ("eax",0);
546         &cmp    ($j,$_num);
547         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
548         &mov    ($sbit,"eax");
549         &jle    (&label("sqradd"));
550
551         &mov    ($carry,"edx");
552         &add    ("edx","edx");
553         &shr    ($carry,31);
554         &add    ("edx",$sbit);
555         &adc    ($carry,0);
556 &set_label("sqrlast");
557         &mov    ($word,$_n0);
558         &mov    ($inp,$_np);
559         &imul   ($word,&DWP($frame,"esp"));             # n0*tp[0]
560
561         &add    ("edx",&DWP($frame,"esp",$j,4));        # +=tp[num]
562         &mov    ("eax",&DWP(0,$inp));                   # np[0]
563         &adc    ($carry,0);
564         &mov    (&DWP($frame,"esp",$j,4),"edx");        # tp[num]=
565         &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num+1]=
566
567         &mul    ($word);                                # np[0]*m
568         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
569         &lea    ($num,&DWP(-1,$j));
570         &adc    ("edx",0);
571         &mov    ($j,1);
572         &mov    ("eax",&DWP(4,$inp));                   # np[1]
573
574         &jmp    (&label("3rdmadd"));
575 }
576 \f
577 &set_label("common_tail",16);
578         &mov    ($np,$_np);                     # load modulus pointer
579         &mov    ($rp,$_rp);                     # load result pointer
580         &lea    ($tp,&DWP($frame,"esp"));       # [$ap and $bp are zapped]
581
582         &mov    ("eax",&DWP(0,$tp));            # tp[0]
583         &mov    ($j,$num);                      # j=num-1
584         &xor    ($i,$i);                        # i=0 and clear CF!
585
586 &set_label("sub",16);
587         &sbb    ("eax",&DWP(0,$np,$i,4));
588         &mov    (&DWP(0,$rp,$i,4),"eax");       # rp[i]=tp[i]-np[i]
589         &dec    ($j);                           # doesn't affect CF!
590         &mov    ("eax",&DWP(4,$tp,$i,4));       # tp[i+1]
591         &lea    ($i,&DWP(1,$i));                # i++
592         &jge    (&label("sub"));
593
594         &sbb    ("eax",0);                      # handle upmost overflow bit
595         &and    ($tp,"eax");
596         &not    ("eax");
597         &mov    ($np,$rp);
598         &and    ($np,"eax");
599         &or     ($tp,$np);                      # tp=carry?tp:rp
600
601 &set_label("copy",16);                          # copy or in-place refresh
602         &mov    ("eax",&DWP(0,$tp,$num,4));
603         &mov    (&DWP(0,$rp,$num,4),"eax");     # rp[i]=tp[i]
604         &mov    (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
605         &dec    ($num);
606         &jge    (&label("copy"));
607
608         &mov    ("esp",$_sp);           # pull saved stack pointer
609         &mov    ("eax",1);
610 &set_label("just_leave");
611 &function_end("bn_mul_mont");
612
613 &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
614
615 &asm_finish();