Explain *cough*-dows
[openssl.git] / crypto / bn / asm / x86-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # October 2005
11 #
12 # This is a "teaser" code, as it can be improved in several ways...
13 # First of all non-SSE2 path should be implemented (yes, for now it
14 # performs Montgomery multiplication/convolution only on SSE2-capable
15 # CPUs such as P4, others fall down to original code). Then inner loop
16 # can be unrolled and modulo-scheduled to improve ILP and possibly
17 # moved to 128-bit XMM register bank (though it would require input
18 # rearrangement and/or increase bus bandwidth utilization). Dedicated
19 # squaring procedure should give further performance improvement...
20 # Yet, for being draft, the code improves rsa512 *sign* benchmark by
21 # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
22
23 # December 2006
24 #
25 # Modulo-scheduling SSE2 loops results in further 15-20% improvement.
26 # Integer-only code [being equipped with dedicated squaring procedure]
27 # gives ~40% on rsa512 sign benchmark...
28
29 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30 push(@INC,"${dir}","${dir}../../perlasm");
31 require "x86asm.pl";
32
33 $output = pop;
34 open STDOUT,">$output";
35  
36 &asm_init($ARGV[0],$0);
37
38 $sse2=0;
39 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
40
41 &external_label("OPENSSL_ia32cap_P") if ($sse2);
42
43 &function_begin("bn_mul_mont");
44
45 $i="edx";
46 $j="ecx";
47 $ap="esi";      $tp="esi";              # overlapping variables!!!
48 $rp="edi";      $bp="edi";              # overlapping variables!!!
49 $np="ebp";
50 $num="ebx";
51
52 $_num=&DWP(4*0,"esp");                  # stack top layout
53 $_rp=&DWP(4*1,"esp");
54 $_ap=&DWP(4*2,"esp");
55 $_bp=&DWP(4*3,"esp");
56 $_np=&DWP(4*4,"esp");
57 $_n0=&DWP(4*5,"esp");   $_n0q=&QWP(4*5,"esp");
58 $_sp=&DWP(4*6,"esp");
59 $_bpend=&DWP(4*7,"esp");
60 $frame=32;                              # size of above frame rounded up to 16n
61
62         &xor    ("eax","eax");
63         &mov    ("edi",&wparam(5));     # int num
64         &cmp    ("edi",4);
65         &jl     (&label("just_leave"));
66
67         &lea    ("esi",&wparam(0));     # put aside pointer to argument block
68         &lea    ("edx",&wparam(1));     # load ap
69         &mov    ("ebp","esp");          # saved stack pointer!
70         &add    ("edi",2);              # extra two words on top of tp
71         &neg    ("edi");
72         &lea    ("esp",&DWP(-$frame,"esp","edi",4));    # alloca($frame+4*(num+2))
73         &neg    ("edi");
74
75         # minimize cache contention by arraning 2K window between stack
76         # pointer and ap argument [np is also position sensitive vector,
77         # but it's assumed to be near ap, as it's allocated at ~same
78         # time].
79         &mov    ("eax","esp");
80         &sub    ("eax","edx");
81         &and    ("eax",2047);
82         &sub    ("esp","eax");          # this aligns sp and ap modulo 2048
83
84         &xor    ("edx","esp");
85         &and    ("edx",2048);
86         &xor    ("edx",2048);
87         &sub    ("esp","edx");          # this splits them apart modulo 4096
88
89         &and    ("esp",-64);            # align to cache line
90
91         # Some OSes (Windows) insist on stack being "wired" to
92         # physical memory in strictly sequential manner, i.e. if stack
93         # allocation spans two pages, then reference to farmost one can
94         # be punishable by SEGV. But page walking can do good even on
95         # other OSes, because it guarantees that villain thread hits
96         # the guard page before it can make damage to innocent one...
97         &mov    ("eax","ebp");
98         &sub    ("eax","esp");
99         &and    ("eax",-4096);
100 &set_label("page_walk");
101         &mov    ("edx",&DWP(0,"esp","eax"));
102         &sub    ("eax",4096);
103         &data_byte(0x2e);
104         &jnc    (&label("page_walk"));
105
106         ################################# load argument block...
107         &mov    ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
108         &mov    ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
109         &mov    ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
110         &mov    ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
111         &mov    ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
112         #&mov   ("edi",&DWP(5*4,"esi"));# int num
113
114         &mov    ("esi",&DWP(0,"esi"));  # pull n0[0]
115         &mov    ($_rp,"eax");           # ... save a copy of argument block
116         &mov    ($_ap,"ebx");
117         &mov    ($_bp,"ecx");
118         &mov    ($_np,"edx");
119         &mov    ($_n0,"esi");
120         &lea    ($num,&DWP(-3,"edi"));  # num=num-1 to assist modulo-scheduling
121         #&mov   ($_num,$num);           # redundant as $num is not reused
122         &mov    ($_sp,"ebp");           # saved stack pointer!
123 \f
124 if($sse2) {
125 $acc0="mm0";    # mmx register bank layout
126 $acc1="mm1";
127 $car0="mm2";
128 $car1="mm3";
129 $mul0="mm4";
130 $mul1="mm5";
131 $temp="mm6";
132 $mask="mm7";
133
134         &picmeup("eax","OPENSSL_ia32cap_P");
135         &bt     (&DWP(0,"eax"),26);
136         &jnc    (&label("non_sse2"));
137
138         &mov    ("eax",-1);
139         &movd   ($mask,"eax");          # mask 32 lower bits
140
141         &mov    ($ap,$_ap);             # load input pointers
142         &mov    ($bp,$_bp);
143         &mov    ($np,$_np);
144
145         &xor    ($i,$i);                # i=0
146         &xor    ($j,$j);                # j=0
147
148         &movd   ($mul0,&DWP(0,$bp));            # bp[0]
149         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
150         &movd   ($car1,&DWP(0,$np));            # np[0]
151
152         &pmuludq($mul1,$mul0);                  # ap[0]*bp[0]
153         &movq   ($car0,$mul1);
154         &movq   ($acc0,$mul1);                  # I wish movd worked for
155         &pand   ($acc0,$mask);                  # inter-register transfers
156
157         &pmuludq($mul1,$_n0q);                  # *=n0
158
159         &pmuludq($car1,$mul1);                  # "t[0]"*np[0]*n0
160         &paddq  ($car1,$acc0);
161
162         &movd   ($acc1,&DWP(4,$np));            # np[1]
163         &movd   ($acc0,&DWP(4,$ap));            # ap[1]
164
165         &psrlq  ($car0,32);
166         &psrlq  ($car1,32);
167
168         &inc    ($j);                           # j++
169 &set_label("1st",16);
170         &pmuludq($acc0,$mul0);                  # ap[j]*bp[0]
171         &pmuludq($acc1,$mul1);                  # np[j]*m1
172         &paddq  ($car0,$acc0);                  # +=c0
173         &paddq  ($car1,$acc1);                  # +=c1
174
175         &movq   ($acc0,$car0);
176         &pand   ($acc0,$mask);
177         &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
178         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[0];
179         &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
180         &psrlq  ($car0,32);
181         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[j-1]=
182         &psrlq  ($car1,32);
183
184         &lea    ($j,&DWP(1,$j));
185         &cmp    ($j,$num);
186         &jl     (&label("1st"));
187
188         &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[0]
189         &pmuludq($acc1,$mul1);                  # np[num-1]*m1
190         &paddq  ($car0,$acc0);                  # +=c0
191         &paddq  ($car1,$acc1);                  # +=c1
192
193         &movq   ($acc0,$car0);
194         &pand   ($acc0,$mask);
195         &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[0];
196         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
197
198         &psrlq  ($car0,32);
199         &psrlq  ($car1,32);
200
201         &paddq  ($car1,$car0);
202         &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
203 \f
204         &inc    ($i);                           # i++
205 &set_label("outer");
206         &xor    ($j,$j);                        # j=0
207
208         &movd   ($mul0,&DWP(0,$bp,$i,4));       # bp[i]
209         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
210         &movd   ($temp,&DWP($frame,"esp"));     # tp[0]
211         &movd   ($car1,&DWP(0,$np));            # np[0]
212         &pmuludq($mul1,$mul0);                  # ap[0]*bp[i]
213
214         &paddq  ($mul1,$temp);                  # +=tp[0]
215         &movq   ($acc0,$mul1);
216         &movq   ($car0,$mul1);
217         &pand   ($acc0,$mask);
218
219         &pmuludq($mul1,$_n0q);                  # *=n0
220
221         &pmuludq($car1,$mul1);
222         &paddq  ($car1,$acc0);
223
224         &movd   ($temp,&DWP($frame+4,"esp"));   # tp[1]
225         &movd   ($acc1,&DWP(4,$np));            # np[1]
226         &movd   ($acc0,&DWP(4,$ap));            # ap[1]
227
228         &psrlq  ($car0,32);
229         &psrlq  ($car1,32);
230         &paddq  ($car0,$temp);                  # +=tp[1]
231
232         &inc    ($j);                           # j++
233         &dec    ($num);
234 &set_label("inner");
235         &pmuludq($acc0,$mul0);                  # ap[j]*bp[i]
236         &pmuludq($acc1,$mul1);                  # np[j]*m1
237         &paddq  ($car0,$acc0);                  # +=c0
238         &paddq  ($car1,$acc1);                  # +=c1
239
240         &movq   ($acc0,$car0);
241         &movd   ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
242         &pand   ($acc0,$mask);
243         &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
244         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[i]+tp[j]
245         &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
246         &psrlq  ($car0,32);
247         &movd   (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
248         &psrlq  ($car1,32);
249         &paddq  ($car0,$temp);                  # +=tp[j+1]
250
251         &dec    ($num);
252         &lea    ($j,&DWP(1,$j));                # j++
253         &jnz    (&label("inner"));
254
255         &mov    ($num,$j);
256         &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[i]
257         &pmuludq($acc1,$mul1);                  # np[num-1]*m1
258         &paddq  ($car0,$acc0);                  # +=c0
259         &paddq  ($car1,$acc1);                  # +=c1
260
261         &movq   ($acc0,$car0);
262         &pand   ($acc0,$mask);
263         &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[i]+tp[num-1]
264         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
265         &psrlq  ($car0,32);
266         &psrlq  ($car1,32);
267
268         &movd   ($temp,&DWP($frame+4,"esp",$num,4));    # += tp[num]
269         &paddq  ($car1,$car0);
270         &paddq  ($car1,$temp);
271         &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
272
273         &lea    ($i,&DWP(1,$i));                # i++
274         &cmp    ($i,$num);
275         &jle    (&label("outer"));
276
277         &emms   ();                             # done with mmx bank
278         &jmp    (&label("common_tail"));
279
280 &set_label("non_sse2",16);
281 }
282 \f
283 if (0) {
284         &mov    ("esp",$_sp);
285         &xor    ("eax","eax");  # signal "not fast enough [yet]"
286         &jmp    (&label("just_leave"));
287         # While the below code provides competitive performance for
288         # all key lengthes on modern Intel cores, it's still more
289         # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
290         # means compared to the original integer-only assembler.
291         # 512-bit RSA sign is better by ~40%, but that's about all
292         # one can say about all CPUs...
293 } else {
294 $inp="esi";     # integer path uses these registers differently
295 $word="edi";
296 $carry="ebp";
297
298         &mov    ($inp,$_ap);
299         &lea    ($carry,&DWP(1,$num));
300         &mov    ($word,$_bp);
301         &xor    ($j,$j);                                # j=0
302         &mov    ("edx",$inp);
303         &and    ($carry,1);                             # see if num is even
304         &sub    ("edx",$word);                          # see if ap==bp
305         &lea    ("eax",&DWP(4,$word,$num,4));           # &bp[num]
306         &or     ($carry,"edx");
307         &mov    ($word,&DWP(0,$word));                  # bp[0]
308         &jz     (&label("bn_sqr_mont"));
309         &mov    ($_bpend,"eax");
310         &mov    ("eax",&DWP(0,$inp));
311         &xor    ("edx","edx");
312
313 &set_label("mull",16);
314         &mov    ($carry,"edx");
315         &mul    ($word);                                # ap[j]*bp[0]
316         &add    ($carry,"eax");
317         &lea    ($j,&DWP(1,$j));
318         &adc    ("edx",0);
319         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
320         &cmp    ($j,$num);
321         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
322         &jl     (&label("mull"));
323
324         &mov    ($carry,"edx");
325         &mul    ($word);                                # ap[num-1]*bp[0]
326          &mov   ($word,$_n0);
327         &add    ("eax",$carry);
328          &mov   ($inp,$_np);
329         &adc    ("edx",0);
330          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
331
332         &mov    (&DWP($frame,"esp",$num,4),"eax");      # tp[num-1]=
333         &xor    ($j,$j);
334         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
335         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
336
337         &mov    ("eax",&DWP(0,$inp));                   # np[0]
338         &mul    ($word);                                # np[0]*m
339         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
340         &mov    ("eax",&DWP(4,$inp));                   # np[1]
341         &adc    ("edx",0);
342         &inc    ($j);
343
344         &jmp    (&label("2ndmadd"));
345 \f\f
346 &set_label("1stmadd",16);
347         &mov    ($carry,"edx");
348         &mul    ($word);                                # ap[j]*bp[i]
349         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
350         &lea    ($j,&DWP(1,$j));
351         &adc    ("edx",0);
352         &add    ($carry,"eax");
353         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
354         &adc    ("edx",0);
355         &cmp    ($j,$num);
356         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
357         &jl     (&label("1stmadd"));
358
359         &mov    ($carry,"edx");
360         &mul    ($word);                                # ap[num-1]*bp[i]
361         &add    ("eax",&DWP($frame,"esp",$num,4));      # +=tp[num-1]
362          &mov   ($word,$_n0);
363         &adc    ("edx",0);
364          &mov   ($inp,$_np);
365         &add    ($carry,"eax");
366         &adc    ("edx",0);
367          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
368
369         &xor    ($j,$j);
370         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
371         &mov    (&DWP($frame,"esp",$num,4),$carry);     # tp[num-1]=
372         &adc    ($j,0);
373          &mov   ("eax",&DWP(0,$inp));                   # np[0]
374         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
375         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
376
377         &mul    ($word);                                # np[0]*m
378         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
379         &mov    ("eax",&DWP(4,$inp));                   # np[1]
380         &adc    ("edx",0);
381         &mov    ($j,1);
382 \f
383 &set_label("2ndmadd",16);
384         &mov    ($carry,"edx");
385         &mul    ($word);                                # np[j]*m
386         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
387         &lea    ($j,&DWP(1,$j));
388         &adc    ("edx",0);
389         &add    ($carry,"eax");
390         &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+1]
391         &adc    ("edx",0);
392         &cmp    ($j,$num);
393         &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j-1]=
394         &jl     (&label("2ndmadd"));
395
396         &mov    ($carry,"edx");
397         &mul    ($word);                                # np[j]*m
398         &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
399         &adc    ("edx",0);
400         &add    ($carry,"eax");
401         &adc    ("edx",0);
402         &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
403
404         &xor    ("eax","eax");
405          &mov   ($j,$_bp);                              # &bp[i]
406         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
407         &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
408          &lea   ($j,&DWP(4,$j));
409         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
410          &cmp   ($j,$_bpend);
411         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
412         &je     (&label("common_tail"));
413
414         &mov    ($word,&DWP(0,$j));                     # bp[i+1]
415         &mov    ($inp,$_ap);
416         &mov    ($_bp,$j);                              # &bp[++i]
417         &xor    ($j,$j);
418         &xor    ("edx","edx");
419         &mov    ("eax",&DWP(0,$inp));
420         &jmp    (&label("1stmadd"));
421 \f
422 &set_label("bn_sqr_mont",16);
423 $sbit=$num;
424         &mov    ($_num,$num);
425         &mov    ($_bp,$j);                              # i=0
426
427         &mov    ("eax",$word);                          # ap[0]
428         &mul    ($word);                                # ap[0]*ap[0]
429         &mov    (&DWP($frame,"esp"),"eax");             # tp[0]=
430         &mov    ($sbit,"edx");
431         &shr    ("edx",1);
432         &and    ($sbit,1);
433         &inc    ($j);
434 &set_label("sqr",16);
435         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
436         &mov    ($carry,"edx");
437         &mul    ($word);                                # ap[j]*ap[0]
438         &add    ("eax",$carry);
439         &lea    ($j,&DWP(1,$j));
440         &adc    ("edx",0);
441         &lea    ($carry,&DWP(0,$sbit,"eax",2));
442         &shr    ("eax",31);
443         &cmp    ($j,$_num);
444         &mov    ($sbit,"eax");
445         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
446         &jl     (&label("sqr"));
447
448         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[num-1]
449         &mov    ($carry,"edx");
450         &mul    ($word);                                # ap[num-1]*ap[0]
451         &add    ("eax",$carry);
452          &mov   ($word,$_n0);
453         &adc    ("edx",0);
454          &mov   ($inp,$_np);
455         &lea    ($carry,&DWP(0,$sbit,"eax",2));
456          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
457         &shr    ("eax",31);
458         &mov    (&DWP($frame,"esp",$j,4),$carry);       # tp[num-1]=
459
460         &lea    ($carry,&DWP(0,"eax","edx",2));
461          &mov   ("eax",&DWP(0,$inp));                   # np[0]
462         &shr    ("edx",31);
463         &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num]=
464         &mov    (&DWP($frame+8,"esp",$j,4),"edx");      # tp[num+1]=
465
466         &mul    ($word);                                # np[0]*m
467         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
468         &mov    ($num,$j);
469         &adc    ("edx",0);
470         &mov    ("eax",&DWP(4,$inp));                   # np[1]
471         &mov    ($j,1);
472 \f\f
473 &set_label("3rdmadd",16);
474         &mov    ($carry,"edx");
475         &mul    ($word);                                # np[j]*m
476         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
477         &adc    ("edx",0);
478         &add    ($carry,"eax");
479         &mov    ("eax",&DWP(4,$inp,$j,4));              # np[j+1]
480         &adc    ("edx",0);
481         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j-1]=
482
483         &mov    ($carry,"edx");
484         &mul    ($word);                                # np[j+1]*m
485         &add    ($carry,&DWP($frame+4,"esp",$j,4));     # +=tp[j+1]
486         &lea    ($j,&DWP(2,$j));
487         &adc    ("edx",0);
488         &add    ($carry,"eax");
489         &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+2]
490         &adc    ("edx",0);
491         &cmp    ($j,$num);
492         &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j]=
493         &jl     (&label("3rdmadd"));
494
495         &mov    ($carry,"edx");
496         &mul    ($word);                                # np[j]*m
497         &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
498         &adc    ("edx",0);
499         &add    ($carry,"eax");
500         &adc    ("edx",0);
501         &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
502
503         &mov    ($j,$_bp);                              # i
504         &xor    ("eax","eax");
505         &mov    ($inp,$_ap);
506         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
507         &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
508         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
509         &cmp    ($j,$num);
510         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
511         &je     (&label("common_tail"));
512 \f
513         &mov    ($word,&DWP(4,$inp,$j,4));              # ap[i]
514         &lea    ($j,&DWP(1,$j));
515         &mov    ("eax",$word);
516         &mov    ($_bp,$j);                              # ++i
517         &mul    ($word);                                # ap[i]*ap[i]
518         &add    ("eax",&DWP($frame,"esp",$j,4));        # +=tp[i]
519         &adc    ("edx",0);
520         &mov    (&DWP($frame,"esp",$j,4),"eax");        # tp[i]=
521         &xor    ($carry,$carry);
522         &cmp    ($j,$num);
523         &lea    ($j,&DWP(1,$j));
524         &je     (&label("sqrlast"));
525
526         &mov    ($sbit,"edx");                          # zaps $num
527         &shr    ("edx",1);
528         &and    ($sbit,1);
529 &set_label("sqradd",16);
530         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
531         &mov    ($carry,"edx");
532         &mul    ($word);                                # ap[j]*ap[i]
533         &add    ("eax",$carry);
534         &lea    ($carry,&DWP(0,"eax","eax"));
535         &adc    ("edx",0);
536         &shr    ("eax",31);
537         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
538         &lea    ($j,&DWP(1,$j));
539         &adc    ("eax",0);
540         &add    ($carry,$sbit);
541         &adc    ("eax",0);
542         &cmp    ($j,$_num);
543         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
544         &mov    ($sbit,"eax");
545         &jle    (&label("sqradd"));
546
547         &mov    ($carry,"edx");
548         &add    ("edx","edx");
549         &shr    ($carry,31);
550         &add    ("edx",$sbit);
551         &adc    ($carry,0);
552 &set_label("sqrlast");
553         &mov    ($word,$_n0);
554         &mov    ($inp,$_np);
555         &imul   ($word,&DWP($frame,"esp"));             # n0*tp[0]
556
557         &add    ("edx",&DWP($frame,"esp",$j,4));        # +=tp[num]
558         &mov    ("eax",&DWP(0,$inp));                   # np[0]
559         &adc    ($carry,0);
560         &mov    (&DWP($frame,"esp",$j,4),"edx");        # tp[num]=
561         &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num+1]=
562
563         &mul    ($word);                                # np[0]*m
564         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
565         &lea    ($num,&DWP(-1,$j));
566         &adc    ("edx",0);
567         &mov    ($j,1);
568         &mov    ("eax",&DWP(4,$inp));                   # np[1]
569
570         &jmp    (&label("3rdmadd"));
571 }
572 \f
573 &set_label("common_tail",16);
574         &mov    ($np,$_np);                     # load modulus pointer
575         &mov    ($rp,$_rp);                     # load result pointer
576         &lea    ($tp,&DWP($frame,"esp"));       # [$ap and $bp are zapped]
577
578         &mov    ("eax",&DWP(0,$tp));            # tp[0]
579         &mov    ($j,$num);                      # j=num-1
580         &xor    ($i,$i);                        # i=0 and clear CF!
581
582 &set_label("sub",16);
583         &sbb    ("eax",&DWP(0,$np,$i,4));
584         &mov    (&DWP(0,$rp,$i,4),"eax");       # rp[i]=tp[i]-np[i]
585         &dec    ($j);                           # doesn't affect CF!
586         &mov    ("eax",&DWP(4,$tp,$i,4));       # tp[i+1]
587         &lea    ($i,&DWP(1,$i));                # i++
588         &jge    (&label("sub"));
589
590         &sbb    ("eax",0);                      # handle upmost overflow bit
591         &and    ($tp,"eax");
592         &not    ("eax");
593         &mov    ($np,$rp);
594         &and    ($np,"eax");
595         &or     ($tp,$np);                      # tp=carry?tp:rp
596
597 &set_label("copy",16);                          # copy or in-place refresh
598         &mov    ("eax",&DWP(0,$tp,$num,4));
599         &mov    (&DWP(0,$rp,$num,4),"eax");     # rp[i]=tp[i]
600         &mov    (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
601         &dec    ($num);
602         &jge    (&label("copy"));
603
604         &mov    ("esp",$_sp);           # pull saved stack pointer
605         &mov    ("eax",1);
606 &set_label("just_leave");
607 &function_end("bn_mul_mont");
608
609 &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
610
611 &asm_finish();
612
613 close STDOUT;