bn/asm/x86[_64]-mont*.pl: complement alloca with page-walking.
[openssl.git] / crypto / bn / asm / x86-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # October 2005
11 #
12 # This is a "teaser" code, as it can be improved in several ways...
13 # First of all non-SSE2 path should be implemented (yes, for now it
14 # performs Montgomery multiplication/convolution only on SSE2-capable
15 # CPUs such as P4, others fall down to original code). Then inner loop
16 # can be unrolled and modulo-scheduled to improve ILP and possibly
17 # moved to 128-bit XMM register bank (though it would require input
18 # rearrangement and/or increase bus bandwidth utilization). Dedicated
19 # squaring procedure should give further performance improvement...
20 # Yet, for being draft, the code improves rsa512 *sign* benchmark by
21 # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
22
23 # December 2006
24 #
25 # Modulo-scheduling SSE2 loops results in further 15-20% improvement.
26 # Integer-only code [being equipped with dedicated squaring procedure]
27 # gives ~40% on rsa512 sign benchmark...
28
29 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30 push(@INC,"${dir}","${dir}../../perlasm");
31 require "x86asm.pl";
32
33 &asm_init($ARGV[0],$0);
34
35 $sse2=0;
36 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
37
38 &external_label("OPENSSL_ia32cap_P") if ($sse2);
39
40 &function_begin("bn_mul_mont");
41
42 $i="edx";
43 $j="ecx";
44 $ap="esi";      $tp="esi";              # overlapping variables!!!
45 $rp="edi";      $bp="edi";              # overlapping variables!!!
46 $np="ebp";
47 $num="ebx";
48
49 $_num=&DWP(4*0,"esp");                  # stack top layout
50 $_rp=&DWP(4*1,"esp");
51 $_ap=&DWP(4*2,"esp");
52 $_bp=&DWP(4*3,"esp");
53 $_np=&DWP(4*4,"esp");
54 $_n0=&DWP(4*5,"esp");   $_n0q=&QWP(4*5,"esp");
55 $_sp=&DWP(4*6,"esp");
56 $_bpend=&DWP(4*7,"esp");
57 $frame=32;                              # size of above frame rounded up to 16n
58
59         &xor    ("eax","eax");
60         &mov    ("edi",&wparam(5));     # int num
61         &cmp    ("edi",4);
62         &jl     (&label("just_leave"));
63
64         &lea    ("esi",&wparam(0));     # put aside pointer to argument block
65         &lea    ("edx",&wparam(1));     # load ap
66         &mov    ("ebp","esp");          # saved stack pointer!
67         &add    ("edi",2);              # extra two words on top of tp
68         &neg    ("edi");
69         &lea    ("esp",&DWP(-$frame,"esp","edi",4));    # alloca($frame+4*(num+2))
70         &neg    ("edi");
71
72         # minimize cache contention by arraning 2K window between stack
73         # pointer and ap argument [np is also position sensitive vector,
74         # but it's assumed to be near ap, as it's allocated at ~same
75         # time].
76         &mov    ("eax","esp");
77         &sub    ("eax","edx");
78         &and    ("eax",2047);
79         &sub    ("esp","eax");          # this aligns sp and ap modulo 2048
80
81         &xor    ("edx","esp");
82         &and    ("edx",2048);
83         &xor    ("edx",2048);
84         &sub    ("esp","edx");          # this splits them apart modulo 4096
85
86         &and    ("esp",-64);            # align to cache line
87
88         # Some OSes, *cough*-dows, insist on stack being "wired" to
89         # physical memory in strictly sequential manner, i.e. if stack
90         # allocation spans two pages, then reference to farmost one can
91         # be punishable by SEGV. But page walking can do good even on
92         # other OSes, because it guarantees that villain thread hits
93         # the guard page before it can make damage to innocent one...
94         &mov    ("eax","ebp");
95         &sub    ("eax","esp");
96         &and    ("eax",-4096);
97 &set_label("page_walk");
98         &mov    ("edx",&DWP(0,"esp","eax"));
99         &sub    ("eax",4096);
100         &data_byte(0x2e);
101         &jnc    (&label("page_walk"));
102
103         ################################# load argument block...
104         &mov    ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
105         &mov    ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
106         &mov    ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
107         &mov    ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
108         &mov    ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
109         #&mov   ("edi",&DWP(5*4,"esi"));# int num
110
111         &mov    ("esi",&DWP(0,"esi"));  # pull n0[0]
112         &mov    ($_rp,"eax");           # ... save a copy of argument block
113         &mov    ($_ap,"ebx");
114         &mov    ($_bp,"ecx");
115         &mov    ($_np,"edx");
116         &mov    ($_n0,"esi");
117         &lea    ($num,&DWP(-3,"edi"));  # num=num-1 to assist modulo-scheduling
118         #&mov   ($_num,$num);           # redundant as $num is not reused
119         &mov    ($_sp,"ebp");           # saved stack pointer!
120 \f
121 if($sse2) {
122 $acc0="mm0";    # mmx register bank layout
123 $acc1="mm1";
124 $car0="mm2";
125 $car1="mm3";
126 $mul0="mm4";
127 $mul1="mm5";
128 $temp="mm6";
129 $mask="mm7";
130
131         &picmeup("eax","OPENSSL_ia32cap_P");
132         &bt     (&DWP(0,"eax"),26);
133         &jnc    (&label("non_sse2"));
134
135         &mov    ("eax",-1);
136         &movd   ($mask,"eax");          # mask 32 lower bits
137
138         &mov    ($ap,$_ap);             # load input pointers
139         &mov    ($bp,$_bp);
140         &mov    ($np,$_np);
141
142         &xor    ($i,$i);                # i=0
143         &xor    ($j,$j);                # j=0
144
145         &movd   ($mul0,&DWP(0,$bp));            # bp[0]
146         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
147         &movd   ($car1,&DWP(0,$np));            # np[0]
148
149         &pmuludq($mul1,$mul0);                  # ap[0]*bp[0]
150         &movq   ($car0,$mul1);
151         &movq   ($acc0,$mul1);                  # I wish movd worked for
152         &pand   ($acc0,$mask);                  # inter-register transfers
153
154         &pmuludq($mul1,$_n0q);                  # *=n0
155
156         &pmuludq($car1,$mul1);                  # "t[0]"*np[0]*n0
157         &paddq  ($car1,$acc0);
158
159         &movd   ($acc1,&DWP(4,$np));            # np[1]
160         &movd   ($acc0,&DWP(4,$ap));            # ap[1]
161
162         &psrlq  ($car0,32);
163         &psrlq  ($car1,32);
164
165         &inc    ($j);                           # j++
166 &set_label("1st",16);
167         &pmuludq($acc0,$mul0);                  # ap[j]*bp[0]
168         &pmuludq($acc1,$mul1);                  # np[j]*m1
169         &paddq  ($car0,$acc0);                  # +=c0
170         &paddq  ($car1,$acc1);                  # +=c1
171
172         &movq   ($acc0,$car0);
173         &pand   ($acc0,$mask);
174         &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
175         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[0];
176         &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
177         &psrlq  ($car0,32);
178         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[j-1]=
179         &psrlq  ($car1,32);
180
181         &lea    ($j,&DWP(1,$j));
182         &cmp    ($j,$num);
183         &jl     (&label("1st"));
184
185         &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[0]
186         &pmuludq($acc1,$mul1);                  # np[num-1]*m1
187         &paddq  ($car0,$acc0);                  # +=c0
188         &paddq  ($car1,$acc1);                  # +=c1
189
190         &movq   ($acc0,$car0);
191         &pand   ($acc0,$mask);
192         &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[0];
193         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
194
195         &psrlq  ($car0,32);
196         &psrlq  ($car1,32);
197
198         &paddq  ($car1,$car0);
199         &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
200 \f
201         &inc    ($i);                           # i++
202 &set_label("outer");
203         &xor    ($j,$j);                        # j=0
204
205         &movd   ($mul0,&DWP(0,$bp,$i,4));       # bp[i]
206         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
207         &movd   ($temp,&DWP($frame,"esp"));     # tp[0]
208         &movd   ($car1,&DWP(0,$np));            # np[0]
209         &pmuludq($mul1,$mul0);                  # ap[0]*bp[i]
210
211         &paddq  ($mul1,$temp);                  # +=tp[0]
212         &movq   ($acc0,$mul1);
213         &movq   ($car0,$mul1);
214         &pand   ($acc0,$mask);
215
216         &pmuludq($mul1,$_n0q);                  # *=n0
217
218         &pmuludq($car1,$mul1);
219         &paddq  ($car1,$acc0);
220
221         &movd   ($temp,&DWP($frame+4,"esp"));   # tp[1]
222         &movd   ($acc1,&DWP(4,$np));            # np[1]
223         &movd   ($acc0,&DWP(4,$ap));            # ap[1]
224
225         &psrlq  ($car0,32);
226         &psrlq  ($car1,32);
227         &paddq  ($car0,$temp);                  # +=tp[1]
228
229         &inc    ($j);                           # j++
230         &dec    ($num);
231 &set_label("inner");
232         &pmuludq($acc0,$mul0);                  # ap[j]*bp[i]
233         &pmuludq($acc1,$mul1);                  # np[j]*m1
234         &paddq  ($car0,$acc0);                  # +=c0
235         &paddq  ($car1,$acc1);                  # +=c1
236
237         &movq   ($acc0,$car0);
238         &movd   ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
239         &pand   ($acc0,$mask);
240         &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
241         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[i]+tp[j]
242         &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
243         &psrlq  ($car0,32);
244         &movd   (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
245         &psrlq  ($car1,32);
246         &paddq  ($car0,$temp);                  # +=tp[j+1]
247
248         &dec    ($num);
249         &lea    ($j,&DWP(1,$j));                # j++
250         &jnz    (&label("inner"));
251
252         &mov    ($num,$j);
253         &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[i]
254         &pmuludq($acc1,$mul1);                  # np[num-1]*m1
255         &paddq  ($car0,$acc0);                  # +=c0
256         &paddq  ($car1,$acc1);                  # +=c1
257
258         &movq   ($acc0,$car0);
259         &pand   ($acc0,$mask);
260         &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[i]+tp[num-1]
261         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
262         &psrlq  ($car0,32);
263         &psrlq  ($car1,32);
264
265         &movd   ($temp,&DWP($frame+4,"esp",$num,4));    # += tp[num]
266         &paddq  ($car1,$car0);
267         &paddq  ($car1,$temp);
268         &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
269
270         &lea    ($i,&DWP(1,$i));                # i++
271         &cmp    ($i,$num);
272         &jle    (&label("outer"));
273
274         &emms   ();                             # done with mmx bank
275         &jmp    (&label("common_tail"));
276
277 &set_label("non_sse2",16);
278 }
279 \f
280 if (0) {
281         &mov    ("esp",$_sp);
282         &xor    ("eax","eax");  # signal "not fast enough [yet]"
283         &jmp    (&label("just_leave"));
284         # While the below code provides competitive performance for
285         # all key lengthes on modern Intel cores, it's still more
286         # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
287         # means compared to the original integer-only assembler.
288         # 512-bit RSA sign is better by ~40%, but that's about all
289         # one can say about all CPUs...
290 } else {
291 $inp="esi";     # integer path uses these registers differently
292 $word="edi";
293 $carry="ebp";
294
295         &mov    ($inp,$_ap);
296         &lea    ($carry,&DWP(1,$num));
297         &mov    ($word,$_bp);
298         &xor    ($j,$j);                                # j=0
299         &mov    ("edx",$inp);
300         &and    ($carry,1);                             # see if num is even
301         &sub    ("edx",$word);                          # see if ap==bp
302         &lea    ("eax",&DWP(4,$word,$num,4));           # &bp[num]
303         &or     ($carry,"edx");
304         &mov    ($word,&DWP(0,$word));                  # bp[0]
305         &jz     (&label("bn_sqr_mont"));
306         &mov    ($_bpend,"eax");
307         &mov    ("eax",&DWP(0,$inp));
308         &xor    ("edx","edx");
309
310 &set_label("mull",16);
311         &mov    ($carry,"edx");
312         &mul    ($word);                                # ap[j]*bp[0]
313         &add    ($carry,"eax");
314         &lea    ($j,&DWP(1,$j));
315         &adc    ("edx",0);
316         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
317         &cmp    ($j,$num);
318         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
319         &jl     (&label("mull"));
320
321         &mov    ($carry,"edx");
322         &mul    ($word);                                # ap[num-1]*bp[0]
323          &mov   ($word,$_n0);
324         &add    ("eax",$carry);
325          &mov   ($inp,$_np);
326         &adc    ("edx",0);
327          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
328
329         &mov    (&DWP($frame,"esp",$num,4),"eax");      # tp[num-1]=
330         &xor    ($j,$j);
331         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
332         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
333
334         &mov    ("eax",&DWP(0,$inp));                   # np[0]
335         &mul    ($word);                                # np[0]*m
336         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
337         &mov    ("eax",&DWP(4,$inp));                   # np[1]
338         &adc    ("edx",0);
339         &inc    ($j);
340
341         &jmp    (&label("2ndmadd"));
342 \f\f
343 &set_label("1stmadd",16);
344         &mov    ($carry,"edx");
345         &mul    ($word);                                # ap[j]*bp[i]
346         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
347         &lea    ($j,&DWP(1,$j));
348         &adc    ("edx",0);
349         &add    ($carry,"eax");
350         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
351         &adc    ("edx",0);
352         &cmp    ($j,$num);
353         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
354         &jl     (&label("1stmadd"));
355
356         &mov    ($carry,"edx");
357         &mul    ($word);                                # ap[num-1]*bp[i]
358         &add    ("eax",&DWP($frame,"esp",$num,4));      # +=tp[num-1]
359          &mov   ($word,$_n0);
360         &adc    ("edx",0);
361          &mov   ($inp,$_np);
362         &add    ($carry,"eax");
363         &adc    ("edx",0);
364          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
365
366         &xor    ($j,$j);
367         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
368         &mov    (&DWP($frame,"esp",$num,4),$carry);     # tp[num-1]=
369         &adc    ($j,0);
370          &mov   ("eax",&DWP(0,$inp));                   # np[0]
371         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
372         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
373
374         &mul    ($word);                                # np[0]*m
375         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
376         &mov    ("eax",&DWP(4,$inp));                   # np[1]
377         &adc    ("edx",0);
378         &mov    ($j,1);
379 \f
380 &set_label("2ndmadd",16);
381         &mov    ($carry,"edx");
382         &mul    ($word);                                # np[j]*m
383         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
384         &lea    ($j,&DWP(1,$j));
385         &adc    ("edx",0);
386         &add    ($carry,"eax");
387         &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+1]
388         &adc    ("edx",0);
389         &cmp    ($j,$num);
390         &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j-1]=
391         &jl     (&label("2ndmadd"));
392
393         &mov    ($carry,"edx");
394         &mul    ($word);                                # np[j]*m
395         &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
396         &adc    ("edx",0);
397         &add    ($carry,"eax");
398         &adc    ("edx",0);
399         &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
400
401         &xor    ("eax","eax");
402          &mov   ($j,$_bp);                              # &bp[i]
403         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
404         &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
405          &lea   ($j,&DWP(4,$j));
406         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
407          &cmp   ($j,$_bpend);
408         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
409         &je     (&label("common_tail"));
410
411         &mov    ($word,&DWP(0,$j));                     # bp[i+1]
412         &mov    ($inp,$_ap);
413         &mov    ($_bp,$j);                              # &bp[++i]
414         &xor    ($j,$j);
415         &xor    ("edx","edx");
416         &mov    ("eax",&DWP(0,$inp));
417         &jmp    (&label("1stmadd"));
418 \f
419 &set_label("bn_sqr_mont",16);
420 $sbit=$num;
421         &mov    ($_num,$num);
422         &mov    ($_bp,$j);                              # i=0
423
424         &mov    ("eax",$word);                          # ap[0]
425         &mul    ($word);                                # ap[0]*ap[0]
426         &mov    (&DWP($frame,"esp"),"eax");             # tp[0]=
427         &mov    ($sbit,"edx");
428         &shr    ("edx",1);
429         &and    ($sbit,1);
430         &inc    ($j);
431 &set_label("sqr",16);
432         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
433         &mov    ($carry,"edx");
434         &mul    ($word);                                # ap[j]*ap[0]
435         &add    ("eax",$carry);
436         &lea    ($j,&DWP(1,$j));
437         &adc    ("edx",0);
438         &lea    ($carry,&DWP(0,$sbit,"eax",2));
439         &shr    ("eax",31);
440         &cmp    ($j,$_num);
441         &mov    ($sbit,"eax");
442         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
443         &jl     (&label("sqr"));
444
445         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[num-1]
446         &mov    ($carry,"edx");
447         &mul    ($word);                                # ap[num-1]*ap[0]
448         &add    ("eax",$carry);
449          &mov   ($word,$_n0);
450         &adc    ("edx",0);
451          &mov   ($inp,$_np);
452         &lea    ($carry,&DWP(0,$sbit,"eax",2));
453          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
454         &shr    ("eax",31);
455         &mov    (&DWP($frame,"esp",$j,4),$carry);       # tp[num-1]=
456
457         &lea    ($carry,&DWP(0,"eax","edx",2));
458          &mov   ("eax",&DWP(0,$inp));                   # np[0]
459         &shr    ("edx",31);
460         &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num]=
461         &mov    (&DWP($frame+8,"esp",$j,4),"edx");      # tp[num+1]=
462
463         &mul    ($word);                                # np[0]*m
464         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
465         &mov    ($num,$j);
466         &adc    ("edx",0);
467         &mov    ("eax",&DWP(4,$inp));                   # np[1]
468         &mov    ($j,1);
469 \f\f
470 &set_label("3rdmadd",16);
471         &mov    ($carry,"edx");
472         &mul    ($word);                                # np[j]*m
473         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
474         &adc    ("edx",0);
475         &add    ($carry,"eax");
476         &mov    ("eax",&DWP(4,$inp,$j,4));              # np[j+1]
477         &adc    ("edx",0);
478         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j-1]=
479
480         &mov    ($carry,"edx");
481         &mul    ($word);                                # np[j+1]*m
482         &add    ($carry,&DWP($frame+4,"esp",$j,4));     # +=tp[j+1]
483         &lea    ($j,&DWP(2,$j));
484         &adc    ("edx",0);
485         &add    ($carry,"eax");
486         &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+2]
487         &adc    ("edx",0);
488         &cmp    ($j,$num);
489         &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j]=
490         &jl     (&label("3rdmadd"));
491
492         &mov    ($carry,"edx");
493         &mul    ($word);                                # np[j]*m
494         &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
495         &adc    ("edx",0);
496         &add    ($carry,"eax");
497         &adc    ("edx",0);
498         &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
499
500         &mov    ($j,$_bp);                              # i
501         &xor    ("eax","eax");
502         &mov    ($inp,$_ap);
503         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
504         &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
505         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
506         &cmp    ($j,$num);
507         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
508         &je     (&label("common_tail"));
509 \f
510         &mov    ($word,&DWP(4,$inp,$j,4));              # ap[i]
511         &lea    ($j,&DWP(1,$j));
512         &mov    ("eax",$word);
513         &mov    ($_bp,$j);                              # ++i
514         &mul    ($word);                                # ap[i]*ap[i]
515         &add    ("eax",&DWP($frame,"esp",$j,4));        # +=tp[i]
516         &adc    ("edx",0);
517         &mov    (&DWP($frame,"esp",$j,4),"eax");        # tp[i]=
518         &xor    ($carry,$carry);
519         &cmp    ($j,$num);
520         &lea    ($j,&DWP(1,$j));
521         &je     (&label("sqrlast"));
522
523         &mov    ($sbit,"edx");                          # zaps $num
524         &shr    ("edx",1);
525         &and    ($sbit,1);
526 &set_label("sqradd",16);
527         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
528         &mov    ($carry,"edx");
529         &mul    ($word);                                # ap[j]*ap[i]
530         &add    ("eax",$carry);
531         &lea    ($carry,&DWP(0,"eax","eax"));
532         &adc    ("edx",0);
533         &shr    ("eax",31);
534         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
535         &lea    ($j,&DWP(1,$j));
536         &adc    ("eax",0);
537         &add    ($carry,$sbit);
538         &adc    ("eax",0);
539         &cmp    ($j,$_num);
540         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
541         &mov    ($sbit,"eax");
542         &jle    (&label("sqradd"));
543
544         &mov    ($carry,"edx");
545         &add    ("edx","edx");
546         &shr    ($carry,31);
547         &add    ("edx",$sbit);
548         &adc    ($carry,0);
549 &set_label("sqrlast");
550         &mov    ($word,$_n0);
551         &mov    ($inp,$_np);
552         &imul   ($word,&DWP($frame,"esp"));             # n0*tp[0]
553
554         &add    ("edx",&DWP($frame,"esp",$j,4));        # +=tp[num]
555         &mov    ("eax",&DWP(0,$inp));                   # np[0]
556         &adc    ($carry,0);
557         &mov    (&DWP($frame,"esp",$j,4),"edx");        # tp[num]=
558         &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num+1]=
559
560         &mul    ($word);                                # np[0]*m
561         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
562         &lea    ($num,&DWP(-1,$j));
563         &adc    ("edx",0);
564         &mov    ($j,1);
565         &mov    ("eax",&DWP(4,$inp));                   # np[1]
566
567         &jmp    (&label("3rdmadd"));
568 }
569 \f
570 &set_label("common_tail",16);
571         &mov    ($np,$_np);                     # load modulus pointer
572         &mov    ($rp,$_rp);                     # load result pointer
573         &lea    ($tp,&DWP($frame,"esp"));       # [$ap and $bp are zapped]
574
575         &mov    ("eax",&DWP(0,$tp));            # tp[0]
576         &mov    ($j,$num);                      # j=num-1
577         &xor    ($i,$i);                        # i=0 and clear CF!
578
579 &set_label("sub",16);
580         &sbb    ("eax",&DWP(0,$np,$i,4));
581         &mov    (&DWP(0,$rp,$i,4),"eax");       # rp[i]=tp[i]-np[i]
582         &dec    ($j);                           # doesn't affect CF!
583         &mov    ("eax",&DWP(4,$tp,$i,4));       # tp[i+1]
584         &lea    ($i,&DWP(1,$i));                # i++
585         &jge    (&label("sub"));
586
587         &sbb    ("eax",0);                      # handle upmost overflow bit
588         &and    ($tp,"eax");
589         &not    ("eax");
590         &mov    ($np,$rp);
591         &and    ($np,"eax");
592         &or     ($tp,$np);                      # tp=carry?tp:rp
593
594 &set_label("copy",16);                          # copy or in-place refresh
595         &mov    ("eax",&DWP(0,$tp,$num,4));
596         &mov    (&DWP(0,$rp,$num,4),"eax");     # rp[i]=tp[i]
597         &mov    (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
598         &dec    ($num);
599         &jge    (&label("copy"));
600
601         &mov    ("esp",$_sp);           # pull saved stack pointer
602         &mov    ("eax",1);
603 &set_label("just_leave");
604 &function_end("bn_mul_mont");
605
606 &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
607
608 &asm_finish();