Update copyright year
[openssl.git] / crypto / bn / asm / x86-mont.pl
1 #! /usr/bin/env perl
2 # Copyright 2005-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # October 2005
18 #
19 # This is a "teaser" code, as it can be improved in several ways...
20 # First of all non-SSE2 path should be implemented (yes, for now it
21 # performs Montgomery multiplication/convolution only on SSE2-capable
22 # CPUs such as P4, others fall down to original code). Then inner loop
23 # can be unrolled and modulo-scheduled to improve ILP and possibly
24 # moved to 128-bit XMM register bank (though it would require input
25 # rearrangement and/or increase bus bandwidth utilization). Dedicated
26 # squaring procedure should give further performance improvement...
27 # Yet, for being draft, the code improves rsa512 *sign* benchmark by
28 # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
29
30 # December 2006
31 #
32 # Modulo-scheduling SSE2 loops results in further 15-20% improvement.
33 # Integer-only code [being equipped with dedicated squaring procedure]
34 # gives ~40% on rsa512 sign benchmark...
35
36 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37 push(@INC,"${dir}","${dir}../../perlasm");
38 require "x86asm.pl";
39
40 $output = pop;
41 open STDOUT,">$output";
42
43 &asm_init($ARGV[0]);
44
45 $sse2=0;
46 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
47
48 &external_label("OPENSSL_ia32cap_P") if ($sse2);
49
50 &function_begin("bn_mul_mont");
51
52 $i="edx";
53 $j="ecx";
54 $ap="esi";      $tp="esi";              # overlapping variables!!!
55 $rp="edi";      $bp="edi";              # overlapping variables!!!
56 $np="ebp";
57 $num="ebx";
58
59 $_num=&DWP(4*0,"esp");                  # stack top layout
60 $_rp=&DWP(4*1,"esp");
61 $_ap=&DWP(4*2,"esp");
62 $_bp=&DWP(4*3,"esp");
63 $_np=&DWP(4*4,"esp");
64 $_n0=&DWP(4*5,"esp");   $_n0q=&QWP(4*5,"esp");
65 $_sp=&DWP(4*6,"esp");
66 $_bpend=&DWP(4*7,"esp");
67 $frame=32;                              # size of above frame rounded up to 16n
68
69         &xor    ("eax","eax");
70         &mov    ("edi",&wparam(5));     # int num
71         &cmp    ("edi",4);
72         &jl     (&label("just_leave"));
73
74         &lea    ("esi",&wparam(0));     # put aside pointer to argument block
75         &lea    ("edx",&wparam(1));     # load ap
76         &add    ("edi",2);              # extra two words on top of tp
77         &neg    ("edi");
78         &lea    ("ebp",&DWP(-$frame,"esp","edi",4));    # future alloca($frame+4*(num+2))
79         &neg    ("edi");
80
81         # minimize cache contention by arranging 2K window between stack
82         # pointer and ap argument [np is also position sensitive vector,
83         # but it's assumed to be near ap, as it's allocated at ~same
84         # time].
85         &mov    ("eax","ebp");
86         &sub    ("eax","edx");
87         &and    ("eax",2047);
88         &sub    ("ebp","eax");          # this aligns sp and ap modulo 2048
89
90         &xor    ("edx","ebp");
91         &and    ("edx",2048);
92         &xor    ("edx",2048);
93         &sub    ("ebp","edx");          # this splits them apart modulo 4096
94
95         &and    ("ebp",-64);            # align to cache line
96
97         # An OS-agnostic version of __chkstk.
98         #
99         # Some OSes (Windows) insist on stack being "wired" to
100         # physical memory in strictly sequential manner, i.e. if stack
101         # allocation spans two pages, then reference to farmost one can
102         # be punishable by SEGV. But page walking can do good even on
103         # other OSes, because it guarantees that villain thread hits
104         # the guard page before it can make damage to innocent one...
105         &mov    ("eax","esp");
106         &sub    ("eax","ebp");
107         &and    ("eax",-4096);
108         &mov    ("edx","esp");          # saved stack pointer!
109         &lea    ("esp",&DWP(0,"ebp","eax"));
110         &mov    ("eax",&DWP(0,"esp"));
111         &cmp    ("esp","ebp");
112         &ja     (&label("page_walk"));
113         &jmp    (&label("page_walk_done"));
114
115 &set_label("page_walk",16);
116         &lea    ("esp",&DWP(-4096,"esp"));
117         &mov    ("eax",&DWP(0,"esp"));
118         &cmp    ("esp","ebp");
119         &ja     (&label("page_walk"));
120 &set_label("page_walk_done");
121
122         ################################# load argument block...
123         &mov    ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
124         &mov    ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
125         &mov    ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
126         &mov    ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
127         &mov    ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
128         #&mov   ("edi",&DWP(5*4,"esi"));# int num
129
130         &mov    ("esi",&DWP(0,"esi"));  # pull n0[0]
131         &mov    ($_rp,"eax");           # ... save a copy of argument block
132         &mov    ($_ap,"ebx");
133         &mov    ($_bp,"ecx");
134         &mov    ($_np,"ebp");
135         &mov    ($_n0,"esi");
136         &lea    ($num,&DWP(-3,"edi"));  # num=num-1 to assist modulo-scheduling
137         #&mov   ($_num,$num);           # redundant as $num is not reused
138         &mov    ($_sp,"edx");           # saved stack pointer!
139 \f
140 if($sse2) {
141 $acc0="mm0";    # mmx register bank layout
142 $acc1="mm1";
143 $car0="mm2";
144 $car1="mm3";
145 $mul0="mm4";
146 $mul1="mm5";
147 $temp="mm6";
148 $mask="mm7";
149
150         &picmeup("eax","OPENSSL_ia32cap_P");
151         &bt     (&DWP(0,"eax"),26);
152         &jnc    (&label("non_sse2"));
153
154         &mov    ("eax",-1);
155         &movd   ($mask,"eax");          # mask 32 lower bits
156
157         &mov    ($ap,$_ap);             # load input pointers
158         &mov    ($bp,$_bp);
159         &mov    ($np,$_np);
160
161         &xor    ($i,$i);                # i=0
162         &xor    ($j,$j);                # j=0
163
164         &movd   ($mul0,&DWP(0,$bp));            # bp[0]
165         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
166         &movd   ($car1,&DWP(0,$np));            # np[0]
167
168         &pmuludq($mul1,$mul0);                  # ap[0]*bp[0]
169         &movq   ($car0,$mul1);
170         &movq   ($acc0,$mul1);                  # I wish movd worked for
171         &pand   ($acc0,$mask);                  # inter-register transfers
172
173         &pmuludq($mul1,$_n0q);                  # *=n0
174
175         &pmuludq($car1,$mul1);                  # "t[0]"*np[0]*n0
176         &paddq  ($car1,$acc0);
177
178         &movd   ($acc1,&DWP(4,$np));            # np[1]
179         &movd   ($acc0,&DWP(4,$ap));            # ap[1]
180
181         &psrlq  ($car0,32);
182         &psrlq  ($car1,32);
183
184         &inc    ($j);                           # j++
185 &set_label("1st",16);
186         &pmuludq($acc0,$mul0);                  # ap[j]*bp[0]
187         &pmuludq($acc1,$mul1);                  # np[j]*m1
188         &paddq  ($car0,$acc0);                  # +=c0
189         &paddq  ($car1,$acc1);                  # +=c1
190
191         &movq   ($acc0,$car0);
192         &pand   ($acc0,$mask);
193         &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
194         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[0];
195         &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
196         &psrlq  ($car0,32);
197         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[j-1]=
198         &psrlq  ($car1,32);
199
200         &lea    ($j,&DWP(1,$j));
201         &cmp    ($j,$num);
202         &jl     (&label("1st"));
203
204         &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[0]
205         &pmuludq($acc1,$mul1);                  # np[num-1]*m1
206         &paddq  ($car0,$acc0);                  # +=c0
207         &paddq  ($car1,$acc1);                  # +=c1
208
209         &movq   ($acc0,$car0);
210         &pand   ($acc0,$mask);
211         &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[0];
212         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
213
214         &psrlq  ($car0,32);
215         &psrlq  ($car1,32);
216
217         &paddq  ($car1,$car0);
218         &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
219 \f
220         &inc    ($i);                           # i++
221 &set_label("outer");
222         &xor    ($j,$j);                        # j=0
223
224         &movd   ($mul0,&DWP(0,$bp,$i,4));       # bp[i]
225         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
226         &movd   ($temp,&DWP($frame,"esp"));     # tp[0]
227         &movd   ($car1,&DWP(0,$np));            # np[0]
228         &pmuludq($mul1,$mul0);                  # ap[0]*bp[i]
229
230         &paddq  ($mul1,$temp);                  # +=tp[0]
231         &movq   ($acc0,$mul1);
232         &movq   ($car0,$mul1);
233         &pand   ($acc0,$mask);
234
235         &pmuludq($mul1,$_n0q);                  # *=n0
236
237         &pmuludq($car1,$mul1);
238         &paddq  ($car1,$acc0);
239
240         &movd   ($temp,&DWP($frame+4,"esp"));   # tp[1]
241         &movd   ($acc1,&DWP(4,$np));            # np[1]
242         &movd   ($acc0,&DWP(4,$ap));            # ap[1]
243
244         &psrlq  ($car0,32);
245         &psrlq  ($car1,32);
246         &paddq  ($car0,$temp);                  # +=tp[1]
247
248         &inc    ($j);                           # j++
249         &dec    ($num);
250 &set_label("inner");
251         &pmuludq($acc0,$mul0);                  # ap[j]*bp[i]
252         &pmuludq($acc1,$mul1);                  # np[j]*m1
253         &paddq  ($car0,$acc0);                  # +=c0
254         &paddq  ($car1,$acc1);                  # +=c1
255
256         &movq   ($acc0,$car0);
257         &movd   ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
258         &pand   ($acc0,$mask);
259         &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
260         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[i]+tp[j]
261         &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
262         &psrlq  ($car0,32);
263         &movd   (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
264         &psrlq  ($car1,32);
265         &paddq  ($car0,$temp);                  # +=tp[j+1]
266
267         &dec    ($num);
268         &lea    ($j,&DWP(1,$j));                # j++
269         &jnz    (&label("inner"));
270
271         &mov    ($num,$j);
272         &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[i]
273         &pmuludq($acc1,$mul1);                  # np[num-1]*m1
274         &paddq  ($car0,$acc0);                  # +=c0
275         &paddq  ($car1,$acc1);                  # +=c1
276
277         &movq   ($acc0,$car0);
278         &pand   ($acc0,$mask);
279         &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[i]+tp[num-1]
280         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
281         &psrlq  ($car0,32);
282         &psrlq  ($car1,32);
283
284         &movd   ($temp,&DWP($frame+4,"esp",$num,4));    # += tp[num]
285         &paddq  ($car1,$car0);
286         &paddq  ($car1,$temp);
287         &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
288
289         &lea    ($i,&DWP(1,$i));                # i++
290         &cmp    ($i,$num);
291         &jle    (&label("outer"));
292
293         &emms   ();                             # done with mmx bank
294         &jmp    (&label("common_tail"));
295
296 &set_label("non_sse2",16);
297 }
298 \f
299 if (0) {
300         &mov    ("esp",$_sp);
301         &xor    ("eax","eax");  # signal "not fast enough [yet]"
302         &jmp    (&label("just_leave"));
303         # While the below code provides competitive performance for
304         # all key lengths on modern Intel cores, it's still more
305         # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
306         # means compared to the original integer-only assembler.
307         # 512-bit RSA sign is better by ~40%, but that's about all
308         # one can say about all CPUs...
309 } else {
310 $inp="esi";     # integer path uses these registers differently
311 $word="edi";
312 $carry="ebp";
313
314         &mov    ($inp,$_ap);
315         &lea    ($carry,&DWP(1,$num));
316         &mov    ($word,$_bp);
317         &xor    ($j,$j);                                # j=0
318         &mov    ("edx",$inp);
319         &and    ($carry,1);                             # see if num is even
320         &sub    ("edx",$word);                          # see if ap==bp
321         &lea    ("eax",&DWP(4,$word,$num,4));           # &bp[num]
322         &or     ($carry,"edx");
323         &mov    ($word,&DWP(0,$word));                  # bp[0]
324         &jz     (&label("bn_sqr_mont"));
325         &mov    ($_bpend,"eax");
326         &mov    ("eax",&DWP(0,$inp));
327         &xor    ("edx","edx");
328
329 &set_label("mull",16);
330         &mov    ($carry,"edx");
331         &mul    ($word);                                # ap[j]*bp[0]
332         &add    ($carry,"eax");
333         &lea    ($j,&DWP(1,$j));
334         &adc    ("edx",0);
335         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
336         &cmp    ($j,$num);
337         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
338         &jl     (&label("mull"));
339
340         &mov    ($carry,"edx");
341         &mul    ($word);                                # ap[num-1]*bp[0]
342          &mov   ($word,$_n0);
343         &add    ("eax",$carry);
344          &mov   ($inp,$_np);
345         &adc    ("edx",0);
346          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
347
348         &mov    (&DWP($frame,"esp",$num,4),"eax");      # tp[num-1]=
349         &xor    ($j,$j);
350         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
351         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
352
353         &mov    ("eax",&DWP(0,$inp));                   # np[0]
354         &mul    ($word);                                # np[0]*m
355         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
356         &mov    ("eax",&DWP(4,$inp));                   # np[1]
357         &adc    ("edx",0);
358         &inc    ($j);
359
360         &jmp    (&label("2ndmadd"));
361 \f\f
362 &set_label("1stmadd",16);
363         &mov    ($carry,"edx");
364         &mul    ($word);                                # ap[j]*bp[i]
365         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
366         &lea    ($j,&DWP(1,$j));
367         &adc    ("edx",0);
368         &add    ($carry,"eax");
369         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
370         &adc    ("edx",0);
371         &cmp    ($j,$num);
372         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
373         &jl     (&label("1stmadd"));
374
375         &mov    ($carry,"edx");
376         &mul    ($word);                                # ap[num-1]*bp[i]
377         &add    ("eax",&DWP($frame,"esp",$num,4));      # +=tp[num-1]
378          &mov   ($word,$_n0);
379         &adc    ("edx",0);
380          &mov   ($inp,$_np);
381         &add    ($carry,"eax");
382         &adc    ("edx",0);
383          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
384
385         &xor    ($j,$j);
386         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
387         &mov    (&DWP($frame,"esp",$num,4),$carry);     # tp[num-1]=
388         &adc    ($j,0);
389          &mov   ("eax",&DWP(0,$inp));                   # np[0]
390         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
391         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
392
393         &mul    ($word);                                # np[0]*m
394         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
395         &mov    ("eax",&DWP(4,$inp));                   # np[1]
396         &adc    ("edx",0);
397         &mov    ($j,1);
398 \f
399 &set_label("2ndmadd",16);
400         &mov    ($carry,"edx");
401         &mul    ($word);                                # np[j]*m
402         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
403         &lea    ($j,&DWP(1,$j));
404         &adc    ("edx",0);
405         &add    ($carry,"eax");
406         &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+1]
407         &adc    ("edx",0);
408         &cmp    ($j,$num);
409         &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j-1]=
410         &jl     (&label("2ndmadd"));
411
412         &mov    ($carry,"edx");
413         &mul    ($word);                                # np[j]*m
414         &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
415         &adc    ("edx",0);
416         &add    ($carry,"eax");
417         &adc    ("edx",0);
418         &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
419
420         &xor    ("eax","eax");
421          &mov   ($j,$_bp);                              # &bp[i]
422         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
423         &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
424          &lea   ($j,&DWP(4,$j));
425         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
426          &cmp   ($j,$_bpend);
427         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
428         &je     (&label("common_tail"));
429
430         &mov    ($word,&DWP(0,$j));                     # bp[i+1]
431         &mov    ($inp,$_ap);
432         &mov    ($_bp,$j);                              # &bp[++i]
433         &xor    ($j,$j);
434         &xor    ("edx","edx");
435         &mov    ("eax",&DWP(0,$inp));
436         &jmp    (&label("1stmadd"));
437 \f
438 &set_label("bn_sqr_mont",16);
439 $sbit=$num;
440         &mov    ($_num,$num);
441         &mov    ($_bp,$j);                              # i=0
442
443         &mov    ("eax",$word);                          # ap[0]
444         &mul    ($word);                                # ap[0]*ap[0]
445         &mov    (&DWP($frame,"esp"),"eax");             # tp[0]=
446         &mov    ($sbit,"edx");
447         &shr    ("edx",1);
448         &and    ($sbit,1);
449         &inc    ($j);
450 &set_label("sqr",16);
451         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
452         &mov    ($carry,"edx");
453         &mul    ($word);                                # ap[j]*ap[0]
454         &add    ("eax",$carry);
455         &lea    ($j,&DWP(1,$j));
456         &adc    ("edx",0);
457         &lea    ($carry,&DWP(0,$sbit,"eax",2));
458         &shr    ("eax",31);
459         &cmp    ($j,$_num);
460         &mov    ($sbit,"eax");
461         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
462         &jl     (&label("sqr"));
463
464         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[num-1]
465         &mov    ($carry,"edx");
466         &mul    ($word);                                # ap[num-1]*ap[0]
467         &add    ("eax",$carry);
468          &mov   ($word,$_n0);
469         &adc    ("edx",0);
470          &mov   ($inp,$_np);
471         &lea    ($carry,&DWP(0,$sbit,"eax",2));
472          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
473         &shr    ("eax",31);
474         &mov    (&DWP($frame,"esp",$j,4),$carry);       # tp[num-1]=
475
476         &lea    ($carry,&DWP(0,"eax","edx",2));
477          &mov   ("eax",&DWP(0,$inp));                   # np[0]
478         &shr    ("edx",31);
479         &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num]=
480         &mov    (&DWP($frame+8,"esp",$j,4),"edx");      # tp[num+1]=
481
482         &mul    ($word);                                # np[0]*m
483         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
484         &mov    ($num,$j);
485         &adc    ("edx",0);
486         &mov    ("eax",&DWP(4,$inp));                   # np[1]
487         &mov    ($j,1);
488 \f\f
489 &set_label("3rdmadd",16);
490         &mov    ($carry,"edx");
491         &mul    ($word);                                # np[j]*m
492         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
493         &adc    ("edx",0);
494         &add    ($carry,"eax");
495         &mov    ("eax",&DWP(4,$inp,$j,4));              # np[j+1]
496         &adc    ("edx",0);
497         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j-1]=
498
499         &mov    ($carry,"edx");
500         &mul    ($word);                                # np[j+1]*m
501         &add    ($carry,&DWP($frame+4,"esp",$j,4));     # +=tp[j+1]
502         &lea    ($j,&DWP(2,$j));
503         &adc    ("edx",0);
504         &add    ($carry,"eax");
505         &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+2]
506         &adc    ("edx",0);
507         &cmp    ($j,$num);
508         &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j]=
509         &jl     (&label("3rdmadd"));
510
511         &mov    ($carry,"edx");
512         &mul    ($word);                                # np[j]*m
513         &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
514         &adc    ("edx",0);
515         &add    ($carry,"eax");
516         &adc    ("edx",0);
517         &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
518
519         &mov    ($j,$_bp);                              # i
520         &xor    ("eax","eax");
521         &mov    ($inp,$_ap);
522         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
523         &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
524         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
525         &cmp    ($j,$num);
526         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
527         &je     (&label("common_tail"));
528 \f
529         &mov    ($word,&DWP(4,$inp,$j,4));              # ap[i]
530         &lea    ($j,&DWP(1,$j));
531         &mov    ("eax",$word);
532         &mov    ($_bp,$j);                              # ++i
533         &mul    ($word);                                # ap[i]*ap[i]
534         &add    ("eax",&DWP($frame,"esp",$j,4));        # +=tp[i]
535         &adc    ("edx",0);
536         &mov    (&DWP($frame,"esp",$j,4),"eax");        # tp[i]=
537         &xor    ($carry,$carry);
538         &cmp    ($j,$num);
539         &lea    ($j,&DWP(1,$j));
540         &je     (&label("sqrlast"));
541
542         &mov    ($sbit,"edx");                          # zaps $num
543         &shr    ("edx",1);
544         &and    ($sbit,1);
545 &set_label("sqradd",16);
546         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
547         &mov    ($carry,"edx");
548         &mul    ($word);                                # ap[j]*ap[i]
549         &add    ("eax",$carry);
550         &lea    ($carry,&DWP(0,"eax","eax"));
551         &adc    ("edx",0);
552         &shr    ("eax",31);
553         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
554         &lea    ($j,&DWP(1,$j));
555         &adc    ("eax",0);
556         &add    ($carry,$sbit);
557         &adc    ("eax",0);
558         &cmp    ($j,$_num);
559         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
560         &mov    ($sbit,"eax");
561         &jle    (&label("sqradd"));
562
563         &mov    ($carry,"edx");
564         &add    ("edx","edx");
565         &shr    ($carry,31);
566         &add    ("edx",$sbit);
567         &adc    ($carry,0);
568 &set_label("sqrlast");
569         &mov    ($word,$_n0);
570         &mov    ($inp,$_np);
571         &imul   ($word,&DWP($frame,"esp"));             # n0*tp[0]
572
573         &add    ("edx",&DWP($frame,"esp",$j,4));        # +=tp[num]
574         &mov    ("eax",&DWP(0,$inp));                   # np[0]
575         &adc    ($carry,0);
576         &mov    (&DWP($frame,"esp",$j,4),"edx");        # tp[num]=
577         &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num+1]=
578
579         &mul    ($word);                                # np[0]*m
580         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
581         &lea    ($num,&DWP(-1,$j));
582         &adc    ("edx",0);
583         &mov    ($j,1);
584         &mov    ("eax",&DWP(4,$inp));                   # np[1]
585
586         &jmp    (&label("3rdmadd"));
587 }
588 \f
589 &set_label("common_tail",16);
590         &mov    ($np,$_np);                     # load modulus pointer
591         &mov    ($rp,$_rp);                     # load result pointer
592         &lea    ($tp,&DWP($frame,"esp"));       # [$ap and $bp are zapped]
593
594         &mov    ("eax",&DWP(0,$tp));            # tp[0]
595         &mov    ($j,$num);                      # j=num-1
596         &xor    ($i,$i);                        # i=0 and clear CF!
597
598 &set_label("sub",16);
599         &sbb    ("eax",&DWP(0,$np,$i,4));
600         &mov    (&DWP(0,$rp,$i,4),"eax");       # rp[i]=tp[i]-np[i]
601         &dec    ($j);                           # doesn't affect CF!
602         &mov    ("eax",&DWP(4,$tp,$i,4));       # tp[i+1]
603         &lea    ($i,&DWP(1,$i));                # i++
604         &jge    (&label("sub"));
605
606         &sbb    ("eax",0);                      # handle upmost overflow bit
607         &mov    ("edx",-1);
608         &xor    ("edx","eax");
609         &jmp    (&label("copy"));
610
611 &set_label("copy",16);                          # conditional copy
612         &mov    ($tp,&DWP($frame,"esp",$num,4));
613         &mov    ($np,&DWP(0,$rp,$num,4));
614         &mov    (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
615         &and    ($tp,"eax");
616         &and    ($np,"edx");
617         &or     ($np,$tp);
618         &mov    (&DWP(0,$rp,$num,4),$np);
619         &dec    ($num);
620         &jge    (&label("copy"));
621
622         &mov    ("esp",$_sp);           # pull saved stack pointer
623         &mov    ("eax",1);
624 &set_label("just_leave");
625 &function_end("bn_mul_mont");
626
627 &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
628
629 &asm_finish();
630
631 close STDOUT;