Update copyright year
[openssl.git] / crypto / bn / asm / x86-mont.pl
1 #! /usr/bin/env perl
2 # Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # October 2005
18 #
19 # This is a "teaser" code, as it can be improved in several ways...
20 # First of all non-SSE2 path should be implemented (yes, for now it
21 # performs Montgomery multiplication/convolution only on SSE2-capable
22 # CPUs such as P4, others fall down to original code). Then inner loop
23 # can be unrolled and modulo-scheduled to improve ILP and possibly
24 # moved to 128-bit XMM register bank (though it would require input
25 # rearrangement and/or increase bus bandwidth utilization). Dedicated
26 # squaring procedure should give further performance improvement...
27 # Yet, for being draft, the code improves rsa512 *sign* benchmark by
28 # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
29
30 # December 2006
31 #
32 # Modulo-scheduling SSE2 loops results in further 15-20% improvement.
33 # Integer-only code [being equipped with dedicated squaring procedure]
34 # gives ~40% on rsa512 sign benchmark...
35
36 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37 push(@INC,"${dir}","${dir}../../perlasm");
38 require "x86asm.pl";
39
40 $output = pop and open STDOUT,">$output";
41
42 &asm_init($ARGV[0]);
43
44 $sse2=0;
45 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
46
47 &external_label("OPENSSL_ia32cap_P") if ($sse2);
48
49 &function_begin("bn_mul_mont");
50
51 $i="edx";
52 $j="ecx";
53 $ap="esi";      $tp="esi";              # overlapping variables!!!
54 $rp="edi";      $bp="edi";              # overlapping variables!!!
55 $np="ebp";
56 $num="ebx";
57
58 $_num=&DWP(4*0,"esp");                  # stack top layout
59 $_rp=&DWP(4*1,"esp");
60 $_ap=&DWP(4*2,"esp");
61 $_bp=&DWP(4*3,"esp");
62 $_np=&DWP(4*4,"esp");
63 $_n0=&DWP(4*5,"esp");   $_n0q=&QWP(4*5,"esp");
64 $_sp=&DWP(4*6,"esp");
65 $_bpend=&DWP(4*7,"esp");
66 $frame=32;                              # size of above frame rounded up to 16n
67
68         &xor    ("eax","eax");
69         &mov    ("edi",&wparam(5));     # int num
70         &cmp    ("edi",4);
71         &jl     (&label("just_leave"));
72
73         &lea    ("esi",&wparam(0));     # put aside pointer to argument block
74         &lea    ("edx",&wparam(1));     # load ap
75         &add    ("edi",2);              # extra two words on top of tp
76         &neg    ("edi");
77         &lea    ("ebp",&DWP(-$frame,"esp","edi",4));    # future alloca($frame+4*(num+2))
78         &neg    ("edi");
79
80         # minimize cache contention by arranging 2K window between stack
81         # pointer and ap argument [np is also position sensitive vector,
82         # but it's assumed to be near ap, as it's allocated at ~same
83         # time].
84         &mov    ("eax","ebp");
85         &sub    ("eax","edx");
86         &and    ("eax",2047);
87         &sub    ("ebp","eax");          # this aligns sp and ap modulo 2048
88
89         &xor    ("edx","ebp");
90         &and    ("edx",2048);
91         &xor    ("edx",2048);
92         &sub    ("ebp","edx");          # this splits them apart modulo 4096
93
94         &and    ("ebp",-64);            # align to cache line
95
96         # An OS-agnostic version of __chkstk.
97         #
98         # Some OSes (Windows) insist on stack being "wired" to
99         # physical memory in strictly sequential manner, i.e. if stack
100         # allocation spans two pages, then reference to farmost one can
101         # be punishable by SEGV. But page walking can do good even on
102         # other OSes, because it guarantees that villain thread hits
103         # the guard page before it can make damage to innocent one...
104         &mov    ("eax","esp");
105         &sub    ("eax","ebp");
106         &and    ("eax",-4096);
107         &mov    ("edx","esp");          # saved stack pointer!
108         &lea    ("esp",&DWP(0,"ebp","eax"));
109         &mov    ("eax",&DWP(0,"esp"));
110         &cmp    ("esp","ebp");
111         &ja     (&label("page_walk"));
112         &jmp    (&label("page_walk_done"));
113
114 &set_label("page_walk",16);
115         &lea    ("esp",&DWP(-4096,"esp"));
116         &mov    ("eax",&DWP(0,"esp"));
117         &cmp    ("esp","ebp");
118         &ja     (&label("page_walk"));
119 &set_label("page_walk_done");
120
121         ################################# load argument block...
122         &mov    ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
123         &mov    ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
124         &mov    ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
125         &mov    ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
126         &mov    ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
127         #&mov   ("edi",&DWP(5*4,"esi"));# int num
128
129         &mov    ("esi",&DWP(0,"esi"));  # pull n0[0]
130         &mov    ($_rp,"eax");           # ... save a copy of argument block
131         &mov    ($_ap,"ebx");
132         &mov    ($_bp,"ecx");
133         &mov    ($_np,"ebp");
134         &mov    ($_n0,"esi");
135         &lea    ($num,&DWP(-3,"edi"));  # num=num-1 to assist modulo-scheduling
136         #&mov   ($_num,$num);           # redundant as $num is not reused
137         &mov    ($_sp,"edx");           # saved stack pointer!
138 \f
139 if($sse2) {
140 $acc0="mm0";    # mmx register bank layout
141 $acc1="mm1";
142 $car0="mm2";
143 $car1="mm3";
144 $mul0="mm4";
145 $mul1="mm5";
146 $temp="mm6";
147 $mask="mm7";
148
149         &picmeup("eax","OPENSSL_ia32cap_P");
150         &bt     (&DWP(0,"eax"),26);
151         &jnc    (&label("non_sse2"));
152
153         &mov    ("eax",-1);
154         &movd   ($mask,"eax");          # mask 32 lower bits
155
156         &mov    ($ap,$_ap);             # load input pointers
157         &mov    ($bp,$_bp);
158         &mov    ($np,$_np);
159
160         &xor    ($i,$i);                # i=0
161         &xor    ($j,$j);                # j=0
162
163         &movd   ($mul0,&DWP(0,$bp));            # bp[0]
164         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
165         &movd   ($car1,&DWP(0,$np));            # np[0]
166
167         &pmuludq($mul1,$mul0);                  # ap[0]*bp[0]
168         &movq   ($car0,$mul1);
169         &movq   ($acc0,$mul1);                  # I wish movd worked for
170         &pand   ($acc0,$mask);                  # inter-register transfers
171
172         &pmuludq($mul1,$_n0q);                  # *=n0
173
174         &pmuludq($car1,$mul1);                  # "t[0]"*np[0]*n0
175         &paddq  ($car1,$acc0);
176
177         &movd   ($acc1,&DWP(4,$np));            # np[1]
178         &movd   ($acc0,&DWP(4,$ap));            # ap[1]
179
180         &psrlq  ($car0,32);
181         &psrlq  ($car1,32);
182
183         &inc    ($j);                           # j++
184 &set_label("1st",16);
185         &pmuludq($acc0,$mul0);                  # ap[j]*bp[0]
186         &pmuludq($acc1,$mul1);                  # np[j]*m1
187         &paddq  ($car0,$acc0);                  # +=c0
188         &paddq  ($car1,$acc1);                  # +=c1
189
190         &movq   ($acc0,$car0);
191         &pand   ($acc0,$mask);
192         &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
193         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[0];
194         &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
195         &psrlq  ($car0,32);
196         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[j-1]=
197         &psrlq  ($car1,32);
198
199         &lea    ($j,&DWP(1,$j));
200         &cmp    ($j,$num);
201         &jl     (&label("1st"));
202
203         &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[0]
204         &pmuludq($acc1,$mul1);                  # np[num-1]*m1
205         &paddq  ($car0,$acc0);                  # +=c0
206         &paddq  ($car1,$acc1);                  # +=c1
207
208         &movq   ($acc0,$car0);
209         &pand   ($acc0,$mask);
210         &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[0];
211         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
212
213         &psrlq  ($car0,32);
214         &psrlq  ($car1,32);
215
216         &paddq  ($car1,$car0);
217         &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
218 \f
219         &inc    ($i);                           # i++
220 &set_label("outer");
221         &xor    ($j,$j);                        # j=0
222
223         &movd   ($mul0,&DWP(0,$bp,$i,4));       # bp[i]
224         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
225         &movd   ($temp,&DWP($frame,"esp"));     # tp[0]
226         &movd   ($car1,&DWP(0,$np));            # np[0]
227         &pmuludq($mul1,$mul0);                  # ap[0]*bp[i]
228
229         &paddq  ($mul1,$temp);                  # +=tp[0]
230         &movq   ($acc0,$mul1);
231         &movq   ($car0,$mul1);
232         &pand   ($acc0,$mask);
233
234         &pmuludq($mul1,$_n0q);                  # *=n0
235
236         &pmuludq($car1,$mul1);
237         &paddq  ($car1,$acc0);
238
239         &movd   ($temp,&DWP($frame+4,"esp"));   # tp[1]
240         &movd   ($acc1,&DWP(4,$np));            # np[1]
241         &movd   ($acc0,&DWP(4,$ap));            # ap[1]
242
243         &psrlq  ($car0,32);
244         &psrlq  ($car1,32);
245         &paddq  ($car0,$temp);                  # +=tp[1]
246
247         &inc    ($j);                           # j++
248         &dec    ($num);
249 &set_label("inner");
250         &pmuludq($acc0,$mul0);                  # ap[j]*bp[i]
251         &pmuludq($acc1,$mul1);                  # np[j]*m1
252         &paddq  ($car0,$acc0);                  # +=c0
253         &paddq  ($car1,$acc1);                  # +=c1
254
255         &movq   ($acc0,$car0);
256         &movd   ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
257         &pand   ($acc0,$mask);
258         &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
259         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[i]+tp[j]
260         &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
261         &psrlq  ($car0,32);
262         &movd   (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
263         &psrlq  ($car1,32);
264         &paddq  ($car0,$temp);                  # +=tp[j+1]
265
266         &dec    ($num);
267         &lea    ($j,&DWP(1,$j));                # j++
268         &jnz    (&label("inner"));
269
270         &mov    ($num,$j);
271         &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[i]
272         &pmuludq($acc1,$mul1);                  # np[num-1]*m1
273         &paddq  ($car0,$acc0);                  # +=c0
274         &paddq  ($car1,$acc1);                  # +=c1
275
276         &movq   ($acc0,$car0);
277         &pand   ($acc0,$mask);
278         &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[i]+tp[num-1]
279         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
280         &psrlq  ($car0,32);
281         &psrlq  ($car1,32);
282
283         &movd   ($temp,&DWP($frame+4,"esp",$num,4));    # += tp[num]
284         &paddq  ($car1,$car0);
285         &paddq  ($car1,$temp);
286         &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
287
288         &lea    ($i,&DWP(1,$i));                # i++
289         &cmp    ($i,$num);
290         &jle    (&label("outer"));
291
292         &emms   ();                             # done with mmx bank
293         &jmp    (&label("common_tail"));
294
295 &set_label("non_sse2",16);
296 }
297 \f
298 if (0) {
299         &mov    ("esp",$_sp);
300         &xor    ("eax","eax");  # signal "not fast enough [yet]"
301         &jmp    (&label("just_leave"));
302         # While the below code provides competitive performance for
303         # all key lengths on modern Intel cores, it's still more
304         # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
305         # means compared to the original integer-only assembler.
306         # 512-bit RSA sign is better by ~40%, but that's about all
307         # one can say about all CPUs...
308 } else {
309 $inp="esi";     # integer path uses these registers differently
310 $word="edi";
311 $carry="ebp";
312
313         &mov    ($inp,$_ap);
314         &lea    ($carry,&DWP(1,$num));
315         &mov    ($word,$_bp);
316         &xor    ($j,$j);                                # j=0
317         &mov    ("edx",$inp);
318         &and    ($carry,1);                             # see if num is even
319         &sub    ("edx",$word);                          # see if ap==bp
320         &lea    ("eax",&DWP(4,$word,$num,4));           # &bp[num]
321         &or     ($carry,"edx");
322         &mov    ($word,&DWP(0,$word));                  # bp[0]
323         &jz     (&label("bn_sqr_mont"));
324         &mov    ($_bpend,"eax");
325         &mov    ("eax",&DWP(0,$inp));
326         &xor    ("edx","edx");
327
328 &set_label("mull",16);
329         &mov    ($carry,"edx");
330         &mul    ($word);                                # ap[j]*bp[0]
331         &add    ($carry,"eax");
332         &lea    ($j,&DWP(1,$j));
333         &adc    ("edx",0);
334         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
335         &cmp    ($j,$num);
336         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
337         &jl     (&label("mull"));
338
339         &mov    ($carry,"edx");
340         &mul    ($word);                                # ap[num-1]*bp[0]
341          &mov   ($word,$_n0);
342         &add    ("eax",$carry);
343          &mov   ($inp,$_np);
344         &adc    ("edx",0);
345          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
346
347         &mov    (&DWP($frame,"esp",$num,4),"eax");      # tp[num-1]=
348         &xor    ($j,$j);
349         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
350         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
351
352         &mov    ("eax",&DWP(0,$inp));                   # np[0]
353         &mul    ($word);                                # np[0]*m
354         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
355         &mov    ("eax",&DWP(4,$inp));                   # np[1]
356         &adc    ("edx",0);
357         &inc    ($j);
358
359         &jmp    (&label("2ndmadd"));
360 \f\f
361 &set_label("1stmadd",16);
362         &mov    ($carry,"edx");
363         &mul    ($word);                                # ap[j]*bp[i]
364         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
365         &lea    ($j,&DWP(1,$j));
366         &adc    ("edx",0);
367         &add    ($carry,"eax");
368         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
369         &adc    ("edx",0);
370         &cmp    ($j,$num);
371         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
372         &jl     (&label("1stmadd"));
373
374         &mov    ($carry,"edx");
375         &mul    ($word);                                # ap[num-1]*bp[i]
376         &add    ("eax",&DWP($frame,"esp",$num,4));      # +=tp[num-1]
377          &mov   ($word,$_n0);
378         &adc    ("edx",0);
379          &mov   ($inp,$_np);
380         &add    ($carry,"eax");
381         &adc    ("edx",0);
382          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
383
384         &xor    ($j,$j);
385         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
386         &mov    (&DWP($frame,"esp",$num,4),$carry);     # tp[num-1]=
387         &adc    ($j,0);
388          &mov   ("eax",&DWP(0,$inp));                   # np[0]
389         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
390         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
391
392         &mul    ($word);                                # np[0]*m
393         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
394         &mov    ("eax",&DWP(4,$inp));                   # np[1]
395         &adc    ("edx",0);
396         &mov    ($j,1);
397 \f
398 &set_label("2ndmadd",16);
399         &mov    ($carry,"edx");
400         &mul    ($word);                                # np[j]*m
401         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
402         &lea    ($j,&DWP(1,$j));
403         &adc    ("edx",0);
404         &add    ($carry,"eax");
405         &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+1]
406         &adc    ("edx",0);
407         &cmp    ($j,$num);
408         &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j-1]=
409         &jl     (&label("2ndmadd"));
410
411         &mov    ($carry,"edx");
412         &mul    ($word);                                # np[j]*m
413         &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
414         &adc    ("edx",0);
415         &add    ($carry,"eax");
416         &adc    ("edx",0);
417         &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
418
419         &xor    ("eax","eax");
420          &mov   ($j,$_bp);                              # &bp[i]
421         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
422         &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
423          &lea   ($j,&DWP(4,$j));
424         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
425          &cmp   ($j,$_bpend);
426         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
427         &je     (&label("common_tail"));
428
429         &mov    ($word,&DWP(0,$j));                     # bp[i+1]
430         &mov    ($inp,$_ap);
431         &mov    ($_bp,$j);                              # &bp[++i]
432         &xor    ($j,$j);
433         &xor    ("edx","edx");
434         &mov    ("eax",&DWP(0,$inp));
435         &jmp    (&label("1stmadd"));
436 \f
437 &set_label("bn_sqr_mont",16);
438 $sbit=$num;
439         &mov    ($_num,$num);
440         &mov    ($_bp,$j);                              # i=0
441
442         &mov    ("eax",$word);                          # ap[0]
443         &mul    ($word);                                # ap[0]*ap[0]
444         &mov    (&DWP($frame,"esp"),"eax");             # tp[0]=
445         &mov    ($sbit,"edx");
446         &shr    ("edx",1);
447         &and    ($sbit,1);
448         &inc    ($j);
449 &set_label("sqr",16);
450         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
451         &mov    ($carry,"edx");
452         &mul    ($word);                                # ap[j]*ap[0]
453         &add    ("eax",$carry);
454         &lea    ($j,&DWP(1,$j));
455         &adc    ("edx",0);
456         &lea    ($carry,&DWP(0,$sbit,"eax",2));
457         &shr    ("eax",31);
458         &cmp    ($j,$_num);
459         &mov    ($sbit,"eax");
460         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
461         &jl     (&label("sqr"));
462
463         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[num-1]
464         &mov    ($carry,"edx");
465         &mul    ($word);                                # ap[num-1]*ap[0]
466         &add    ("eax",$carry);
467          &mov   ($word,$_n0);
468         &adc    ("edx",0);
469          &mov   ($inp,$_np);
470         &lea    ($carry,&DWP(0,$sbit,"eax",2));
471          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
472         &shr    ("eax",31);
473         &mov    (&DWP($frame,"esp",$j,4),$carry);       # tp[num-1]=
474
475         &lea    ($carry,&DWP(0,"eax","edx",2));
476          &mov   ("eax",&DWP(0,$inp));                   # np[0]
477         &shr    ("edx",31);
478         &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num]=
479         &mov    (&DWP($frame+8,"esp",$j,4),"edx");      # tp[num+1]=
480
481         &mul    ($word);                                # np[0]*m
482         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
483         &mov    ($num,$j);
484         &adc    ("edx",0);
485         &mov    ("eax",&DWP(4,$inp));                   # np[1]
486         &mov    ($j,1);
487 \f\f
488 &set_label("3rdmadd",16);
489         &mov    ($carry,"edx");
490         &mul    ($word);                                # np[j]*m
491         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
492         &adc    ("edx",0);
493         &add    ($carry,"eax");
494         &mov    ("eax",&DWP(4,$inp,$j,4));              # np[j+1]
495         &adc    ("edx",0);
496         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j-1]=
497
498         &mov    ($carry,"edx");
499         &mul    ($word);                                # np[j+1]*m
500         &add    ($carry,&DWP($frame+4,"esp",$j,4));     # +=tp[j+1]
501         &lea    ($j,&DWP(2,$j));
502         &adc    ("edx",0);
503         &add    ($carry,"eax");
504         &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+2]
505         &adc    ("edx",0);
506         &cmp    ($j,$num);
507         &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j]=
508         &jl     (&label("3rdmadd"));
509
510         &mov    ($carry,"edx");
511         &mul    ($word);                                # np[j]*m
512         &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
513         &adc    ("edx",0);
514         &add    ($carry,"eax");
515         &adc    ("edx",0);
516         &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
517
518         &mov    ($j,$_bp);                              # i
519         &xor    ("eax","eax");
520         &mov    ($inp,$_ap);
521         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
522         &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
523         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
524         &cmp    ($j,$num);
525         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
526         &je     (&label("common_tail"));
527 \f
528         &mov    ($word,&DWP(4,$inp,$j,4));              # ap[i]
529         &lea    ($j,&DWP(1,$j));
530         &mov    ("eax",$word);
531         &mov    ($_bp,$j);                              # ++i
532         &mul    ($word);                                # ap[i]*ap[i]
533         &add    ("eax",&DWP($frame,"esp",$j,4));        # +=tp[i]
534         &adc    ("edx",0);
535         &mov    (&DWP($frame,"esp",$j,4),"eax");        # tp[i]=
536         &xor    ($carry,$carry);
537         &cmp    ($j,$num);
538         &lea    ($j,&DWP(1,$j));
539         &je     (&label("sqrlast"));
540
541         &mov    ($sbit,"edx");                          # zaps $num
542         &shr    ("edx",1);
543         &and    ($sbit,1);
544 &set_label("sqradd",16);
545         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
546         &mov    ($carry,"edx");
547         &mul    ($word);                                # ap[j]*ap[i]
548         &add    ("eax",$carry);
549         &lea    ($carry,&DWP(0,"eax","eax"));
550         &adc    ("edx",0);
551         &shr    ("eax",31);
552         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
553         &lea    ($j,&DWP(1,$j));
554         &adc    ("eax",0);
555         &add    ($carry,$sbit);
556         &adc    ("eax",0);
557         &cmp    ($j,$_num);
558         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
559         &mov    ($sbit,"eax");
560         &jle    (&label("sqradd"));
561
562         &mov    ($carry,"edx");
563         &add    ("edx","edx");
564         &shr    ($carry,31);
565         &add    ("edx",$sbit);
566         &adc    ($carry,0);
567 &set_label("sqrlast");
568         &mov    ($word,$_n0);
569         &mov    ($inp,$_np);
570         &imul   ($word,&DWP($frame,"esp"));             # n0*tp[0]
571
572         &add    ("edx",&DWP($frame,"esp",$j,4));        # +=tp[num]
573         &mov    ("eax",&DWP(0,$inp));                   # np[0]
574         &adc    ($carry,0);
575         &mov    (&DWP($frame,"esp",$j,4),"edx");        # tp[num]=
576         &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num+1]=
577
578         &mul    ($word);                                # np[0]*m
579         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
580         &lea    ($num,&DWP(-1,$j));
581         &adc    ("edx",0);
582         &mov    ($j,1);
583         &mov    ("eax",&DWP(4,$inp));                   # np[1]
584
585         &jmp    (&label("3rdmadd"));
586 }
587 \f
588 &set_label("common_tail",16);
589         &mov    ($np,$_np);                     # load modulus pointer
590         &mov    ($rp,$_rp);                     # load result pointer
591         &lea    ($tp,&DWP($frame,"esp"));       # [$ap and $bp are zapped]
592
593         &mov    ("eax",&DWP(0,$tp));            # tp[0]
594         &mov    ($j,$num);                      # j=num-1
595         &xor    ($i,$i);                        # i=0 and clear CF!
596
597 &set_label("sub",16);
598         &sbb    ("eax",&DWP(0,$np,$i,4));
599         &mov    (&DWP(0,$rp,$i,4),"eax");       # rp[i]=tp[i]-np[i]
600         &dec    ($j);                           # doesn't affect CF!
601         &mov    ("eax",&DWP(4,$tp,$i,4));       # tp[i+1]
602         &lea    ($i,&DWP(1,$i));                # i++
603         &jge    (&label("sub"));
604
605         &sbb    ("eax",0);                      # handle upmost overflow bit
606         &mov    ("edx",-1);
607         &xor    ("edx","eax");
608         &jmp    (&label("copy"));
609
610 &set_label("copy",16);                          # conditional copy
611         &mov    ($tp,&DWP($frame,"esp",$num,4));
612         &mov    ($np,&DWP(0,$rp,$num,4));
613         &mov    (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
614         &and    ($tp,"eax");
615         &and    ($np,"edx");
616         &or     ($np,$tp);
617         &mov    (&DWP(0,$rp,$num,4),$np);
618         &dec    ($num);
619         &jge    (&label("copy"));
620
621         &mov    ("esp",$_sp);           # pull saved stack pointer
622         &mov    ("eax",1);
623 &set_label("just_leave");
624 &function_end("bn_mul_mont");
625
626 &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
627
628 &asm_finish();
629
630 close STDOUT or die "error closing STDOUT: $!";