9994b0bf9652a79b7006d15e8c639063a482344c
[openssl.git] / crypto / bn / asm / x86-mont.pl
1 #! /usr/bin/env perl
2 # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # October 2005
18 #
19 # This is a "teaser" code, as it can be improved in several ways...
20 # First of all non-SSE2 path should be implemented (yes, for now it
21 # performs Montgomery multiplication/convolution only on SSE2-capable
22 # CPUs such as P4, others fall down to original code). Then inner loop
23 # can be unrolled and modulo-scheduled to improve ILP and possibly
24 # moved to 128-bit XMM register bank (though it would require input
25 # rearrangement and/or increase bus bandwidth utilization). Dedicated
26 # squaring procedure should give further performance improvement...
27 # Yet, for being draft, the code improves rsa512 *sign* benchmark by
28 # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
29
30 # December 2006
31 #
32 # Modulo-scheduling SSE2 loops results in further 15-20% improvement.
33 # Integer-only code [being equipped with dedicated squaring procedure]
34 # gives ~40% on rsa512 sign benchmark...
35
36 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37 push(@INC,"${dir}","${dir}../../perlasm");
38 require "x86asm.pl";
39
40 $output = pop;
41 open STDOUT,">$output";
42  
43 &asm_init($ARGV[0],$0);
44
45 $sse2=0;
46 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
47
48 &external_label("OPENSSL_ia32cap_P") if ($sse2);
49
50 &function_begin("bn_mul_mont");
51
52 $i="edx";
53 $j="ecx";
54 $ap="esi";      $tp="esi";              # overlapping variables!!!
55 $rp="edi";      $bp="edi";              # overlapping variables!!!
56 $np="ebp";
57 $num="ebx";
58
59 $_num=&DWP(4*0,"esp");                  # stack top layout
60 $_rp=&DWP(4*1,"esp");
61 $_ap=&DWP(4*2,"esp");
62 $_bp=&DWP(4*3,"esp");
63 $_np=&DWP(4*4,"esp");
64 $_n0=&DWP(4*5,"esp");   $_n0q=&QWP(4*5,"esp");
65 $_sp=&DWP(4*6,"esp");
66 $_bpend=&DWP(4*7,"esp");
67 $frame=32;                              # size of above frame rounded up to 16n
68
69         &xor    ("eax","eax");
70         &mov    ("edi",&wparam(5));     # int num
71         &cmp    ("edi",4);
72         &jl     (&label("just_leave"));
73
74         &lea    ("esi",&wparam(0));     # put aside pointer to argument block
75         &lea    ("edx",&wparam(1));     # load ap
76         &mov    ("ebp","esp");          # saved stack pointer!
77         &add    ("edi",2);              # extra two words on top of tp
78         &neg    ("edi");
79         &lea    ("esp",&DWP(-$frame,"esp","edi",4));    # alloca($frame+4*(num+2))
80         &neg    ("edi");
81
82         # minimize cache contention by arraning 2K window between stack
83         # pointer and ap argument [np is also position sensitive vector,
84         # but it's assumed to be near ap, as it's allocated at ~same
85         # time].
86         &mov    ("eax","esp");
87         &sub    ("eax","edx");
88         &and    ("eax",2047);
89         &sub    ("esp","eax");          # this aligns sp and ap modulo 2048
90
91         &xor    ("edx","esp");
92         &and    ("edx",2048);
93         &xor    ("edx",2048);
94         &sub    ("esp","edx");          # this splits them apart modulo 4096
95
96         &and    ("esp",-64);            # align to cache line
97
98         # An OS-agnostic version of __chkstk.
99         #
100         # Some OSes (Windows) insist on stack being "wired" to
101         # physical memory in strictly sequential manner, i.e. if stack
102         # allocation spans two pages, then reference to farmost one can
103         # be punishable by SEGV. But page walking can do good even on
104         # other OSes, because it guarantees that villain thread hits
105         # the guard page before it can make damage to innocent one...
106         &mov    ("eax","ebp");
107         &sub    ("eax","esp");
108         &and    ("eax",-4096);
109 &set_label("page_walk");
110         &mov    ("edx",&DWP(0,"esp","eax"));
111         &sub    ("eax",4096);
112         &data_byte(0x2e);
113         &jnc    (&label("page_walk"));
114
115         ################################# load argument block...
116         &mov    ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
117         &mov    ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
118         &mov    ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
119         &mov    ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
120         &mov    ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
121         #&mov   ("edi",&DWP(5*4,"esi"));# int num
122
123         &mov    ("esi",&DWP(0,"esi"));  # pull n0[0]
124         &mov    ($_rp,"eax");           # ... save a copy of argument block
125         &mov    ($_ap,"ebx");
126         &mov    ($_bp,"ecx");
127         &mov    ($_np,"edx");
128         &mov    ($_n0,"esi");
129         &lea    ($num,&DWP(-3,"edi"));  # num=num-1 to assist modulo-scheduling
130         #&mov   ($_num,$num);           # redundant as $num is not reused
131         &mov    ($_sp,"ebp");           # saved stack pointer!
132 \f
133 if($sse2) {
134 $acc0="mm0";    # mmx register bank layout
135 $acc1="mm1";
136 $car0="mm2";
137 $car1="mm3";
138 $mul0="mm4";
139 $mul1="mm5";
140 $temp="mm6";
141 $mask="mm7";
142
143         &picmeup("eax","OPENSSL_ia32cap_P");
144         &bt     (&DWP(0,"eax"),26);
145         &jnc    (&label("non_sse2"));
146
147         &mov    ("eax",-1);
148         &movd   ($mask,"eax");          # mask 32 lower bits
149
150         &mov    ($ap,$_ap);             # load input pointers
151         &mov    ($bp,$_bp);
152         &mov    ($np,$_np);
153
154         &xor    ($i,$i);                # i=0
155         &xor    ($j,$j);                # j=0
156
157         &movd   ($mul0,&DWP(0,$bp));            # bp[0]
158         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
159         &movd   ($car1,&DWP(0,$np));            # np[0]
160
161         &pmuludq($mul1,$mul0);                  # ap[0]*bp[0]
162         &movq   ($car0,$mul1);
163         &movq   ($acc0,$mul1);                  # I wish movd worked for
164         &pand   ($acc0,$mask);                  # inter-register transfers
165
166         &pmuludq($mul1,$_n0q);                  # *=n0
167
168         &pmuludq($car1,$mul1);                  # "t[0]"*np[0]*n0
169         &paddq  ($car1,$acc0);
170
171         &movd   ($acc1,&DWP(4,$np));            # np[1]
172         &movd   ($acc0,&DWP(4,$ap));            # ap[1]
173
174         &psrlq  ($car0,32);
175         &psrlq  ($car1,32);
176
177         &inc    ($j);                           # j++
178 &set_label("1st",16);
179         &pmuludq($acc0,$mul0);                  # ap[j]*bp[0]
180         &pmuludq($acc1,$mul1);                  # np[j]*m1
181         &paddq  ($car0,$acc0);                  # +=c0
182         &paddq  ($car1,$acc1);                  # +=c1
183
184         &movq   ($acc0,$car0);
185         &pand   ($acc0,$mask);
186         &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
187         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[0];
188         &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
189         &psrlq  ($car0,32);
190         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[j-1]=
191         &psrlq  ($car1,32);
192
193         &lea    ($j,&DWP(1,$j));
194         &cmp    ($j,$num);
195         &jl     (&label("1st"));
196
197         &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[0]
198         &pmuludq($acc1,$mul1);                  # np[num-1]*m1
199         &paddq  ($car0,$acc0);                  # +=c0
200         &paddq  ($car1,$acc1);                  # +=c1
201
202         &movq   ($acc0,$car0);
203         &pand   ($acc0,$mask);
204         &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[0];
205         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
206
207         &psrlq  ($car0,32);
208         &psrlq  ($car1,32);
209
210         &paddq  ($car1,$car0);
211         &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
212 \f
213         &inc    ($i);                           # i++
214 &set_label("outer");
215         &xor    ($j,$j);                        # j=0
216
217         &movd   ($mul0,&DWP(0,$bp,$i,4));       # bp[i]
218         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
219         &movd   ($temp,&DWP($frame,"esp"));     # tp[0]
220         &movd   ($car1,&DWP(0,$np));            # np[0]
221         &pmuludq($mul1,$mul0);                  # ap[0]*bp[i]
222
223         &paddq  ($mul1,$temp);                  # +=tp[0]
224         &movq   ($acc0,$mul1);
225         &movq   ($car0,$mul1);
226         &pand   ($acc0,$mask);
227
228         &pmuludq($mul1,$_n0q);                  # *=n0
229
230         &pmuludq($car1,$mul1);
231         &paddq  ($car1,$acc0);
232
233         &movd   ($temp,&DWP($frame+4,"esp"));   # tp[1]
234         &movd   ($acc1,&DWP(4,$np));            # np[1]
235         &movd   ($acc0,&DWP(4,$ap));            # ap[1]
236
237         &psrlq  ($car0,32);
238         &psrlq  ($car1,32);
239         &paddq  ($car0,$temp);                  # +=tp[1]
240
241         &inc    ($j);                           # j++
242         &dec    ($num);
243 &set_label("inner");
244         &pmuludq($acc0,$mul0);                  # ap[j]*bp[i]
245         &pmuludq($acc1,$mul1);                  # np[j]*m1
246         &paddq  ($car0,$acc0);                  # +=c0
247         &paddq  ($car1,$acc1);                  # +=c1
248
249         &movq   ($acc0,$car0);
250         &movd   ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
251         &pand   ($acc0,$mask);
252         &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
253         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[i]+tp[j]
254         &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
255         &psrlq  ($car0,32);
256         &movd   (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
257         &psrlq  ($car1,32);
258         &paddq  ($car0,$temp);                  # +=tp[j+1]
259
260         &dec    ($num);
261         &lea    ($j,&DWP(1,$j));                # j++
262         &jnz    (&label("inner"));
263
264         &mov    ($num,$j);
265         &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[i]
266         &pmuludq($acc1,$mul1);                  # np[num-1]*m1
267         &paddq  ($car0,$acc0);                  # +=c0
268         &paddq  ($car1,$acc1);                  # +=c1
269
270         &movq   ($acc0,$car0);
271         &pand   ($acc0,$mask);
272         &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[i]+tp[num-1]
273         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
274         &psrlq  ($car0,32);
275         &psrlq  ($car1,32);
276
277         &movd   ($temp,&DWP($frame+4,"esp",$num,4));    # += tp[num]
278         &paddq  ($car1,$car0);
279         &paddq  ($car1,$temp);
280         &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
281
282         &lea    ($i,&DWP(1,$i));                # i++
283         &cmp    ($i,$num);
284         &jle    (&label("outer"));
285
286         &emms   ();                             # done with mmx bank
287         &jmp    (&label("common_tail"));
288
289 &set_label("non_sse2",16);
290 }
291 \f
292 if (0) {
293         &mov    ("esp",$_sp);
294         &xor    ("eax","eax");  # signal "not fast enough [yet]"
295         &jmp    (&label("just_leave"));
296         # While the below code provides competitive performance for
297         # all key lengthes on modern Intel cores, it's still more
298         # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
299         # means compared to the original integer-only assembler.
300         # 512-bit RSA sign is better by ~40%, but that's about all
301         # one can say about all CPUs...
302 } else {
303 $inp="esi";     # integer path uses these registers differently
304 $word="edi";
305 $carry="ebp";
306
307         &mov    ($inp,$_ap);
308         &lea    ($carry,&DWP(1,$num));
309         &mov    ($word,$_bp);
310         &xor    ($j,$j);                                # j=0
311         &mov    ("edx",$inp);
312         &and    ($carry,1);                             # see if num is even
313         &sub    ("edx",$word);                          # see if ap==bp
314         &lea    ("eax",&DWP(4,$word,$num,4));           # &bp[num]
315         &or     ($carry,"edx");
316         &mov    ($word,&DWP(0,$word));                  # bp[0]
317         &jz     (&label("bn_sqr_mont"));
318         &mov    ($_bpend,"eax");
319         &mov    ("eax",&DWP(0,$inp));
320         &xor    ("edx","edx");
321
322 &set_label("mull",16);
323         &mov    ($carry,"edx");
324         &mul    ($word);                                # ap[j]*bp[0]
325         &add    ($carry,"eax");
326         &lea    ($j,&DWP(1,$j));
327         &adc    ("edx",0);
328         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
329         &cmp    ($j,$num);
330         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
331         &jl     (&label("mull"));
332
333         &mov    ($carry,"edx");
334         &mul    ($word);                                # ap[num-1]*bp[0]
335          &mov   ($word,$_n0);
336         &add    ("eax",$carry);
337          &mov   ($inp,$_np);
338         &adc    ("edx",0);
339          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
340
341         &mov    (&DWP($frame,"esp",$num,4),"eax");      # tp[num-1]=
342         &xor    ($j,$j);
343         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
344         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
345
346         &mov    ("eax",&DWP(0,$inp));                   # np[0]
347         &mul    ($word);                                # np[0]*m
348         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
349         &mov    ("eax",&DWP(4,$inp));                   # np[1]
350         &adc    ("edx",0);
351         &inc    ($j);
352
353         &jmp    (&label("2ndmadd"));
354 \f\f
355 &set_label("1stmadd",16);
356         &mov    ($carry,"edx");
357         &mul    ($word);                                # ap[j]*bp[i]
358         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
359         &lea    ($j,&DWP(1,$j));
360         &adc    ("edx",0);
361         &add    ($carry,"eax");
362         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
363         &adc    ("edx",0);
364         &cmp    ($j,$num);
365         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
366         &jl     (&label("1stmadd"));
367
368         &mov    ($carry,"edx");
369         &mul    ($word);                                # ap[num-1]*bp[i]
370         &add    ("eax",&DWP($frame,"esp",$num,4));      # +=tp[num-1]
371          &mov   ($word,$_n0);
372         &adc    ("edx",0);
373          &mov   ($inp,$_np);
374         &add    ($carry,"eax");
375         &adc    ("edx",0);
376          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
377
378         &xor    ($j,$j);
379         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
380         &mov    (&DWP($frame,"esp",$num,4),$carry);     # tp[num-1]=
381         &adc    ($j,0);
382          &mov   ("eax",&DWP(0,$inp));                   # np[0]
383         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
384         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
385
386         &mul    ($word);                                # np[0]*m
387         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
388         &mov    ("eax",&DWP(4,$inp));                   # np[1]
389         &adc    ("edx",0);
390         &mov    ($j,1);
391 \f
392 &set_label("2ndmadd",16);
393         &mov    ($carry,"edx");
394         &mul    ($word);                                # np[j]*m
395         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
396         &lea    ($j,&DWP(1,$j));
397         &adc    ("edx",0);
398         &add    ($carry,"eax");
399         &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+1]
400         &adc    ("edx",0);
401         &cmp    ($j,$num);
402         &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j-1]=
403         &jl     (&label("2ndmadd"));
404
405         &mov    ($carry,"edx");
406         &mul    ($word);                                # np[j]*m
407         &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
408         &adc    ("edx",0);
409         &add    ($carry,"eax");
410         &adc    ("edx",0);
411         &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
412
413         &xor    ("eax","eax");
414          &mov   ($j,$_bp);                              # &bp[i]
415         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
416         &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
417          &lea   ($j,&DWP(4,$j));
418         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
419          &cmp   ($j,$_bpend);
420         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
421         &je     (&label("common_tail"));
422
423         &mov    ($word,&DWP(0,$j));                     # bp[i+1]
424         &mov    ($inp,$_ap);
425         &mov    ($_bp,$j);                              # &bp[++i]
426         &xor    ($j,$j);
427         &xor    ("edx","edx");
428         &mov    ("eax",&DWP(0,$inp));
429         &jmp    (&label("1stmadd"));
430 \f
431 &set_label("bn_sqr_mont",16);
432 $sbit=$num;
433         &mov    ($_num,$num);
434         &mov    ($_bp,$j);                              # i=0
435
436         &mov    ("eax",$word);                          # ap[0]
437         &mul    ($word);                                # ap[0]*ap[0]
438         &mov    (&DWP($frame,"esp"),"eax");             # tp[0]=
439         &mov    ($sbit,"edx");
440         &shr    ("edx",1);
441         &and    ($sbit,1);
442         &inc    ($j);
443 &set_label("sqr",16);
444         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
445         &mov    ($carry,"edx");
446         &mul    ($word);                                # ap[j]*ap[0]
447         &add    ("eax",$carry);
448         &lea    ($j,&DWP(1,$j));
449         &adc    ("edx",0);
450         &lea    ($carry,&DWP(0,$sbit,"eax",2));
451         &shr    ("eax",31);
452         &cmp    ($j,$_num);
453         &mov    ($sbit,"eax");
454         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
455         &jl     (&label("sqr"));
456
457         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[num-1]
458         &mov    ($carry,"edx");
459         &mul    ($word);                                # ap[num-1]*ap[0]
460         &add    ("eax",$carry);
461          &mov   ($word,$_n0);
462         &adc    ("edx",0);
463          &mov   ($inp,$_np);
464         &lea    ($carry,&DWP(0,$sbit,"eax",2));
465          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
466         &shr    ("eax",31);
467         &mov    (&DWP($frame,"esp",$j,4),$carry);       # tp[num-1]=
468
469         &lea    ($carry,&DWP(0,"eax","edx",2));
470          &mov   ("eax",&DWP(0,$inp));                   # np[0]
471         &shr    ("edx",31);
472         &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num]=
473         &mov    (&DWP($frame+8,"esp",$j,4),"edx");      # tp[num+1]=
474
475         &mul    ($word);                                # np[0]*m
476         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
477         &mov    ($num,$j);
478         &adc    ("edx",0);
479         &mov    ("eax",&DWP(4,$inp));                   # np[1]
480         &mov    ($j,1);
481 \f\f
482 &set_label("3rdmadd",16);
483         &mov    ($carry,"edx");
484         &mul    ($word);                                # np[j]*m
485         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
486         &adc    ("edx",0);
487         &add    ($carry,"eax");
488         &mov    ("eax",&DWP(4,$inp,$j,4));              # np[j+1]
489         &adc    ("edx",0);
490         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j-1]=
491
492         &mov    ($carry,"edx");
493         &mul    ($word);                                # np[j+1]*m
494         &add    ($carry,&DWP($frame+4,"esp",$j,4));     # +=tp[j+1]
495         &lea    ($j,&DWP(2,$j));
496         &adc    ("edx",0);
497         &add    ($carry,"eax");
498         &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+2]
499         &adc    ("edx",0);
500         &cmp    ($j,$num);
501         &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j]=
502         &jl     (&label("3rdmadd"));
503
504         &mov    ($carry,"edx");
505         &mul    ($word);                                # np[j]*m
506         &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
507         &adc    ("edx",0);
508         &add    ($carry,"eax");
509         &adc    ("edx",0);
510         &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
511
512         &mov    ($j,$_bp);                              # i
513         &xor    ("eax","eax");
514         &mov    ($inp,$_ap);
515         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
516         &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
517         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
518         &cmp    ($j,$num);
519         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
520         &je     (&label("common_tail"));
521 \f
522         &mov    ($word,&DWP(4,$inp,$j,4));              # ap[i]
523         &lea    ($j,&DWP(1,$j));
524         &mov    ("eax",$word);
525         &mov    ($_bp,$j);                              # ++i
526         &mul    ($word);                                # ap[i]*ap[i]
527         &add    ("eax",&DWP($frame,"esp",$j,4));        # +=tp[i]
528         &adc    ("edx",0);
529         &mov    (&DWP($frame,"esp",$j,4),"eax");        # tp[i]=
530         &xor    ($carry,$carry);
531         &cmp    ($j,$num);
532         &lea    ($j,&DWP(1,$j));
533         &je     (&label("sqrlast"));
534
535         &mov    ($sbit,"edx");                          # zaps $num
536         &shr    ("edx",1);
537         &and    ($sbit,1);
538 &set_label("sqradd",16);
539         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
540         &mov    ($carry,"edx");
541         &mul    ($word);                                # ap[j]*ap[i]
542         &add    ("eax",$carry);
543         &lea    ($carry,&DWP(0,"eax","eax"));
544         &adc    ("edx",0);
545         &shr    ("eax",31);
546         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
547         &lea    ($j,&DWP(1,$j));
548         &adc    ("eax",0);
549         &add    ($carry,$sbit);
550         &adc    ("eax",0);
551         &cmp    ($j,$_num);
552         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
553         &mov    ($sbit,"eax");
554         &jle    (&label("sqradd"));
555
556         &mov    ($carry,"edx");
557         &add    ("edx","edx");
558         &shr    ($carry,31);
559         &add    ("edx",$sbit);
560         &adc    ($carry,0);
561 &set_label("sqrlast");
562         &mov    ($word,$_n0);
563         &mov    ($inp,$_np);
564         &imul   ($word,&DWP($frame,"esp"));             # n0*tp[0]
565
566         &add    ("edx",&DWP($frame,"esp",$j,4));        # +=tp[num]
567         &mov    ("eax",&DWP(0,$inp));                   # np[0]
568         &adc    ($carry,0);
569         &mov    (&DWP($frame,"esp",$j,4),"edx");        # tp[num]=
570         &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num+1]=
571
572         &mul    ($word);                                # np[0]*m
573         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
574         &lea    ($num,&DWP(-1,$j));
575         &adc    ("edx",0);
576         &mov    ($j,1);
577         &mov    ("eax",&DWP(4,$inp));                   # np[1]
578
579         &jmp    (&label("3rdmadd"));
580 }
581 \f
582 &set_label("common_tail",16);
583         &mov    ($np,$_np);                     # load modulus pointer
584         &mov    ($rp,$_rp);                     # load result pointer
585         &lea    ($tp,&DWP($frame,"esp"));       # [$ap and $bp are zapped]
586
587         &mov    ("eax",&DWP(0,$tp));            # tp[0]
588         &mov    ($j,$num);                      # j=num-1
589         &xor    ($i,$i);                        # i=0 and clear CF!
590
591 &set_label("sub",16);
592         &sbb    ("eax",&DWP(0,$np,$i,4));
593         &mov    (&DWP(0,$rp,$i,4),"eax");       # rp[i]=tp[i]-np[i]
594         &dec    ($j);                           # doesn't affect CF!
595         &mov    ("eax",&DWP(4,$tp,$i,4));       # tp[i+1]
596         &lea    ($i,&DWP(1,$i));                # i++
597         &jge    (&label("sub"));
598
599         &sbb    ("eax",0);                      # handle upmost overflow bit
600         &and    ($tp,"eax");
601         &not    ("eax");
602         &mov    ($np,$rp);
603         &and    ($np,"eax");
604         &or     ($tp,$np);                      # tp=carry?tp:rp
605
606 &set_label("copy",16);                          # copy or in-place refresh
607         &mov    ("eax",&DWP(0,$tp,$num,4));
608         &mov    (&DWP(0,$rp,$num,4),"eax");     # rp[i]=tp[i]
609         &mov    (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
610         &dec    ($num);
611         &jge    (&label("copy"));
612
613         &mov    ("esp",$_sp);           # pull saved stack pointer
614         &mov    ("eax",1);
615 &set_label("just_leave");
616 &function_end("bn_mul_mont");
617
618 &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
619
620 &asm_finish();
621
622 close STDOUT;