alpha-mont.pl: gcc portability fix and make-rule.
[openssl.git] / crypto / bn / asm / x86-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # October 2005
11 #
12 # This is a "teaser" code, as it can be improved in several ways...
13 # First of all non-SSE2 path should be implemented (yes, for now it
14 # performs Montgomery multiplication/convolution only on SSE2-capable
15 # CPUs such as P4, others fall down to original code). Then inner loop
16 # can be unrolled and modulo-scheduled to improve ILP and possibly
17 # moved to 128-bit XMM register bank (though it would require input
18 # rearrangement and/or increase bus bandwidth utilization). Dedicated
19 # squaring procedure should give further performance improvement...
20 # Yet, for being draft, the code improves rsa512 *sign* benchmark by
21 # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
22
23 push(@INC,"perlasm","../../perlasm");
24 require "x86asm.pl";
25
26 &asm_init($ARGV[0],$0);
27
28 $sse2=0;
29 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
30
31 &external_label("OPENSSL_ia32cap_P") if ($sse2);
32
33 &function_begin("bn_mul_mont",$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
34
35 $i="edx";
36 $j="ecx";
37 $ap="esi";
38 $rp="edi";      $bp="edi";              # overlapping variables!!!
39 $np="ebp";
40 $num="ebx";
41
42 $_rp=&DWP(4*0,"esp");                   # stack top layout
43 $_ap=&DWP(4*1,"esp");
44 $_bp=&DWP(4*2,"esp");
45 $_np=&DWP(4*3,"esp");
46 $_n0=&DWP(4*4,"esp");
47 $_num=&DWP(4*5,"esp");
48 $_sp=&DWP(4*6,"esp");
49 $_bpend=&DWP(4*7,"esp");
50 $frame=32;                              # size of above frame rounded up to 16n
51
52         &xor    ("eax","eax");
53         &mov    ("edi",&wparam(5));     # int num
54         &cmp    ("edi",3);
55         &jb     (&label("just_leave"));
56
57         ################################# load argument block...
58         &mov    ("eax",&wparam(0));     # BN_ULONG *rp
59         &mov    ("ebx",&wparam(1));     # const BN_ULONG *ap
60         &mov    ("ecx",&wparam(2));     # const BN_ULONG *bp
61         &mov    ("edx",&wparam(3));     # const BN_ULONG *np
62         &mov    ("esi",&wparam(4));     # const BN_ULONG *n0
63         #&mov   ("edi",&wparam(5));     # int num
64
65         &mov    ("ebp","esp");          # saved stack pointer!
66         &add    ("edi",2);              # extra two words on top of tp
67         &neg    ("edi");
68         &lea    ("esp",&DWP(-$frame,"esp","edi",4));    # alloca($frame+4*(num+2))
69         &neg    ("edi");
70         &and    ("esp",-4096);          # minimize TLB utilization
71
72         &mov    ("esi",&DWP(0,"esi"));  # pull n0[0]
73         &mov    ($_rp,"eax");           # ... save a copy of argument block
74         &mov    ($_ap,"ebx");
75         &mov    ($_bp,"ecx");
76         &mov    ($_np,"edx");
77         &mov    ($_n0,"esi");
78         &lea    ($num,&DWP(-2,"edi"));  # num is restored to its original value
79         #&mov   ($_num,$num);           # redundant as $num is not reused
80         &mov    ($_sp,"ebp");           # saved stack pointer!
81
82 if($sse2) {
83 $acc0="mm0";    # mmx register bank layout
84 $acc1="mm1";
85 $car0="mm2";
86 $car1="mm3";
87 $mul0="mm4";
88 $mul1="mm5";
89 $temp="mm6";
90 $mask="mm7";
91
92         &picmeup("eax","OPENSSL_ia32cap_P");
93         &bt     (&DWP(0,"eax"),26);
94         &jnc    (&label("non_sse2"));
95
96         &mov    ("eax",-1);
97         &movd   ($mask,"eax");          # mask 32 lower bits
98
99         &mov    ($ap,$_ap);             # load input pointers
100         &mov    ($bp,$_bp);
101         &mov    ($np,$_np);
102
103         &xor    ($i,$i);                # i=0
104         &xor    ($j,$j);                # j=0
105
106         &movd   ($mul0,&DWP(0,$bp));            # bp[0]
107         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
108         &movd   ($car1,&DWP(0,$np));            # np[0]
109
110         &pmuludq($mul1,$mul0);                  # ap[0]*bp[0]
111         &movq   ($car0,$mul1);
112         &movq   ($acc0,$mul1);                  # I wish movd worked for
113         &pand   ($acc0,$mask);                  # inter-register transfers
114
115         &pmuludq($mul1,$_n0);                   # *=n0
116
117         &pmuludq($car1,$mul1);                  # "t[0]"*np[0]*n0
118         &paddq  ($car1,$acc0);
119
120         &psrlq  ($car0,32);
121         &psrlq  ($car1,32);
122
123         &inc    ($j);                           # j++
124 &set_label("1st");
125         &movd   ($acc0,&DWP(0,$ap,$j,4));       # ap[j]
126         &movd   ($acc1,&DWP(0,$np,$j,4));       # np[j]
127         &pmuludq($acc0,$mul0);                  # ap[j]*bp[0]
128         &pmuludq($acc1,$mul1);                  # np[j]*m1
129
130         &paddq  ($car0,$acc0);                  # +=c0
131         &movq   ($acc0,$car0);
132         &pand   ($acc0,$mask);
133
134         &paddq  ($car1,$acc1);                  # +=c1
135         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[0];
136         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[j-1]=
137
138         &psrlq  ($car0,32);
139         &psrlq  ($car1,32);
140
141         &lea    ($j,&DWP(1,$j));
142         &cmp    ($j,$num);
143         &jl     (&label("1st"));
144
145         &paddq  ($car1,$car0);
146         &movq   (&DWP($frame-4,"esp",$num,4),$car1);
147
148         &inc    ($i);                           # i++
149 &set_label("outer");
150         &xor    ($j,$j);                        # j=0
151
152         &movd   ($mul0,&DWP(0,$bp,$i,4));       # bp[i]
153         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
154         &movd   ($temp,&DWP($frame,"esp"));     # tp[0]
155         &movd   ($car1,&DWP(0,$np));            # np[0]
156         &pmuludq($mul1,$mul0);                  # ap[0]*bp[i]
157
158         &paddq  ($mul1,$temp);                  # +=tp[0]
159         &movq   ($acc0,$mul1);
160         &movq   ($car0,$mul1);
161         &pand   ($acc0,$mask);
162
163         &pmuludq($mul1,$_n0);                   # *=n0
164
165         &pmuludq($car1,$mul1);
166         &paddq  ($car1,$acc0);
167
168         &psrlq  ($car0,32);
169         &psrlq  ($car1,32);
170
171         &inc    ($j);                           # j++
172 &set_label("inner");
173         &movd   ($acc0,&DWP(0,$ap,$j,4));       # ap[j]
174         &movd   ($acc1,&DWP(0,$np,$j,4));       # np[j]
175         &movd   ($temp,&DWP($frame,"esp",$j,4));# tp[j]
176         &pmuludq($acc0,$mul0);                  # ap[j]*bp[i]
177         &pmuludq($acc1,$mul1);                  # np[j]*m1
178         &paddq  ($car0,$temp);                  # +=tp[j]
179         &paddq  ($car0,$acc0);                  # +=c0
180         &movq   ($acc0,$car0);
181         &pand   ($acc0,$mask);
182
183         &paddq  ($car1,$acc1);                  # +=c1
184         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[i]+tp[j]
185         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[j-1]=
186
187         &psrlq  ($car0,32);
188         &psrlq  ($car1,32);
189
190         &lea    ($j,&DWP(1,$j));                # j++
191         &cmp    ($j,$num);
192         &jl     (&label("inner"));
193
194         &movd   ($temp,&DWP($frame,"esp",$num,4));
195         &paddq  ($car1,$car0);
196         &paddq  ($car1,$temp);
197         &movq   (&DWP($frame-4,"esp",$num,4),$car1);
198
199         &lea    ($i,&DWP(1,$i));                # i++
200         &cmp    ($i,$num);
201         &jl     (&label("outer"));
202
203         &emms   ();                             # done with mmx bank
204         &jmp    (&label("common_tail"));
205
206 &set_label("non_sse2",16);
207 }
208
209 if (1) {
210         &mov    ("esp",$_sp);
211         &xor    ("eax","eax");  # signal "not fast enough [yet]"
212         &jmp    (&label("just_leave"));
213         # The code below gives ~15% improvement on 512-bit benchmark
214         # *only*:-( On all other key lengths it's slower for up to 20%.
215         # This is because the original code path holds down the overall
216         # amount of multiplications by ~25% by deploying bn_sqr_words.
217         # In other words, for the code below to be competitive,
218         # dedicated squaring procedure is a must...
219 } else {
220 $inp="esi";     # integer path uses these registers differently
221 $word="edi";
222 $carry="ebp";
223
224         &sub    ($num,1);               # non-SSE2 path uses num-1
225
226         &mov    ($inp,$_ap);
227         &mov    ($word,$_bp);
228         &lea    ("eax",&DWP(4,$word,$num,4));           # &bp[num]
229         &mov    ($word,&DWP(0,$word));                  # bp[0]
230         &mov    ($_bpend,"eax");
231         &xor    ($j,$j);
232         &xor    ("edx","edx");
233
234 &set_label("mull",16);
235         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
236         &mov    ($carry,"edx");
237         &mul    ($word);                                # ap[j]*bp[0]
238         &lea    ($j,&DWP(1,$j));
239         &add    ("eax",$carry);
240         &adc    ("edx",0);
241         &mov    (&DWP($frame-4,"esp",$j,4),"eax");      # tp[j]=
242         &cmp    ($j,$num);
243         &jb     (&label("mull"));
244
245         &mov    ("eax",&DWP(0,$inp,$num,4));            # ap[num-1]
246         &mov    ($carry,"edx");
247         &mul    ($word);                                # ap[num-1]*bp[0]
248         &add    ("eax",$carry);
249         &adc    ("edx",0);
250
251         &mov    ($word,$_n0);
252         &mov    ($inp,$_np);
253         &imul   ($word,&DWP($frame,"esp"));             # n0*tp[0]
254
255         &mov    (&DWP($frame,"esp",$num,4),"eax");      # tp[num-1]=
256         &xor    ($j,$j);
257         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
258         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
259
260         &mov    ("eax",&DWP(0,$inp));                   # np[0]
261         &mul    ($word);                                # np[0]*m
262         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
263         &adc    ("edx",0);
264         &mov    ($j,1);
265
266         &jmp    (&label("2ndmadd"));
267
268 &set_label("1stmadd",16);
269         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
270         &mov    ($carry,"edx");
271         &mul    ($word);                                # ap[j]*bp[i]
272         &lea    ($j,&DWP(1,$j));
273         &add    ("eax",&DWP($frame-4,"esp",$j,4));      # +=tp[j]
274         &adc    ("edx",0);
275         &add    ("eax",$carry);
276         &adc    ("edx",0);
277         &mov    (&DWP($frame-4,"esp",$j,4),"eax");      # tp[j]=
278         &cmp    ($j,$num);
279         &jb     (&label("1stmadd"));
280
281         &mov    ("eax",&DWP(0,$inp,$num,4));            # ap[num-1]
282         &mov    ($carry,"edx");
283         &mul    ($word);                                # ap[num-1]*bp[i]
284         &add    ("eax",&DWP($frame,"esp",$num,4));      # +=tp[num-1]
285         &adc    ("edx",0);
286         &add    ("eax",$carry);
287         &adc    ("edx",0);
288
289         &mov    ($word,$_n0);
290         &mov    ($inp,$_np);
291         &imul   ($word,&DWP($frame,"esp"));             # n0*tp[0]
292
293         &xor    ($j,$j);
294         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
295         &mov    (&DWP($frame,"esp",$num,4),"eax");      # tp[num-1]=
296         &adc    ($j,0);
297         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
298         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
299
300         &mov    ("eax",&DWP(0,$inp));                   # np[0]
301         &mul    ($word);                                # np[0]*m
302         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
303         &adc    ("edx",0);
304         &mov    ($j,1);
305
306 &set_label("2ndmadd",16);
307         &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j]
308         &mov    ($carry,"edx");
309         &mul    ($word);                                # np[j]*m
310         &lea    ($j,&DWP(1,$j));
311         &add    ("eax",&DWP($frame-4,"esp",$j,4));      # +=tp[j]
312         &adc    ("edx",0);
313         &add    ("eax",$carry);
314         &adc    ("edx",0);
315         &mov    (&DWP($frame-8,"esp",$j,4),"eax");      # tp[j-1]=
316         &cmp    ($j,$num);
317         &jb     (&label("2ndmadd"));
318
319         &mov    ("eax",&DWP(0,$inp,$num,4));            # np[num-1]
320         &mov    ($carry,"edx");
321         &mul    ($word);                                # np[num-1]*m
322         &add    ("eax",&DWP($frame,"esp",$num,4));      # +=tp[num-1]
323         &adc    ("edx",0);
324         &add    ("eax",$carry);
325         &adc    ("edx",0);
326         &mov    (&DWP($frame-4,"esp",$num,4),"eax");    # tp[num-2]=
327
328         &xor    ("eax","eax");
329         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
330         &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
331         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
332         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
333
334         &mov    ($carry,$_bp);                          # &bp[i]
335         &add    ($carry,4);
336         &cmp    ($carry,$_bpend);
337         &je     (&label("x86done"));
338         &mov    ($word,&DWP(0,$carry));                 # bp[i]
339         &mov    ($inp,$_ap);
340         &mov    ($_bp,$carry);                          # &bp[++i]
341         &xor    ($j,$j);
342         &xor    ("edx","edx");
343         &jmp    (&label("1stmadd"));
344
345 &set_label("x86done",16);
346         &mov    ($np,$_np);     # make adjustments for tail processing
347         &add    ($num,1);
348 }
349
350 &set_label("common_tail",16);
351         &mov    ("esi",&DWP($frame,"esp",$num,4));# load upmost overflow bit
352         &mov    ($rp,$_rp);                     # load result pointer
353                                                 # [$ap and $bp are zapped]
354         &xor    ($i,$i);                        # i=0
355         &lea    ($j,&DWP(-1,$num));             # j=num-1
356         &cmp    ("esi",0);                      # clears CF unconditionally
357         &jnz    (&label("sub"));
358         &mov    ("eax",&DWP($frame,"esp",$j,4));
359         &cmp    ("eax",&DWP(0,$np,$j,4));       # tp[num-1]-np[num-1]?
360         &jae    (&label("sub"));                # if taken CF is cleared
361 &set_label("copy",16);
362         &mov    ("eax",&DWP($frame,"esp",$j,4));
363         &mov    (&DWP(0,$rp,$j,4),"eax");       # rp[i]=tp[i]
364         &mov    (&DWP($frame,"esp",$j,4),$j);   # zap temporary vector
365         &dec    ($j);
366         &jge    (&label("copy"));
367         &jmp    (&label("exit"));
368
369 &set_label("sub",16);
370         &mov    ("eax",&DWP($frame,"esp",$i,4));
371         &sbb    ("eax",&DWP(0,$np,$i,4));
372         &mov    (&DWP(0,$rp,$i,4),"eax");       # rp[i]=tp[i]-np[i]
373         &lea    ($i,&DWP(1,$i));                # i++
374         &dec    ($j);                           # doesn't affect CF!
375         &jge    (&label("sub"));
376         &lea    ($j,&DWP(-1,$num));             # j=num-1
377         &sbb    ("esi",0);                      # esi holds upmost overflow bit
378         &jc     (&label("copy"));
379 &set_label("zap",16);
380         &mov    (&DWP($frame,"esp",$j,4),$i);   # zap temporary vector
381         &dec    ($j);
382         &jge    (&label("zap"));
383
384 &set_label("exit",4);
385         &mov    ("esp",$_sp);           # pull saved stack pointer
386         &mov    ("eax",1);
387 &set_label("just_leave");
388 &function_end("bn_mul_mont");
389
390 &asm_finish();