Minor sparcv9 clean-ups.
[openssl.git] / crypto / bn / asm / x86-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
8
9 # October 2005
10 #
11 # This is a "teaser" code, as it can be improved in several ways...
12 # First of all non-SSE2 path should be implemented (yes, for now it
13 # performs Montgomery multiplication/convolution only on SSE2-capable
14 # CPUs such as P4, others fall down to original code). Then inner loop
15 # can be unrolled and modulo-scheduled to improve ILP and possibly
16 # moved to 128-bit XMM register bank (though it would require input
17 # rearrangement and/or increase bus bandwidth utilization). Dedicated
18 # squaring procedure should give further performance improvement...
19 # Yet, for being draft, the code improves rsa512 *sign* benchmark by
20 # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
21
22 push(@INC,"perlasm","../../perlasm");
23 require "x86asm.pl";
24
25 &asm_init($ARGV[0],$0);
26
27 $sse2=0;
28 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
29
30 &external_label("OPENSSL_ia32cap_P") if ($sse2);
31
32 &function_begin("bn_mul_mont",$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
33
34 $i="ebx";
35 $j="ecx";
36 $ap="esi";
37 $rp="edi";      $bp="edi";              # overlapping variables!!!
38 $np="edx";
39 $num="ebp";
40
41 $_rp=&DWP(4*0,"esp");                   # stack top layout
42 $_ap=&DWP(4*1,"esp");
43 $_bp=&DWP(4*2,"esp");
44 $_np=&DWP(4*3,"esp");
45 $_n0=&DWP(4*4,"esp");
46 $_num=&DWP(4*5,"esp");
47 $_sp=&DWP(4*6,"esp");
48 $frame=32;                              # size of above frame rounded up to 16n
49
50 $acc0="mm0";                            # mmx register bank layout
51 $acc1="mm1";
52 $car0="mm2";
53 $car1="mm3";
54 $mul0="mm4";
55 $mul1="mm5";
56 $temp="mm6";
57 $mask="mm7";
58
59 if($sse2) {
60         &picmeup("eax","OPENSSL_ia32cap_P");
61         &bt     (&DWP(0,"eax"),26);
62         &jnc    (&label("non_sse2"));
63
64         ################################# load argument block...
65         &mov    ("eax",&wparam(0));     # BN_ULONG *rp
66         &mov    ("ebx",&wparam(1));     # const BN_ULONG *ap
67         &mov    ("ecx",&wparam(2));     # const BN_ULONG *bp
68         &mov    ("edx",&wparam(3));     # const BN_ULONG *np
69         &mov    ("esi",&wparam(4));     # const BN_ULONG *n0
70         &mov    ($num,&wparam(5));      # int num
71
72         &mov    ("edi","esp");          # saved stack pointer!
73         &add    ($num,1);               # extra word on top of tp
74         &neg    ($num);
75         &lea    ("esp",&DWP(-$frame,"esp",$num,4));     # alloca($frame+8*($num+1))
76         &neg    ($num);
77         &and    ("esp",-1024);          # minimize TLB utilization
78         &sub    ($num,1);               # num is restored to its original value
79                                         # and will remain constant from now...
80
81         &mov    ("esi",&DWP(0,"esi"));  # pull n0[0]
82         &mov    ($_rp,"eax");           # ... save a copy of argument block
83         &mov    ($_ap,"ebx");
84         &mov    ($_bp,"ecx");
85         &mov    ($_np,"edx");
86         &mov    ($_n0,"esi");
87         #&mov   ($_num,$num);           # redundant in sse2 context
88         &mov    ($_sp,"edi");           # saved stack pointer!
89
90         &mov    ("eax",-1);
91         &movd   ($mask,"eax");          # mask 32 lower bits
92
93         &mov    ($ap,$_ap);             # load input pointers
94         &mov    ($bp,$_bp);
95         &mov    ($np,$_np);
96
97         &xor    ($i,$i);                # i=0
98         &xor    ($j,$j);                # j=0
99
100         &movd   ($mul0,&DWP(0,$bp));            # bp[0]
101         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
102         &movd   ($car1,&DWP(0,$np));            # np[0]
103
104         &pmuludq($mul1,$mul0);                  # ap[0]*bp[0]
105         &movq   ($car0,$mul1);
106         &movq   ($acc0,$mul1);                  # I wish movd worked for
107         &pand   ($acc0,$mask);                  # inter-register transfers
108
109         &pmuludq($mul1,$_n0);                   # *=n0
110
111         &pmuludq($car1,$mul1);                  # "t[0]"*np[0]*n0
112         &paddq  ($car1,$acc0);
113
114         &psrlq  ($car0,32);
115         &psrlq  ($car1,32);
116
117         &inc    ($j);                           # j++
118 &set_label("1st");
119         &movd   ($acc0,&DWP(0,$ap,$j,4));       # ap[j]
120         &movd   ($acc1,&DWP(0,$np,$j,4));       # np[j]
121         &pmuludq($acc0,$mul0);                  # ap[j]*bp[0]
122         &pmuludq($acc1,$mul1);                  # np[j]*m1
123
124         &paddq  ($car0,$acc0);                  # +=c0
125         &movq   ($acc0,$car0);
126         &pand   ($acc0,$mask);
127
128         &paddq  ($car1,$acc1);                  # +=c1
129         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[0];
130         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[j-1]=
131
132         &psrlq  ($car0,32);
133         &psrlq  ($car1,32);
134
135         &lea    ($j,&DWP(1,$j));
136         &cmp    ($j,$num);
137         &jl     (&label("1st"));
138
139         &paddq  ($car1,$car0);
140         &movq   (&DWP($frame-4,"esp",$num,4),$car1);
141
142         &inc    ($i);                           # i++
143 &set_label("outer");
144         &xor    ($j,$j);                        # j=0
145
146         &movd   ($mul0,&DWP(0,$bp,$i,4));       # bp[i]
147         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
148         &movd   ($temp,&DWP($frame,"esp"));     # tp[0]
149         &movd   ($car1,&DWP(0,$np));            # np[0]
150         &pmuludq($mul1,$mul0);                  # ap[0]*bp[i]
151
152         &paddq  ($mul1,$temp);                  # +=tp[0]
153         &movq   ($acc0,$mul1);
154         &movq   ($car0,$mul1);
155         &pand   ($acc0,$mask);
156
157         &pmuludq($mul1,$_n0);                   # *=n0
158
159         &pmuludq($car1,$mul1);
160         &paddq  ($car1,$acc0);
161
162         &psrlq  ($car0,32);
163         &psrlq  ($car1,32);
164
165         &inc    ($j);                           # j++
166 &set_label("inner");
167         &movd   ($acc0,&DWP(0,$ap,$j,4));       # ap[j]
168         &movd   ($acc1,&DWP(0,$np,$j,4));       # np[j]
169         &movd   ($temp,&DWP($frame,"esp",$j,4));# tp[j]
170         &pmuludq($acc0,$mul0);                  # ap[j]*bp[i]
171         &pmuludq($acc1,$mul1);                  # np[j]*m1
172         &paddq  ($car0,$temp);                  # +=tp[j]
173         &paddq  ($car0,$acc0);                  # +=c0
174         &movq   ($acc0,$car0);
175         &pand   ($acc0,$mask);
176
177         &paddq  ($car1,$acc1);                  # +=c1
178         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[i]+tp[j]
179         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[j-1]=
180
181         &psrlq  ($car0,32);
182         &psrlq  ($car1,32);
183
184         &lea    ($j,&DWP(1,$j));                # j++
185         &cmp    ($j,$num);
186         &jl     (&label("inner"));
187
188         &movd   ($temp,&DWP($frame,"esp",$num,4));
189         &paddq  ($car1,$car0);
190         &paddq  ($car1,$temp);
191         &movq   (&DWP($frame-4,"esp",$num,4),$car1);
192
193         &lea    ($i,&DWP(1,$i));                # i++
194         &cmp    ($i,$num);
195         &jl     (&label("outer"));
196
197         &emms   ();                             # done with mmx bank
198
199         &mov    ("esi",&DWP($frame,"esp",$num,4));# load upmost overflow bit
200         &mov    ($rp,$_rp);                     # load result pointer
201                                                 # [$ap and $bp are zapped]
202         &xor    ($i,$i);                        # i=0
203         &lea    ($j,&DWP(-1,$num));             # j=num-1
204         &cmp    ("esi",0);                      # clears CF unconditionally
205         &jnz    (&label("sub"));
206         &mov    ("eax",&DWP($frame,"esp",$j,4));
207         &cmp    ("eax",&DWP(0,$np,$j,4));       # tp[num-1]-np[num-1]?
208         &jae    (&label("sub"));                # if taken CF is cleared
209 &set_label("copy");
210         &mov    ("eax",&DWP($frame,"esp",$j,4));
211         &mov    (&DWP(0,$rp,$j,4),"eax");       # rp[i]=tp[i]
212         &mov    (&DWP($frame,"esp",$j,4),$j);   # zap temporary vector
213         &dec    ($j);
214         &jge    (&label("copy"));
215         &jmp    (&label("exit_sse2"));
216
217 &set_label("sub",4);
218         &mov    ("eax",&DWP($frame,"esp",$i,4));
219         &sbb    ("eax",&DWP(0,$np,$i,4));
220         &mov    (&DWP(0,$rp,$i,4),"eax");       # rp[i]=tp[i]-np[i]
221         &lea    ($i,&DWP(1,$i));                # i++
222         &dec    ($j);                           # doesn't affect CF!
223         &jge    (&label("sub"));
224         &lea    ($j,&DWP(-1,$num));             # j=num-1
225         &sbb    ("esi",0);                      # esi holds upmost overflow bit
226         &jc     (&label("copy"));
227 &set_label("zap");
228         &mov    (&DWP($frame,"esp",$j,4),$i);   # zap temporary vector
229         &dec    ($j);
230         &jge    (&label("zap"));
231
232 &set_label("exit_sse2");
233         &mov    ("esp",$_sp);           # pull saved stack pointer
234         &mov    ("eax",1);
235         &jmp    (&label("leave"));
236 &set_label("non_sse2");
237 }
238
239         &xor    ("eax","eax");  # zero signals "not implemented [yet]"
240
241 &set_label("leave");
242 &function_end("bn_mul_mont");
243
244 &asm_finish();