3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # "Teaser" Montgomery multiplication module for IA-64. There are
13 # several possibilities for improvement:
15 # - modulo-scheduling outer loop would eliminate quite a number of
16 # stalls after ldf8, xma and getf.sig outside inner loop and
17 # improve shorter key performance;
18 # - shorter vector support [with input vectors being fetched only
19 # once] should be added;
20 # - 2x unroll with help of n0[1] would make the code scalable on
21 # "wider" IA-64, "wider" than Itanium 2 that is, which is not of
22 # acute interest, because upcoming Tukwila's individual cores are
23 # reportedly based on Itanium 2 design;
24 # - dedicated squaring procedure(?);
26 # So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
28 # sign verify sign/s verify/s
29 # rsa 512 bits 0.000634s 0.000030s 1577.6 32877.3
30 # rsa 1024 bits 0.001246s 0.000058s 802.8 17181.5
31 # rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
32 # rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
33 # dsa 512 bits 0.000322s 0.000286s 3106.0 3499.0
34 # dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
35 # dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
39 # rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
40 # rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
41 # rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
42 # rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
43 # dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
44 # dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
45 # dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
47 # 512-bit RSA sign performance does not improve, because this module
48 # doesn't handle short enough vectors (yet). Otherwise RSA sign
49 # improves by 60-30%, less for longer keys, while verify - by 35-13%.
50 # DSA performance improves by 40-30%.
54 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
55 } else { $ADDP="add"; }
61 // int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
62 // const BN_ULONG *bp,const BN_ULONG *np,
63 // const BN_ULONG *n0p,int num);
89 { .mmi; .save ar.pfs,prevfs
90 alloc prevfs=ar.pfs,6,2,0,8
94 { .mmi; .vframe prevsp
97 cmp4.gt p6,p0=5,in5 };; // is num large enough?
98 { .mfi; nop.m 0 // align loop bodies
101 { .mib; mov ret0=r0 // signal "unhandled"
104 (p6) br.ret.dpnt.many b0 };;
107 .rotf alo[6],nlo[4],ahi[8],nhi[6]
110 { .mmi; ldf8 bi=[bptr],8 // (*bp++)
111 ldf8 alo[4]=[aptr],16 // ap[0]
113 { .mmi; ldf8 alo[3]=[r30],16 // ap[1]
114 ldf8 alo[2]=[aptr],16 // ap[2]
116 { .mmi; ldf8 alo[1]=[r30] // ap[3]
119 { .mmi; $ADDP nptr=0,in3
122 { .mmi; ldf8 nlo[2]=[nptr],8 // np[0]
124 shladd r31=num,3,r31 };;
125 { .mmi; ldf8 nlo[1]=[nptr],8 // np[1]
128 { .mfb; and sp=-16,r31 // alloca
129 xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0]
132 xmpy.lu alo[4]=alo[4],bi
133 brp.loop.imp .L1st_ctop,.L1st_cend-16
136 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
139 xma.lu alo[3]=alo[3],bi,ahi[2]
140 mov pr.rot=0x20001f<<16
141 // ------^----- (p40) at first (p23)
142 // ----------^^ p[16:20]=1
145 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0
148 fcvt.fxu.s1 nhi[1]=f0
153 .pred.rel "mutex",p40,p42
154 { .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
155 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
156 (p40) add n[2]=n[2],a[2] } // (p23) }
157 { .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16)
158 (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
159 (p42) add n[2]=n[2],a[2],1 };; // (p23)
160 { .mfi; (p21) getf.sig a[0]=alo[5]
161 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
162 (p42) cmp.leu p41,p39=n[2],a[2] } // (p23)
163 { .mfi; (p23) st8 [tp_1]=n[2],8
164 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
165 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
166 { .mmb; (p21) getf.sig n[0]=nlo[3]
168 br.ctop.sptk .L1st_ctop };;
171 { .mmi; getf.sig a[0]=ahi[6] // (p24)
173 add num=-1,num };; // num--
174 { .mmi; .pred.rel "mutex",p40,p42
175 (p40) add n[0]=n[0],a[0]
176 (p42) add n[0]=n[0],a[0],1
177 sub aptr=aptr,len };; // rewind
178 { .mmi; .pred.rel "mutex",p40,p42
179 (p40) cmp.ltu p41,p39=n[0],a[0]
180 (p42) cmp.leu p41,p39=n[0],a[0]
181 sub nptr=nptr,len };;
182 { .mmi; .pred.rel "mutex",p39,p41
183 (p39) add topbit=r0,r0
184 (p41) add topbit=r0,r0,1
186 { .mmi; st8 [tp_1]=n[0]
193 { .mmi; ldf8 bi=[bptr],8 // (*bp++)
194 ldf8 ahi[3]=[tptr] // tp[0]
196 { .mmi; ldf8 alo[4]=[aptr],16 // ap[0]
197 ldf8 alo[3]=[r30],16 // ap[1]
199 { .mfb; ldf8 alo[2]=[aptr],16 // ap[2]
200 xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
201 brp.loop.imp .Linner_ctop,.Linner_cend-16
203 { .mfb; ldf8 alo[1]=[r30] // ap[3]
204 xma.lu alo[4]=alo[4],bi,ahi[3]
206 { .mfi; ldf8 nlo[2]=[nptr],16 // np[0]
207 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
209 { .mfi; ldf8 nlo[1]=[r31] // np[1]
210 xma.lu alo[3]=alo[3],bi,ahi[2]
211 mov pr.rot=0x20101f<<16
212 // ------^----- (p40) at first (p23)
213 // --------^--- (p30) at first (p22)
214 // ----------^^ p[16:20]=1
216 { .mfi; st8 [tptr]=r0 // tp[0] is already accounted
217 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0
220 fcvt.fxu.s1 nhi[1]=f0
223 // This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
224 // 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
225 // in latter case accounts for two-tick pipeline stall, which means
226 // that its performance would be ~20% lower than optimal one. No
227 // attempt was made to address this, because original Itanium is
228 // hardly represented out in the wild...
231 .pred.rel "mutex",p40,p42
232 .pred.rel "mutex",p30,p32
233 { .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
234 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
235 (p40) add n[2]=n[2],a[2] } // (p23)
236 { .mfi; (p16) nop.m 0
237 (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
238 (p42) add n[2]=n[2],a[2],1 };; // (p23)
239 { .mfi; (p21) getf.sig a[0]=alo[5]
241 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
242 { .mfi; (p21) ld8 t[0]=[tptr],8
244 (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23)
245 { .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)
246 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
247 (p30) add a[1]=a[1],t[1] } // (p22)
248 { .mfi; (p16) nop.m 0
249 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
250 (p32) add a[1]=a[1],t[1],1 };; // (p22)
251 { .mmi; (p21) getf.sig n[0]=nlo[3]
253 (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22)
254 { .mmb; (p23) st8 [tp_1]=n[2],8
255 (p32) cmp.leu p31,p29=a[1],t[1] // (p22)
256 br.ctop.sptk .Linner_ctop };;
259 { .mmi; getf.sig a[0]=ahi[6] // (p24)
263 { .mmi; .pred.rel "mutex",p31,p33
264 (p31) add a[0]=a[0],topbit
265 (p33) add a[0]=a[0],topbit,1
267 { .mfi; .pred.rel "mutex",p31,p33
268 (p31) cmp.ltu p32,p30=a[0],topbit
269 (p33) cmp.leu p32,p30=a[0],topbit
271 { .mfi; .pred.rel "mutex",p40,p42
272 (p40) add n[0]=n[0],a[0]
273 (p42) add n[0]=n[0],a[0],1
275 { .mmi; .pred.rel "mutex",p44,p46
276 (p40) cmp.ltu p41,p39=n[0],a[0]
277 (p42) cmp.leu p41,p39=n[0],a[0]
278 (p32) add topbit=r0,r0,1 }
280 { .mmi; st8 [tp_1]=n[0],8
282 sub aptr=aptr,len };; // rewind
283 { .mmi; sub nptr=nptr,len
284 (p41) add topbit=r0,r0,1
286 { .mmb; $ADDP tp_1=8,sp
287 add num=-1,num // num--
288 (p6) br.cond.sptk.many .Louter };;
291 brp.loop.imp .Lsub_ctop,.Lsub_cend-16
294 mov pr.rot=0x10001<<16
295 // ------^---- (p33) at first (p17)
302 .pred.rel "mutex",p33,p35
303 { .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++)
305 (p33) sub n[1]=t[1],n[1] } // (p17)
306 { .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++)
308 (p35) sub n[1]=t[1],n[1],1 };; // (p17)
309 { .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r
310 (p33) cmp.gtu p34,p32=n[1],t[1] // (p17)
312 { .mib; (p18) nop.m 0
313 (p35) cmp.geu p34,p32=n[1],t[1] // (p17)
314 br.ctop.sptk .Lsub_ctop };;
317 { .mmb; .pred.rel "mutex",p34,p36
318 (p34) sub topbit=topbit,r0 // (p19)
319 (p36) sub topbit=topbit,r0,1
320 brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16
322 { .mmb; sub rptr=rptr,len // rewind
325 { .mmi; and aptr=tptr,topbit
326 andcm bptr=rptr,topbit
328 { .mii; or nptr=aptr,bptr
333 { .mmb; (p16) ld8 n[0]=[nptr],8
334 (p18) st8 [tptr]=r0,8
336 { .mmb; (p16) nop.m 0
337 (p18) st8 [rptr]=n[2],8
338 br.ctop.sptk .Lcopy_ctop };;
341 { .mmi; mov ret0=1 // signal "handled"
342 rum 1<<5 // clear um.mfh
347 br.ret.sptk.many b0 };;
349 .type copyright#,\@object
351 stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
354 $output=shift and open STDOUT,">$output";