fc539d52644548b13840c1ddbfc0b4d1b67a1a3c
[openssl.git] / crypto / bn / asm / ia64-mont.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # January 2010
11 #
12 # "Teaser" Montgomery multiplication module for IA-64. There are
13 # several possibilities for improvement:
14 #
15 # - modulo-scheduling outer loop would eliminate quite a number of
16 #   stalls after ldf8, xma and getf.sig outside inner loop and
17 #   improve shorter key performance;
18 # - shorter vector support [with input vectors being fetched only
19 #   once] should be added;
20 # - 2x unroll with help of n0[1] would make the code scalable on
21 #   "wider" IA-64, "wider" than Itanium 2 that is, which is not of
22 #   acute interest, because upcoming Tukwila's individual cores are
23 #   reportedly based on Itanium 2 design;
24 # - dedicated squaring procedure(?);
25 #
26 # So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
27 # this module is:
28 #                   sign    verify    sign/s verify/s
29 # rsa  512 bits 0.000634s 0.000030s   1577.6  32877.3
30 # rsa 1024 bits 0.001246s 0.000058s    802.8  17181.5
31 # rsa 2048 bits 0.005908s 0.000148s    169.3   6754.0
32 # rsa 4096 bits 0.033456s 0.000469s     29.9   2133.6
33 # dsa  512 bits 0.000322s 0.000286s   3106.0   3499.0
34 # dsa 1024 bits 0.000585s 0.000607s   1708.4   1647.4
35 # dsa 2048 bits 0.001453s 0.001703s    688.1    587.4
36 #
37 # ... and *without*:
38 #
39 # rsa  512 bits 0.000670s 0.000041s   1491.8  24145.5
40 # rsa 1024 bits 0.001988s 0.000080s    502.9  12499.3
41 # rsa 2048 bits 0.008702s 0.000189s    114.9   5293.9
42 # rsa 4096 bits 0.043860s 0.000533s     22.8   1875.9
43 # dsa  512 bits 0.000441s 0.000427s   2265.3   2340.6
44 # dsa 1024 bits 0.000823s 0.000867s   1215.6   1153.2
45 # dsa 2048 bits 0.001894s 0.002179s    528.1    458.9
46 #
47 # 512-bit RSA sign performance does not improve, because this module
48 # doesn't handle short enough vectors (yet). Otherwise RSA sign
49 # improves by 60-30%, less for longer keys, while verify - by 35-13%.
50 # DSA performance improves by 40-30%.
51
52 if ($^O eq "hpux") {
53     $ADDP="addp4";
54     for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
55 } else { $ADDP="add"; }
56
57 $code=<<___;
58 .explicit
59 .text
60 \f
61 // int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
62 //                  const BN_ULONG *bp,const BN_ULONG *np,
63 //                  const BN_ULONG *n0p,int num);                       
64 .global bn_mul_mont#
65 .proc   bn_mul_mont#
66 prevsp=r2;
67 prevfs=r3;
68 prevlc=r10;
69 prevpr=r11;
70
71 rptr=r14;
72 aptr=r15;
73 bptr=r16;
74 nptr=r17;
75 tptr=r18;       // &tp[0]
76 tp_1=r19;       // &tp[-1]
77 num=r20;
78 len=r21;
79 topbit=r22;
80 lc=r23;
81
82 bi=f6;
83 n0=f7;
84 m0=f8;
85
86 .align  64
87 bn_mul_mont:
88         .prologue
89 { .mmi; .save   ar.pfs,prevfs
90         alloc   prevfs=ar.pfs,6,2,0,8
91         $ADDP   aptr=0,in1
92         .save   ar.lc,prevlc
93         mov     prevlc=ar.lc            }
94 { .mmi; .vframe prevsp
95         mov     prevsp=sp
96         $ADDP   bptr=0,in2
97         cmp4.gt p6,p0=5,in5             };;     // is num large enough?
98 { .mfi; nop.m   0                               // align loop bodies
99         nop.f   0
100         nop.i   0                       }
101 { .mib; mov     ret0=r0                         // signal "unhandled"
102         .save   pr,prevpr
103         mov     prevpr=pr
104 (p6)    br.ret.dpnt.many        b0      };;
105
106         .body
107         .rotf           alo[6],nlo[4],ahi[8],nhi[6]
108         .rotr           a[3],n[3],t[2]
109
110 { .mmi; ldf8            bi=[bptr],8             // (*bp++)
111         ldf8            alo[4]=[aptr],16        // ap[0]
112         $ADDP           r30=8,in1       };;
113 { .mmi; ldf8            alo[3]=[r30],16         // ap[1]
114         ldf8            alo[2]=[aptr],16        // ap[2]
115         $ADDP           in4=0,in4       };;
116 { .mmi; ldf8            alo[1]=[r30]            // ap[3]
117         ldf8            n0=[in4]                // n0
118         $ADDP           rptr=0,in0              }
119 { .mmi; $ADDP           nptr=0,in3
120         mov             r31=16
121         zxt4            num=in5         };;
122 { .mmi; ldf8            nlo[2]=[nptr],8         // np[0]
123         shladd          len=num,3,r0
124         shladd          r31=num,3,r31   };;
125 { .mmi; ldf8            nlo[1]=[nptr],8         // np[1]
126         add             lc=-5,num
127         sub             r31=sp,r31      };;
128 { .mfb; and             sp=-16,r31              // alloca
129         xmpy.hu         ahi[2]=alo[4],bi        // ap[0]*bp[0]
130         nop.b           0               }
131 { .mfb; nop.m           0
132         xmpy.lu         alo[4]=alo[4],bi
133         brp.loop.imp    .L1st_ctop,.L1st_cend-16
134                                         };;
135 { .mfi; nop.m           0
136         xma.hu          ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
137         add             tp_1=8,sp       }
138 { .mfi; nop.m           0
139         xma.lu          alo[3]=alo[3],bi,ahi[2]
140         mov             pr.rot=0x20001f<<16
141                         // ------^----- (p40) at first (p23)
142                         // ----------^^ p[16:20]=1
143                                         };;
144 { .mfi; nop.m           0
145         xmpy.lu         m0=alo[4],n0            // (ap[0]*bp[0])*n0
146         mov             ar.lc=lc        }
147 { .mfi; nop.m           0
148         fcvt.fxu.s1     nhi[1]=f0
149         mov             ar.ec=8         };;
150
151 .align  32
152 .L1st_ctop:
153 .pred.rel       "mutex",p40,p42
154 { .mfi; (p16)   ldf8            alo[0]=[aptr],8             // *(aptr++)
155         (p18)   xma.hu          ahi[0]=alo[2],bi,ahi[1]
156         (p40)   add             n[2]=n[2],a[2]          }   // (p23)                                    }
157 { .mfi; (p18)   ldf8            nlo[0]=[nptr],8             // *(nptr++)(p16)
158         (p18)   xma.lu          alo[2]=alo[2],bi,ahi[1]
159         (p42)   add             n[2]=n[2],a[2],1        };; // (p23)
160 { .mfi; (p21)   getf.sig        a[0]=alo[5]
161         (p20)   xma.hu          nhi[0]=nlo[2],m0,nhi[1]
162         (p42)   cmp.leu         p41,p39=n[2],a[2]       }   // (p23)
163 { .mfi; (p23)   st8             [tp_1]=n[2],8
164         (p20)   xma.lu          nlo[2]=nlo[2],m0,nhi[1]
165         (p40)   cmp.ltu         p41,p39=n[2],a[2]       }   // (p23)
166 { .mmb; (p21)   getf.sig        n[0]=nlo[3]
167         (p16)   nop.m           0
168         br.ctop.sptk    .L1st_ctop                      };;
169 .L1st_cend:
170
171 { .mmi; getf.sig        a[0]=ahi[6]             // (p24)
172         getf.sig        n[0]=nhi[4]
173         add             num=-1,num      };;     // num--
174 { .mmi; .pred.rel       "mutex",p40,p42
175 (p40)   add             n[0]=n[0],a[0]
176 (p42)   add             n[0]=n[0],a[0],1
177         sub             aptr=aptr,len   };;     // rewind
178 { .mmi; .pred.rel       "mutex",p40,p42
179 (p40)   cmp.ltu         p41,p39=n[0],a[0]
180 (p42)   cmp.leu         p41,p39=n[0],a[0]
181         sub             nptr=nptr,len   };;
182 { .mmi; .pred.rel       "mutex",p39,p41
183 (p39)   add             topbit=r0,r0
184 (p41)   add             topbit=r0,r0,1
185         nop.i           0               }       
186 { .mmi; st8             [tp_1]=n[0]
187         add             tptr=16,sp
188         add             tp_1=8,sp       };;
189 \f
190 \f
191 .Louter:
192 { .mmi; ldf8            bi=[bptr],8             // (*bp++)
193         ldf8            ahi[3]=[tptr]           // tp[0]
194         add             r30=8,aptr      };;
195 { .mmi; ldf8            alo[4]=[aptr],16        // ap[0]
196         ldf8            alo[3]=[r30],16         // ap[1]
197         add             r31=8,nptr      };;
198 { .mfb; ldf8            alo[2]=[aptr],16        // ap[2]
199         xma.hu          ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
200         brp.loop.imp    .Linner_ctop,.Linner_cend-16
201                                         }
202 { .mfb; ldf8            alo[1]=[r30]            // ap[3]
203         xma.lu          alo[4]=alo[4],bi,ahi[3]
204         clrrrb.pr                       };;
205 { .mfi; ldf8            nlo[2]=[nptr],16        // np[0]
206         xma.hu          ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
207         nop.i           0               }
208 { .mfi; ldf8            nlo[1]=[r31]            // np[1]
209         xma.lu          alo[3]=alo[3],bi,ahi[2]
210         mov             pr.rot=0x20101f<<16
211                         // ------^----- (p40) at first (p23)
212                         // --------^--- (p30) at first (p22)
213                         // ----------^^ p[16:20]=1
214                                         };;
215 { .mfi; st8             [tptr]=r0               // tp[0] is already accounted
216         xmpy.lu         m0=alo[4],n0            // (ap[0]*bp[i]+tp[0])*n0
217         mov             ar.lc=lc        }
218 { .mfi;
219         fcvt.fxu.s1     nhi[1]=f0
220         mov             ar.ec=8         };;
221
222 // This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
223 // 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
224 // in latter case accounts for two-tick pipeline stall, which means
225 // that its performance would be ~20% lower than optimal one. No
226 // attempt was made to address this, because original Itanium is
227 // hardly represented out in the wild...
228 .align  32
229 .Linner_ctop:
230 .pred.rel       "mutex",p40,p42
231 .pred.rel       "mutex",p30,p32
232 { .mfi; (p16)   ldf8            alo[0]=[aptr],8             // *(aptr++)
233         (p18)   xma.hu          ahi[0]=alo[2],bi,ahi[1]
234         (p40)   add             n[2]=n[2],a[2]          }   // (p23)
235 { .mfi; (p16)   nop.m           0
236         (p18)   xma.lu          alo[2]=alo[2],bi,ahi[1]
237         (p42)   add             n[2]=n[2],a[2],1        };; // (p23)
238 { .mfi; (p21)   getf.sig        a[0]=alo[5]
239         (p16)   nop.f           0
240         (p40)   cmp.ltu         p41,p39=n[2],a[2]       }   // (p23)
241 { .mfi; (p21)   ld8             t[0]=[tptr],8
242         (p16)   nop.f           0
243         (p42)   cmp.leu         p41,p39=n[2],a[2]       };; // (p23)
244 { .mfi; (p18)   ldf8            nlo[0]=[nptr],8             // *(nptr++)
245         (p20)   xma.hu          nhi[0]=nlo[2],m0,nhi[1]
246         (p30)   add             a[1]=a[1],t[1]          }   // (p22)
247 { .mfi; (p16)   nop.m           0
248         (p20)   xma.lu          nlo[2]=nlo[2],m0,nhi[1]
249         (p32)   add             a[1]=a[1],t[1],1        };; // (p22)
250 { .mmi; (p21)   getf.sig        n[0]=nlo[3]
251         (p16)   nop.m           0
252         (p30)   cmp.ltu         p31,p29=a[1],t[1]       }   // (p22)
253 { .mmb; (p23)   st8             [tp_1]=n[2],8
254         (p32)   cmp.leu         p31,p29=a[1],t[1]           // (p22)
255         br.ctop.sptk    .Linner_ctop                    };;
256 .Linner_cend:
257
258 { .mmi; getf.sig        a[0]=ahi[6]             // (p24)
259         getf.sig        n[0]=nhi[4]
260         nop.i           0               };;
261
262 { .mmi; .pred.rel       "mutex",p31,p33
263 (p31)   add             a[0]=a[0],topbit
264 (p33)   add             a[0]=a[0],topbit,1
265         mov             topbit=r0       };;
266 { .mfi; .pred.rel       "mutex",p31,p33
267 (p31)   cmp.ltu         p32,p30=a[0],topbit
268 (p33)   cmp.leu         p32,p30=a[0],topbit
269                                         }
270 { .mfi; .pred.rel       "mutex",p40,p42
271 (p40)   add             n[0]=n[0],a[0]
272 (p42)   add             n[0]=n[0],a[0],1
273                                         };;
274 { .mmi; .pred.rel       "mutex",p44,p46
275 (p40)   cmp.ltu         p41,p39=n[0],a[0]
276 (p42)   cmp.leu         p41,p39=n[0],a[0]
277 (p32)   add             topbit=r0,r0,1  }
278
279 { .mmi; st8             [tp_1]=n[0],8
280         cmp4.ne         p6,p0=1,num
281         sub             aptr=aptr,len   };;     // rewind
282 { .mmi; sub             nptr=nptr,len
283 (p41)   add             topbit=r0,r0,1
284         add             tptr=16,sp      }
285 { .mmb; add             tp_1=8,sp
286         add             num=-1,num              // num--
287 (p6)    br.cond.sptk.many       .Louter };;
288 \f
289 { .mbb; add             lc=4,lc
290         brp.loop.imp    .Lsub_ctop,.Lsub_cend-16
291         clrrrb.pr                       };;
292 { .mii; nop.m           0
293         mov             pr.rot=0x10001<<16
294                         // ------^---- (p33) at first (p17)
295         mov             ar.lc=lc        }
296 { .mii; nop.m           0
297         mov             ar.ec=3
298         nop.i           0               };;
299
300 .Lsub_ctop:
301 .pred.rel       "mutex",p33,p35
302 { .mfi; (p16)   ld8             t[0]=[tptr],8               // t=*(tp++)
303         (p16)   nop.f           0
304         (p33)   sub             n[1]=t[1],n[1]          }   // (p17)
305 { .mfi; (p16)   ld8             n[0]=[nptr],8               // n=*(np++)
306         (p16)   nop.f           0
307         (p35)   sub             n[1]=t[1],n[1],1        };; // (p17)
308 { .mib; (p18)   st8             [rptr]=n[2],8               // *(rp++)=r
309         (p33)   cmp.gtu         p34,p32=n[1],t[1]           // (p17)
310         (p18)   nop.b           0                       }
311 { .mib; (p18)   nop.m           0
312         (p35)   cmp.geu         p34,p32=n[1],t[1]           // (p17)
313         br.ctop.sptk    .Lsub_ctop                      };;
314 .Lsub_cend:
315
316 { .mmb; .pred.rel       "mutex",p34,p36
317 (p34)   sub     topbit=topbit,r0        // (p19)
318 (p36)   sub     topbit=topbit,r0,1
319         brp.loop.imp    .Lcopy_ctop,.Lcopy_cend-16
320                                         }
321 { .mmb; sub     rptr=rptr,len           // rewind
322         sub     tptr=tptr,len
323         clrrrb.pr                       };;
324 { .mmi; and     aptr=tptr,topbit
325         andcm   bptr=rptr,topbit
326         mov     pr.rot=1<<16            };;
327 { .mii; or      nptr=aptr,bptr
328         mov     ar.lc=lc
329         mov     ar.ec=3                 };;
330
331 .Lcopy_ctop:
332 { .mmb; (p16)   ld8     n[0]=[nptr],8
333         (p18)   st8     [tptr]=r0,8
334         (p16)   nop.b   0               }
335 { .mmb; (p16)   nop.m   0
336         (p18)   st8     [rptr]=n[2],8
337         br.ctop.sptk    .Lcopy_ctop     };;
338 .Lcopy_cend:
339
340 { .mmi; mov             ret0=1                  // signal "handled"
341         rum             1<<5                    // clear um.mfh
342         mov             ar.lc=prevlc    }
343 { .mib; .restore        sp
344         mov             sp=prevsp
345         mov             pr=prevpr,-2
346         br.ret.sptk.many        b0      };;
347 .endp   bn_mul_mont
348 .type   copyright#,\@object
349 copyright:
350 stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
351 ___
352
353 $output=shift and open STDOUT,">$output";
354 print $code;
355 close STDOUT;