0084213fb551c826f1c81959ea5f6845214b843f
[openssl.git] / crypto / bn / asm / ia64-mont.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # January 2010
11 #
12 # "Teaser" Montgomery multiplication module for IA-64. There are
13 # several possibilities for improvement:
14 #
15 # - modulo-scheduling outer loop would eliminate quite a number of
16 #   stalls after ldf8, xma and getf.sig outside inner loop and
17 #   improve shorter key performance;
18 # - shorter vector support [with input vectors being fetched only
19 #   once] should be added;
20 # - 2x unroll with help of n0[1] would make the code scalable on
21 #   "wider" IA-64, "wider" than Itanium 2 that is, which is not of
22 #   acute interest, because upcoming Tukwila's individual cores are
23 #   reportedly based on Itanium 2 design;
24 # - dedicated squaring procedure(?);
25 #
26 # So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
27 # this module is:
28 #                   sign    verify    sign/s verify/s
29 # rsa  512 bits 0.000634s 0.000030s   1577.6  32877.3
30 # rsa 1024 bits 0.001246s 0.000058s    802.8  17181.5
31 # rsa 2048 bits 0.005908s 0.000148s    169.3   6754.0
32 # rsa 4096 bits 0.033456s 0.000469s     29.9   2133.6
33 # dsa  512 bits 0.000322s 0.000286s   3106.0   3499.0
34 # dsa 1024 bits 0.000585s 0.000607s   1708.4   1647.4
35 # dsa 2048 bits 0.001453s 0.001703s    688.1    587.4
36 #
37 # ... and *without*:
38 #
39 # rsa  512 bits 0.000670s 0.000041s   1491.8  24145.5
40 # rsa 1024 bits 0.001988s 0.000080s    502.9  12499.3
41 # rsa 2048 bits 0.008702s 0.000189s    114.9   5293.9
42 # rsa 4096 bits 0.043860s 0.000533s     22.8   1875.9
43 # dsa  512 bits 0.000441s 0.000427s   2265.3   2340.6
44 # dsa 1024 bits 0.000823s 0.000867s   1215.6   1153.2
45 # dsa 2048 bits 0.001894s 0.002179s    528.1    458.9
46 #
47 # 512-bit RSA sign performance does not improve, because this module
48 # doesn't handle short enough vectors (yet). Otherwise RSA sign
49 # improves by 60-30%, less for longer keys, while verify - by 35-13%.
50 # DSA performance improves by 40-30%.
51
52 if ($^O eq "hpux") {
53     $ADDP="addp4";
54     for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
55 } else { $ADDP="add"; }
56
57 $code=<<___;
58 .explicit
59 .text
60 \f
61 // int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
62 //                  const BN_ULONG *bp,const BN_ULONG *np,
63 //                  const BN_ULONG *n0p,int num);                       
64 .global bn_mul_mont#
65 .proc   bn_mul_mont#
66 prevsp=r2;
67 prevfs=r3;
68 prevlc=r10;
69 prevpr=r11;
70
71 rptr=r14;
72 aptr=r15;
73 bptr=r16;
74 nptr=r17;
75 tptr=r18;       // &tp[0]
76 tp_1=r19;       // &tp[-1]
77 num=r20;
78 len=r21;
79 topbit=r22;
80 lc=r23;
81
82 bi=f6;
83 n0=f7;
84 m0=f8;
85
86 .align  64
87 bn_mul_mont:
88         .prologue
89 { .mmi; .save   ar.pfs,prevfs
90         alloc   prevfs=ar.pfs,6,2,0,8
91         $ADDP   aptr=0,in1
92         .save   ar.lc,prevlc
93         mov     prevlc=ar.lc            }
94 { .mmi; .vframe prevsp
95         mov     prevsp=sp
96         $ADDP   bptr=0,in2
97         cmp4.gt p6,p0=5,in5             };;     // is num large enough?
98 { .mfi; nop.m   0                               // align loop bodies
99         nop.f   0
100         nop.i   0                       }
101 { .mib; mov     ret0=r0                         // signal "unhandled"
102         .save   pr,prevpr
103         mov     prevpr=pr
104 (p6)    br.ret.dpnt.many        b0      };;
105
106         .body
107         .rotf           alo[6],nlo[4],ahi[8],nhi[6]
108         .rotr           a[3],n[3],t[2]
109
110 { .mmi; ldf8            bi=[bptr],8             // (*bp++)
111         ldf8            alo[4]=[aptr],16        // ap[0]
112         $ADDP           r30=8,in1       };;
113 { .mmi; ldf8            alo[3]=[r30],16         // ap[1]
114         ldf8            alo[2]=[aptr],16        // ap[2]
115         $ADDP           in4=0,in4       };;
116 { .mmi; ldf8            alo[1]=[r30]            // ap[3]
117         ldf8            n0=[in4]                // n0
118         $ADDP           rptr=0,in0              }
119 { .mmi; $ADDP           nptr=0,in3
120         mov             r31=16
121         zxt4            num=in5         };;
122 { .mmi; ldf8            nlo[2]=[nptr],8         // np[0]
123         shladd          len=num,3,r0
124         shladd          r31=num,3,r31   };;
125 { .mmi; ldf8            nlo[1]=[nptr],8         // np[1]
126         add             lc=-5,num
127         sub             r31=sp,r31      };;
128 { .mfb; and             sp=-16,r31              // alloca
129         xmpy.hu         ahi[2]=alo[4],bi        // ap[0]*bp[0]
130         nop.b           0               }
131 { .mfb; nop.m           0
132         xmpy.lu         alo[4]=alo[4],bi
133         brp.loop.imp    .L1st_ctop,.L1st_cend-16
134                                         };;
135 { .mfi; nop.m           0
136         xma.hu          ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
137         $ADDP           tp_1=8,sp       }
138 { .mfi; nop.m           0
139         xma.lu          alo[3]=alo[3],bi,ahi[2]
140         mov             pr.rot=0x20001f<<16
141                         // ------^----- (p40) at first (p23)
142                         // ----------^^ p[16:20]=1
143                                         };;
144 { .mfi; nop.m           0
145         xmpy.lu         m0=alo[4],n0            // (ap[0]*bp[0])*n0
146         mov             ar.lc=lc        }
147 { .mfi; nop.m           0
148         fcvt.fxu.s1     nhi[1]=f0
149         mov             ar.ec=8         };;
150
151 .align  32
152 .L1st_ctop:
153 .pred.rel       "mutex",p40,p42
154 { .mfi; (p16)   ldf8            alo[0]=[aptr],8             // *(aptr++)
155         (p18)   xma.hu          ahi[0]=alo[2],bi,ahi[1]
156         (p40)   add             n[2]=n[2],a[2]          }   // (p23)                                    }
157 { .mfi; (p18)   ldf8            nlo[0]=[nptr],8             // *(nptr++)(p16)
158         (p18)   xma.lu          alo[2]=alo[2],bi,ahi[1]
159         (p42)   add             n[2]=n[2],a[2],1        };; // (p23)
160 { .mfi; (p21)   getf.sig        a[0]=alo[5]
161         (p20)   xma.hu          nhi[0]=nlo[2],m0,nhi[1]
162         (p42)   cmp.leu         p41,p39=n[2],a[2]       }   // (p23)
163 { .mfi; (p23)   st8             [tp_1]=n[2],8
164         (p20)   xma.lu          nlo[2]=nlo[2],m0,nhi[1]
165         (p40)   cmp.ltu         p41,p39=n[2],a[2]       }   // (p23)
166 { .mmb; (p21)   getf.sig        n[0]=nlo[3]
167         (p16)   nop.m           0
168         br.ctop.sptk    .L1st_ctop                      };;
169 .L1st_cend:
170
171 { .mmi; getf.sig        a[0]=ahi[6]             // (p24)
172         getf.sig        n[0]=nhi[4]
173         add             num=-1,num      };;     // num--
174 { .mmi; .pred.rel       "mutex",p40,p42
175 (p40)   add             n[0]=n[0],a[0]
176 (p42)   add             n[0]=n[0],a[0],1
177         sub             aptr=aptr,len   };;     // rewind
178 { .mmi; .pred.rel       "mutex",p40,p42
179 (p40)   cmp.ltu         p41,p39=n[0],a[0]
180 (p42)   cmp.leu         p41,p39=n[0],a[0]
181         sub             nptr=nptr,len   };;
182 { .mmi; .pred.rel       "mutex",p39,p41
183 (p39)   add             topbit=r0,r0
184 (p41)   add             topbit=r0,r0,1
185         nop.i           0               }       
186 { .mmi; st8             [tp_1]=n[0]
187         $ADDP           tptr=16,sp
188         $ADDP           tp_1=8,sp       };;
189 ___
190 \f\f
191 $code.=<<___;
192 .Louter:
193 { .mmi; ldf8            bi=[bptr],8             // (*bp++)
194         ldf8            ahi[3]=[tptr]           // tp[0]
195         add             r30=8,aptr      };;
196 { .mmi; ldf8            alo[4]=[aptr],16        // ap[0]
197         ldf8            alo[3]=[r30],16         // ap[1]
198         add             r31=8,nptr      };;
199 { .mfb; ldf8            alo[2]=[aptr],16        // ap[2]
200         xma.hu          ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
201         brp.loop.imp    .Linner_ctop,.Linner_cend-16
202                                         }
203 { .mfb; ldf8            alo[1]=[r30]            // ap[3]
204         xma.lu          alo[4]=alo[4],bi,ahi[3]
205         clrrrb.pr                       };;
206 { .mfi; ldf8            nlo[2]=[nptr],16        // np[0]
207         xma.hu          ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
208         nop.i           0               }
209 { .mfi; ldf8            nlo[1]=[r31]            // np[1]
210         xma.lu          alo[3]=alo[3],bi,ahi[2]
211         mov             pr.rot=0x20101f<<16
212                         // ------^----- (p40) at first (p23)
213                         // --------^--- (p30) at first (p22)
214                         // ----------^^ p[16:20]=1
215                                         };;
216 { .mfi; st8             [tptr]=r0               // tp[0] is already accounted
217         xmpy.lu         m0=alo[4],n0            // (ap[0]*bp[i]+tp[0])*n0
218         mov             ar.lc=lc        }
219 { .mfi;
220         fcvt.fxu.s1     nhi[1]=f0
221         mov             ar.ec=8         };;
222
223 // This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
224 // 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
225 // in latter case accounts for two-tick pipeline stall, which means
226 // that its performance would be ~20% lower than optimal one. No
227 // attempt was made to address this, because original Itanium is
228 // hardly represented out in the wild...
229 .align  32
230 .Linner_ctop:
231 .pred.rel       "mutex",p40,p42
232 .pred.rel       "mutex",p30,p32
233 { .mfi; (p16)   ldf8            alo[0]=[aptr],8             // *(aptr++)
234         (p18)   xma.hu          ahi[0]=alo[2],bi,ahi[1]
235         (p40)   add             n[2]=n[2],a[2]          }   // (p23)
236 { .mfi; (p16)   nop.m           0
237         (p18)   xma.lu          alo[2]=alo[2],bi,ahi[1]
238         (p42)   add             n[2]=n[2],a[2],1        };; // (p23)
239 { .mfi; (p21)   getf.sig        a[0]=alo[5]
240         (p16)   nop.f           0
241         (p40)   cmp.ltu         p41,p39=n[2],a[2]       }   // (p23)
242 { .mfi; (p21)   ld8             t[0]=[tptr],8
243         (p16)   nop.f           0
244         (p42)   cmp.leu         p41,p39=n[2],a[2]       };; // (p23)
245 { .mfi; (p18)   ldf8            nlo[0]=[nptr],8             // *(nptr++)
246         (p20)   xma.hu          nhi[0]=nlo[2],m0,nhi[1]
247         (p30)   add             a[1]=a[1],t[1]          }   // (p22)
248 { .mfi; (p16)   nop.m           0
249         (p20)   xma.lu          nlo[2]=nlo[2],m0,nhi[1]
250         (p32)   add             a[1]=a[1],t[1],1        };; // (p22)
251 { .mmi; (p21)   getf.sig        n[0]=nlo[3]
252         (p16)   nop.m           0
253         (p30)   cmp.ltu         p31,p29=a[1],t[1]       }   // (p22)
254 { .mmb; (p23)   st8             [tp_1]=n[2],8
255         (p32)   cmp.leu         p31,p29=a[1],t[1]           // (p22)
256         br.ctop.sptk    .Linner_ctop                    };;
257 .Linner_cend:
258
259 { .mmi; getf.sig        a[0]=ahi[6]             // (p24)
260         getf.sig        n[0]=nhi[4]
261         nop.i           0               };;
262
263 { .mmi; .pred.rel       "mutex",p31,p33
264 (p31)   add             a[0]=a[0],topbit
265 (p33)   add             a[0]=a[0],topbit,1
266         mov             topbit=r0       };;
267 { .mfi; .pred.rel       "mutex",p31,p33
268 (p31)   cmp.ltu         p32,p30=a[0],topbit
269 (p33)   cmp.leu         p32,p30=a[0],topbit
270                                         }
271 { .mfi; .pred.rel       "mutex",p40,p42
272 (p40)   add             n[0]=n[0],a[0]
273 (p42)   add             n[0]=n[0],a[0],1
274                                         };;
275 { .mmi; .pred.rel       "mutex",p44,p46
276 (p40)   cmp.ltu         p41,p39=n[0],a[0]
277 (p42)   cmp.leu         p41,p39=n[0],a[0]
278 (p32)   add             topbit=r0,r0,1  }
279
280 { .mmi; st8             [tp_1]=n[0],8
281         cmp4.ne         p6,p0=1,num
282         sub             aptr=aptr,len   };;     // rewind
283 { .mmi; sub             nptr=nptr,len
284 (p41)   add             topbit=r0,r0,1
285         $ADDP           tptr=16,sp      }
286 { .mmb; $ADDP           tp_1=8,sp
287         add             num=-1,num              // num--
288 (p6)    br.cond.sptk.many       .Louter };;
289 \f
290 { .mbb; add             lc=4,lc
291         brp.loop.imp    .Lsub_ctop,.Lsub_cend-16
292         clrrrb.pr                       };;
293 { .mii; nop.m           0
294         mov             pr.rot=0x10001<<16
295                         // ------^---- (p33) at first (p17)
296         mov             ar.lc=lc        }
297 { .mii; nop.m           0
298         mov             ar.ec=3
299         nop.i           0               };;
300
301 .Lsub_ctop:
302 .pred.rel       "mutex",p33,p35
303 { .mfi; (p16)   ld8             t[0]=[tptr],8               // t=*(tp++)
304         (p16)   nop.f           0
305         (p33)   sub             n[1]=t[1],n[1]          }   // (p17)
306 { .mfi; (p16)   ld8             n[0]=[nptr],8               // n=*(np++)
307         (p16)   nop.f           0
308         (p35)   sub             n[1]=t[1],n[1],1        };; // (p17)
309 { .mib; (p18)   st8             [rptr]=n[2],8               // *(rp++)=r
310         (p33)   cmp.gtu         p34,p32=n[1],t[1]           // (p17)
311         (p18)   nop.b           0                       }
312 { .mib; (p18)   nop.m           0
313         (p35)   cmp.geu         p34,p32=n[1],t[1]           // (p17)
314         br.ctop.sptk    .Lsub_ctop                      };;
315 .Lsub_cend:
316
317 { .mmb; .pred.rel       "mutex",p34,p36
318 (p34)   sub     topbit=topbit,r0        // (p19)
319 (p36)   sub     topbit=topbit,r0,1
320         brp.loop.imp    .Lcopy_ctop,.Lcopy_cend-16
321                                         }
322 { .mmb; sub     rptr=rptr,len           // rewind
323         sub     tptr=tptr,len
324         clrrrb.pr                       };;
325 { .mmi; and     aptr=tptr,topbit
326         andcm   bptr=rptr,topbit
327         mov     pr.rot=1<<16            };;
328 { .mii; or      nptr=aptr,bptr
329         mov     ar.lc=lc
330         mov     ar.ec=3                 };;
331
332 .Lcopy_ctop:
333 { .mmb; (p16)   ld8     n[0]=[nptr],8
334         (p18)   st8     [tptr]=r0,8
335         (p16)   nop.b   0               }
336 { .mmb; (p16)   nop.m   0
337         (p18)   st8     [rptr]=n[2],8
338         br.ctop.sptk    .Lcopy_ctop     };;
339 .Lcopy_cend:
340
341 { .mmi; mov             ret0=1                  // signal "handled"
342         rum             1<<5                    // clear um.mfh
343         mov             ar.lc=prevlc    }
344 { .mib; .restore        sp
345         mov             sp=prevsp
346         mov             pr=prevpr,-2
347         br.ret.sptk.many        b0      };;
348 .endp   bn_mul_mont
349 .type   copyright#,\@object
350 copyright:
351 stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
352 ___
353
354 $output=shift and open STDOUT,">$output";
355 print $code;
356 close STDOUT;