crypto/bn/asm/ia64-mont.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9
  10 # January 2010
  11 #
  12 # "Teaser" Montgomery multiplication module for IA-64. There are
  13 # several possibilities for improvement:
  14 #
  15 # - modulo-scheduling outer loop would eliminate quite a number of
  16 #   stalls after ldf8, xma and getf.sig outside inner loop and
  17 #   improve shorter key performance;
  18 # - shorter vector support [with input vectors being fetched only
  19 #   once] should be added;
  20 # - 2x unroll with help of n0[1] would make the code scalable on
  21 #   "wider" IA-64, "wider" than Itanium 2 that is, which is not of
  22 #   acute interest, because upcoming Tukwila's individual cores are
  23 #   reportedly based on Itanium 2 design;
  24 # - dedicated squaring procedure(?);
  25 #
  26 # So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
  27 # this module is:
  28 #                   sign    verify    sign/s verify/s
  29 # rsa  512 bits 0.000634s 0.000030s   1577.6  32877.3
  30 # rsa 1024 bits 0.001246s 0.000058s    802.8  17181.5
  31 # rsa 2048 bits 0.005908s 0.000148s    169.3   6754.0
  32 # rsa 4096 bits 0.033456s 0.000469s     29.9   2133.6
  33 # dsa  512 bits 0.000322s 0.000286s   3106.0   3499.0
  34 # dsa 1024 bits 0.000585s 0.000607s   1708.4   1647.4
  35 # dsa 2048 bits 0.001453s 0.001703s    688.1    587.4
  36 #
  37 # ... and *without*:
  38 #
  39 # rsa  512 bits 0.000670s 0.000041s   1491.8  24145.5
  40 # rsa 1024 bits 0.001988s 0.000080s    502.9  12499.3
  41 # rsa 2048 bits 0.008702s 0.000189s    114.9   5293.9
  42 # rsa 4096 bits 0.043860s 0.000533s     22.8   1875.9
  43 # dsa  512 bits 0.000441s 0.000427s   2265.3   2340.6
  44 # dsa 1024 bits 0.000823s 0.000867s   1215.6   1153.2
  45 # dsa 2048 bits 0.001894s 0.002179s    528.1    458.9
  46 #
  47 # 512-bit RSA sign performance does not improve, because this module
  48 # doesn't handle short enough vectors (yet). Otherwise RSA sign
  49 # improves by 60-30%, less for longer keys, while verify - by 35-13%.
  50 # DSA performance improves by 40-30%.
  51
  52 if ($^O eq "hpux") {
  53     $ADDP="addp4";
  54     for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
  55 } else { $ADDP="add"; }
  56
  57 $code=<<___;
  58 .explicit
  59 .text
  60 \f
  61 // int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
  62 //                  const BN_ULONG *bp,const BN_ULONG *np,
  63 //                  const BN_ULONG *n0p,int num);
  64 .global bn_mul_mont#
  65 .proc   bn_mul_mont#
  66 prevsp=r2;
  67 prevfs=r3;
  68 prevlc=r10;
  69 prevpr=r11;
  70
  71 rptr=r14;
  72 aptr=r15;
  73 bptr=r16;
  74 nptr=r17;
  75 tptr=r18;       // &tp[0]
  76 tp_1=r19;       // &tp[-1]
  77 num=r20;
  78 len=r21;
  79 topbit=r22;
  80 lc=r23;
  81
  82 bi=f6;
  83 n0=f7;
  84 m0=f8;
  85
  86 .align  64
  87 bn_mul_mont:
  88         .prologue
  89 { .mmi; .save   ar.pfs,prevfs
  90         alloc   prevfs=ar.pfs,6,2,0,8
  91         $ADDP   aptr=0,in1
  92         .save   ar.lc,prevlc
  93         mov     prevlc=ar.lc            }
  94 { .mmi; .vframe prevsp
  95         mov     prevsp=sp
  96         $ADDP   bptr=0,in2
  97         cmp4.gt p6,p0=5,in5             };;     // is num large enough?
  98 { .mfi; nop.m   0                               // align loop bodies
  99         nop.f   0
 100         nop.i   0                       }
 101 { .mib; mov     ret0=r0                         // signal "unhandled"
 102         .save   pr,prevpr
 103         mov     prevpr=pr
 104 (p6)    br.ret.dpnt.many        b0      };;
 105
 106         .body
 107         .rotf           alo[6],nlo[4],ahi[8],nhi[6]
 108         .rotr           a[3],n[3],t[2]
 109
 110 { .mmi; ldf8            bi=[bptr],8             // (*bp++)
 111         ldf8            alo[4]=[aptr],16        // ap[0]
 112         $ADDP           r30=8,in1       };;
 113 { .mmi; ldf8            alo[3]=[r30],16         // ap[1]
 114         ldf8            alo[2]=[aptr],16        // ap[2]
 115         $ADDP           in4=0,in4       };;
 116 { .mmi; ldf8            alo[1]=[r30]            // ap[3]
 117         ldf8            n0=[in4]                // n0
 118         $ADDP           rptr=0,in0              }
 119 { .mmi; $ADDP           nptr=0,in3
 120         mov             r31=16
 121         zxt4            num=in5         };;
 122 { .mmi; ldf8            nlo[2]=[nptr],8         // np[0]
 123         shladd          len=num,3,r0
 124         shladd          r31=num,3,r31   };;
 125 { .mmi; ldf8            nlo[1]=[nptr],8         // np[1]
 126         add             lc=-5,num
 127         sub             r31=sp,r31      };;
 128 { .mfb; and             sp=-16,r31              // alloca
 129         xmpy.hu         ahi[2]=alo[4],bi        // ap[0]*bp[0]
 130         nop.b           0               }
 131 { .mfb; nop.m           0
 132         xmpy.lu         alo[4]=alo[4],bi
 133         brp.loop.imp    .L1st_ctop,.L1st_cend-16
 134                                         };;
 135 { .mfi; nop.m           0
 136         xma.hu          ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
 137         $ADDP           tp_1=8,sp       }
 138 { .mfi; nop.m           0
 139         xma.lu          alo[3]=alo[3],bi,ahi[2]
 140         mov             pr.rot=0x20001f<<16
 141                         // ------^----- (p40) at first (p23)
 142                         // ----------^^ p[16:20]=1
 143                                         };;
 144 { .mfi; nop.m           0
 145         xmpy.lu         m0=alo[4],n0            // (ap[0]*bp[0])*n0
 146         mov             ar.lc=lc        }
 147 { .mfi; nop.m           0
 148         fcvt.fxu.s1     nhi[1]=f0
 149         mov             ar.ec=8         };;
 150
 151 .align  32
 152 .L1st_ctop:
 153 .pred.rel       "mutex",p40,p42
 154 { .mfi; (p16)   ldf8            alo[0]=[aptr],8             // *(aptr++)
 155         (p18)   xma.hu          ahi[0]=alo[2],bi,ahi[1]
 156         (p40)   add             n[2]=n[2],a[2]          }   // (p23)                                    }
 157 { .mfi; (p18)   ldf8            nlo[0]=[nptr],8             // *(nptr++)(p16)
 158         (p18)   xma.lu          alo[2]=alo[2],bi,ahi[1]
 159         (p42)   add             n[2]=n[2],a[2],1        };; // (p23)
 160 { .mfi; (p21)   getf.sig        a[0]=alo[5]
 161         (p20)   xma.hu          nhi[0]=nlo[2],m0,nhi[1]
 162         (p42)   cmp.leu         p41,p39=n[2],a[2]       }   // (p23)
 163 { .mfi; (p23)   st8             [tp_1]=n[2],8
 164         (p20)   xma.lu          nlo[2]=nlo[2],m0,nhi[1]
 165         (p40)   cmp.ltu         p41,p39=n[2],a[2]       }   // (p23)
 166 { .mmb; (p21)   getf.sig        n[0]=nlo[3]
 167         (p16)   nop.m           0
 168         br.ctop.sptk    .L1st_ctop                      };;
 169 .L1st_cend:
 170
 171 { .mmi; getf.sig        a[0]=ahi[6]             // (p24)
 172         getf.sig        n[0]=nhi[4]
 173         add             num=-1,num      };;     // num--
 174 { .mmi; .pred.rel       "mutex",p40,p42
 175 (p40)   add             n[0]=n[0],a[0]
 176 (p42)   add             n[0]=n[0],a[0],1
 177         sub             aptr=aptr,len   };;     // rewind
 178 { .mmi; .pred.rel       "mutex",p40,p42
 179 (p40)   cmp.ltu         p41,p39=n[0],a[0]
 180 (p42)   cmp.leu         p41,p39=n[0],a[0]
 181         sub             nptr=nptr,len   };;
 182 { .mmi; .pred.rel       "mutex",p39,p41
 183 (p39)   add             topbit=r0,r0
 184 (p41)   add             topbit=r0,r0,1
 185         nop.i           0               }
 186 { .mmi; st8             [tp_1]=n[0]
 187         $ADDP           tptr=16,sp
 188         $ADDP           tp_1=8,sp       };;
 189 ___
 190 \f\f
 191 $code.=<<___;
 192 .Louter:
 193 { .mmi; ldf8            bi=[bptr],8             // (*bp++)
 194         ldf8            ahi[3]=[tptr]           // tp[0]
 195         add             r30=8,aptr      };;
 196 { .mmi; ldf8            alo[4]=[aptr],16        // ap[0]
 197         ldf8            alo[3]=[r30],16         // ap[1]
 198         add             r31=8,nptr      };;
 199 { .mfb; ldf8            alo[2]=[aptr],16        // ap[2]
 200         xma.hu          ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
 201         brp.loop.imp    .Linner_ctop,.Linner_cend-16
 202                                         }
 203 { .mfb; ldf8            alo[1]=[r30]            // ap[3]
 204         xma.lu          alo[4]=alo[4],bi,ahi[3]
 205         clrrrb.pr                       };;
 206 { .mfi; ldf8            nlo[2]=[nptr],16        // np[0]
 207         xma.hu          ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
 208         nop.i           0               }
 209 { .mfi; ldf8            nlo[1]=[r31]            // np[1]
 210         xma.lu          alo[3]=alo[3],bi,ahi[2]
 211         mov             pr.rot=0x20101f<<16
 212                         // ------^----- (p40) at first (p23)
 213                         // --------^--- (p30) at first (p22)
 214                         // ----------^^ p[16:20]=1
 215                                         };;
 216 { .mfi; st8             [tptr]=r0               // tp[0] is already accounted
 217         xmpy.lu         m0=alo[4],n0            // (ap[0]*bp[i]+tp[0])*n0
 218         mov             ar.lc=lc        }
 219 { .mfi;
 220         fcvt.fxu.s1     nhi[1]=f0
 221         mov             ar.ec=8         };;
 222
 223 // This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
 224 // 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
 225 // in latter case accounts for two-tick pipeline stall, which means
 226 // that its performance would be ~20% lower than optimal one. No
 227 // attempt was made to address this, because original Itanium is
 228 // hardly represented out in the wild...
 229 .align  32
 230 .Linner_ctop:
 231 .pred.rel       "mutex",p40,p42
 232 .pred.rel       "mutex",p30,p32
 233 { .mfi; (p16)   ldf8            alo[0]=[aptr],8             // *(aptr++)
 234         (p18)   xma.hu          ahi[0]=alo[2],bi,ahi[1]
 235         (p40)   add             n[2]=n[2],a[2]          }   // (p23)
 236 { .mfi; (p16)   nop.m           0
 237         (p18)   xma.lu          alo[2]=alo[2],bi,ahi[1]
 238         (p42)   add             n[2]=n[2],a[2],1        };; // (p23)
 239 { .mfi; (p21)   getf.sig        a[0]=alo[5]
 240         (p16)   nop.f           0
 241         (p40)   cmp.ltu         p41,p39=n[2],a[2]       }   // (p23)
 242 { .mfi; (p21)   ld8             t[0]=[tptr],8
 243         (p16)   nop.f           0
 244         (p42)   cmp.leu         p41,p39=n[2],a[2]       };; // (p23)
 245 { .mfi; (p18)   ldf8            nlo[0]=[nptr],8             // *(nptr++)
 246         (p20)   xma.hu          nhi[0]=nlo[2],m0,nhi[1]
 247         (p30)   add             a[1]=a[1],t[1]          }   // (p22)
 248 { .mfi; (p16)   nop.m           0
 249         (p20)   xma.lu          nlo[2]=nlo[2],m0,nhi[1]
 250         (p32)   add             a[1]=a[1],t[1],1        };; // (p22)
 251 { .mmi; (p21)   getf.sig        n[0]=nlo[3]
 252         (p16)   nop.m           0
 253         (p30)   cmp.ltu         p31,p29=a[1],t[1]       }   // (p22)
 254 { .mmb; (p23)   st8             [tp_1]=n[2],8
 255         (p32)   cmp.leu         p31,p29=a[1],t[1]           // (p22)
 256         br.ctop.sptk    .Linner_ctop                    };;
 257 .Linner_cend:
 258
 259 { .mmi; getf.sig        a[0]=ahi[6]             // (p24)
 260         getf.sig        n[0]=nhi[4]
 261         nop.i           0               };;
 262
 263 { .mmi; .pred.rel       "mutex",p31,p33
 264 (p31)   add             a[0]=a[0],topbit
 265 (p33)   add             a[0]=a[0],topbit,1
 266         mov             topbit=r0       };;
 267 { .mfi; .pred.rel       "mutex",p31,p33
 268 (p31)   cmp.ltu         p32,p30=a[0],topbit
 269 (p33)   cmp.leu         p32,p30=a[0],topbit
 270                                         }
 271 { .mfi; .pred.rel       "mutex",p40,p42
 272 (p40)   add             n[0]=n[0],a[0]
 273 (p42)   add             n[0]=n[0],a[0],1
 274                                         };;
 275 { .mmi; .pred.rel       "mutex",p44,p46
 276 (p40)   cmp.ltu         p41,p39=n[0],a[0]
 277 (p42)   cmp.leu         p41,p39=n[0],a[0]
 278 (p32)   add             topbit=r0,r0,1  }
 279
 280 { .mmi; st8             [tp_1]=n[0],8
 281         cmp4.ne         p6,p0=1,num
 282         sub             aptr=aptr,len   };;     // rewind
 283 { .mmi; sub             nptr=nptr,len
 284 (p41)   add             topbit=r0,r0,1
 285         $ADDP           tptr=16,sp      }
 286 { .mmb; $ADDP           tp_1=8,sp
 287         add             num=-1,num              // num--
 288 (p6)    br.cond.sptk.many       .Louter };;
 289 \f
 290 { .mbb; add             lc=4,lc
 291         brp.loop.imp    .Lsub_ctop,.Lsub_cend-16
 292         clrrrb.pr                       };;
 293 { .mii; nop.m           0
 294         mov             pr.rot=0x10001<<16
 295                         // ------^---- (p33) at first (p17)
 296         mov             ar.lc=lc        }
 297 { .mii; nop.m           0
 298         mov             ar.ec=3
 299         nop.i           0               };;
 300
 301 .Lsub_ctop:
 302 .pred.rel       "mutex",p33,p35
 303 { .mfi; (p16)   ld8             t[0]=[tptr],8               // t=*(tp++)
 304         (p16)   nop.f           0
 305         (p33)   sub             n[1]=t[1],n[1]          }   // (p17)
 306 { .mfi; (p16)   ld8             n[0]=[nptr],8               // n=*(np++)
 307         (p16)   nop.f           0
 308         (p35)   sub             n[1]=t[1],n[1],1        };; // (p17)
 309 { .mib; (p18)   st8             [rptr]=n[2],8               // *(rp++)=r
 310         (p33)   cmp.gtu         p34,p32=n[1],t[1]           // (p17)
 311         (p18)   nop.b           0                       }
 312 { .mib; (p18)   nop.m           0
 313         (p35)   cmp.geu         p34,p32=n[1],t[1]           // (p17)
 314         br.ctop.sptk    .Lsub_ctop                      };;
 315 .Lsub_cend:
 316
 317 { .mmb; .pred.rel       "mutex",p34,p36
 318 (p34)   sub     topbit=topbit,r0        // (p19)
 319 (p36)   sub     topbit=topbit,r0,1
 320         brp.loop.imp    .Lcopy_ctop,.Lcopy_cend-16
 321                                         }
 322 { .mmb; sub     rptr=rptr,len           // rewind
 323         sub     tptr=tptr,len
 324         clrrrb.pr                       };;
 325 { .mmi; and     aptr=tptr,topbit
 326         andcm   bptr=rptr,topbit
 327         mov     pr.rot=1<<16            };;
 328 { .mii; or      nptr=aptr,bptr
 329         mov     ar.lc=lc
 330         mov     ar.ec=3                 };;
 331
 332 .Lcopy_ctop:
 333 { .mmb; (p16)   ld8     n[0]=[nptr],8
 334         (p18)   st8     [tptr]=r0,8
 335         (p16)   nop.b   0               }
 336 { .mmb; (p16)   nop.m   0
 337         (p18)   st8     [rptr]=n[2],8
 338         br.ctop.sptk    .Lcopy_ctop     };;
 339 .Lcopy_cend:
 340
 341 { .mmi; mov             ret0=1                  // signal "handled"
 342         rum             1<<5                    // clear um.mfh
 343         mov             ar.lc=prevlc    }
 344 { .mib; .restore        sp
 345         mov             sp=prevsp
 346         mov             pr=prevpr,-2
 347         br.ret.sptk.many        b0      };;
 348 .endp   bn_mul_mont
 349 .type   copyright#,\@object
 350 copyright:
 351 stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
 352 ___
 353
 354 $output=shift and open STDOUT,">$output";
 355 print $code;
 356 close STDOUT;