3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # On 21264 RSA sign performance improves by 70/35/20/15 percent for
11 # 512/1024/2048/4096 bit key lengths. This is against vendor compiler
12 # instructed to '-tune host' code with in-line assembler. Other
13 # benchmarks improve by 15-20%. To anchor it to something else, the
14 # code provides approximately the same performance per GHz as AMD64.
15 # I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
19 $rp="a0"; # BN_ULONG *rp,
20 $ap="a1"; # const BN_ULONG *ap,
21 $bp="a2"; # const BN_ULONG *bp,
22 $np="a3"; # const BN_ULONG *np,
23 $n0="a4"; # const BN_ULONG *n0,
24 $num="a5"; # int num);
73 ldq $hi0,0($ap) # ap[0]
77 ldq $bi,0($bp) # bp[0]
83 ldq $hi1,0($np) # np[0]
196 s8addq $j,$np,$nj #U0
200 addq $alo,$hi0,$lo0 #L1
203 mulq $aj,$bi,$alo #U1
204 cmpult $lo0,$hi0,AT #L0
205 addq $nlo,$hi1,$lo1 #L1
208 mulq $nj,$m1,$nlo #U1
209 addq $ahi,AT,$hi0 #L0
210 addq $lo0,$tj,$lo0 #L1
211 cmpult $lo1,$hi1,v0 #U0
213 umulh $aj,$bi,$ahi #U1
214 cmpult $lo0,$tj,AT #L0
215 addq $lo1,$lo0,$lo1 #L1
216 addq $nhi,v0,$hi1 #U0
218 umulh $nj,$m1,$nhi #U1
219 s8addq $j,$ap,$aj #L0
220 cmpult $lo1,$lo0,v0 #L1
221 cmplt $j,$num,$tj #U0 # borrow $tj
223 addq $hi0,AT,$hi0 #L0
224 addq $hi1,v0,$hi1 #U1
247 cmpult $lo1,$hi0,$hi1
253 cmplt $i,$num,$tj # borrow $tj
279 .Lsub: ldq $lo0,($tp)
284 cmpult $lo1,$lo0,$hi0
300 .Lzap: stq zero,($tp)
317 .asciiz "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"