3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
11 # Montgomery multiplication routine for x86_64. While it gives modest
12 # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
13 # than twice, >2x, as fast. Most common rsa1024 sign is improved by
14 # respectful 50%. It remains to be seen if loop unrolling and
15 # dedicated squaring routine can provide further improvement...
18 open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
21 $rp="%rdi"; # BN_ULONG *rp,
22 $ap="%rsi"; # const BN_ULONG *ap,
23 $bp="%rdx"; # const BN_ULONG *bp,
24 $np="%rcx"; # const BN_ULONG *np,
25 $n0="%r8"; # const BN_ULONG *n0,
26 $num="%r9"; # int num);
29 $bp="%r12"; # reassign $bp
40 .type bn_mul_mont,\@function,6
53 lea (%rsp,%rax,8),%rsp # tp=alloca(8*(num+2))
54 and \$-1024,%rsp # minimize TLB usage
55 mov %rbp,8(%rsp,$num,8) # tp[num+1]=%rsp
56 mov %rdx,$bp # $bp reassigned, remember?
58 mov ($n0),$n0 # pull n0[0] value
63 mov ($bp),$m0 # m0=bp[0]
65 mulq $m0 # ap[0]*bp[0]
69 imulq $n0,%rax # "tp[0]"*n0
73 add $lo0,%rax # discarded
80 mulq $m0 # ap[j]*bp[0]
90 add $lo0,%rax # np[j]*m1+ap[j]*bp[0]
92 mov %rax,-8(%rsp,$j,8) # tp[j-1]
102 mov $hi1,-8(%rsp,$num,8)
103 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
110 mov ($bp,$i,8),$m0 # m0=bp[i]
111 mov ($ap),%rax # ap[0]
112 mulq $m0 # ap[0]*bp[i]
113 add (%rsp),%rax # ap[0]*bp[i]+tp[0]
118 imulq $n0,%rax # tp[0]*n0
121 mulq ($np,$j,8) # np[0]*m1
122 add $lo0,%rax # discarded
130 mulq $m0 # ap[j]*bp[i]
133 add (%rsp,$j,8),%rax # ap[j]*bp[i]+tp[j]
142 add $lo0,%rax # np[j]*m1+ap[j]*bp[i]+tp[j]
144 mov %rax,-8(%rsp,$j,8) # tp[j-1]
154 add (%rsp,$num,8),$hi1 # pull upmost overflow bit
156 mov $hi1,-8(%rsp,$num,8)
157 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
164 lea -1($num),$j # j=num-1
165 cmp \$0,%rdx # %rdx still holds upmost overflow bit
166 jnz .Lsub # CF is cleared by compare with 0
168 cmp ($np,$j,8),%rax # tp[num-1]-np[num-1]
169 jae .Lsub # if taken CF was cleared by above cmp
173 mov %rax,($rp,$j,8) # rp[i]=tp[i]
174 mov $i,(%rsp,$j,8) # zap temporary vector
179 mov 8(%rsp,$num,8),%rsp # restore %rsp
190 .Lsub: mov (%rsp,$i,8),%rax
192 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[j]
194 dec $j # doesn't affect CF!
196 lea -1($num),$j # j=num-1
198 jc .Lcopy # tp was less than np
200 .Lzap: mov $i,(%rsp,$j,8) # zap temporary vector
204 .size bn_mul_mont,.-bn_mul_mont