3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # Companion to x86_64-mont.pl that optimizes cache-timing attack
13 # countermeasures. The subroutines are produced by replacing bp[i]
14 # references in their x86_64-mont.pl counterparts with cache-neutral
15 # references to powers table computed in BN_mod_exp_mont_consttime.
16 # In addition subroutine that scatters elements of the powers table
17 # is implemented, so that scatter-/gathering can be tuned without
18 # bn_exp.c modifications.
22 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
24 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
26 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
27 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
28 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
29 die "can't locate x86_64-xlate.pl";
31 open STDOUT,"| $^X $xlate $flavour $output";
33 # int bn_mul_mont_gather5(
34 $rp="%rdi"; # BN_ULONG *rp,
35 $ap="%rsi"; # const BN_ULONG *ap,
36 $bp="%rdx"; # const BN_ULONG *bp,
37 $np="%rcx"; # const BN_ULONG *np,
38 $n0="%r8"; # const BN_ULONG *n0,
39 $num="%r9"; # int num,
40 # int idx); # 0 to 2^5-1, "index" in $bp holding
41 # pre-computed powers of a', interlaced
42 # in such manner that b[0] is $bp[idx],
43 # b[1] is [2^5+idx], etc.
55 .globl bn_mul_mont_gather5
56 .type bn_mul_mont_gather5,\@function,6
67 mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
79 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
80 and \$-1024,%rsp # minimize TLB usage
82 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
84 mov $bp,%r12 # reassign $bp
87 $STRIDE=2**5*8; # 5 is "window size"
88 $N=$STRIDE/4; # should match cache line size
91 shr \$`log($N/8)/log(2)`,%r10
94 lea .Lmagic_masks(%rip),%rax
95 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
96 lea 96($bp,%r11,8),$bp # pointer within 1st cache line
97 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
98 movq 8(%rax,%r10,8),%xmm5 # cache line contains element
99 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
100 movq 24(%rax,%r10,8),%xmm7
102 movq `0*$STRIDE/4-96`($bp),%xmm0
103 movq `1*$STRIDE/4-96`($bp),%xmm1
105 movq `2*$STRIDE/4-96`($bp),%xmm2
107 movq `3*$STRIDE/4-96`($bp),%xmm3
115 movq %xmm0,$m0 # m0=bp[0]
117 mov ($n0),$n0 # pull n0[0] value
123 movq `0*$STRIDE/4-96`($bp),%xmm0
124 movq `1*$STRIDE/4-96`($bp),%xmm1
126 movq `2*$STRIDE/4-96`($bp),%xmm2
130 mulq $m0 # ap[0]*bp[0]
134 movq `3*$STRIDE/4-96`($bp),%xmm3
139 imulq $lo0,$m1 # "tp[0]"*n0
147 add %rax,$lo0 # discarded
160 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
163 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
167 mulq $m0 # ap[j]*bp[0]
178 movq %xmm0,$m0 # bp[1]
181 mov ($ap),%rax # ap[0]
183 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
185 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
192 mov $hi1,-8(%rsp,$num,8)
193 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
203 movq `0*$STRIDE/4-96`($bp),%xmm0
204 movq `1*$STRIDE/4-96`($bp),%xmm1
206 movq `2*$STRIDE/4-96`($bp),%xmm2
209 mulq $m0 # ap[0]*bp[i]
210 add %rax,$lo0 # ap[0]*bp[i]+tp[0]
214 movq `3*$STRIDE/4-96`($bp),%xmm3
219 imulq $lo0,$m1 # tp[0]*n0
227 add %rax,$lo0 # discarded
230 mov 8(%rsp),$lo0 # tp[1]
241 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
244 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
248 mulq $m0 # ap[j]*bp[i]
252 add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
261 movq %xmm0,$m0 # bp[i+1]
264 mov ($ap),%rax # ap[0]
266 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
269 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
275 add $lo0,$hi1 # pull upmost overflow bit
277 mov $hi1,-8(%rsp,$num,8)
278 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
284 xor $i,$i # i=0 and clear CF!
285 mov (%rsp),%rax # tp[0]
286 lea (%rsp),$ap # borrow ap for tp
290 .Lsub: sbb ($np,$i,8),%rax
291 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
292 mov 8($ap,$i,8),%rax # tp[i+1]
294 dec $j # doesnn't affect CF!
297 sbb \$0,%rax # handle upmost overflow bit
304 or $np,$ap # ap=borrow?tp:rp
306 .Lcopy: # copy or in-place refresh
308 mov $i,(%rsp,$i,8) # zap temporary vector
309 mov %rax,($rp,$i,8) # rp[i]=tp[i]
314 mov 8(%rsp,$num,8),%rsi # restore %rsp
325 .size bn_mul_mont_gather5,.-bn_mul_mont_gather5
328 my @A=("%r10","%r11");
329 my @N=("%r13","%rdi");
331 .type bn_mul4x_mont_gather5,\@function,6
333 bn_mul4x_mont_gather5:
335 mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
347 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4))
348 and \$-1024,%rsp # minimize TLB usage
350 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
352 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
353 mov %rdx,%r12 # reassign $bp
356 $STRIDE=2**5*8; # 5 is "window size"
357 $N=$STRIDE/4; # should match cache line size
360 shr \$`log($N/8)/log(2)`,%r10
363 lea .Lmagic_masks(%rip),%rax
364 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
365 lea 96($bp,%r11,8),$bp # pointer within 1st cache line
366 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
367 movq 8(%rax,%r10,8),%xmm5 # cache line contains element
368 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
369 movq 24(%rax,%r10,8),%xmm7
371 movq `0*$STRIDE/4-96`($bp),%xmm0
372 movq `1*$STRIDE/4-96`($bp),%xmm1
374 movq `2*$STRIDE/4-96`($bp),%xmm2
376 movq `3*$STRIDE/4-96`($bp),%xmm3
384 movq %xmm0,$m0 # m0=bp[0]
385 mov ($n0),$n0 # pull n0[0] value
391 movq `0*$STRIDE/4-96`($bp),%xmm0
392 movq `1*$STRIDE/4-96`($bp),%xmm1
394 movq `2*$STRIDE/4-96`($bp),%xmm2
398 mulq $m0 # ap[0]*bp[0]
402 movq `3*$STRIDE/4-96`($bp),%xmm3
407 imulq $A[0],$m1 # "tp[0]"*n0
415 add %rax,$A[0] # discarded
438 mulq $m0 # ap[j]*bp[0]
440 mov -16($np,$j,8),%rax
446 mov -8($ap,$j,8),%rax
448 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
450 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
453 mulq $m0 # ap[j]*bp[0]
455 mov -8($np,$j,8),%rax
463 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
465 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
468 mulq $m0 # ap[j]*bp[0]
478 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
480 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
483 mulq $m0 # ap[j]*bp[0]
492 mov -16($ap,$j,8),%rax
494 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
496 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
501 mulq $m0 # ap[j]*bp[0]
503 mov -16($np,$j,8),%rax
509 mov -8($ap,$j,8),%rax
511 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
513 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
516 mulq $m0 # ap[j]*bp[0]
518 mov -8($np,$j,8),%rax
524 mov ($ap),%rax # ap[0]
526 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
528 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
531 movq %xmm0,$m0 # bp[1]
536 mov $N[0],-8(%rsp,$j,8)
537 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
543 movq `0*$STRIDE/4-96`($bp),%xmm0
544 movq `1*$STRIDE/4-96`($bp),%xmm1
546 movq `2*$STRIDE/4-96`($bp),%xmm2
551 mulq $m0 # ap[0]*bp[i]
552 add %rax,$A[0] # ap[0]*bp[i]+tp[0]
556 movq `3*$STRIDE/4-96`($bp),%xmm3
561 imulq $A[0],$m1 # tp[0]*n0
569 add %rax,$A[0] # "$N[0]", discarded
574 mulq $m0 # ap[j]*bp[i]
578 add 8(%rsp),$A[1] # +tp[1]
586 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
589 mov $N[1],(%rsp) # tp[j-1]
594 mulq $m0 # ap[j]*bp[i]
596 mov -16($np,$j,8),%rax
598 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
604 mov -8($ap,$j,8),%rax
608 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
611 mulq $m0 # ap[j]*bp[i]
613 mov -8($np,$j,8),%rax
615 add -8(%rsp,$j,8),$A[1]
625 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
628 mulq $m0 # ap[j]*bp[i]
632 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
642 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
645 mulq $m0 # ap[j]*bp[i]
649 add 8(%rsp,$j,8),$A[1]
656 mov -16($ap,$j,8),%rax
660 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
665 mulq $m0 # ap[j]*bp[i]
667 mov -16($np,$j,8),%rax
669 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
675 mov -8($ap,$j,8),%rax
679 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
682 mulq $m0 # ap[j]*bp[i]
684 mov -8($np,$j,8),%rax
686 add -8(%rsp,$j,8),$A[1]
693 mov ($ap),%rax # ap[0]
697 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
700 movq %xmm0,$m0 # bp[i+1]
705 add (%rsp,$num,8),$N[0] # pull upmost overflow bit
707 mov $N[0],-8(%rsp,$j,8)
708 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
714 my @ri=("%rax","%rdx",$m0,$m1);
716 mov 16(%rsp,$num,8),$rp # restore $rp
717 mov 0(%rsp),@ri[0] # tp[0]
719 mov 8(%rsp),@ri[1] # tp[1]
720 shr \$2,$num # num/=4
721 lea (%rsp),$ap # borrow ap for tp
722 xor $i,$i # i=0 and clear CF!
725 mov 16($ap),@ri[2] # tp[2]
726 mov 24($ap),@ri[3] # tp[3]
728 lea -1($num),$j # j=num/4-1
732 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
733 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
734 sbb 16($np,$i,8),@ri[2]
735 mov 32($ap,$i,8),@ri[0] # tp[i+1]
736 mov 40($ap,$i,8),@ri[1]
737 sbb 24($np,$i,8),@ri[3]
738 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
739 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
740 sbb 32($np,$i,8),@ri[0]
741 mov 48($ap,$i,8),@ri[2]
742 mov 56($ap,$i,8),@ri[3]
743 sbb 40($np,$i,8),@ri[1]
745 dec $j # doesnn't affect CF!
748 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
749 mov 32($ap,$i,8),@ri[0] # load overflow bit
750 sbb 16($np,$i,8),@ri[2]
751 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
752 sbb 24($np,$i,8),@ri[3]
753 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
755 sbb \$0,@ri[0] # handle upmost overflow bit
756 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
763 or $np,$ap # ap=borrow?tp:rp
770 .Lcopy4x: # copy or in-place refresh
771 movdqu 16($ap,$i),%xmm2
772 movdqu 32($ap,$i),%xmm1
773 movdqa %xmm0,16(%rsp,$i)
774 movdqu %xmm2,16($rp,$i)
775 movdqa %xmm0,32(%rsp,$i)
776 movdqu %xmm1,32($rp,$i)
782 movdqu 16($ap,$i),%xmm2
783 movdqa %xmm0,16(%rsp,$i)
784 movdqu %xmm2,16($rp,$i)
788 mov 8(%rsp,$num,8),%rsi # restore %rsp
799 .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
804 my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
805 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
808 .type bn_scatter5,\@abi-omnipotent
811 lea ($tbl,$idx,8),$tbl
820 .size bn_scatter5,.-bn_scatter5
826 .long 0,0, 0,0, 0,0, -1,-1
827 .long 0,0, 0,0, 0,0, 0,0
828 .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
831 $code =~ s/\`([^\`]*)\`/eval($1)/gem;