From: Andy Polyakov Date: Mon, 30 Apr 2018 20:59:51 +0000 (+0200) Subject: bn/asm/*-mont.pl: harmonize with BN_from_montgomery_word. X-Git-Tag: OpenSSL_1_1_0i~130 X-Git-Url: https://git.openssl.org/?p=openssl.git;a=commitdiff_plain;h=d067ba3bcc4944140578bdbf857affa6faf9e8c1 bn/asm/*-mont.pl: harmonize with BN_from_montgomery_word. Montgomery multiplication post-conditions in some of code paths were formally non-constant time. Cache access pattern was result-neutral, but a little bit asymmetric, which might have produced a signal [if processor reordered load and stores at run-time]. Reviewed-by: Rich Salz (Merged from https://github.com/openssl/openssl/pull/6163) (cherry picked from commit 774ff8fed67e19d4f5f0df2f59050f2737abab2a) Resolved conflicts in ppc-mont.pl and x86_64-mont.pl. --- diff --git a/crypto/bn/asm/alpha-mont.pl b/crypto/bn/asm/alpha-mont.pl index 1d68d6d072..068e450eb6 100644 --- a/crypto/bn/asm/alpha-mont.pl +++ b/crypto/bn/asm/alpha-mont.pl @@ -297,15 +297,12 @@ bn_mul_mont: mov sp,$tp mov $bp,$rp # restore rp - and sp,$hi0,$ap - bic $bp,$hi0,$bp - bis $bp,$ap,$ap # ap=borrow?tp:rp - .align 4 -.Lcopy: ldq $aj,0($ap) # copy or in-place refresh +.Lcopy: ldq $aj,0($tp) # conditional copy + ldq $nj,0($rp) lda $tp,8($tp) lda $rp,8($rp) - lda $ap,8($ap) + cmoveq $hi0,$nj,$aj stq zero,-8($tp) # zap tp cmpult $tp,$tj,AT stq $aj,-8($rp) diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl index 0dc4fe95e4..152823c7d4 100644 --- a/crypto/bn/asm/armv4-mont.pl +++ b/crypto/bn/asm/armv4-mont.pl @@ -262,14 +262,15 @@ bn_mul_mont: mov $tp,sp @ "rewind" $tp sub $rp,$rp,$aj @ "rewind" $rp - and $ap,$tp,$nhi - bic $np,$rp,$nhi - orr $ap,$ap,$np @ ap=borrow?tp:rp - -.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh +.Lcopy: ldr $tj,[$tp] @ conditional copy + ldr $aj,[$rp] str sp,[$tp],#4 @ zap tp - str $tj,[$rp],#4 - cmp $tp,$num +#ifdef __thumb2__ + it cc +#endif + movcc $aj,$tj + str $aj,[$rp],#4 + teq $tp,$num @ preserve carry bne .Lcopy mov sp,$num diff --git a/crypto/bn/asm/ia64-mont.pl b/crypto/bn/asm/ia64-mont.pl index 5cc5c599f9..a535e2d5b1 100644 --- a/crypto/bn/asm/ia64-mont.pl +++ b/crypto/bn/asm/ia64-mont.pl @@ -341,19 +341,19 @@ bn_mul_mont_general: { .mmb; sub rptr=rptr,len // rewind sub tptr=tptr,len clrrrb.pr };; -{ .mmi; and aptr=tptr,topbit - andcm bptr=rptr,topbit +{ .mmi; mov aptr=rptr + mov bptr=tptr mov pr.rot=1<<16 };; -{ .mii; or nptr=aptr,bptr +{ .mii; cmp.eq p0,p6=topbit,r0 mov ar.lc=lc - mov ar.ec=3 };; + mov ar.ec=2 };; .Lcopy_ctop: -{ .mmb; (p16) ld8 n[0]=[nptr],8 - (p18) st8 [tptr]=r0,8 - (p16) nop.b 0 } -{ .mmb; (p16) nop.m 0 - (p18) st8 [rptr]=n[2],8 +{ .mmi; (p16) ld8 a[0]=[aptr],8 + (p16) ld8 t[0]=[bptr],8 + (p6) mov a[1]=t[1] };; // (p17) +{ .mmb; (p17) st8 [rptr]=a[1],8 + (p17) st8 [tptr]=r0,8 br.ctop.sptk .Lcopy_ctop };; .Lcopy_cend: diff --git a/crypto/bn/asm/mips-mont.pl b/crypto/bn/asm/mips-mont.pl index a907571bec..d15c0b948d 100644 --- a/crypto/bn/asm/mips-mont.pl +++ b/crypto/bn/asm/mips-mont.pl @@ -384,15 +384,13 @@ $code.=<<___; $PTR_SUB $rp,$num # restore rp not $hi1,$hi0 - and $ap,$hi0,$sp - and $bp,$hi1,$rp - or $ap,$ap,$bp # ap=borrow?tp:rp - -.align 4 -.Lcopy: $LD $aj,($ap) - $PTR_ADD $ap,$BNSZ +.Lcopy: $LD $nj,($tp) # conditional move + $LD $aj,($rp) $ST $zero,($tp) $PTR_ADD $tp,$BNSZ + and $nj,$hi0 + and $aj,$hi1 + or $aj,$nj sltu $at,$tp,$tj $ST $aj,($rp) bnez $at,.Lcopy diff --git a/crypto/bn/asm/parisc-mont.pl b/crypto/bn/asm/parisc-mont.pl index 8aa94e8511..ef50ba1eb5 100644 --- a/crypto/bn/asm/parisc-mont.pl +++ b/crypto/bn/asm/parisc-mont.pl @@ -517,7 +517,6 @@ L\$sub stws,ma $hi1,4($rp) subb $ti0,%r0,$hi1 - ldo -4($tp),$tp ___ $code.=<<___ if ($BN_SZ==8); ldd,ma 8($tp),$ti0 @@ -532,21 +531,19 @@ L\$sub extrd,u $ti0,31,32,$ti0 ; carry in flipped word order sub,db $ti0,%r0,$hi1 - ldo -8($tp),$tp ___ $code.=<<___; - and $tp,$hi1,$ap - andcm $rp,$hi1,$bp - or $ap,$bp,$np - + ldo `$LOCALS+32`($fp),$tp sub $rp,$arrsz,$rp ; rewind rp subi 0,$arrsz,$idx - ldo `$LOCALS+32`($fp),$tp L\$copy - ldd $idx($np),$hi0 + ldd 0($tp),$ti0 + ldd 0($rp),$hi0 std,ma %r0,8($tp) - addib,<> 8,$idx,.-8 ; L\$copy - std,ma $hi0,8($rp) + comiclr,= 0,$hi1,%r0 + copy $ti0,$hi0 + addib,<> 8,$idx,L\$copy + std,ma $hi0,8($rp) ___ if ($BN_SZ==4) { # PA-RISC 1.1 code-path @@ -856,17 +853,16 @@ L\$sub_pa11 stws,ma $hi1,4($rp) subb $ti0,%r0,$hi1 - ldo -4($tp),$tp - and $tp,$hi1,$ap - andcm $rp,$hi1,$bp - or $ap,$bp,$np + ldo `$LOCALS+32`($fp),$tp sub $rp,$arrsz,$rp ; rewind rp subi 0,$arrsz,$idx - ldo `$LOCALS+32`($fp),$tp L\$copy_pa11 - ldwx $idx($np),$hi0 + ldw 0($tp),$ti0 + ldw 0($rp),$hi0 stws,ma %r0,4($tp) + comiclr,= 0,$hi1,%r0 + copy $ti0,$hi0 addib,<> 4,$idx,L\$copy_pa11 stws,ma $hi0,4($rp) diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl index 5802260ca6..6dd3ccc256 100644 --- a/crypto/bn/asm/ppc-mont.pl +++ b/crypto/bn/asm/ppc-mont.pl @@ -301,15 +301,16 @@ Lsub: $LDX $tj,$tp,$j li $j,0 mtctr $num subfe $ovf,$j,$ovf ; handle upmost overflow bit - and $ap,$tp,$ovf - andc $np,$rp,$ovf - or $ap,$ap,$np ; ap=borrow?tp:rp .align 4 -Lcopy: ; copy or in-place refresh - $LDX $tj,$ap,$j - $STX $tj,$rp,$j +Lcopy: ; conditional copy + $LDX $tj,$tp,$j + $LDX $aj,$rp,$j + and $tj,$tj,$ovf + andc $aj,$aj,$ovf $STX $j,$tp,$j ; zap at once + or $aj,$aj,$tj + $STX $aj,$rp,$j addi $j,$j,$BNSZ bdnz Lcopy diff --git a/crypto/bn/asm/ppc64-mont.pl b/crypto/bn/asm/ppc64-mont.pl index 1e19c958a1..377a6b3196 100644 --- a/crypto/bn/asm/ppc64-mont.pl +++ b/crypto/bn/asm/ppc64-mont.pl @@ -1501,16 +1501,14 @@ Lsub: ldx $t0,$tp,$i li $i,0 subfe $ovf,$i,$ovf ; handle upmost overflow bit - and $ap,$tp,$ovf - andc $np,$rp,$ovf - or $ap,$ap,$np ; ap=borrow?tp:rp - addi $t7,$ap,8 mtctr $j .align 4 -Lcopy: ; copy or in-place refresh - ldx $t0,$ap,$i - ldx $t1,$t7,$i +Lcopy: ; conditional copy + ldx $t0,$tp,$i + ldx $t1,$t4,$i + ldx $t2,$rp,$i + ldx $t3,$t6,$i std $i,8($nap_d) ; zap nap_d std $i,16($nap_d) std $i,24($nap_d) @@ -1519,6 +1517,12 @@ Lcopy: ; copy or in-place refresh std $i,48($nap_d) std $i,56($nap_d) stdu $i,64($nap_d) + and $t0,$t0,$ovf + and $t1,$t1,$ovf + andc $t2,$t2,$ovf + andc $t3,$t3,$ovf + or $t0,$t0,$t2 + or $t1,$t1,$t3 stdx $t0,$rp,$i stdx $t1,$t6,$i stdx $i,$tp,$i ; zap tp at once @@ -1561,20 +1565,21 @@ Lsub: lwz $t0,12($tp) ; load tp[j..j+3] in 64-bit word order li $i,0 subfe $ovf,$i,$ovf ; handle upmost overflow bit - addi $tp,$sp,`$FRAME+$TRANSFER+4` + addi $ap,$sp,`$FRAME+$TRANSFER+4` subf $rp,$num,$rp ; rewind rp - and $ap,$tp,$ovf - andc $np,$rp,$ovf - or $ap,$ap,$np ; ap=borrow?tp:rp addi $tp,$sp,`$FRAME+$TRANSFER` mtctr $j .align 4 -Lcopy: ; copy or in-place refresh +Lcopy: ; conditional copy lwz $t0,4($ap) lwz $t1,8($ap) lwz $t2,12($ap) lwzu $t3,16($ap) + lwz $t4,4($rp) + lwz $t5,8($rp) + lwz $t6,12($rp) + lwz $t7,16($rp) std $i,8($nap_d) ; zap nap_d std $i,16($nap_d) std $i,24($nap_d) @@ -1583,6 +1588,18 @@ Lcopy: ; copy or in-place refresh std $i,48($nap_d) std $i,56($nap_d) stdu $i,64($nap_d) + and $t0,$t0,$ovf + and $t1,$t1,$ovf + and $t2,$t2,$ovf + and $t3,$t3,$ovf + andc $t4,$t4,$ovf + andc $t5,$t5,$ovf + andc $t6,$t6,$ovf + andc $t7,$t7,$ovf + or $t0,$t0,$t4 + or $t1,$t1,$t5 + or $t2,$t2,$t6 + or $t3,$t3,$t7 stw $t0,4($rp) stw $t1,8($rp) stw $t2,12($rp) diff --git a/crypto/bn/asm/s390x-mont.pl b/crypto/bn/asm/s390x-mont.pl index 2205bc2ca0..7af1bcd30c 100644 --- a/crypto/bn/asm/s390x-mont.pl +++ b/crypto/bn/asm/s390x-mont.pl @@ -252,16 +252,16 @@ $code.=<<___; brct $count,.Lsub lghi $ahi,0 slbgr $AHI,$ahi # handle upmost carry - - ngr $ap,$AHI - lghi $np,-1 - xgr $np,$AHI - ngr $np,$rp - ogr $ap,$np # ap=borrow?tp:rp + lghi $NHI,-1 + xgr $NHI,$AHI la $j,0(%r0) lgr $count,$num -.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh +.Lcopy: lg $ahi,$stdframe($j,$sp) # conditional copy + lg $alo,0($j,$rp) + ngr $ahi,$AHI + ngr $alo,$NHI + ogr $alo,$ahi _dswap $alo stg $j,$stdframe($j,$sp) # zap tp stg $alo,0($j,$rp) diff --git a/crypto/bn/asm/sparct4-mont.pl b/crypto/bn/asm/sparct4-mont.pl index 4faf66f10a..2e1d110c76 100755 --- a/crypto/bn/asm/sparct4-mont.pl +++ b/crypto/bn/asm/sparct4-mont.pl @@ -888,19 +888,17 @@ $code.=<<___; sub $tp, $num, $tp sub $rp, $num, $rp - subc $ovf, %g0, $ovf ! handle upmost overflow bit - and $tp, $ovf, $ap - andn $rp, $ovf, $np - or $np, $ap, $ap ! ap=borrow?tp:rp + subccc $ovf, %g0, $ovf ! handle upmost overflow bit ba .Lcopy sub $num, 8, $cnt .align 16 -.Lcopy: ! copy or in-place refresh - ldx [$ap+0], $t2 - add $ap, 8, $ap +.Lcopy: ! conditional copy + ldx [$tp], $tj + ldx [$rp+0], $t2 stx %g0, [$tp] ! zap add $tp, 8, $tp + movcs %icc, $tj, $t2 stx $t2, [$rp+0] add $rp, 8, $rp brnz $cnt, .Lcopy @@ -1136,19 +1134,17 @@ $code.=<<___; sub $tp, $num, $tp sub $rp, $num, $rp - subc $ovf, %g0, $ovf ! handle upmost overflow bit - and $tp, $ovf, $ap - andn $rp, $ovf, $np - or $np, $ap, $ap ! ap=borrow?tp:rp + subccc $ovf, %g0, $ovf ! handle upmost overflow bit ba .Lcopy_g5 sub $num, 8, $cnt .align 16 -.Lcopy_g5: ! copy or in-place refresh - ldx [$ap+0], $t2 - add $ap, 8, $ap +.Lcopy_g5: ! conditional copy + ldx [$tp], $tj + ldx [$rp+0], $t2 stx %g0, [$tp] ! zap add $tp, 8, $tp + movcs %icc, $tj, $t2 stx $t2, [$rp+0] add $rp, 8, $rp brnz $cnt, .Lcopy_g5 diff --git a/crypto/bn/asm/sparcv9-mont.pl b/crypto/bn/asm/sparcv9-mont.pl index 6807c8b6e0..55ceecb84c 100644 --- a/crypto/bn/asm/sparcv9-mont.pl +++ b/crypto/bn/asm/sparcv9-mont.pl @@ -265,7 +265,6 @@ $fname: .Ltail: add $np,$num,$np add $rp,$num,$rp - mov $tp,$ap sub %g0,$num,%o7 ! k=-num ba .Lsub subcc %g0,%g0,%g0 ! clear %icc.c @@ -278,15 +277,14 @@ $fname: add %o7,4,%o7 brnz %o7,.Lsub st %o1,[$i] - subc $car2,0,$car2 ! handle upmost overflow bit - and $tp,$car2,$ap - andn $rp,$car2,$np - or $ap,$np,$ap + subccc $car2,0,$car2 ! handle upmost overflow bit sub %g0,$num,%o7 .Lcopy: - ld [$ap+%o7],%o0 ! copy or in-place refresh + ld [$tp+%o7],%o1 ! conditional copy + ld [$rp+%o7],%o0 st %g0,[$tp+%o7] ! zap tp + movcs %icc,%o1,%o0 st %o0,[$rp+%o7] add %o7,4,%o7 brnz %o7,.Lcopy diff --git a/crypto/bn/asm/via-mont.pl b/crypto/bn/asm/via-mont.pl index 9f81bc822e..8357cb5975 100644 --- a/crypto/bn/asm/via-mont.pl +++ b/crypto/bn/asm/via-mont.pl @@ -213,18 +213,15 @@ $sp=&DWP(28,"esp"); &mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit &sbb ("eax",0); - &and ("esi","eax"); - ¬ ("eax"); - &mov ("ebp","edi"); - &and ("ebp","eax"); - &or ("esi","ebp"); # tp=carry?tp:rp &mov ("ecx","edx"); # num - &xor ("edx","edx"); # i=0 + &mov ("edx",0); # i=0 &set_label("copy",8); - &mov ("eax",&DWP(0,"esi","edx",4)); - &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp + &mov ("ebx",&DWP(0,"esi","edx",4)); + &mov ("eax",&DWP(0,"edi","edx",4)); + &mov (&DWP(0,"esi","edx",4),"ecx"); # zap tp + &cmovc ("eax","ebx"); &mov (&DWP(0,"edi","edx",4),"eax"); &lea ("edx",&DWP(1,"edx")); # i++ &loop (&label("copy")); diff --git a/crypto/bn/asm/vis3-mont.pl b/crypto/bn/asm/vis3-mont.pl index 64dba4480f..6937fe26b3 100644 --- a/crypto/bn/asm/vis3-mont.pl +++ b/crypto/bn/asm/vis3-mont.pl @@ -310,23 +310,23 @@ $code.=<<___; sub $anp, $num, $anp sub $rp, $num, $rp - subc $ovf, %g0, $ovf ! handle upmost overflow bit - and $tp, $ovf, $ap - andn $rp, $ovf, $np - or $np, $ap, $ap ! ap=borrow?tp:rp + subccc $ovf, %g0, $ovf ! handle upmost overflow bit ba .Lcopy sub $num, 8, $cnt .align 16 -.Lcopy: ! copy or in-place refresh - ld [$ap+0], $t2 - ld [$ap+4], $t3 - add $ap, 8, $ap +.Lcopy: ! conditional copy + ld [$tp+0], $t0 + ld [$tp+4], $t1 + ld [$rp+0], $t2 + ld [$rp+4], $t3 stx %g0, [$tp] ! zap add $tp, 8, $tp stx %g0, [$anp] ! zap stx %g0, [$anp+8] add $anp, 16, $anp + movcs %icc, $t0, $t2 + movcs %icc, $t1, $t3 st $t3, [$rp+0] ! flip order st $t2, [$rp+4] add $rp, 8, $rp diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl index 6787503666..3c1bb9d8e4 100755 --- a/crypto/bn/asm/x86-mont.pl +++ b/crypto/bn/asm/x86-mont.pl @@ -604,16 +604,18 @@ $sbit=$num; &jge (&label("sub")); &sbb ("eax",0); # handle upmost overflow bit - &and ($tp,"eax"); - ¬ ("eax"); - &mov ($np,$rp); - &and ($np,"eax"); - &or ($tp,$np); # tp=carry?tp:rp - -&set_label("copy",16); # copy or in-place refresh - &mov ("eax",&DWP(0,$tp,$num,4)); - &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i] + &mov ("edx",-1); + &xor ("edx","eax"); + &jmp (&label("copy")); + +&set_label("copy",16); # conditional copy + &mov ($tp,&DWP($frame,"esp",$num,4)); + &mov ($np,&DWP(0,$rp,$num,4)); &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector + &and ($tp,"eax"); + &and ($np,"edx"); + &or ($np,$tp); + &mov (&DWP(0,$rp,$num,4),$np); &dec ($num); &jge (&label("copy")); diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl index df4cca5bfe..95524d2f7a 100755 --- a/crypto/bn/asm/x86_64-mont.pl +++ b/crypto/bn/asm/x86_64-mont.pl @@ -302,30 +302,30 @@ $code.=<<___; xor $i,$i # i=0 and clear CF! mov (%rsp),%rax # tp[0] - lea (%rsp),$ap # borrow ap for tp mov $num,$j # j=num - jmp .Lsub + .align 16 .Lsub: sbb ($np,$i,8),%rax mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] - mov 8($ap,$i,8),%rax # tp[i+1] + mov 8(%rsp,$i,8),%rax # tp[i+1] lea 1($i),$i # i++ dec $j # doesnn't affect CF! jnz .Lsub sbb \$0,%rax # handle upmost overflow bit + mov \$-1,%rbx + xor %rax,%rbx # not %rax xor $i,$i - and %rax,$ap - not %rax - mov $rp,$np - and %rax,$np mov $num,$j # j=num - or $np,$ap # ap=borrow?tp:rp -.align 16 -.Lcopy: # copy or in-place refresh - mov ($ap,$i,8),%rax - mov $i,(%rsp,$i,8) # zap temporary vector - mov %rax,($rp,$i,8) # rp[i]=tp[i] + +.Lcopy: # conditional copy + mov ($rp,$i,8),%rcx + mov (%rsp,$i,8),%rdx + and %rbx,%rcx + and %rax,%rdx + mov $num,(%rsp,$i,8) # zap temporary vector + or %rcx,%rdx + mov %rdx,($rp,$i,8) # rp[i]=tp[i] lea 1($i),$i sub \$1,$j jnz .Lcopy @@ -695,10 +695,10 @@ ___ my @ri=("%rax","%rdx",$m0,$m1); $code.=<<___; mov 16(%rsp,$num,8),$rp # restore $rp + lea -4($num),$j mov 0(%rsp),@ri[0] # tp[0] - pxor %xmm0,%xmm0 mov 8(%rsp),@ri[1] # tp[1] - shr \$2,$num # num/=4 + shr \$2,$j # j=num/4-1 lea (%rsp),$ap # borrow ap for tp xor $i,$i # i=0 and clear CF! @@ -706,9 +706,7 @@ $code.=<<___; mov 16($ap),@ri[2] # tp[2] mov 24($ap),@ri[3] # tp[3] sbb 8($np),@ri[1] - lea -1($num),$j # j=num/4-1 - jmp .Lsub4x -.align 16 + .Lsub4x: mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] @@ -735,34 +733,35 @@ $code.=<<___; sbb \$0,@ri[0] # handle upmost overflow bit mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] - xor $i,$i # i=0 - and @ri[0],$ap - not @ri[0] - mov $rp,$np - and @ri[0],$np - lea -1($num),$j - or $np,$ap # ap=borrow?tp:rp - - movdqu ($ap),%xmm1 - movdqa %xmm0,(%rsp) - movdqu %xmm1,($rp) + pxor %xmm0,%xmm0 + movq @ri[0],%xmm4 + pcmpeqd %xmm5,%xmm5 + pshufd \$0,%xmm4,%xmm4 + mov $num,$j + pxor %xmm4,%xmm5 + shr \$2,$j # j=num/4 + xor %eax,%eax # i=0 + jmp .Lcopy4x .align 16 -.Lcopy4x: # copy or in-place refresh - movdqu 16($ap,$i),%xmm2 - movdqu 32($ap,$i),%xmm1 - movdqa %xmm0,16(%rsp,$i) - movdqu %xmm2,16($rp,$i) - movdqa %xmm0,32(%rsp,$i) - movdqu %xmm1,32($rp,$i) - lea 32($i),$i +.Lcopy4x: # conditional copy + movdqa (%rsp,%rax),%xmm1 + movdqu ($rp,%rax),%xmm2 + pand %xmm4,%xmm1 + pand %xmm5,%xmm2 + movdqa 16(%rsp,%rax),%xmm3 + movdqa %xmm0,(%rsp,%rax) + por %xmm2,%xmm1 + movdqu 16($rp,%rax),%xmm2 + movdqu %xmm1,($rp,%rax) + pand %xmm4,%xmm3 + pand %xmm5,%xmm2 + movdqa %xmm0,16(%rsp,%rax) + por %xmm2,%xmm3 + movdqu %xmm3,16($rp,%rax) + lea 32(%rax),%rax dec $j jnz .Lcopy4x - - shl \$2,$num - movdqu 16($ap,$i),%xmm2 - movdqa %xmm0,16(%rsp,$i) - movdqu %xmm2,16($rp,$i) ___ } $code.=<<___; diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl index 5779059ea2..31c60d9634 100755 --- a/crypto/bn/asm/x86_64-mont5.pl +++ b/crypto/bn/asm/x86_64-mont5.pl @@ -414,18 +414,19 @@ $code.=<<___; jnz .Lsub sbb \$0,%rax # handle upmost overflow bit + mov \$-1,%rbx + xor %rax,%rbx xor $i,$i - and %rax,$ap - not %rax - mov $rp,$np - and %rax,$np mov $num,$j # j=num - or $np,$ap # ap=borrow?tp:rp -.align 16 -.Lcopy: # copy or in-place refresh - mov ($ap,$i,8),%rax + +.Lcopy: # conditional copy + mov ($rp,$i,8),%rcx + mov (%rsp,$i,8),%rdx + and %rbx,%rcx + and %rax,%rdx mov $i,(%rsp,$i,8) # zap temporary vector - mov %rax,($rp,$i,8) # rp[i]=tp[i] + or %rcx,%rdx + mov %rdx,($rp,$i,8) # rp[i]=tp[i] lea 1($i),$i sub \$1,$j jnz .Lcopy