From: Andy Polyakov Date: Sat, 20 Aug 2016 20:10:24 +0000 (+0200) Subject: ec/asm/ecp_nistz256-*.pl: addition to perform stricter reduction. X-Git-Tag: OpenSSL_1_1_0~50 X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=commitdiff_plain;h=dfde4219fdebbb5a8a17602fea036f7690e517ea ec/asm/ecp_nistz256-*.pl: addition to perform stricter reduction. Addition was not preserving inputs' property of being fully reduced. Thanks to Brian Smith for reporting this. Reviewed-by: Rich Salz --- diff --git a/crypto/ec/asm/ecp_nistz256-armv4.pl b/crypto/ec/asm/ecp_nistz256-armv4.pl index 73b7a55806..de3cd5cd0f 100755 --- a/crypto/ec/asm/ecp_nistz256-armv4.pl +++ b/crypto/ec/asm/ecp_nistz256-armv4.pl @@ -174,10 +174,7 @@ __ecp_nistz256_mul_by_2: adcs $a6,$a6,$a6 mov $ff,#0 adcs $a7,$a7,$a7 -#ifdef __thumb2__ - it cs -#endif - movcs $ff,#-1 @ $ff = carry ? -1 : 0 + adc $ff,$ff,#0 b .Lreduce_by_sub .size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 @@ -228,35 +225,45 @@ __ecp_nistz256_add: adcs $a6,$a6,$t2 mov $ff,#0 adcs $a7,$a7,$t3 -#ifdef __thumb2__ - it cs -#endif - movcs $ff,#-1 @ $ff = carry ? -1 : 0, "broadcast" carry + adc $ff,$ff,#0 ldr lr,[sp],#4 @ pop lr .Lreduce_by_sub: - @ if a+b carries, subtract modulus. + @ if a+b >= modulus, subtract modulus. @ + @ But since comparison implies subtraction, we subtract + @ modulus and then add it back if subraction borrowed. + + subs $a0,$a0,#-1 + sbcs $a1,$a1,#-1 + sbcs $a2,$a2,#-1 + sbcs $a3,$a3,#0 + sbcs $a4,$a4,#0 + sbcs $a5,$a5,#0 + sbcs $a6,$a6,#1 + sbcs $a7,$a7,#-1 + sbc $ff,$ff,#0 + @ Note that because mod has special form, i.e. consists of @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by - @ using value of broadcasted carry as a whole or extracting - @ single bit. Follow $ff register... + @ using value of borrow as a whole or extracting single bit. + @ Follow $ff register... - subs $a0,$a0,$ff @ subtract synthesized modulus - sbcs $a1,$a1,$ff + adds $a0,$a0,$ff @ add synthesized modulus + adcs $a1,$a1,$ff str $a0,[$r_ptr,#0] - sbcs $a2,$a2,$ff + adcs $a2,$a2,$ff str $a1,[$r_ptr,#4] - sbcs $a3,$a3,#0 + adcs $a3,$a3,#0 str $a2,[$r_ptr,#8] - sbcs $a4,$a4,#0 + adcs $a4,$a4,#0 str $a3,[$r_ptr,#12] - sbcs $a5,$a5,#0 + adcs $a5,$a5,#0 str $a4,[$r_ptr,#16] - sbcs $a6,$a6,$ff,lsr#31 + adcs $a6,$a6,$ff,lsr#31 str $a5,[$r_ptr,#20] - sbcs $a7,$a7,$ff + adcs $a7,$a7,$ff str $a6,[$r_ptr,#24] str $a7,[$r_ptr,#28] @@ -304,26 +311,29 @@ __ecp_nistz256_mul_by_3: adcs $a6,$a6,$a6 mov $ff,#0 adcs $a7,$a7,$a7 -#ifdef __thumb2__ - it cs -#endif - movcs $ff,#-1 @ $ff = carry ? -1 : 0, "broadcast" carry - - subs $a0,$a0,$ff @ subtract synthesized modulus, see - @ .Lreduce_by_sub for details, except - @ that we don't write anything to - @ memory, but keep intermediate - @ results in registers... - sbcs $a1,$a1,$ff - sbcs $a2,$a2,$ff + adc $ff,$ff,#0 + + subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores + sbcs $a1,$a1,#-1 + sbcs $a2,$a2,#-1 sbcs $a3,$a3,#0 sbcs $a4,$a4,#0 - ldr $b_ptr,[$a_ptr,#0] sbcs $a5,$a5,#0 + sbcs $a6,$a6,#1 + sbcs $a7,$a7,#-1 + sbc $ff,$ff,#0 + + adds $a0,$a0,$ff @ add synthesized modulus + adcs $a1,$a1,$ff + adcs $a2,$a2,$ff + adcs $a3,$a3,#0 + adcs $a4,$a4,#0 + ldr $b_ptr,[$a_ptr,#0] + adcs $a5,$a5,#0 ldr $t1,[$a_ptr,#4] - sbcs $a6,$a6,$ff,lsr#31 + adcs $a6,$a6,$ff,lsr#31 ldr $t2,[$a_ptr,#8] - sbcs $a7,$a7,$ff + adc $a7,$a7,$ff ldr $t0,[$a_ptr,#12] adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7] @@ -339,10 +349,7 @@ __ecp_nistz256_mul_by_3: adcs $a6,$a6,$t2 mov $ff,#0 adcs $a7,$a7,$t3 -#ifdef __thumb2__ - it cs -#endif - movcs $ff,#-1 @ $ff = carry ? -1 : 0, "broadcast" carry + adc $ff,$ff,#0 ldr lr,[sp],#4 @ pop lr b .Lreduce_by_sub @@ -1210,25 +1217,42 @@ __ecp_nistz256_add_self: adcs $a6,$a6,$a6 mov $ff,#0 adcs $a7,$a7,$a7 -#ifdef __thumb2__ - it cs -#endif - movcs $ff,#-1 @ $ff = carry ? -1 : 0 + adc $ff,$ff,#0 + + @ if a+b >= modulus, subtract modulus. + @ + @ But since comparison implies subtraction, we subtract + @ modulus and then add it back if subraction borrowed. + + subs $a0,$a0,#-1 + sbcs $a1,$a1,#-1 + sbcs $a2,$a2,#-1 + sbcs $a3,$a3,#0 + sbcs $a4,$a4,#0 + sbcs $a5,$a5,#0 + sbcs $a6,$a6,#1 + sbcs $a7,$a7,#-1 + sbc $ff,$ff,#0 - subs $a0,$a0,$ff @ subtract synthesized modulus - sbcs $a1,$a1,$ff + @ Note that because mod has special form, i.e. consists of + @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by + @ using value of borrow as a whole or extracting single bit. + @ Follow $ff register... + + adds $a0,$a0,$ff @ add synthesized modulus + adcs $a1,$a1,$ff str $a0,[$r_ptr,#0] - sbcs $a2,$a2,$ff + adcs $a2,$a2,$ff str $a1,[$r_ptr,#4] - sbcs $a3,$a3,#0 + adcs $a3,$a3,#0 str $a2,[$r_ptr,#8] - sbcs $a4,$a4,#0 + adcs $a4,$a4,#0 str $a3,[$r_ptr,#12] - sbcs $a5,$a5,#0 + adcs $a5,$a5,#0 str $a4,[$r_ptr,#16] - sbcs $a6,$a6,$ff,lsr#31 + adcs $a6,$a6,$ff,lsr#31 str $a5,[$r_ptr,#20] - sbcs $a7,$a7,$ff + adcs $a7,$a7,$ff str $a6,[$r_ptr,#24] str $a7,[$r_ptr,#28] diff --git a/crypto/ec/asm/ecp_nistz256-armv8.pl b/crypto/ec/asm/ecp_nistz256-armv8.pl index c5c1280755..13625861e8 100644 --- a/crypto/ec/asm/ecp_nistz256-armv8.pl +++ b/crypto/ec/asm/ecp_nistz256-armv8.pl @@ -583,14 +583,14 @@ __ecp_nistz256_add: adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus sbcs $t1,$acc1,$poly1 sbcs $t2,$acc2,xzr - sbc $t3,$acc3,$poly3 - cmp $ap,xzr // did addition carry? + sbcs $t3,$acc3,$poly3 + sbcs xzr,$ap,xzr // did subtraction borrow? - csel $acc0,$acc0,$t0,eq // ret = carry ? ret-modulus : ret - csel $acc1,$acc1,$t1,eq - csel $acc2,$acc2,$t2,eq + csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $acc1,$acc1,$t1,lo + csel $acc2,$acc2,$t2,lo stp $acc0,$acc1,[$rp] - csel $acc3,$acc3,$t3,eq + csel $acc3,$acc3,$t3,lo stp $acc2,$acc3,[$rp,#16] ret diff --git a/crypto/ec/asm/ecp_nistz256-sparcv9.pl b/crypto/ec/asm/ecp_nistz256-sparcv9.pl index 3f39088661..3c7ff502b9 100755 --- a/crypto/ec/asm/ecp_nistz256-sparcv9.pl +++ b/crypto/ec/asm/ecp_nistz256-sparcv9.pl @@ -406,33 +406,44 @@ __ecp_nistz256_add: addccc @acc[5],$t5,@acc[5] addccc @acc[6],$t6,@acc[6] addccc @acc[7],$t7,@acc[7] - subc %g0,%g0,$carry ! broadcast carry bit + addc %g0,%g0,$carry .Lreduce_by_sub: - ! if a+b carries, subtract modulus. + ! if a+b >= modulus, subtract modulus. ! + ! But since comparison implies subtraction, we subtract + ! modulus and then add it back if subraction borrowed. + + subcc @acc[0],-1,@acc[0] + subccc @acc[1],-1,@acc[1] + subccc @acc[2],-1,@acc[2] + subccc @acc[3], 0,@acc[3] + subccc @acc[4], 0,@acc[4] + subccc @acc[5], 0,@acc[5] + subccc @acc[6], 1,@acc[6] + subccc @acc[7],-1,@acc[7] + subc $carry,0,$carry + ! Note that because mod has special form, i.e. consists of ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by - ! using value of broadcasted borrow and the borrow bit itself. - ! To minimize dependency chain we first broadcast and then - ! extract the bit by negating (follow $bi). + ! using value of borrow and its negative. - subcc @acc[0],$carry,@acc[0] ! subtract synthesized modulus - subccc @acc[1],$carry,@acc[1] + addcc @acc[0],$carry,@acc[0] ! add synthesized modulus + addccc @acc[1],$carry,@acc[1] neg $carry,$bi st @acc[0],[$rp] - subccc @acc[2],$carry,@acc[2] + addccc @acc[2],$carry,@acc[2] st @acc[1],[$rp+4] - subccc @acc[3],0,@acc[3] + addccc @acc[3],0,@acc[3] st @acc[2],[$rp+8] - subccc @acc[4],0,@acc[4] + addccc @acc[4],0,@acc[4] st @acc[3],[$rp+12] - subccc @acc[5],0,@acc[5] + addccc @acc[5],0,@acc[5] st @acc[4],[$rp+16] - subccc @acc[6],$bi,@acc[6] + addccc @acc[6],$bi,@acc[6] st @acc[5],[$rp+20] - subc @acc[7],$carry,@acc[7] + addc @acc[7],$carry,@acc[7] st @acc[6],[$rp+24] retl st @acc[7],[$rp+28] @@ -469,7 +480,7 @@ __ecp_nistz256_mul_by_2: addccc @acc[6],@acc[6],@acc[6] addccc @acc[7],@acc[7],@acc[7] b .Lreduce_by_sub - subc %g0,%g0,$carry ! broadcast carry bit + addc %g0,%g0,$carry .type __ecp_nistz256_mul_by_2,#function .size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 @@ -502,17 +513,27 @@ __ecp_nistz256_mul_by_3: addccc @acc[5],@acc[5],$t5 addccc @acc[6],@acc[6],$t6 addccc @acc[7],@acc[7],$t7 - subc %g0,%g0,$carry ! broadcast carry bit + addc %g0,%g0,$carry - subcc $t0,$carry,$t0 ! .Lreduce_by_sub but without stores + subcc $t0,-1,$t0 ! .Lreduce_by_sub but without stores + subccc $t1,-1,$t1 + subccc $t2,-1,$t2 + subccc $t3, 0,$t3 + subccc $t4, 0,$t4 + subccc $t5, 0,$t5 + subccc $t6, 1,$t6 + subccc $t7,-1,$t7 + subc $carry,0,$carry + + addcc $t0,$carry,$t0 ! add synthesized modulus + addccc $t1,$carry,$t1 neg $carry,$bi - subccc $t1,$carry,$t1 - subccc $t2,$carry,$t2 - subccc $t3,0,$t3 - subccc $t4,0,$t4 - subccc $t5,0,$t5 - subccc $t6,$bi,$t6 - subc $t7,$carry,$t7 + addccc $t2,$carry,$t2 + addccc $t3,0,$t3 + addccc $t4,0,$t4 + addccc $t5,0,$t5 + addccc $t6,$bi,$t6 + addc $t7,$carry,$t7 addcc $t0,@acc[0],@acc[0] ! 2*a+a=3*a addccc $t1,@acc[1],@acc[1] @@ -523,7 +544,7 @@ __ecp_nistz256_mul_by_3: addccc $t6,@acc[6],@acc[6] addccc $t7,@acc[7],@acc[7] b .Lreduce_by_sub - subc %g0,%g0,$carry ! broadcast carry bit + addc %g0,%g0,$carry .type __ecp_nistz256_mul_by_3,#function .size __ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3 @@ -1662,14 +1683,15 @@ __ecp_nistz256_add_noload_vis3: addcc $acc0,1,$t0 ! add -modulus, i.e. subtract addxccc $acc1,$poly1,$t1 addxccc $acc2,$minus1,$t2 - addxc $acc3,$poly3,$t3 + addxccc $acc3,$poly3,$t3 + addxc $acc4,$minus1,$acc4 - movrnz $acc4,$t0,$acc0 ! if a+b carried, ret = ret-mod - movrnz $acc4,$t1,$acc1 + movrz $acc4,$t0,$acc0 ! ret = borrow ? ret : ret-modulus + movrz $acc4,$t1,$acc1 stx $acc0,[$rp] - movrnz $acc4,$t2,$acc2 + movrz $acc4,$t2,$acc2 stx $acc1,[$rp+8] - movrnz $acc4,$t3,$acc3 + movrz $acc4,$t3,$acc3 stx $acc2,[$rp+16] retl stx $acc3,[$rp+24] diff --git a/crypto/ec/asm/ecp_nistz256-x86.pl b/crypto/ec/asm/ecp_nistz256-x86.pl index e9fa038a0b..b96b1aa017 100755 --- a/crypto/ec/asm/ecp_nistz256-x86.pl +++ b/crypto/ec/asm/ecp_nistz256-x86.pl @@ -284,18 +284,41 @@ for(1..37) { &mov (&DWP(16,"edi"),"eax"); &adc ("ecx",&DWP(24,"ebp")); &mov (&DWP(20,"edi"),"ebx"); + &mov ("esi",0); &adc ("edx",&DWP(28,"ebp")); &mov (&DWP(24,"edi"),"ecx"); - &sbb ("esi","esi"); # broadcast carry bit + &adc ("esi",0); &mov (&DWP(28,"edi"),"edx"); - # if a+b carries, subtract modulus. + # if a+b >= modulus, subtract modulus. # + # But since comparison implies subtraction, we subtract modulus + # to see if it borrows, and then subtract it for real if + # subtraction didn't borrow. + + &mov ("eax",&DWP(0,"edi")); + &mov ("ebx",&DWP(4,"edi")); + &mov ("ecx",&DWP(8,"edi")); + &sub ("eax",-1); + &mov ("edx",&DWP(12,"edi")); + &sbb ("ebx",-1); + &mov ("eax",&DWP(16,"edi")); + &sbb ("ecx",-1); + &mov ("ebx",&DWP(20,"edi")); + &sbb ("edx",0); + &mov ("ecx",&DWP(24,"edi")); + &sbb ("eax",0); + &mov ("edx",&DWP(28,"edi")); + &sbb ("ebx",0); + &sbb ("ecx",1); + &sbb ("edx",-1); + &sbb ("esi",0); + # Note that because mod has special form, i.e. consists of # 0xffffffff, 1 and 0s, we can conditionally synthesize it by - # assigning carry bit to one register, %ebp, and its negative - # to another, %esi. But we started by calculating %esi... + # by using borrow. + ¬ ("esi"); &mov ("eax",&DWP(0,"edi")); &mov ("ebp","esi"); &mov ("ebx",&DWP(4,"edi"));