ec/asm/ecp_nistz256-*.pl: addition to perform stricter reduction.
authorAndy Polyakov <appro@openssl.org>
Sat, 20 Aug 2016 20:10:24 +0000 (22:10 +0200)
committerMatt Caswell <matt@openssl.org>
Wed, 24 Aug 2016 09:44:56 +0000 (10:44 +0100)
Addition was not preserving inputs' property of being fully reduced.

Thanks to Brian Smith for reporting this.

Reviewed-by: Rich Salz <rsalz@openssl.org>
crypto/ec/asm/ecp_nistz256-armv4.pl
crypto/ec/asm/ecp_nistz256-armv8.pl
crypto/ec/asm/ecp_nistz256-sparcv9.pl
crypto/ec/asm/ecp_nistz256-x86.pl

index 73b7a55..de3cd5c 100755 (executable)
@@ -174,10 +174,7 @@ __ecp_nistz256_mul_by_2:
        adcs    $a6,$a6,$a6
        mov     $ff,#0
        adcs    $a7,$a7,$a7
-#ifdef __thumb2__
-       it      cs
-#endif
-       movcs   $ff,#-1                 @ $ff = carry ? -1 : 0
+       adc     $ff,$ff,#0
 
        b       .Lreduce_by_sub
 .size  __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
@@ -228,35 +225,45 @@ __ecp_nistz256_add:
        adcs    $a6,$a6,$t2
        mov     $ff,#0
        adcs    $a7,$a7,$t3
-#ifdef __thumb2__
-       it      cs
-#endif
-       movcs   $ff,#-1                 @ $ff = carry ? -1 : 0, "broadcast" carry
+       adc     $ff,$ff,#0
        ldr     lr,[sp],#4              @ pop lr
 
 .Lreduce_by_sub:
 
-       @ if a+b carries, subtract modulus.
+       @ if a+b >= modulus, subtract modulus.
        @
+       @ But since comparison implies subtraction, we subtract
+       @ modulus and then add it back if subraction borrowed.
+
+       subs    $a0,$a0,#-1
+       sbcs    $a1,$a1,#-1
+       sbcs    $a2,$a2,#-1
+       sbcs    $a3,$a3,#0
+       sbcs    $a4,$a4,#0
+       sbcs    $a5,$a5,#0
+       sbcs    $a6,$a6,#1
+       sbcs    $a7,$a7,#-1
+       sbc     $ff,$ff,#0
+
        @ Note that because mod has special form, i.e. consists of
        @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
-       @ using value of broadcasted carry as a whole or extracting
-       @ single bit. Follow $ff register...
+       @ using value of borrow as a whole or extracting single bit.
+       @ Follow $ff register...
 
-       subs    $a0,$a0,$ff             @ subtract synthesized modulus
-       sbcs    $a1,$a1,$ff
+       adds    $a0,$a0,$ff             @ add synthesized modulus
+       adcs    $a1,$a1,$ff
        str     $a0,[$r_ptr,#0]
-       sbcs    $a2,$a2,$ff
+       adcs    $a2,$a2,$ff
        str     $a1,[$r_ptr,#4]
-       sbcs    $a3,$a3,#0
+       adcs    $a3,$a3,#0
        str     $a2,[$r_ptr,#8]
-       sbcs    $a4,$a4,#0
+       adcs    $a4,$a4,#0
        str     $a3,[$r_ptr,#12]
-       sbcs    $a5,$a5,#0
+       adcs    $a5,$a5,#0
        str     $a4,[$r_ptr,#16]
-       sbcs    $a6,$a6,$ff,lsr#31
+       adcs    $a6,$a6,$ff,lsr#31
        str     $a5,[$r_ptr,#20]
-       sbcs    $a7,$a7,$ff
+       adcs    $a7,$a7,$ff
        str     $a6,[$r_ptr,#24]
        str     $a7,[$r_ptr,#28]
 
@@ -304,26 +311,29 @@ __ecp_nistz256_mul_by_3:
        adcs    $a6,$a6,$a6
        mov     $ff,#0
        adcs    $a7,$a7,$a7
-#ifdef __thumb2__
-       it      cs
-#endif
-       movcs   $ff,#-1                 @ $ff = carry ? -1 : 0, "broadcast" carry
-
-       subs    $a0,$a0,$ff             @ subtract synthesized modulus, see
-                                       @ .Lreduce_by_sub for details, except
-                                       @ that we don't write anything to
-                                       @ memory, but keep intermediate
-                                       @ results in registers...
-       sbcs    $a1,$a1,$ff
-       sbcs    $a2,$a2,$ff
+       adc     $ff,$ff,#0
+
+       subs    $a0,$a0,#-1             @ .Lreduce_by_sub but without stores
+       sbcs    $a1,$a1,#-1
+       sbcs    $a2,$a2,#-1
        sbcs    $a3,$a3,#0
        sbcs    $a4,$a4,#0
-        ldr    $b_ptr,[$a_ptr,#0]
        sbcs    $a5,$a5,#0
+       sbcs    $a6,$a6,#1
+       sbcs    $a7,$a7,#-1
+       sbc     $ff,$ff,#0
+
+       adds    $a0,$a0,$ff             @ add synthesized modulus
+       adcs    $a1,$a1,$ff
+       adcs    $a2,$a2,$ff
+       adcs    $a3,$a3,#0
+       adcs    $a4,$a4,#0
+        ldr    $b_ptr,[$a_ptr,#0]
+       adcs    $a5,$a5,#0
         ldr    $t1,[$a_ptr,#4]
-       sbcs    $a6,$a6,$ff,lsr#31
+       adcs    $a6,$a6,$ff,lsr#31
         ldr    $t2,[$a_ptr,#8]
-       sbcs    $a7,$a7,$ff
+       adc     $a7,$a7,$ff
 
        ldr     $t0,[$a_ptr,#12]
        adds    $a0,$a0,$b_ptr          @ 2*a[0:7]+=a[0:7]
@@ -339,10 +349,7 @@ __ecp_nistz256_mul_by_3:
        adcs    $a6,$a6,$t2
        mov     $ff,#0
        adcs    $a7,$a7,$t3
-#ifdef __thumb2__
-       it      cs
-#endif
-       movcs   $ff,#-1                 @ $ff = carry ? -1 : 0, "broadcast" carry
+       adc     $ff,$ff,#0
        ldr     lr,[sp],#4              @ pop lr
 
        b       .Lreduce_by_sub
@@ -1210,25 +1217,42 @@ __ecp_nistz256_add_self:
        adcs    $a6,$a6,$a6
        mov     $ff,#0
        adcs    $a7,$a7,$a7
-#ifdef __thumb2__
-       it      cs
-#endif
-       movcs   $ff,#-1                 @ $ff = carry ? -1 : 0
+       adc     $ff,$ff,#0
+
+       @ if a+b >= modulus, subtract modulus.
+       @
+       @ But since comparison implies subtraction, we subtract
+       @ modulus and then add it back if subraction borrowed.
+
+       subs    $a0,$a0,#-1
+       sbcs    $a1,$a1,#-1
+       sbcs    $a2,$a2,#-1
+       sbcs    $a3,$a3,#0
+       sbcs    $a4,$a4,#0
+       sbcs    $a5,$a5,#0
+       sbcs    $a6,$a6,#1
+       sbcs    $a7,$a7,#-1
+       sbc     $ff,$ff,#0
 
-       subs    $a0,$a0,$ff             @ subtract synthesized modulus
-       sbcs    $a1,$a1,$ff
+       @ Note that because mod has special form, i.e. consists of
+       @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
+       @ using value of borrow as a whole or extracting single bit.
+       @ Follow $ff register...
+
+       adds    $a0,$a0,$ff             @ add synthesized modulus
+       adcs    $a1,$a1,$ff
        str     $a0,[$r_ptr,#0]
-       sbcs    $a2,$a2,$ff
+       adcs    $a2,$a2,$ff
        str     $a1,[$r_ptr,#4]
-       sbcs    $a3,$a3,#0
+       adcs    $a3,$a3,#0
        str     $a2,[$r_ptr,#8]
-       sbcs    $a4,$a4,#0
+       adcs    $a4,$a4,#0
        str     $a3,[$r_ptr,#12]
-       sbcs    $a5,$a5,#0
+       adcs    $a5,$a5,#0
        str     $a4,[$r_ptr,#16]
-       sbcs    $a6,$a6,$ff,lsr#31
+       adcs    $a6,$a6,$ff,lsr#31
        str     $a5,[$r_ptr,#20]
-       sbcs    $a7,$a7,$ff
+       adcs    $a7,$a7,$ff
        str     $a6,[$r_ptr,#24]
        str     $a7,[$r_ptr,#28]
 
index c5c1280..1362586 100644 (file)
@@ -583,14 +583,14 @@ __ecp_nistz256_add:
        adds    $t0,$acc0,#1            // subs $t0,$a0,#-1 // tmp = ret-modulus
        sbcs    $t1,$acc1,$poly1
        sbcs    $t2,$acc2,xzr
-       sbc     $t3,$acc3,$poly3
-       cmp     $ap,xzr                 // did addition carry?
+       sbcs    $t3,$acc3,$poly3
+       sbcs    xzr,$ap,xzr             // did subtraction borrow?
 
-       csel    $acc0,$acc0,$t0,eq      // ret = carry ? ret-modulus : ret
-       csel    $acc1,$acc1,$t1,eq
-       csel    $acc2,$acc2,$t2,eq
+       csel    $acc0,$acc0,$t0,lo      // ret = borrow ? ret : ret-modulus
+       csel    $acc1,$acc1,$t1,lo
+       csel    $acc2,$acc2,$t2,lo
        stp     $acc0,$acc1,[$rp]
-       csel    $acc3,$acc3,$t3,eq
+       csel    $acc3,$acc3,$t3,lo
        stp     $acc2,$acc3,[$rp,#16]
 
        ret
index 3f39088..3c7ff50 100755 (executable)
@@ -406,33 +406,44 @@ __ecp_nistz256_add:
        addccc  @acc[5],$t5,@acc[5]
        addccc  @acc[6],$t6,@acc[6]
        addccc  @acc[7],$t7,@acc[7]
-       subc    %g0,%g0,$carry          ! broadcast carry bit
+       addc    %g0,%g0,$carry
 
 .Lreduce_by_sub:
 
-       ! if a+b carries, subtract modulus.
+       ! if a+b >= modulus, subtract modulus.
        !
+       ! But since comparison implies subtraction, we subtract
+       ! modulus and then add it back if subraction borrowed.
+
+       subcc   @acc[0],-1,@acc[0]
+       subccc  @acc[1],-1,@acc[1]
+       subccc  @acc[2],-1,@acc[2]
+       subccc  @acc[3], 0,@acc[3]
+       subccc  @acc[4], 0,@acc[4]
+       subccc  @acc[5], 0,@acc[5]
+       subccc  @acc[6], 1,@acc[6]
+       subccc  @acc[7],-1,@acc[7]
+       subc    $carry,0,$carry
+
        ! Note that because mod has special form, i.e. consists of
        ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
-       ! using value of broadcasted borrow and the borrow bit itself.
-       ! To minimize dependency chain we first broadcast and then
-       ! extract the bit by negating (follow $bi).
+       ! using value of borrow and its negative.
 
-       subcc   @acc[0],$carry,@acc[0]  ! subtract synthesized modulus
-       subccc  @acc[1],$carry,@acc[1]
+       addcc   @acc[0],$carry,@acc[0]  ! add synthesized modulus
+       addccc  @acc[1],$carry,@acc[1]
        neg     $carry,$bi
        st      @acc[0],[$rp]
-       subccc  @acc[2],$carry,@acc[2]
+       addccc  @acc[2],$carry,@acc[2]
        st      @acc[1],[$rp+4]
-       subccc  @acc[3],0,@acc[3]
+       addccc  @acc[3],0,@acc[3]
        st      @acc[2],[$rp+8]
-       subccc  @acc[4],0,@acc[4]
+       addccc  @acc[4],0,@acc[4]
        st      @acc[3],[$rp+12]
-       subccc  @acc[5],0,@acc[5]
+       addccc  @acc[5],0,@acc[5]
        st      @acc[4],[$rp+16]
-       subccc  @acc[6],$bi,@acc[6]
+       addccc  @acc[6],$bi,@acc[6]
        st      @acc[5],[$rp+20]
-       subc    @acc[7],$carry,@acc[7]
+       addc    @acc[7],$carry,@acc[7]
        st      @acc[6],[$rp+24]
        retl
        st      @acc[7],[$rp+28]
@@ -469,7 +480,7 @@ __ecp_nistz256_mul_by_2:
        addccc  @acc[6],@acc[6],@acc[6]
        addccc  @acc[7],@acc[7],@acc[7]
        b       .Lreduce_by_sub
-       subc    %g0,%g0,$carry          ! broadcast carry bit
+       addc    %g0,%g0,$carry
 .type  __ecp_nistz256_mul_by_2,#function
 .size  __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
 
@@ -502,17 +513,27 @@ __ecp_nistz256_mul_by_3:
        addccc  @acc[5],@acc[5],$t5
        addccc  @acc[6],@acc[6],$t6
        addccc  @acc[7],@acc[7],$t7
-       subc    %g0,%g0,$carry          ! broadcast carry bit
+       addc    %g0,%g0,$carry
 
-       subcc   $t0,$carry,$t0          ! .Lreduce_by_sub but without stores
+       subcc   $t0,-1,$t0              ! .Lreduce_by_sub but without stores
+       subccc  $t1,-1,$t1
+       subccc  $t2,-1,$t2
+       subccc  $t3, 0,$t3
+       subccc  $t4, 0,$t4
+       subccc  $t5, 0,$t5
+       subccc  $t6, 1,$t6
+       subccc  $t7,-1,$t7
+       subc    $carry,0,$carry
+
+       addcc   $t0,$carry,$t0          ! add synthesized modulus
+       addccc  $t1,$carry,$t1
        neg     $carry,$bi
-       subccc  $t1,$carry,$t1
-       subccc  $t2,$carry,$t2
-       subccc  $t3,0,$t3
-       subccc  $t4,0,$t4
-       subccc  $t5,0,$t5
-       subccc  $t6,$bi,$t6
-       subc    $t7,$carry,$t7
+       addccc  $t2,$carry,$t2
+       addccc  $t3,0,$t3
+       addccc  $t4,0,$t4
+       addccc  $t5,0,$t5
+       addccc  $t6,$bi,$t6
+       addc    $t7,$carry,$t7
 
        addcc   $t0,@acc[0],@acc[0]     ! 2*a+a=3*a
        addccc  $t1,@acc[1],@acc[1]
@@ -523,7 +544,7 @@ __ecp_nistz256_mul_by_3:
        addccc  $t6,@acc[6],@acc[6]
        addccc  $t7,@acc[7],@acc[7]
        b       .Lreduce_by_sub
-       subc    %g0,%g0,$carry          ! broadcast carry bit
+       addc    %g0,%g0,$carry
 .type  __ecp_nistz256_mul_by_3,#function
 .size  __ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3
 
@@ -1662,14 +1683,15 @@ __ecp_nistz256_add_noload_vis3:
        addcc   $acc0,1,$t0             ! add -modulus, i.e. subtract
        addxccc $acc1,$poly1,$t1
        addxccc $acc2,$minus1,$t2
-       addxc   $acc3,$poly3,$t3
+       addxccc $acc3,$poly3,$t3
+       addxc   $acc4,$minus1,$acc4
 
-       movrnz  $acc4,$t0,$acc0         ! if a+b carried, ret = ret-mod
-       movrnz  $acc4,$t1,$acc1
+       movrz   $acc4,$t0,$acc0         ! ret = borrow ? ret : ret-modulus
+       movr  $acc4,$t1,$acc1
        stx     $acc0,[$rp]
-       movrnz  $acc4,$t2,$acc2
+       movr  $acc4,$t2,$acc2
        stx     $acc1,[$rp+8]
-       movrnz  $acc4,$t3,$acc3
+       movr  $acc4,$t3,$acc3
        stx     $acc2,[$rp+16]
        retl
        stx     $acc3,[$rp+24]
index e9fa038..b96b1aa 100755 (executable)
@@ -284,18 +284,41 @@ for(1..37) {
        &mov    (&DWP(16,"edi"),"eax");
        &adc    ("ecx",&DWP(24,"ebp"));
        &mov    (&DWP(20,"edi"),"ebx");
+       &mov    ("esi",0);
        &adc    ("edx",&DWP(28,"ebp"));
        &mov    (&DWP(24,"edi"),"ecx");
-       &sbb    ("esi","esi");                  # broadcast carry bit
+       &adc    ("esi",0);
        &mov    (&DWP(28,"edi"),"edx");
 
-       # if a+b carries, subtract modulus.
+       # if a+b >= modulus, subtract modulus.
        #
+       # But since comparison implies subtraction, we subtract modulus
+       # to see if it borrows, and then subtract it for real if
+       # subtraction didn't borrow.
+
+       &mov    ("eax",&DWP(0,"edi"));
+       &mov    ("ebx",&DWP(4,"edi"));
+       &mov    ("ecx",&DWP(8,"edi"));
+       &sub    ("eax",-1);
+       &mov    ("edx",&DWP(12,"edi"));
+       &sbb    ("ebx",-1);
+       &mov    ("eax",&DWP(16,"edi"));
+       &sbb    ("ecx",-1);
+       &mov    ("ebx",&DWP(20,"edi"));
+       &sbb    ("edx",0);
+       &mov    ("ecx",&DWP(24,"edi"));
+       &sbb    ("eax",0);
+       &mov    ("edx",&DWP(28,"edi"));
+       &sbb    ("ebx",0);
+       &sbb    ("ecx",1);
+       &sbb    ("edx",-1);
+       &sbb    ("esi",0);
+
        # Note that because mod has special form, i.e. consists of
        # 0xffffffff, 1 and 0s, we can conditionally synthesize it by
-       # assigning carry bit to one register, %ebp, and its negative
-       # to another, %esi. But we started by calculating %esi...
+       # by using borrow.
 
+       &not    ("esi");
        &mov    ("eax",&DWP(0,"edi"));
        &mov    ("ebp","esi");
        &mov    ("ebx",&DWP(4,"edi"));