ec/asm/ecp_nistz256-*.pl: get corner case logic right.
authorAndy Polyakov <appro@openssl.org>
Sun, 21 Feb 2016 20:04:26 +0000 (21:04 +0100)
committerAndy Polyakov <appro@openssl.org>
Tue, 23 Feb 2016 20:22:30 +0000 (21:22 +0100)
RT#4284

Reviewed-by: Rich Salz <rsalz@openssl.org>
crypto/ec/asm/ecp_nistz256-armv4.pl
crypto/ec/asm/ecp_nistz256-armv8.pl
crypto/ec/asm/ecp_nistz256-x86.pl

index 3a636eae6cb61d8ab7830c6b7770db4d4e7d619f..ab11a8782e298e81ebad6fda5c15794394499818 100755 (executable)
@@ -1252,6 +1252,7 @@ ecp_nistz256_point_double:
        stmdb   sp!,{r0-r12,lr}         @ push from r0, unusual, but intentional
        sub     sp,sp,#32*5
 
+.Lpoint_double_shortcut:
        add     r3,sp,#$in_x
        ldmia   $a_ptr!,{r4-r11}        @ copy in_x
        stmia   r3,{r4-r11}
@@ -1371,7 +1372,7 @@ $code.=<<___;
 .align 5
 ecp_nistz256_point_add:
        stmdb   sp!,{r0-r12,lr}         @ push from r0, unusual, but intentional
-       sub     sp,sp,#32*18
+       sub     sp,sp,#32*18+16
 
        ldmia   $b_ptr!,{r4-r11}        @ copy in2
        add     r3,sp,#$in2_x
@@ -1504,9 +1505,9 @@ ecp_nistz256_point_add:
        tst     $t0,$t1
        beq     .Ladd_proceed           @ (in1infty || in2infty)?
        tst     $t2,$t2
-       beq     .Ladd_proceed           @ is_equal(S1,S2)?
+       beq     .Ladd_double            @ is_equal(S1,S2)?
 
-       ldr     $r_ptr,[sp,#32*18]
+       ldr     $r_ptr,[sp,#32*18+16]
        eor     r4,r4,r4
        eor     r5,r5,r5
        eor     r6,r6,r6
@@ -1520,6 +1521,12 @@ ecp_nistz256_point_add:
        stmia   $r_ptr!,{r4-r11}
        b       .Ladd_done
 
+.align 4
+.Ladd_double:
+       ldr     $a_ptr,[sp,#32*18+20]
+       add     sp,sp,#32*(18-5)+16     @ difference in frame sizes
+       b       .Lpoint_double_shortcut
+
 .align 4
 .Ladd_proceed:
        add     $a_ptr,sp,#$R
@@ -1588,7 +1595,7 @@ ecp_nistz256_point_add:
        add     r3,sp,#$in1_x
        and     r11,r11,r12
        mvn     r12,r12
-       ldr     $r_ptr,[sp,#32*18]
+       ldr     $r_ptr,[sp,#32*18+16]
 ___
 for($i=0;$i<96;$i+=8) {                        # conditional moves
 $code.=<<___;
@@ -1610,7 +1617,7 @@ ___
 }
 $code.=<<___;
 .Ladd_done:
-       add     sp,sp,#32*18+16         @ +16 means "skip even over saved r0-r3"
+       add     sp,sp,#32*18+16+16      @ +16 means "skip even over saved r0-r3"
 #if __ARM_ARCH__>=5 || defined(__thumb__)
        ldmia   sp!,{r4-r12,pc}
 #else
index ce6b69e8daf17f5587f7406d27ef9632764dcf42..4b2e925434ad0549e0432dc48567e86213fc59f1 100644 (file)
@@ -691,12 +691,13 @@ $code.=<<___;
 .type  ecp_nistz256_point_double,%function
 .align 5
 ecp_nistz256_point_double:
-       stp     x29,x30,[sp,#-48]!
+       stp     x29,x30,[sp,#-80]!
        add     x29,sp,#0
        stp     x19,x20,[sp,#16]
        stp     x21,x22,[sp,#32]
        sub     sp,sp,#32*4
 
+.Ldouble_shortcut:
        ldp     $acc0,$acc1,[$ap,#32]
         mov    $rp_real,$rp
        ldp     $acc2,$acc3,[$ap,#48]
@@ -823,7 +824,7 @@ ecp_nistz256_point_double:
        add     sp,x29,#0               // destroy frame
        ldp     x19,x20,[x29,#16]
        ldp     x21,x22,[x29,#32]
-       ldp     x29,x30,[sp],#48
+       ldp     x29,x30,[sp],#80
        ret
 .size  ecp_nistz256_point_double,.-ecp_nistz256_point_double
 ___
@@ -963,7 +964,7 @@ ecp_nistz256_point_add:
        b.eq    .Ladd_proceed           // (in1infty || in2infty)?
 
        tst     $temp,$temp
-       b.eq    .Ladd_proceed           // is_equal(S1,S2)?
+       b.eq    .Ladd_double            // is_equal(S1,S2)?
 
        eor     $a0,$a0,$a0
        eor     $a1,$a1,$a1
@@ -975,6 +976,15 @@ ecp_nistz256_point_add:
        stp     $a0,$a1,[$rp_real,#80]
        b       .Ladd_done
 
+.align 4
+.Ladd_double:
+       mov     $ap,$ap_real
+       mov     $rp,$rp_real
+       ldp     x23,x24,[x29,#48]
+       ldp     x25,x26,[x29,#64]
+       add     sp,sp,#32*(12-4)        // difference in stack frames
+       b       .Ldouble_shortcut
+
 .align 4
 .Ladd_proceed:
        add     $rp,sp,#$Rsqr
index 421ac0b34da0b310f87032d1cc6550e32347c8aa..4d55f82ef8efae4b7fb2af8dcdbe4d9d6eae4035 100755 (executable)
@@ -1197,6 +1197,7 @@ for ($i=0;$i<7;$i++) {
 ########################################################################
 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
 #
+&static_label("point_double_shortcut");
 &function_begin("ecp_nistz256_point_double");
 {   my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
 
@@ -1212,6 +1213,7 @@ for ($i=0;$i<7;$i++) {
        &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic"));
        &mov    ("ebp",&DWP(0,"edx"));          }
 
+&set_label("point_double_shortcut");
        &mov    ("eax",&DWP(0,"esi"));          # copy in_x
        &mov    ("ebx",&DWP(4,"esi"));
        &mov    ("ecx",&DWP(8,"esi"));
@@ -1491,7 +1493,7 @@ for ($i=0;$i<7;$i++) {
        &mov    ("ebx",&DWP(32*18+8,"esp"));
        &jz     (&label("add_proceed"));        # (in1infty || in2infty)?
        &test   ("ebx","ebx");
-       &jz     (&label("add_proceed"));        # is_equal(S1,S2)?
+       &jz     (&label("add_double"));         # is_equal(S1,S2)?
 
        &mov    ("edi",&wparam(0));
        &xor    ("eax","eax");
@@ -1499,6 +1501,12 @@ for ($i=0;$i<7;$i++) {
        &data_byte(0xfc,0xf3,0xab);             # cld; stosd
        &jmp    (&label("add_done"));
 
+&set_label("add_double",16);
+       &mov    ("esi",&wparam(1));
+       &mov    ("ebp",&DWP(32*18+12,"esp"));   # OPENSSL_ia32cap_P copy
+       &add    ("esp",4*((8*18+5)-(8*5+1)));   # difference in frame sizes
+       &jmp    (&label("point_double_shortcut"));
+
 &set_label("add_proceed",16);
        &mov    ("eax",&DWP(32*18+12,"esp"));   # OPENSSL_ia32cap_P copy
        &lea    ("esi",&DWP($R,"esp"));