ec/asm/ecp_nistz256-armv8.pl: add optimized inversion.
authorAndy Polyakov <appro@openssl.org>
Sat, 30 Dec 2017 14:11:25 +0000 (15:11 +0100)
committerAndy Polyakov <appro@openssl.org>
Sun, 7 Jan 2018 20:32:37 +0000 (21:32 +0100)
Reviewed-by: Rich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/5001)

crypto/ec/asm/ecp_nistz256-armv8.pl
crypto/ec/ecp_nistz256.c

index d93c4fe..2f1eb21 100644 (file)
 # http://eprint.iacr.org/2013/816.
 #
 #                      with/without -DECP_NISTZ256_ASM
-# Apple A7             +120-360%
-# Cortex-A53           +120-400%
-# Cortex-A57           +120-350%
-# X-Gene               +200-330%
-# Denver               +140-400%
+# Apple A7             +190-360%
+# Cortex-A53           +190-400%
+# Cortex-A57           +190-350%
+# Denver               +230-400%
 #
 # Ranges denote minimum and maximum improvement coefficients depending
 # on benchmark. Lower coefficients are for ECDSA sign, server-side
@@ -109,6 +108,10 @@ $code.=<<___;
 .quad  0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
 .Lone:
 .quad  1,0,0,0
+.Lord:
+.quad  0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+.LordK:
+.quad  0xccd1c8aaee00bc4f
 .asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 
 // void        ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
@@ -1309,6 +1312,302 @@ $code.=<<___;
        ret
 .size  ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
 ___
+}
+if (1) {
+my ($ord0,$ord1) = ($poly1,$poly3);
+my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24));
+my $acc7 = $bi;
+
+$code.=<<___;
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
+//                                uint64_t b[4]);
+.globl ecp_nistz256_ord_mul_mont
+.type  ecp_nistz256_ord_mul_mont,%function
+.align 4
+ecp_nistz256_ord_mul_mont:
+       stp     x29,x30,[sp,#-64]!
+       add     x29,sp,#0
+       stp     x19,x20,[sp,#16]
+       stp     x21,x22,[sp,#32]
+       stp     x23,x24,[sp,#48]
+
+       adr     $ordk,.Lord
+       ldr     $bi,[$bp]               // bp[0]
+       ldp     $a0,$a1,[$ap]
+       ldp     $a2,$a3,[$ap,#16]
+
+       ldp     $ord0,$ord1,[$ordk,#0]
+       ldp     $ord2,$ord3,[$ordk,#16]
+       ldr     $ordk,[$ordk,#32]
+
+       mul     $acc0,$a0,$bi           // a[0]*b[0]
+       umulh   $t0,$a0,$bi
+
+       mul     $acc1,$a1,$bi           // a[1]*b[0]
+       umulh   $t1,$a1,$bi
+
+       mul     $acc2,$a2,$bi           // a[2]*b[0]
+       umulh   $t2,$a2,$bi
+
+       mul     $acc3,$a3,$bi           // a[3]*b[0]
+       umulh   $acc4,$a3,$bi
+
+       mul     $t4,$acc0,$ordk
+
+       adds    $acc1,$acc1,$t0         // accumulate high parts of multiplication
+       adcs    $acc2,$acc2,$t1
+       adcs    $acc3,$acc3,$t2
+       adc     $acc4,$acc4,xzr
+       mov     $acc5,xzr
+___
+for ($i=1;$i<4;$i++) {
+       ################################################################
+       #            ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
+       # *                                     abcdefgh
+       # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
+       #
+       # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
+       # rewrite above as:
+       #
+       #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
+       # - 0000abcd.efgh0000.abcdefgh.00000000.00000000
+       # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
+$code.=<<___;
+       ldr     $bi,[$bp,#8*$i]         // b[i]
+
+       lsl     $t0,$t4,#32
+       subs    $acc2,$acc2,$t4
+       lsr     $t1,$t4,#32
+       sbcs    $acc3,$acc3,$t0
+       sbcs    $acc4,$acc4,$t1
+       sbc     $acc5,$acc5,xzr
+
+       subs    xzr,$acc0,#1
+       umulh   $t1,$ord0,$t4
+       mul     $t2,$ord1,$t4
+       umulh   $t3,$ord1,$t4
+
+       adcs    $t2,$t2,$t1
+        mul    $t0,$a0,$bi
+       adc     $t3,$t3,xzr
+        mul    $t1,$a1,$bi
+
+       adds    $acc0,$acc1,$t2
+        mul    $t2,$a2,$bi
+       adcs    $acc1,$acc2,$t3
+        mul    $t3,$a3,$bi
+       adcs    $acc2,$acc3,$t4
+       adcs    $acc3,$acc4,$t4
+       adc     $acc4,$acc5,xzr
+
+       adds    $acc0,$acc0,$t0         // accumulate low parts
+       umulh   $t0,$a0,$bi
+       adcs    $acc1,$acc1,$t1
+       umulh   $t1,$a1,$bi
+       adcs    $acc2,$acc2,$t2
+       umulh   $t2,$a2,$bi
+       adcs    $acc3,$acc3,$t3
+       umulh   $t3,$a3,$bi
+       adc     $acc4,$acc4,xzr
+       mul     $t4,$acc0,$ordk
+       adds    $acc1,$acc1,$t0         // accumulate high parts
+       adcs    $acc2,$acc2,$t1
+       adcs    $acc3,$acc3,$t2
+       adcs    $acc4,$acc4,$t3
+       adc     $acc5,xzr,xzr
+___
+}
+$code.=<<___;
+       lsl     $t0,$t4,#32             // last reduction
+       subs    $acc2,$acc2,$t4
+       lsr     $t1,$t4,#32
+       sbcs    $acc3,$acc3,$t0
+       sbcs    $acc4,$acc4,$t1
+       sbc     $acc5,$acc5,xzr
+
+       subs    xzr,$acc0,#1
+       umulh   $t1,$ord0,$t4
+       mul     $t2,$ord1,$t4
+       umulh   $t3,$ord1,$t4
+
+       adcs    $t2,$t2,$t1
+       adc     $t3,$t3,xzr
+
+       adds    $acc0,$acc1,$t2
+       adcs    $acc1,$acc2,$t3
+       adcs    $acc2,$acc3,$t4
+       adcs    $acc3,$acc4,$t4
+       adc     $acc4,$acc5,xzr
+
+       subs    $t0,$acc0,$ord0         // ret -= modulus
+       sbcs    $t1,$acc1,$ord1
+       sbcs    $t2,$acc2,$ord2
+       sbcs    $t3,$acc3,$ord3
+       sbcs    xzr,$acc4,xzr
+
+       csel    $acc0,$acc0,$t0,lo      // ret = borrow ? ret : ret-modulus
+       csel    $acc1,$acc1,$t1,lo
+       csel    $acc2,$acc2,$t2,lo
+       stp     $acc0,$acc1,[$rp]
+       csel    $acc3,$acc3,$t3,lo
+       stp     $acc2,$acc3,[$rp,#16]
+
+       ldp     x19,x20,[sp,#16]
+       ldp     x21,x22,[sp,#32]
+       ldp     x23,x24,[sp,#48]
+       ldr     x29,[sp],#64
+       ret
+.size  ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
+//                                int rep);
+.globl ecp_nistz256_ord_sqr_mont
+.type  ecp_nistz256_ord_sqr_mont,%function
+.align 4
+ecp_nistz256_ord_sqr_mont:
+       stp     x29,x30,[sp,#-64]!
+       add     x29,sp,#0
+       stp     x19,x20,[sp,#16]
+       stp     x21,x22,[sp,#32]
+       stp     x23,x24,[sp,#48]
+
+       adr     $ordk,.Lord
+       ldp     $a0,$a1,[$ap]
+       ldp     $a2,$a3,[$ap,#16]
+
+       ldp     $ord0,$ord1,[$ordk,#0]
+       ldp     $ord2,$ord3,[$ordk,#16]
+       ldr     $ordk,[$ordk,#32]
+       b       .Loop_ord_sqr
+
+.align 4
+.Loop_ord_sqr:
+       sub     $bp,$bp,#1
+       ////////////////////////////////////////////////////////////////
+       //  |  |  |  |  |  |a1*a0|  |
+       //  |  |  |  |  |a2*a0|  |  |
+       //  |  |a3*a2|a3*a0|  |  |  |
+       //  |  |  |  |a2*a1|  |  |  |
+       //  |  |  |a3*a1|  |  |  |  |
+       // *|  |  |  |  |  |  |  | 2|
+       // +|a3*a3|a2*a2|a1*a1|a0*a0|
+       //  |--+--+--+--+--+--+--+--|
+       //  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
+       //
+       //  "can't overflow" below mark carrying into high part of
+       //  multiplication result, which can't overflow, because it
+       //  can never be all ones.
+
+       mul     $acc1,$a1,$a0           // a[1]*a[0]
+       umulh   $t1,$a1,$a0
+       mul     $acc2,$a2,$a0           // a[2]*a[0]
+       umulh   $t2,$a2,$a0
+       mul     $acc3,$a3,$a0           // a[3]*a[0]
+       umulh   $acc4,$a3,$a0
+
+       adds    $acc2,$acc2,$t1         // accumulate high parts of multiplication
+        mul    $t0,$a2,$a1             // a[2]*a[1]
+        umulh  $t1,$a2,$a1
+       adcs    $acc3,$acc3,$t2
+        mul    $t2,$a3,$a1             // a[3]*a[1]
+        umulh  $t3,$a3,$a1
+       adc     $acc4,$acc4,xzr         // can't overflow
+
+       mul     $acc5,$a3,$a2           // a[3]*a[2]
+       umulh   $acc6,$a3,$a2
+
+       adds    $t1,$t1,$t2             // accumulate high parts of multiplication
+        mul    $acc0,$a0,$a0           // a[0]*a[0]
+       adc     $t2,$t3,xzr             // can't overflow
+
+       adds    $acc3,$acc3,$t0         // accumulate low parts of multiplication
+        umulh  $a0,$a0,$a0
+       adcs    $acc4,$acc4,$t1
+        mul    $t1,$a1,$a1             // a[1]*a[1]
+       adcs    $acc5,$acc5,$t2
+        umulh  $a1,$a1,$a1
+       adc     $acc6,$acc6,xzr         // can't overflow
+
+       adds    $acc1,$acc1,$acc1       // acc[1-6]*=2
+        mul    $t2,$a2,$a2             // a[2]*a[2]
+       adcs    $acc2,$acc2,$acc2
+        umulh  $a2,$a2,$a2
+       adcs    $acc3,$acc3,$acc3
+        mul    $t3,$a3,$a3             // a[3]*a[3]
+       adcs    $acc4,$acc4,$acc4
+        umulh  $a3,$a3,$a3
+       adcs    $acc5,$acc5,$acc5
+       adcs    $acc6,$acc6,$acc6
+       adc     $acc7,xzr,xzr
+
+       adds    $acc1,$acc1,$a0         // +a[i]*a[i]
+        mul    $t4,$acc0,$ordk
+       adcs    $acc2,$acc2,$t1
+       adcs    $acc3,$acc3,$a1
+       adcs    $acc4,$acc4,$t2
+       adcs    $acc5,$acc5,$a2
+       adcs    $acc6,$acc6,$t3
+       adc     $acc7,$acc7,$a3
+___
+for($i=0; $i<4; $i++) {                        # reductions
+$code.=<<___;
+       subs    xzr,$acc0,#1
+       umulh   $t1,$ord0,$t4
+       mul     $t2,$ord1,$t4
+       umulh   $t3,$ord1,$t4
+
+       adcs    $t2,$t2,$t1
+       adc     $t3,$t3,xzr
+
+       adds    $acc0,$acc1,$t2
+       adcs    $acc1,$acc2,$t3
+       adcs    $acc2,$acc3,$t4
+       adc     $acc3,xzr,$t4           // can't overflow
+___
+$code.=<<___   if ($i<3);
+       mul     $t3,$acc0,$ordk
+___
+$code.=<<___;
+       lsl     $t0,$t4,#32
+       subs    $acc1,$acc1,$t4
+       lsr     $t1,$t4,#32
+       sbcs    $acc2,$acc2,$t0
+       sbc     $acc3,$acc3,$t1         // can't borrow
+___
+       ($t3,$t4) = ($t4,$t3);
+}
+$code.=<<___;
+       adds    $acc0,$acc0,$acc4       // accumulate upper half
+       adcs    $acc1,$acc1,$acc5
+       adcs    $acc2,$acc2,$acc6
+       adcs    $acc3,$acc3,$acc7
+       adc     $acc4,xzr,xzr
+
+       subs    $t0,$acc0,$ord0         // ret -= modulus
+       sbcs    $t1,$acc1,$ord1
+       sbcs    $t2,$acc2,$ord2
+       sbcs    $t3,$acc3,$ord3
+       sbcs    xzr,$acc4,xzr
+
+       csel    $a0,$acc0,$t0,lo        // ret = borrow ? ret : ret-modulus
+       csel    $a1,$acc1,$t1,lo
+       csel    $a2,$acc2,$t2,lo
+       csel    $a3,$acc3,$t3,lo
+
+       cbnz    $bp,.Loop_ord_sqr
+
+       stp     $a0,$a1,[$rp]
+       stp     $a2,$a3,[$rp,#16]
+
+       ldp     x19,x20,[sp,#16]
+       ldp     x21,x22,[sp,#32]
+       ldp     x23,x24,[sp,#48]
+       ldr     x29,[sp],#64
+       ret
+.size  ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+___
 }      }
 
 ########################################################################
index 6c85884..6bae3d1 100644 (file)
@@ -1499,7 +1499,8 @@ static int ecp_nistz256_window_have_precompute_mult(const EC_GROUP *group)
 
 #if defined(__x86_64) || defined(__x86_64__) || \
     defined(_M_AMD64) || defined(_M_X64) || \
-    defined(__powerpc64__) || defined(_ARCH_PP64)
+    defined(__powerpc64__) || defined(_ARCH_PP64) || \
+    defined(__aarch64__)
 /*
  * Montgomery mul modulo Order(P): res = a*b*2^-256 mod Order(P)
  */