X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=blobdiff_plain;f=crypto%2Fec%2Fasm%2Fecp_nistz256-armv8.pl;h=887ddfb1ea9b1eb8f9655535353efc1ea3f9f86f;hp=5a60f0b2fa144b32816c1b321f59c3e1954ef39c;hb=72a7a7021fa8bc82a11bc08bac1b0241a92143d0;hpb=a5aa63a456fb6e7129e3ba048b17d157df5a1bab diff --git a/crypto/ec/asm/ecp_nistz256-armv8.pl b/crypto/ec/asm/ecp_nistz256-armv8.pl index 5a60f0b2fa..887ddfb1ea 100644 --- a/crypto/ec/asm/ecp_nistz256-armv8.pl +++ b/crypto/ec/asm/ecp_nistz256-armv8.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2015-2019 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov for the OpenSSL @@ -15,11 +22,10 @@ # http://eprint.iacr.org/2013/816. # # with/without -DECP_NISTZ256_ASM -# Apple A7 +120-360% -# Cortex-A53 +120-400% -# Cortex-A57 +120-350% -# X-Gene +200-330% -# Denver +140-400% +# Apple A7 +190-360% +# Cortex-A53 +190-400% +# Cortex-A57 +190-350% +# Denver +230-400% # # Ranges denote minimum and maximum improvement coefficients depending # on benchmark. Lower coefficients are for ECDSA sign, server-side @@ -102,6 +108,10 @@ $code.=<<___; .quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe .Lone: .quad 1,0,0,0 +.Lord: +.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 +.LordK: +.quad 0xccd1c8aaee00bc4f .asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by " // void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); @@ -109,6 +119,7 @@ $code.=<<___; .type ecp_nistz256_to_mont,%function .align 6 ecp_nistz256_to_mont: + .inst 0xd503233f // paciasp stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -124,6 +135,7 @@ ecp_nistz256_to_mont: ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 + .inst 0xd50323bf // autiasp ret .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont @@ -132,6 +144,7 @@ ecp_nistz256_to_mont: .type ecp_nistz256_from_mont,%function .align 4 ecp_nistz256_from_mont: + .inst 0xd503233f // paciasp stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -147,6 +160,7 @@ ecp_nistz256_from_mont: ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 + .inst 0xd50323bf // autiasp ret .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont @@ -156,6 +170,7 @@ ecp_nistz256_from_mont: .type ecp_nistz256_mul_mont,%function .align 4 ecp_nistz256_mul_mont: + .inst 0xd503233f // paciasp stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -170,6 +185,7 @@ ecp_nistz256_mul_mont: ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 + .inst 0xd50323bf // autiasp ret .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont @@ -178,6 +194,7 @@ ecp_nistz256_mul_mont: .type ecp_nistz256_sqr_mont,%function .align 4 ecp_nistz256_sqr_mont: + .inst 0xd503233f // paciasp stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -191,6 +208,7 @@ ecp_nistz256_sqr_mont: ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 + .inst 0xd50323bf // autiasp ret .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont @@ -200,6 +218,7 @@ ecp_nistz256_sqr_mont: .type ecp_nistz256_add,%function .align 4 ecp_nistz256_add: + .inst 0xd503233f // paciasp stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -213,6 +232,7 @@ ecp_nistz256_add: bl __ecp_nistz256_add ldp x29,x30,[sp],#16 + .inst 0xd50323bf // autiasp ret .size ecp_nistz256_add,.-ecp_nistz256_add @@ -221,6 +241,7 @@ ecp_nistz256_add: .type ecp_nistz256_div_by_2,%function .align 4 ecp_nistz256_div_by_2: + .inst 0xd503233f // paciasp stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -232,6 +253,7 @@ ecp_nistz256_div_by_2: bl __ecp_nistz256_div_by_2 ldp x29,x30,[sp],#16 + .inst 0xd50323bf // autiasp ret .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 @@ -240,6 +262,7 @@ ecp_nistz256_div_by_2: .type ecp_nistz256_mul_by_2,%function .align 4 ecp_nistz256_mul_by_2: + .inst 0xd503233f // paciasp stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -255,6 +278,7 @@ ecp_nistz256_mul_by_2: bl __ecp_nistz256_add // ret = a+a // 2*a ldp x29,x30,[sp],#16 + .inst 0xd50323bf // autiasp ret .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 @@ -263,6 +287,7 @@ ecp_nistz256_mul_by_2: .type ecp_nistz256_mul_by_3,%function .align 4 ecp_nistz256_mul_by_3: + .inst 0xd503233f // paciasp stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -289,6 +314,7 @@ ecp_nistz256_mul_by_3: bl __ecp_nistz256_add // ret += a // 2*a+a=3*a ldp x29,x30,[sp],#16 + .inst 0xd50323bf // autiasp ret .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 @@ -298,6 +324,7 @@ ecp_nistz256_mul_by_3: .type ecp_nistz256_sub,%function .align 4 ecp_nistz256_sub: + .inst 0xd503233f // paciasp stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -309,6 +336,7 @@ ecp_nistz256_sub: bl __ecp_nistz256_sub_from ldp x29,x30,[sp],#16 + .inst 0xd50323bf // autiasp ret .size ecp_nistz256_sub,.-ecp_nistz256_sub @@ -317,6 +345,7 @@ ecp_nistz256_sub: .type ecp_nistz256_neg,%function .align 4 ecp_nistz256_neg: + .inst 0xd503233f // paciasp stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -331,6 +360,7 @@ ecp_nistz256_neg: bl __ecp_nistz256_sub_from ldp x29,x30,[sp],#16 + .inst 0xd50323bf // autiasp ret .size ecp_nistz256_neg,.-ecp_nistz256_neg @@ -576,14 +606,14 @@ __ecp_nistz256_add: adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus sbcs $t1,$acc1,$poly1 sbcs $t2,$acc2,xzr - sbc $t3,$acc3,$poly3 - cmp $ap,xzr // did addition carry? + sbcs $t3,$acc3,$poly3 + sbcs xzr,$ap,xzr // did subtraction borrow? - csel $acc0,$acc0,$t0,eq // ret = carry ? ret-modulus : ret - csel $acc1,$acc1,$t1,eq - csel $acc2,$acc2,$t2,eq + csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $acc1,$acc1,$t1,lo + csel $acc2,$acc2,$t2,lo stp $acc0,$acc1,[$rp] - csel $acc3,$acc3,$t3,eq + csel $acc3,$acc3,$t3,lo stp $acc2,$acc3,[$rp,#16] ret @@ -653,7 +683,7 @@ __ecp_nistz256_div_by_2: adc $ap,xzr,xzr // zap $ap tst $acc0,#1 // is a even? - csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus + csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus csel $acc1,$acc1,$t1,eq csel $acc2,$acc2,$t2,eq csel $acc3,$acc3,$t3,eq @@ -674,7 +704,7 @@ __ecp_nistz256_div_by_2: .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 ___ ######################################################################## -# following subroutines are "literal" implemetation of those found in +# following subroutines are "literal" implementation of those found in # ecp_nistz256.c # ######################################################################## @@ -691,6 +721,7 @@ $code.=<<___; .type ecp_nistz256_point_double,%function .align 5 ecp_nistz256_point_double: + .inst 0xd503233f // paciasp stp x29,x30,[sp,#-80]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -825,6 +856,7 @@ ecp_nistz256_point_double: ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x29,x30,[sp],#80 + .inst 0xd50323bf // autiasp ret .size ecp_nistz256_point_double,.-ecp_nistz256_point_double ___ @@ -847,6 +879,7 @@ $code.=<<___; .type ecp_nistz256_point_add,%function .align 5 ecp_nistz256_point_add: + .inst 0xd503233f // paciasp stp x29,x30,[sp,#-80]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -855,46 +888,28 @@ ecp_nistz256_point_add: stp x25,x26,[sp,#64] sub sp,sp,#32*12 - ldp $a0,$a1,[$bp] - ldp $a2,$a3,[$bp,#16] - ldp $t0,$t1,[$bp,#32] - ldp $t2,$t3,[$bp,#48] + ldp $a0,$a1,[$bp,#64] // in2_z + ldp $a2,$a3,[$bp,#64+16] mov $rp_real,$rp mov $ap_real,$ap mov $bp_real,$bp - orr $a0,$a0,$a1 - orr $a2,$a2,$a3 - ldp $acc0,$acc1,[$ap] - orr $t0,$t0,$t1 - orr $t2,$t2,$t3 - ldp $acc2,$acc3,[$ap,#16] - orr $a0,$a0,$a2 - orr $t2,$t0,$t2 - ldp $t0,$t1,[$ap,#32] - orr $in2infty,$a0,$t2 - cmp $in2infty,#0 - ldp $t2,$t3,[$ap,#48] - csetm $in2infty,ne // !in2infty - - ldp $a0,$a1,[$bp_real,#64] // forward load for p256_sqr_mont - orr $acc0,$acc0,$acc1 - orr $acc2,$acc2,$acc3 - ldp $a2,$a3,[$bp_real,#64+16] - orr $t0,$t0,$t1 - orr $t2,$t2,$t3 - orr $acc0,$acc0,$acc2 - orr $t0,$t0,$t2 - orr $in1infty,$acc0,$t0 - cmp $in1infty,#0 ldr $poly1,.Lpoly+8 ldr $poly3,.Lpoly+24 - csetm $in1infty,ne // !in1infty - + orr $t0,$a0,$a1 + orr $t2,$a2,$a3 + orr $in2infty,$t0,$t2 + cmp $in2infty,#0 + csetm $in2infty,ne // !in2infty add $rp,sp,#$Z2sqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); - ldp $a0,$a1,[$ap_real,#64] + ldp $a0,$a1,[$ap_real,#64] // in1_z ldp $a2,$a3,[$ap_real,#64+16] + orr $t0,$a0,$a1 + orr $t2,$a2,$a3 + orr $in1infty,$t0,$t2 + cmp $in1infty,#0 + csetm $in1infty,ne // !in1infty add $rp,sp,#$Z1sqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); @@ -1102,12 +1117,13 @@ $code.=<<___; stp $acc2,$acc3,[$rp_real,#$i+16] .Ladd_done: - add sp,x29,#0 // destroy frame + add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x29,x30,[sp],#80 + .inst 0xd50323bf // autiasp ret .size ecp_nistz256_point_add,.-ecp_nistz256_point_add ___ @@ -1129,6 +1145,7 @@ $code.=<<___; .type ecp_nistz256_point_add_affine,%function .align 5 ecp_nistz256_point_add_affine: + .inst 0xd503233f // paciasp stp x29,x30,[sp,#-80]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -1143,36 +1160,28 @@ ecp_nistz256_point_add_affine: ldr $poly1,.Lpoly+8 ldr $poly3,.Lpoly+24 - ldp $a0,$a1,[$ap] - ldp $a2,$a3,[$ap,#16] - ldp $t0,$t1,[$ap,#32] - ldp $t2,$t3,[$ap,#48] - orr $a0,$a0,$a1 - orr $a2,$a2,$a3 - orr $t0,$t0,$t1 - orr $t2,$t2,$t3 - orr $a0,$a0,$a2 - orr $t0,$t0,$t2 - orr $in1infty,$a0,$t0 + ldp $a0,$a1,[$ap,#64] // in1_z + ldp $a2,$a3,[$ap,#64+16] + orr $t0,$a0,$a1 + orr $t2,$a2,$a3 + orr $in1infty,$t0,$t2 cmp $in1infty,#0 csetm $in1infty,ne // !in1infty - ldp $a0,$a1,[$bp] - ldp $a2,$a3,[$bp,#16] - ldp $t0,$t1,[$bp,#32] + ldp $acc0,$acc1,[$bp] // in2_x + ldp $acc2,$acc3,[$bp,#16] + ldp $t0,$t1,[$bp,#32] // in2_y ldp $t2,$t3,[$bp,#48] - orr $a0,$a0,$a1 - orr $a2,$a2,$a3 + orr $acc0,$acc0,$acc1 + orr $acc2,$acc2,$acc3 orr $t0,$t0,$t1 orr $t2,$t2,$t3 - orr $a0,$a0,$a2 + orr $acc0,$acc0,$acc2 orr $t0,$t0,$t2 - orr $in2infty,$a0,$t0 + orr $in2infty,$acc0,$t0 cmp $in2infty,#0 csetm $in2infty,ne // !in2infty - ldp $a0,$a1,[$ap_real,#64] - ldp $a2,$a3,[$ap_real,#64+16] add $rp,sp,#$Z1sqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); @@ -1325,9 +1334,306 @@ $code.=<<___; ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x29,x30,[sp],#80 + .inst 0xd50323bf // autiasp ret .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine ___ +} +if (1) { +my ($ord0,$ord1) = ($poly1,$poly3); +my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24)); +my $acc7 = $bi; + +$code.=<<___; +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], +// uint64_t b[4]); +.globl ecp_nistz256_ord_mul_mont +.type ecp_nistz256_ord_mul_mont,%function +.align 4 +ecp_nistz256_ord_mul_mont: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adr $ordk,.Lord + ldr $bi,[$bp] // bp[0] + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + + ldp $ord0,$ord1,[$ordk,#0] + ldp $ord2,$ord3,[$ordk,#16] + ldr $ordk,[$ordk,#32] + + mul $acc0,$a0,$bi // a[0]*b[0] + umulh $t0,$a0,$bi + + mul $acc1,$a1,$bi // a[1]*b[0] + umulh $t1,$a1,$bi + + mul $acc2,$a2,$bi // a[2]*b[0] + umulh $t2,$a2,$bi + + mul $acc3,$a3,$bi // a[3]*b[0] + umulh $acc4,$a3,$bi + + mul $t4,$acc0,$ordk + + adds $acc1,$acc1,$t0 // accumulate high parts of multiplication + adcs $acc2,$acc2,$t1 + adcs $acc3,$acc3,$t2 + adc $acc4,$acc4,xzr + mov $acc5,xzr +___ +for ($i=1;$i<4;$i++) { + ################################################################ + # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz + # * abcdefgh + # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx + # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 + # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh +$code.=<<___; + ldr $bi,[$bp,#8*$i] // b[i] + + lsl $t0,$t4,#32 + subs $acc2,$acc2,$t4 + lsr $t1,$t4,#32 + sbcs $acc3,$acc3,$t0 + sbcs $acc4,$acc4,$t1 + sbc $acc5,$acc5,xzr + + subs xzr,$acc0,#1 + umulh $t1,$ord0,$t4 + mul $t2,$ord1,$t4 + umulh $t3,$ord1,$t4 + + adcs $t2,$t2,$t1 + mul $t0,$a0,$bi + adc $t3,$t3,xzr + mul $t1,$a1,$bi + + adds $acc0,$acc1,$t2 + mul $t2,$a2,$bi + adcs $acc1,$acc2,$t3 + mul $t3,$a3,$bi + adcs $acc2,$acc3,$t4 + adcs $acc3,$acc4,$t4 + adc $acc4,$acc5,xzr + + adds $acc0,$acc0,$t0 // accumulate low parts + umulh $t0,$a0,$bi + adcs $acc1,$acc1,$t1 + umulh $t1,$a1,$bi + adcs $acc2,$acc2,$t2 + umulh $t2,$a2,$bi + adcs $acc3,$acc3,$t3 + umulh $t3,$a3,$bi + adc $acc4,$acc4,xzr + mul $t4,$acc0,$ordk + adds $acc1,$acc1,$t0 // accumulate high parts + adcs $acc2,$acc2,$t1 + adcs $acc3,$acc3,$t2 + adcs $acc4,$acc4,$t3 + adc $acc5,xzr,xzr +___ +} +$code.=<<___; + lsl $t0,$t4,#32 // last reduction + subs $acc2,$acc2,$t4 + lsr $t1,$t4,#32 + sbcs $acc3,$acc3,$t0 + sbcs $acc4,$acc4,$t1 + sbc $acc5,$acc5,xzr + + subs xzr,$acc0,#1 + umulh $t1,$ord0,$t4 + mul $t2,$ord1,$t4 + umulh $t3,$ord1,$t4 + + adcs $t2,$t2,$t1 + adc $t3,$t3,xzr + + adds $acc0,$acc1,$t2 + adcs $acc1,$acc2,$t3 + adcs $acc2,$acc3,$t4 + adcs $acc3,$acc4,$t4 + adc $acc4,$acc5,xzr + + subs $t0,$acc0,$ord0 // ret -= modulus + sbcs $t1,$acc1,$ord1 + sbcs $t2,$acc2,$ord2 + sbcs $t3,$acc3,$ord3 + sbcs xzr,$acc4,xzr + + csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $acc1,$acc1,$t1,lo + csel $acc2,$acc2,$t2,lo + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,lo + stp $acc2,$acc3,[$rp,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret +.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], +// int rep); +.globl ecp_nistz256_ord_sqr_mont +.type ecp_nistz256_ord_sqr_mont,%function +.align 4 +ecp_nistz256_ord_sqr_mont: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adr $ordk,.Lord + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + + ldp $ord0,$ord1,[$ordk,#0] + ldp $ord2,$ord3,[$ordk,#16] + ldr $ordk,[$ordk,#32] + b .Loop_ord_sqr + +.align 4 +.Loop_ord_sqr: + sub $bp,$bp,#1 + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul $acc1,$a1,$a0 // a[1]*a[0] + umulh $t1,$a1,$a0 + mul $acc2,$a2,$a0 // a[2]*a[0] + umulh $t2,$a2,$a0 + mul $acc3,$a3,$a0 // a[3]*a[0] + umulh $acc4,$a3,$a0 + + adds $acc2,$acc2,$t1 // accumulate high parts of multiplication + mul $t0,$a2,$a1 // a[2]*a[1] + umulh $t1,$a2,$a1 + adcs $acc3,$acc3,$t2 + mul $t2,$a3,$a1 // a[3]*a[1] + umulh $t3,$a3,$a1 + adc $acc4,$acc4,xzr // can't overflow + + mul $acc5,$a3,$a2 // a[3]*a[2] + umulh $acc6,$a3,$a2 + + adds $t1,$t1,$t2 // accumulate high parts of multiplication + mul $acc0,$a0,$a0 // a[0]*a[0] + adc $t2,$t3,xzr // can't overflow + + adds $acc3,$acc3,$t0 // accumulate low parts of multiplication + umulh $a0,$a0,$a0 + adcs $acc4,$acc4,$t1 + mul $t1,$a1,$a1 // a[1]*a[1] + adcs $acc5,$acc5,$t2 + umulh $a1,$a1,$a1 + adc $acc6,$acc6,xzr // can't overflow + + adds $acc1,$acc1,$acc1 // acc[1-6]*=2 + mul $t2,$a2,$a2 // a[2]*a[2] + adcs $acc2,$acc2,$acc2 + umulh $a2,$a2,$a2 + adcs $acc3,$acc3,$acc3 + mul $t3,$a3,$a3 // a[3]*a[3] + adcs $acc4,$acc4,$acc4 + umulh $a3,$a3,$a3 + adcs $acc5,$acc5,$acc5 + adcs $acc6,$acc6,$acc6 + adc $acc7,xzr,xzr + + adds $acc1,$acc1,$a0 // +a[i]*a[i] + mul $t4,$acc0,$ordk + adcs $acc2,$acc2,$t1 + adcs $acc3,$acc3,$a1 + adcs $acc4,$acc4,$t2 + adcs $acc5,$acc5,$a2 + adcs $acc6,$acc6,$t3 + adc $acc7,$acc7,$a3 +___ +for($i=0; $i<4; $i++) { # reductions +$code.=<<___; + subs xzr,$acc0,#1 + umulh $t1,$ord0,$t4 + mul $t2,$ord1,$t4 + umulh $t3,$ord1,$t4 + + adcs $t2,$t2,$t1 + adc $t3,$t3,xzr + + adds $acc0,$acc1,$t2 + adcs $acc1,$acc2,$t3 + adcs $acc2,$acc3,$t4 + adc $acc3,xzr,$t4 // can't overflow +___ +$code.=<<___ if ($i<3); + mul $t3,$acc0,$ordk +___ +$code.=<<___; + lsl $t0,$t4,#32 + subs $acc1,$acc1,$t4 + lsr $t1,$t4,#32 + sbcs $acc2,$acc2,$t0 + sbc $acc3,$acc3,$t1 // can't borrow +___ + ($t3,$t4) = ($t4,$t3); +} +$code.=<<___; + adds $acc0,$acc0,$acc4 // accumulate upper half + adcs $acc1,$acc1,$acc5 + adcs $acc2,$acc2,$acc6 + adcs $acc3,$acc3,$acc7 + adc $acc4,xzr,xzr + + subs $t0,$acc0,$ord0 // ret -= modulus + sbcs $t1,$acc1,$ord1 + sbcs $t2,$acc2,$ord2 + sbcs $t3,$acc3,$ord3 + sbcs xzr,$acc4,xzr + + csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $a1,$acc1,$t1,lo + csel $a2,$acc2,$t2,lo + csel $a3,$acc3,$t3,lo + + cbnz $bp,.Loop_ord_sqr + + stp $a0,$a1,[$rp] + stp $a2,$a3,[$rp,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret +.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont +___ } } ######################################################################## @@ -1496,21 +1802,21 @@ ecp_nistz256_scatter_w7: prfm pstl1strm,[$out,#4096+64*5] prfm pstl1strm,[$out,#4096+64*6] prfm pstl1strm,[$out,#4096+64*7] - strb w3,[$out,#64*0-1] + strb w3,[$out,#64*0] lsr x3,x3,#8 - strb w3,[$out,#64*1-1] + strb w3,[$out,#64*1] lsr x3,x3,#8 - strb w3,[$out,#64*2-1] + strb w3,[$out,#64*2] lsr x3,x3,#8 - strb w3,[$out,#64*3-1] + strb w3,[$out,#64*3] lsr x3,x3,#8 - strb w3,[$out,#64*4-1] + strb w3,[$out,#64*4] lsr x3,x3,#8 - strb w3,[$out,#64*5-1] + strb w3,[$out,#64*5] lsr x3,x3,#8 - strb w3,[$out,#64*6-1] + strb w3,[$out,#64*6] lsr x3,x3,#8 - strb w3,[$out,#64*7-1] + strb w3,[$out,#64*7] add $out,$out,#64*8 b.ne .Loop_scatter_w7