Optimize RSA on armv8
authorfangming.fang <fangming.fang@arm.com>
Fri, 19 Mar 2021 06:45:57 +0000 (06:45 +0000)
committerPauli <pauli@openssl.org>
Sun, 9 May 2021 13:15:07 +0000 (23:15 +1000)
Add Neon path for RSA on armv8, this optimisation targets to A72
and N1 that are ones of important cores of infrastructure. Other
platforms are not impacted.

A72
                        old new             improved
rsa  512 sign 9828.6 9738.7 -1%
rsa  512 verify 121497.2 122367.7 1%
rsa 1024 sign 1818 1816.9 0%
rsa 1024 verify 37175.6 37161.3 0%
rsa 2048 sign 267.3 267.4 0%
rsa 2048 verify 10127.6 10119.6 0%
rsa 3072 sign 86.8 87 0%
rsa 3072 verify 4604.2 4956.2 8%
rsa 4096 sign 38.3 38.5 1%
rsa 4096 verify 2619.8 2972.1 13%
rsa 7680 sign 5 7 40%
rsa 7680 verify 756       929.4 23%
rsa 15360 sign 0.8       1 25%
rsa 15360 verify 190.4   246 29%

N1
                        old new             improved
rsa  512 sign 12599.2 12596.7 0%
rsa  512 verify 148636.1 148656.2 0%
rsa 1024 sign 2150.6 2148.9 0%
rsa 1024 verify 42353.5 42265.2 0%
rsa 2048 sign 305.5 305.3 0%
rsa 2048 verify 11209.7 11205.2 0%
rsa 3072 sign 97.8 98.2 0%
rsa 3072 verify 5061.3 5990.7 18%
rsa 4096 sign 42.8 43 0%
rsa 4096 verify 2867.6 3509.8 22%
rsa 7680 sign 5.5 8.4 53%
rsa 7680 verify 823.5 1058.3 29%
rsa 15360 sign 0.9 1.1 22%
rsa 15360 verify 207 273.9 32%

CustomizedGitHooks: yes
Change-Id: I01c732cc429d793c4eb5ffd27ccd30ff9cebf8af
Jira: SECLIB-540

Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Paul Dale <pauli@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/14761)

crypto/armcap.c
crypto/bn/asm/armv8-mont.pl
crypto/bn/build.info

index 0e7c0842adb78d2629f366b743e8c57646f20e9b..dc2326f8f6573d54cdef688530db3718b8ef0877 100644 (file)
@@ -19,6 +19,7 @@
 
 unsigned int OPENSSL_armcap_P = 0;
 unsigned int OPENSSL_arm_midr = 0;
+unsigned int OPENSSL_armv8_rsa_neonized = 0;
 
 #if __ARM_MAX_ARCH__<7
 void OPENSSL_cpuid_setup(void)
@@ -237,6 +238,12 @@ void OPENSSL_cpuid_setup(void)
 # ifdef __aarch64__
     if (OPENSSL_armcap_P & ARMV8_CPUID)
         OPENSSL_arm_midr = _armv8_cpuid_probe();
+
+    if ((MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) ||
+         MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1)) &&
+        (OPENSSL_armcap_P & ARMV7_NEON)) {
+            OPENSSL_armv8_rsa_neonized = 1;
+    }
 # endif
 }
 #endif
index e8bdfa3bb852ab3d8251e3523246ab3337a1c1d5..0867ccabeeb0554f8a01ea355c69b69746e8676e 100755 (executable)
@@ -67,16 +67,34 @@ $n0="x4";   # const BN_ULONG *n0,
 $num="x5";     # int num);
 
 $code.=<<___;
+#ifndef        __KERNEL__
+# include "arm_arch.h"
+.extern OPENSSL_armv8_rsa_neonized
+.hidden OPENSSL_armv8_rsa_neonized
+#endif
 .text
 
 .globl bn_mul_mont
 .type  bn_mul_mont,%function
 .align 5
 bn_mul_mont:
+.Lbn_mul_mont:
+       tst     $num,#3
+       b.ne    .Lmul_mont
+       cmp     $num,#32
+       b.le    .Lscalar_impl
+#ifndef        __KERNEL__
+       adrp    x17,OPENSSL_armv8_rsa_neonized
+       ldr     w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
+       cbnz    w17, bn_mul8x_mont_neon
+#endif
+
+.Lscalar_impl:
        tst     $num,#7
        b.eq    __bn_sqr8x_mont
        tst     $num,#3
        b.eq    __bn_mul4x_mont
+
 .Lmul_mont:
        stp     x29,x30,[sp,#-64]!
        add     x29,sp,#0
@@ -274,6 +292,369 @@ bn_mul_mont:
 .size  bn_mul_mont,.-bn_mul_mont
 ___
 {
+my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
+my ($Z,$Temp)=("v4.16b","v5");
+my @ACC=map("v$_",(6..13));
+my ($Bi,$Ni,$M0)=map("v$_",(28..30));
+my $sBi="s28";
+my $sM0="s30";
+my $zero="v14";
+my $temp="v15";
+my $ACCTemp="v16";
+
+my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
+my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
+
+$code.=<<___;
+.type  bn_mul8x_mont_neon,%function
+.align 5
+bn_mul8x_mont_neon:
+       stp     x29,x30,[sp,#-80]!
+       mov     x16,sp
+       stp     d8,d9,[sp,#16]
+       stp     d10,d11,[sp,#32]
+       stp     d12,d13,[sp,#48]
+       stp     d14,d15,[sp,#64]
+       lsl     $num,$num,#1
+       eor     $zero.16b,$zero.16b,$zero.16b
+
+.align 4
+.LNEON_8n:
+       eor     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b
+       sub     $toutptr,sp,#128
+       eor     @ACC[1].16b,@ACC[1].16b,@ACC[1].16b
+       sub     $toutptr,$toutptr,$num,lsl#4
+       eor     @ACC[2].16b,@ACC[2].16b,@ACC[2].16b
+       and     $toutptr,$toutptr,#-64
+       eor     @ACC[3].16b,@ACC[3].16b,@ACC[3].16b
+       mov     sp,$toutptr             // alloca
+       eor     @ACC[4].16b,@ACC[4].16b,@ACC[4].16b
+       add     $toutptr,$toutptr,#256
+       eor     @ACC[5].16b,@ACC[5].16b,@ACC[5].16b
+       sub     $inner,$num,#8
+       eor     @ACC[6].16b,@ACC[6].16b,@ACC[6].16b
+       eor     @ACC[7].16b,@ACC[7].16b,@ACC[7].16b
+
+.LNEON_8n_init:
+       st1     {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
+       subs    $inner,$inner,#8
+       st1     {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
+       st1     {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
+       st1     {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
+       bne     .LNEON_8n_init
+
+       add     $tinptr,sp,#256
+       ld1     {$A0.4s,$A1.4s},[$aptr],#32
+       add     $bnptr,sp,#8
+       ldr     $sM0,[$n0],#4
+       mov     $outer,$num
+       b       .LNEON_8n_outer
+
+.align 4
+.LNEON_8n_outer:
+       ldr     $sBi,[$bptr],#4   // *b++
+       uxtl    $Bi.4s,$Bi.4h
+       add     $toutptr,sp,#128
+       ld1     {$N0.4s,$N1.4s},[$nptr],#32
+
+       umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
+       umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
+       umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
+       shl     $Ni.2d,@ACC[0].2d,#16
+       ext     $Ni.16b,$Ni.16b,$Ni.16b,#8
+       umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
+       add     $Ni.2d,$Ni.2d,@ACC[0].2d
+       umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
+       mul     $Ni.2s,$Ni.2s,$M0.2s
+       umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
+       st1     {$Bi.2s},[sp]           // put aside smashed b[8*i+0]
+       umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
+       uxtl    $Ni.4s,$Ni.4h
+       umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+for ($i=0; $i<7;) {
+$code.=<<___;
+       ldr     $sBi,[$bptr],#4   // *b++
+       umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
+       umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
+       uxtl    $Bi.4s,$Bi.4h
+       umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
+       ushr    $temp.2d,@ACC[0].2d,#16
+       umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
+       umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
+       ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+       add     @ACC[0].2d,@ACC[0].2d,$temp.2d
+       umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
+       ushr    @ACC[0].2d,@ACC[0].2d,#16
+       umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
+       umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
+       add     $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
+       ins     @ACC[1].d[0],$ACCTemp.d[0]
+       st1     {$Ni.2s},[$bnptr],#8    // put aside smashed m[8*i+$i]
+___
+       push(@ACC,shift(@ACC)); $i++;
+$code.=<<___;
+       umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
+       ld1     {@ACC[7].2d},[$tinptr],#16
+       umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
+       umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
+       shl     $Ni.2d,@ACC[0].2d,#16
+       ext     $Ni.16b,$Ni.16b,$Ni.16b,#8
+       umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
+       add     $Ni.2d,$Ni.2d,@ACC[0].2d
+       umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
+       mul     $Ni.2s,$Ni.2s,$M0.2s
+       umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
+       st1     {$Bi.2s},[$bnptr],#8    // put aside smashed b[8*i+$i]
+       umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
+       uxtl    $Ni.4s,$Ni.4h
+       umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+}
+$code.=<<___;
+       ld1     {$Bi.2s},[sp]           // pull smashed b[8*i+0]
+       umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
+       ld1     {$A0.4s,$A1.4s},[$aptr],#32
+       umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
+       umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
+       mov     $Temp.16b,@ACC[0].16b
+       ushr    $Temp.2d,$Temp.2d,#16
+       ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+       umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
+       umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
+       add     @ACC[0].2d,@ACC[0].2d,$Temp.2d
+       umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
+       ushr    @ACC[0].2d,@ACC[0].2d,#16
+       eor     $temp.16b,$temp.16b,$temp.16b
+       ins     @ACC[0].d[1],$temp.d[0]
+       umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
+       umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
+       add     @ACC[1].2d,@ACC[1].2d,@ACC[0].2d
+       st1     {$Ni.2s},[$bnptr],#8    // put aside smashed m[8*i+$i]
+       add     $bnptr,sp,#8            // rewind
+___
+       push(@ACC,shift(@ACC));
+$code.=<<___;
+       sub     $inner,$num,#8
+       b       .LNEON_8n_inner
+
+.align 4
+.LNEON_8n_inner:
+       subs    $inner,$inner,#8
+       umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
+       ld1     {@ACC[7].2d},[$tinptr]
+       umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
+       ld1     {$Ni.2s},[$bnptr],#8    // pull smashed m[8*i+0]
+       umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
+       ld1     {$N0.4s,$N1.4s},[$nptr],#32
+       umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
+       b.eq    .LInner_jump
+       add     $tinptr,$tinptr,#16     // don't advance in last iteration
+.LInner_jump:
+       umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
+       umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
+       umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
+       umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+       ld1     {$Bi.2s},[$bnptr],#8    // pull smashed b[8*i+$i]
+       umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
+       umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
+       umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
+       umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
+       umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
+       umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
+       umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
+       umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
+       st1     {@ACC[0].2d},[$toutptr],#16
+___
+       push(@ACC,shift(@ACC));
+$code.=<<___;
+       umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
+       ld1     {@ACC[7].2d},[$tinptr]
+       umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
+       ld1     {$Ni.2s},[$bnptr],#8    // pull smashed m[8*i+$i]
+       umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
+       b.eq    .LInner_jump$i
+       add     $tinptr,$tinptr,#16     // don't advance in last iteration
+.LInner_jump$i:
+       umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
+       umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
+       umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
+       umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
+       umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+}
+$code.=<<___;
+       b.ne    .LInner_after_rewind$i
+       sub     $aptr,$aptr,$num,lsl#2  // rewind
+.LInner_after_rewind$i:
+       umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
+       ld1     {$Bi.2s},[sp]           // pull smashed b[8*i+0]
+       umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
+       ld1     {$A0.4s,$A1.4s},[$aptr],#32
+       umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
+       add     $bnptr,sp,#8            // rewind
+       umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
+       umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
+       umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
+       umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
+       st1     {@ACC[0].2d},[$toutptr],#16
+       umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
+
+       bne     .LNEON_8n_inner
+___
+       push(@ACC,shift(@ACC));
+$code.=<<___;
+       add     $tinptr,sp,#128
+       st1     {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
+       eor     $N0.16b,$N0.16b,$N0.16b // $N0
+       st1     {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
+       eor     $N1.16b,$N1.16b,$N1.16b // $N1
+       st1     {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
+       st1     {@ACC[6].2d},[$toutptr]
+
+       subs    $outer,$outer,#8
+       ld1     {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
+       ld1     {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
+       ld1     {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
+       ld1     {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
+
+       b.eq    .LInner_8n_jump_2steps
+       sub     $nptr,$nptr,$num,lsl#2  // rewind
+       b       .LNEON_8n_outer
+
+.LInner_8n_jump_2steps:
+       add     $toutptr,sp,#128
+       st1     {$N0.2d,$N1.2d}, [sp],#32       // start wiping stack frame
+       mov     $Temp.16b,@ACC[0].16b
+       ushr    $temp.2d,@ACC[0].2d,#16
+       ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+       st1     {$N0.2d,$N1.2d}, [sp],#32
+       add     @ACC[0].2d,@ACC[0].2d,$temp.2d
+       st1     {$N0.2d,$N1.2d}, [sp],#32
+       ushr    $temp.2d,@ACC[0].2d,#16
+       st1     {$N0.2d,$N1.2d}, [sp],#32
+       zip1    @ACC[0].4h,$Temp.4h,@ACC[0].4h
+       ins     $temp.d[1],$zero.d[0]
+
+       mov     $inner,$num
+       b       .LNEON_tail_entry
+
+.align 4
+.LNEON_tail:
+       add     @ACC[0].2d,@ACC[0].2d,$temp.2d
+       mov     $Temp.16b,@ACC[0].16b
+       ushr    $temp.2d,@ACC[0].2d,#16
+       ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+       ld1     {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
+       add     @ACC[0].2d,@ACC[0].2d,$temp.2d
+       ld1     {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
+       ushr    $temp.2d,@ACC[0].2d,#16
+       ld1     {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
+       zip1    @ACC[0].4h,$Temp.4h,@ACC[0].4h
+       ins     $temp.d[1],$zero.d[0]
+
+.LNEON_tail_entry:
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+       add     @ACC[1].2d,@ACC[1].2d,$temp.2d
+       st1     {@ACC[0].s}[0], [$toutptr],#4
+       ushr    $temp.2d,@ACC[1].2d,#16
+       mov     $Temp.16b,@ACC[1].16b
+       ext     @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
+       add     @ACC[1].2d,@ACC[1].2d,$temp.2d
+       ushr    $temp.2d,@ACC[1].2d,#16
+       zip1    @ACC[1].4h,$Temp.4h,@ACC[1].4h
+       ins     $temp.d[1],$zero.d[0]
+___
+       push(@ACC,shift(@ACC));
+}
+       push(@ACC,shift(@ACC));
+$code.=<<___;
+       ld1     {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
+       subs    $inner,$inner,#8
+       st1     {@ACC[7].s}[0], [$toutptr],#4
+       bne     .LNEON_tail
+
+       st1     {$temp.s}[0], [$toutptr],#4     // top-most bit
+       sub     $nptr,$nptr,$num,lsl#2          // rewind $nptr
+       subs    $aptr,sp,#0                     // clear carry flag
+       add     $bptr,sp,$num,lsl#2
+
+.LNEON_sub:
+       ldp     w4,w5,[$aptr],#8
+       ldp     w6,w7,[$aptr],#8
+       ldp     w8,w9,[$nptr],#8
+       ldp     w10,w11,[$nptr],#8
+       sbcs    w8,w4,w8
+       sbcs    w9,w5,w9
+       sbcs    w10,w6,w10
+       sbcs    w11,w7,w11
+       sub     x17,$bptr,$aptr
+       stp     w8,w9,[$rptr],#8
+       stp     w10,w11,[$rptr],#8
+       cbnz    x17,.LNEON_sub
+
+       ldr     w10, [$aptr]            // load top-most bit
+       mov     x11,sp
+       eor     v0.16b,v0.16b,v0.16b
+       sub     x11,$bptr,x11           // this is num*4
+       eor     v1.16b,v1.16b,v1.16b
+       mov     $aptr,sp
+       sub     $rptr,$rptr,x11         // rewind $rptr
+       mov     $nptr,$bptr             // second 3/4th of frame
+       sbcs    w10,w10,wzr             // result is carry flag
+
+.LNEON_copy_n_zap:
+       ldp     w4,w5,[$aptr],#8
+       ldp     w6,w7,[$aptr],#8
+       ldp     w8,w9,[$rptr],#8
+       ldp     w10,w11,[$rptr]
+       sub     $rptr,$rptr,#8
+       b.cs    .LCopy_1
+       mov     w8,w4
+       mov     w9,w5
+       mov     w10,w6
+       mov     w11,w7
+.LCopy_1:
+       st1     {v0.2d,v1.2d}, [$nptr],#32              // wipe
+       st1     {v0.2d,v1.2d}, [$nptr],#32              // wipe
+       ldp     w4,w5,[$aptr],#8
+       ldp     w6,w7,[$aptr],#8
+       stp     w8,w9,[$rptr],#8
+       stp     w10,w11,[$rptr],#8
+       sub     $aptr,$aptr,#32
+       ldp     w8,w9,[$rptr],#8
+       ldp     w10,w11,[$rptr]
+       sub     $rptr,$rptr,#8
+       b.cs    .LCopy_2
+       mov     w8, w4
+       mov     w9, w5
+       mov     w10, w6
+       mov     w11, w7
+.LCopy_2:
+       st1     {v0.2d,v1.2d}, [$aptr],#32              // wipe
+       st1     {v0.2d,v1.2d}, [$nptr],#32              // wipe
+       sub     x17,$bptr,$aptr         // preserves carry
+       stp     w8,w9,[$rptr],#8
+       stp     w10,w11,[$rptr],#8
+       cbnz    x17,.LNEON_copy_n_zap
+
+       mov     sp,x16
+       ldp     d14,d15,[sp,#64]
+       ldp     d12,d13,[sp,#48]
+       ldp     d10,d11,[sp,#32]
+       ldp     d8,d9,[sp,#16]
+       ldr     x29,[sp],#80
+       ret                     // bx lr
+
+.size  bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+___
+}
+{
 ########################################################################
 # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
 
index 3c32e830672c03452bbdec5983e8b5121dbf70c6..d0c1034bde38a598e9ef2ef0a55e7ae1df784bdc 100644 (file)
@@ -177,3 +177,4 @@ INCLUDE[armv4-mont.o]=..
 GENERATE[armv4-gf2m.S]=asm/armv4-gf2m.pl
 INCLUDE[armv4-gf2m.o]=..
 GENERATE[armv8-mont.S]=asm/armv8-mont.pl
+INCLUDE[armv8-mont.o]=..