$num="x5"; # int num);
$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+.extern OPENSSL_armv8_rsa_neonized
+.hidden OPENSSL_armv8_rsa_neonized
+#endif
.text
.globl bn_mul_mont
.type bn_mul_mont,%function
.align 5
bn_mul_mont:
+.Lbn_mul_mont:
+ tst $num,#3
+ b.ne .Lmul_mont
+ cmp $num,#32
+ b.le .Lscalar_impl
+#ifndef __KERNEL__
+ adrp x17,OPENSSL_armv8_rsa_neonized
+ ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
+ cbnz w17, bn_mul8x_mont_neon
+#endif
+
+.Lscalar_impl:
tst $num,#7
b.eq __bn_sqr8x_mont
tst $num,#3
b.eq __bn_mul4x_mont
+
.Lmul_mont:
stp x29,x30,[sp,#-64]!
add x29,sp,#0
.size bn_mul_mont,.-bn_mul_mont
___
{
+my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
+my ($Z,$Temp)=("v4.16b","v5");
+my @ACC=map("v$_",(6..13));
+my ($Bi,$Ni,$M0)=map("v$_",(28..30));
+my $sBi="s28";
+my $sM0="s30";
+my $zero="v14";
+my $temp="v15";
+my $ACCTemp="v16";
+
+my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
+my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
+
+$code.=<<___;
+.type bn_mul8x_mont_neon,%function
+.align 5
+bn_mul8x_mont_neon:
+ stp x29,x30,[sp,#-80]!
+ mov x16,sp
+ stp d8,d9,[sp,#16]
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+ lsl $num,$num,#1
+ eor $zero.16b,$zero.16b,$zero.16b
+
+.align 4
+.LNEON_8n:
+ eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b
+ sub $toutptr,sp,#128
+ eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b
+ sub $toutptr,$toutptr,$num,lsl#4
+ eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b
+ and $toutptr,$toutptr,#-64
+ eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b
+ mov sp,$toutptr // alloca
+ eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b
+ add $toutptr,$toutptr,#256
+ eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b
+ sub $inner,$num,#8
+ eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b
+ eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b
+
+.LNEON_8n_init:
+ st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
+ subs $inner,$inner,#8
+ st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
+ st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
+ st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
+ bne .LNEON_8n_init
+
+ add $tinptr,sp,#256
+ ld1 {$A0.4s,$A1.4s},[$aptr],#32
+ add $bnptr,sp,#8
+ ldr $sM0,[$n0],#4
+ mov $outer,$num
+ b .LNEON_8n_outer
+
+.align 4
+.LNEON_8n_outer:
+ ldr $sBi,[$bptr],#4 // *b++
+ uxtl $Bi.4s,$Bi.4h
+ add $toutptr,sp,#128
+ ld1 {$N0.4s,$N1.4s},[$nptr],#32
+
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
+ shl $Ni.2d,@ACC[0].2d,#16
+ ext $Ni.16b,$Ni.16b,$Ni.16b,#8
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
+ add $Ni.2d,$Ni.2d,@ACC[0].2d
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
+ mul $Ni.2s,$Ni.2s,$M0.2s
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
+ st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0]
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
+ uxtl $Ni.4s,$Ni.4h
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+for ($i=0; $i<7;) {
+$code.=<<___;
+ ldr $sBi,[$bptr],#4 // *b++
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
+ uxtl $Bi.4s,$Bi.4h
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
+ ushr $temp.2d,@ACC[0].2d,#16
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
+ ushr @ACC[0].2d,@ACC[0].2d,#16
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
+ add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
+ ins @ACC[1].d[0],$ACCTemp.d[0]
+ st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
+___
+ push(@ACC,shift(@ACC)); $i++;
+$code.=<<___;
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
+ ld1 {@ACC[7].2d},[$tinptr],#16
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
+ shl $Ni.2d,@ACC[0].2d,#16
+ ext $Ni.16b,$Ni.16b,$Ni.16b,#8
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
+ add $Ni.2d,$Ni.2d,@ACC[0].2d
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
+ mul $Ni.2s,$Ni.2s,$M0.2s
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
+ st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i]
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
+ uxtl $Ni.4s,$Ni.4h
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+}
+$code.=<<___;
+ ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
+ ld1 {$A0.4s,$A1.4s},[$aptr],#32
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
+ mov $Temp.16b,@ACC[0].16b
+ ushr $Temp.2d,$Temp.2d,#16
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
+ add @ACC[0].2d,@ACC[0].2d,$Temp.2d
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
+ ushr @ACC[0].2d,@ACC[0].2d,#16
+ eor $temp.16b,$temp.16b,$temp.16b
+ ins @ACC[0].d[1],$temp.d[0]
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
+ add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d
+ st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
+ add $bnptr,sp,#8 // rewind
+___
+ push(@ACC,shift(@ACC));
+$code.=<<___;
+ sub $inner,$num,#8
+ b .LNEON_8n_inner
+
+.align 4
+.LNEON_8n_inner:
+ subs $inner,$inner,#8
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
+ ld1 {@ACC[7].2d},[$tinptr]
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
+ ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0]
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
+ ld1 {$N0.4s,$N1.4s},[$nptr],#32
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
+ b.eq .LInner_jump
+ add $tinptr,$tinptr,#16 // don't advance in last iteration
+.LInner_jump:
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+ ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i]
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
+ st1 {@ACC[0].2d},[$toutptr],#16
+___
+ push(@ACC,shift(@ACC));
+$code.=<<___;
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
+ ld1 {@ACC[7].2d},[$tinptr]
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
+ ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i]
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
+ b.eq .LInner_jump$i
+ add $tinptr,$tinptr,#16 // don't advance in last iteration
+.LInner_jump$i:
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+}
+$code.=<<___;
+ b.ne .LInner_after_rewind$i
+ sub $aptr,$aptr,$num,lsl#2 // rewind
+.LInner_after_rewind$i:
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
+ ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
+ ld1 {$A0.4s,$A1.4s},[$aptr],#32
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
+ add $bnptr,sp,#8 // rewind
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
+ st1 {@ACC[0].2d},[$toutptr],#16
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
+
+ bne .LNEON_8n_inner
+___
+ push(@ACC,shift(@ACC));
+$code.=<<___;
+ add $tinptr,sp,#128
+ st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
+ eor $N0.16b,$N0.16b,$N0.16b // $N0
+ st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
+ eor $N1.16b,$N1.16b,$N1.16b // $N1
+ st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
+ st1 {@ACC[6].2d},[$toutptr]
+
+ subs $outer,$outer,#8
+ ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
+ ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
+ ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
+ ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
+
+ b.eq .LInner_8n_jump_2steps
+ sub $nptr,$nptr,$num,lsl#2 // rewind
+ b .LNEON_8n_outer
+
+.LInner_8n_jump_2steps:
+ add $toutptr,sp,#128
+ st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame
+ mov $Temp.16b,@ACC[0].16b
+ ushr $temp.2d,@ACC[0].2d,#16
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+ st1 {$N0.2d,$N1.2d}, [sp],#32
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
+ st1 {$N0.2d,$N1.2d}, [sp],#32
+ ushr $temp.2d,@ACC[0].2d,#16
+ st1 {$N0.2d,$N1.2d}, [sp],#32
+ zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
+ ins $temp.d[1],$zero.d[0]
+
+ mov $inner,$num
+ b .LNEON_tail_entry
+
+.align 4
+.LNEON_tail:
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
+ mov $Temp.16b,@ACC[0].16b
+ ushr $temp.2d,@ACC[0].2d,#16
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+ ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
+ ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
+ ushr $temp.2d,@ACC[0].2d,#16
+ ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
+ zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
+ ins $temp.d[1],$zero.d[0]
+
+.LNEON_tail_entry:
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+ add @ACC[1].2d,@ACC[1].2d,$temp.2d
+ st1 {@ACC[0].s}[0], [$toutptr],#4
+ ushr $temp.2d,@ACC[1].2d,#16
+ mov $Temp.16b,@ACC[1].16b
+ ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
+ add @ACC[1].2d,@ACC[1].2d,$temp.2d
+ ushr $temp.2d,@ACC[1].2d,#16
+ zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h
+ ins $temp.d[1],$zero.d[0]
+___
+ push(@ACC,shift(@ACC));
+}
+ push(@ACC,shift(@ACC));
+$code.=<<___;
+ ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
+ subs $inner,$inner,#8
+ st1 {@ACC[7].s}[0], [$toutptr],#4
+ bne .LNEON_tail
+
+ st1 {$temp.s}[0], [$toutptr],#4 // top-most bit
+ sub $nptr,$nptr,$num,lsl#2 // rewind $nptr
+ subs $aptr,sp,#0 // clear carry flag
+ add $bptr,sp,$num,lsl#2
+
+.LNEON_sub:
+ ldp w4,w5,[$aptr],#8
+ ldp w6,w7,[$aptr],#8
+ ldp w8,w9,[$nptr],#8
+ ldp w10,w11,[$nptr],#8
+ sbcs w8,w4,w8
+ sbcs w9,w5,w9
+ sbcs w10,w6,w10
+ sbcs w11,w7,w11
+ sub x17,$bptr,$aptr
+ stp w8,w9,[$rptr],#8
+ stp w10,w11,[$rptr],#8
+ cbnz x17,.LNEON_sub
+
+ ldr w10, [$aptr] // load top-most bit
+ mov x11,sp
+ eor v0.16b,v0.16b,v0.16b
+ sub x11,$bptr,x11 // this is num*4
+ eor v1.16b,v1.16b,v1.16b
+ mov $aptr,sp
+ sub $rptr,$rptr,x11 // rewind $rptr
+ mov $nptr,$bptr // second 3/4th of frame
+ sbcs w10,w10,wzr // result is carry flag
+
+.LNEON_copy_n_zap:
+ ldp w4,w5,[$aptr],#8
+ ldp w6,w7,[$aptr],#8
+ ldp w8,w9,[$rptr],#8
+ ldp w10,w11,[$rptr]
+ sub $rptr,$rptr,#8
+ b.cs .LCopy_1
+ mov w8,w4
+ mov w9,w5
+ mov w10,w6
+ mov w11,w7
+.LCopy_1:
+ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
+ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
+ ldp w4,w5,[$aptr],#8
+ ldp w6,w7,[$aptr],#8
+ stp w8,w9,[$rptr],#8
+ stp w10,w11,[$rptr],#8
+ sub $aptr,$aptr,#32
+ ldp w8,w9,[$rptr],#8
+ ldp w10,w11,[$rptr]
+ sub $rptr,$rptr,#8
+ b.cs .LCopy_2
+ mov w8, w4
+ mov w9, w5
+ mov w10, w6
+ mov w11, w7
+.LCopy_2:
+ st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe
+ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
+ sub x17,$bptr,$aptr // preserves carry
+ stp w8,w9,[$rptr],#8
+ stp w10,w11,[$rptr],#8
+ cbnz x17,.LNEON_copy_n_zap
+
+ mov sp,x16
+ ldp d14,d15,[sp,#64]
+ ldp d12,d13,[sp,#48]
+ ldp d10,d11,[sp,#32]
+ ldp d8,d9,[sp,#16]
+ ldr x29,[sp],#80
+ ret // bx lr
+
+.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+___
+}
+{
########################################################################
# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.