+___
+{
+########################################################################
+# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
+
+my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
+my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
+my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
+my ($cnt,$carry,$topmost)=("x27","x28","x30");
+my ($tp,$ap_end,$na0)=($bp,$np,$carry);
+
+$code.=<<___;
+.type __bn_sqr8x_mont,%function
+.align 5
+__bn_sqr8x_mont:
+ cmp $ap,$bp
+ b.ne __bn_mul4x_mont
+.Lsqr8x_mont:
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ stp $rp,$np,[sp,#96] // offload rp and np
+
+ ldp $a0,$a1,[$ap,#8*0]
+ ldp $a2,$a3,[$ap,#8*2]
+ ldp $a4,$a5,[$ap,#8*4]
+ ldp $a6,$a7,[$ap,#8*6]
+
+ sub $tp,sp,$num,lsl#4
+ lsl $num,$num,#3
+ ldr $n0,[$n0] // *n0
+ mov sp,$tp // alloca
+ sub $cnt,$num,#8*8
+ b .Lsqr8x_zero_start
+
+.Lsqr8x_zero:
+ sub $cnt,$cnt,#8*8
+ stp xzr,xzr,[$tp,#8*0]
+ stp xzr,xzr,[$tp,#8*2]
+ stp xzr,xzr,[$tp,#8*4]
+ stp xzr,xzr,[$tp,#8*6]
+.Lsqr8x_zero_start:
+ stp xzr,xzr,[$tp,#8*8]
+ stp xzr,xzr,[$tp,#8*10]
+ stp xzr,xzr,[$tp,#8*12]
+ stp xzr,xzr,[$tp,#8*14]
+ add $tp,$tp,#8*16
+ cbnz $cnt,.Lsqr8x_zero
+
+ add $ap_end,$ap,$num
+ add $ap,$ap,#8*8
+ mov $acc0,xzr
+ mov $acc1,xzr
+ mov $acc2,xzr
+ mov $acc3,xzr
+ mov $acc4,xzr
+ mov $acc5,xzr
+ mov $acc6,xzr
+ mov $acc7,xzr
+ mov $tp,sp
+ str $n0,[x29,#112] // offload n0
+
+ // Multiply everything but a[i]*a[i]
+.align 4
+.Lsqr8x_outer_loop:
+ // a[1]a[0] (i)
+ // a[2]a[0]
+ // a[3]a[0]
+ // a[4]a[0]
+ // a[5]a[0]
+ // a[6]a[0]
+ // a[7]a[0]
+ // a[2]a[1] (ii)
+ // a[3]a[1]
+ // a[4]a[1]
+ // a[5]a[1]
+ // a[6]a[1]
+ // a[7]a[1]
+ // a[3]a[2] (iii)
+ // a[4]a[2]
+ // a[5]a[2]
+ // a[6]a[2]
+ // a[7]a[2]
+ // a[4]a[3] (iv)
+ // a[5]a[3]
+ // a[6]a[3]
+ // a[7]a[3]
+ // a[5]a[4] (v)
+ // a[6]a[4]
+ // a[7]a[4]
+ // a[6]a[5] (vi)
+ // a[7]a[5]
+ // a[7]a[6] (vii)
+
+ mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
+ mul $t1,$a2,$a0
+ mul $t2,$a3,$a0
+ mul $t3,$a4,$a0
+ adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
+ mul $t0,$a5,$a0
+ adcs $acc2,$acc2,$t1
+ mul $t1,$a6,$a0
+ adcs $acc3,$acc3,$t2
+ mul $t2,$a7,$a0
+ adcs $acc4,$acc4,$t3
+ umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
+ adcs $acc5,$acc5,$t0
+ umulh $t0,$a2,$a0
+ adcs $acc6,$acc6,$t1
+ umulh $t1,$a3,$a0
+ adcs $acc7,$acc7,$t2
+ umulh $t2,$a4,$a0
+ stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
+ adc $acc0,xzr,xzr // t[8]
+ adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
+ umulh $t3,$a5,$a0
+ adcs $acc3,$acc3,$t0
+ umulh $t0,$a6,$a0
+ adcs $acc4,$acc4,$t1
+ umulh $t1,$a7,$a0
+ adcs $acc5,$acc5,$t2
+ mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
+ adcs $acc6,$acc6,$t3
+ mul $t3,$a3,$a1
+ adcs $acc7,$acc7,$t0
+ mul $t0,$a4,$a1
+ adc $acc0,$acc0,$t1
+
+ mul $t1,$a5,$a1
+ adds $acc3,$acc3,$t2
+ mul $t2,$a6,$a1
+ adcs $acc4,$acc4,$t3
+ mul $t3,$a7,$a1
+ adcs $acc5,$acc5,$t0
+ umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
+ adcs $acc6,$acc6,$t1
+ umulh $t1,$a3,$a1
+ adcs $acc7,$acc7,$t2
+ umulh $t2,$a4,$a1
+ adcs $acc0,$acc0,$t3
+ umulh $t3,$a5,$a1
+ stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
+ adc $acc1,xzr,xzr // t[9]
+ adds $acc4,$acc4,$t0
+ umulh $t0,$a6,$a1
+ adcs $acc5,$acc5,$t1
+ umulh $t1,$a7,$a1
+ adcs $acc6,$acc6,$t2
+ mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
+ adcs $acc7,$acc7,$t3
+ mul $t3,$a4,$a2
+ adcs $acc0,$acc0,$t0
+ mul $t0,$a5,$a2
+ adc $acc1,$acc1,$t1
+
+ mul $t1,$a6,$a2
+ adds $acc5,$acc5,$t2
+ mul $t2,$a7,$a2
+ adcs $acc6,$acc6,$t3
+ umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
+ adcs $acc7,$acc7,$t0
+ umulh $t0,$a4,$a2
+ adcs $acc0,$acc0,$t1
+ umulh $t1,$a5,$a2
+ adcs $acc1,$acc1,$t2
+ umulh $t2,$a6,$a2
+ stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
+ adc $acc2,xzr,xzr // t[10]
+ adds $acc6,$acc6,$t3
+ umulh $t3,$a7,$a2
+ adcs $acc7,$acc7,$t0
+ mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
+ adcs $acc0,$acc0,$t1
+ mul $t1,$a5,$a3
+ adcs $acc1,$acc1,$t2
+ mul $t2,$a6,$a3
+ adc $acc2,$acc2,$t3
+
+ mul $t3,$a7,$a3
+ adds $acc7,$acc7,$t0
+ umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
+ adcs $acc0,$acc0,$t1
+ umulh $t1,$a5,$a3
+ adcs $acc1,$acc1,$t2
+ umulh $t2,$a6,$a3
+ adcs $acc2,$acc2,$t3
+ umulh $t3,$a7,$a3
+ stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
+ adc $acc3,xzr,xzr // t[11]
+ adds $acc0,$acc0,$t0
+ mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
+ adcs $acc1,$acc1,$t1
+ mul $t1,$a6,$a4
+ adcs $acc2,$acc2,$t2
+ mul $t2,$a7,$a4
+ adc $acc3,$acc3,$t3
+
+ umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
+ adds $acc1,$acc1,$t0
+ umulh $t0,$a6,$a4
+ adcs $acc2,$acc2,$t1
+ umulh $t1,$a7,$a4
+ adcs $acc3,$acc3,$t2
+ mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
+ adc $acc4,xzr,xzr // t[12]
+ adds $acc2,$acc2,$t3
+ mul $t3,$a7,$a5
+ adcs $acc3,$acc3,$t0
+ umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
+ adc $acc4,$acc4,$t1
+
+ umulh $t1,$a7,$a5
+ adds $acc3,$acc3,$t2
+ mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
+ adcs $acc4,$acc4,$t3
+ umulh $t3,$a7,$a6 // hi(a[7]*a[6])
+ adc $acc5,xzr,xzr // t[13]
+ adds $acc4,$acc4,$t0
+ sub $cnt,$ap_end,$ap // done yet?
+ adc $acc5,$acc5,$t1
+
+ adds $acc5,$acc5,$t2
+ sub $t0,$ap_end,$num // rewinded ap
+ adc $acc6,xzr,xzr // t[14]
+ add $acc6,$acc6,$t3
+
+ cbz $cnt,.Lsqr8x_outer_break
+
+ mov $n0,$a0
+ ldp $a0,$a1,[$tp,#8*0]
+ ldp $a2,$a3,[$tp,#8*2]
+ ldp $a4,$a5,[$tp,#8*4]
+ ldp $a6,$a7,[$tp,#8*6]
+ adds $acc0,$acc0,$a0
+ adcs $acc1,$acc1,$a1
+ ldp $a0,$a1,[$ap,#8*0]
+ adcs $acc2,$acc2,$a2
+ adcs $acc3,$acc3,$a3
+ ldp $a2,$a3,[$ap,#8*2]
+ adcs $acc4,$acc4,$a4
+ adcs $acc5,$acc5,$a5
+ ldp $a4,$a5,[$ap,#8*4]
+ adcs $acc6,$acc6,$a6
+ mov $rp,$ap
+ adcs $acc7,xzr,$a7
+ ldp $a6,$a7,[$ap,#8*6]
+ add $ap,$ap,#8*8
+ //adc $carry,xzr,xzr // moved below
+ mov $cnt,#-8*8
+
+ // a[8]a[0]
+ // a[9]a[0]
+ // a[a]a[0]
+ // a[b]a[0]
+ // a[c]a[0]
+ // a[d]a[0]
+ // a[e]a[0]
+ // a[f]a[0]
+ // a[8]a[1]
+ // a[f]a[1]........................
+ // a[8]a[2]
+ // a[f]a[2]........................
+ // a[8]a[3]
+ // a[f]a[3]........................
+ // a[8]a[4]
+ // a[f]a[4]........................
+ // a[8]a[5]
+ // a[f]a[5]........................
+ // a[8]a[6]
+ // a[f]a[6]........................
+ // a[8]a[7]
+ // a[f]a[7]........................
+.Lsqr8x_mul:
+ mul $t0,$a0,$n0
+ adc $carry,xzr,xzr // carry bit, modulo-scheduled
+ mul $t1,$a1,$n0
+ add $cnt,$cnt,#8
+ mul $t2,$a2,$n0
+ mul $t3,$a3,$n0
+ adds $acc0,$acc0,$t0
+ mul $t0,$a4,$n0
+ adcs $acc1,$acc1,$t1
+ mul $t1,$a5,$n0
+ adcs $acc2,$acc2,$t2
+ mul $t2,$a6,$n0
+ adcs $acc3,$acc3,$t3
+ mul $t3,$a7,$n0
+ adcs $acc4,$acc4,$t0
+ umulh $t0,$a0,$n0
+ adcs $acc5,$acc5,$t1
+ umulh $t1,$a1,$n0
+ adcs $acc6,$acc6,$t2
+ umulh $t2,$a2,$n0
+ adcs $acc7,$acc7,$t3
+ umulh $t3,$a3,$n0
+ adc $carry,$carry,xzr
+ str $acc0,[$tp],#8
+ adds $acc0,$acc1,$t0
+ umulh $t0,$a4,$n0
+ adcs $acc1,$acc2,$t1
+ umulh $t1,$a5,$n0
+ adcs $acc2,$acc3,$t2
+ umulh $t2,$a6,$n0
+ adcs $acc3,$acc4,$t3
+ umulh $t3,$a7,$n0
+ ldr $n0,[$rp,$cnt]
+ adcs $acc4,$acc5,$t0
+ adcs $acc5,$acc6,$t1
+ adcs $acc6,$acc7,$t2
+ adcs $acc7,$carry,$t3
+ //adc $carry,xzr,xzr // moved above
+ cbnz $cnt,.Lsqr8x_mul
+ // note that carry flag is guaranteed
+ // to be zero at this point
+ cmp $ap,$ap_end // done yet?
+ b.eq .Lsqr8x_break
+
+ ldp $a0,$a1,[$tp,#8*0]
+ ldp $a2,$a3,[$tp,#8*2]
+ ldp $a4,$a5,[$tp,#8*4]
+ ldp $a6,$a7,[$tp,#8*6]
+ adds $acc0,$acc0,$a0
+ ldr $n0,[$rp,#-8*8]
+ adcs $acc1,$acc1,$a1
+ ldp $a0,$a1,[$ap,#8*0]
+ adcs $acc2,$acc2,$a2
+ adcs $acc3,$acc3,$a3
+ ldp $a2,$a3,[$ap,#8*2]
+ adcs $acc4,$acc4,$a4
+ adcs $acc5,$acc5,$a5
+ ldp $a4,$a5,[$ap,#8*4]
+ adcs $acc6,$acc6,$a6
+ mov $cnt,#-8*8
+ adcs $acc7,$acc7,$a7
+ ldp $a6,$a7,[$ap,#8*6]
+ add $ap,$ap,#8*8
+ //adc $carry,xzr,xzr // moved above
+ b .Lsqr8x_mul
+
+.align 4
+.Lsqr8x_break:
+ ldp $a0,$a1,[$rp,#8*0]
+ add $ap,$rp,#8*8
+ ldp $a2,$a3,[$rp,#8*2]
+ sub $t0,$ap_end,$ap // is it last iteration?
+ ldp $a4,$a5,[$rp,#8*4]
+ sub $t1,$tp,$t0
+ ldp $a6,$a7,[$rp,#8*6]
+ cbz $t0,.Lsqr8x_outer_loop
+
+ stp $acc0,$acc1,[$tp,#8*0]
+ ldp $acc0,$acc1,[$t1,#8*0]
+ stp $acc2,$acc3,[$tp,#8*2]
+ ldp $acc2,$acc3,[$t1,#8*2]
+ stp $acc4,$acc5,[$tp,#8*4]
+ ldp $acc4,$acc5,[$t1,#8*4]
+ stp $acc6,$acc7,[$tp,#8*6]
+ mov $tp,$t1
+ ldp $acc6,$acc7,[$t1,#8*6]
+ b .Lsqr8x_outer_loop
+
+.align 4
+.Lsqr8x_outer_break:
+ // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+ ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
+ ldp $t1,$t2,[sp,#8*1]
+ ldp $a5,$a7,[$t0,#8*2]
+ add $ap,$t0,#8*4
+ ldp $t3,$t0,[sp,#8*3]
+
+ stp $acc0,$acc1,[$tp,#8*0]
+ mul $acc0,$a1,$a1
+ stp $acc2,$acc3,[$tp,#8*2]
+ umulh $a1,$a1,$a1
+ stp $acc4,$acc5,[$tp,#8*4]
+ mul $a2,$a3,$a3
+ stp $acc6,$acc7,[$tp,#8*6]
+ mov $tp,sp
+ umulh $a3,$a3,$a3
+ adds $acc1,$a1,$t1,lsl#1
+ extr $t1,$t2,$t1,#63
+ sub $cnt,$num,#8*4
+
+.Lsqr4x_shift_n_add:
+ adcs $acc2,$a2,$t1
+ extr $t2,$t3,$t2,#63
+ sub $cnt,$cnt,#8*4
+ adcs $acc3,$a3,$t2
+ ldp $t1,$t2,[$tp,#8*5]
+ mul $a4,$a5,$a5
+ ldp $a1,$a3,[$ap],#8*2
+ umulh $a5,$a5,$a5
+ mul $a6,$a7,$a7
+ umulh $a7,$a7,$a7
+ extr $t3,$t0,$t3,#63
+ stp $acc0,$acc1,[$tp,#8*0]
+ adcs $acc4,$a4,$t3
+ extr $t0,$t1,$t0,#63
+ stp $acc2,$acc3,[$tp,#8*2]
+ adcs $acc5,$a5,$t0
+ ldp $t3,$t0,[$tp,#8*7]
+ extr $t1,$t2,$t1,#63
+ adcs $acc6,$a6,$t1
+ extr $t2,$t3,$t2,#63
+ adcs $acc7,$a7,$t2
+ ldp $t1,$t2,[$tp,#8*9]
+ mul $a0,$a1,$a1
+ ldp $a5,$a7,[$ap],#8*2
+ umulh $a1,$a1,$a1
+ mul $a2,$a3,$a3
+ umulh $a3,$a3,$a3
+ stp $acc4,$acc5,[$tp,#8*4]
+ extr $t3,$t0,$t3,#63
+ stp $acc6,$acc7,[$tp,#8*6]
+ add $tp,$tp,#8*8
+ adcs $acc0,$a0,$t3
+ extr $t0,$t1,$t0,#63
+ adcs $acc1,$a1,$t0
+ ldp $t3,$t0,[$tp,#8*3]
+ extr $t1,$t2,$t1,#63
+ cbnz $cnt,.Lsqr4x_shift_n_add
+___
+my ($np,$np_end)=($ap,$ap_end);
+$code.=<<___;
+ ldp $np,$n0,[x29,#104] // pull np and n0
+
+ adcs $acc2,$a2,$t1
+ extr $t2,$t3,$t2,#63
+ adcs $acc3,$a3,$t2
+ ldp $t1,$t2,[$tp,#8*5]
+ mul $a4,$a5,$a5
+ umulh $a5,$a5,$a5
+ stp $acc0,$acc1,[$tp,#8*0]
+ mul $a6,$a7,$a7
+ umulh $a7,$a7,$a7
+ stp $acc2,$acc3,[$tp,#8*2]
+ extr $t3,$t0,$t3,#63
+ adcs $acc4,$a4,$t3
+ extr $t0,$t1,$t0,#63
+ ldp $acc0,$acc1,[sp,#8*0]
+ adcs $acc5,$a5,$t0
+ extr $t1,$t2,$t1,#63
+ ldp $a0,$a1,[$np,#8*0]
+ adcs $acc6,$a6,$t1
+ extr $t2,xzr,$t2,#63
+ ldp $a2,$a3,[$np,#8*2]
+ adc $acc7,$a7,$t2
+ ldp $a4,$a5,[$np,#8*4]
+
+ // Reduce by 512 bits per iteration
+ mul $na0,$n0,$acc0 // t[0]*n0
+ ldp $a6,$a7,[$np,#8*6]
+ add $np_end,$np,$num
+ ldp $acc2,$acc3,[sp,#8*2]
+ stp $acc4,$acc5,[$tp,#8*4]
+ ldp $acc4,$acc5,[sp,#8*4]
+ stp $acc6,$acc7,[$tp,#8*6]
+ ldp $acc6,$acc7,[sp,#8*6]
+ add $np,$np,#8*8
+ mov $topmost,xzr // initial top-most carry
+ mov $tp,sp
+ mov $cnt,#8
+
+.Lsqr8x_reduction:
+ // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
+ mul $t1,$a1,$na0
+ sub $cnt,$cnt,#1
+ mul $t2,$a2,$na0
+ str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
+ mul $t3,$a3,$na0
+ // (*) adds xzr,$acc0,$t0
+ subs xzr,$acc0,#1 // (*)
+ mul $t0,$a4,$na0
+ adcs $acc0,$acc1,$t1
+ mul $t1,$a5,$na0
+ adcs $acc1,$acc2,$t2
+ mul $t2,$a6,$na0
+ adcs $acc2,$acc3,$t3
+ mul $t3,$a7,$na0
+ adcs $acc3,$acc4,$t0
+ umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
+ adcs $acc4,$acc5,$t1
+ umulh $t1,$a1,$na0
+ adcs $acc5,$acc6,$t2
+ umulh $t2,$a2,$na0
+ adcs $acc6,$acc7,$t3
+ umulh $t3,$a3,$na0
+ adc $acc7,xzr,xzr
+ adds $acc0,$acc0,$t0
+ umulh $t0,$a4,$na0
+ adcs $acc1,$acc1,$t1
+ umulh $t1,$a5,$na0
+ adcs $acc2,$acc2,$t2
+ umulh $t2,$a6,$na0
+ adcs $acc3,$acc3,$t3
+ umulh $t3,$a7,$na0
+ mul $na0,$n0,$acc0 // next t[0]*n0
+ adcs $acc4,$acc4,$t0
+ adcs $acc5,$acc5,$t1
+ adcs $acc6,$acc6,$t2
+ adc $acc7,$acc7,$t3
+ cbnz $cnt,.Lsqr8x_reduction
+
+ ldp $t0,$t1,[$tp,#8*0]
+ ldp $t2,$t3,[$tp,#8*2]
+ mov $rp,$tp
+ sub $cnt,$np_end,$np // done yet?
+ adds $acc0,$acc0,$t0
+ adcs $acc1,$acc1,$t1
+ ldp $t0,$t1,[$tp,#8*4]
+ adcs $acc2,$acc2,$t2
+ adcs $acc3,$acc3,$t3
+ ldp $t2,$t3,[$tp,#8*6]
+ adcs $acc4,$acc4,$t0
+ adcs $acc5,$acc5,$t1
+ adcs $acc6,$acc6,$t2
+ adcs $acc7,$acc7,$t3
+ //adc $carry,xzr,xzr // moved below
+ cbz $cnt,.Lsqr8x8_post_condition
+
+ ldr $n0,[$tp,#-8*8]
+ ldp $a0,$a1,[$np,#8*0]
+ ldp $a2,$a3,[$np,#8*2]
+ ldp $a4,$a5,[$np,#8*4]
+ mov $cnt,#-8*8
+ ldp $a6,$a7,[$np,#8*6]
+ add $np,$np,#8*8
+
+.Lsqr8x_tail:
+ mul $t0,$a0,$n0
+ adc $carry,xzr,xzr // carry bit, modulo-scheduled
+ mul $t1,$a1,$n0
+ add $cnt,$cnt,#8
+ mul $t2,$a2,$n0
+ mul $t3,$a3,$n0
+ adds $acc0,$acc0,$t0
+ mul $t0,$a4,$n0
+ adcs $acc1,$acc1,$t1
+ mul $t1,$a5,$n0
+ adcs $acc2,$acc2,$t2
+ mul $t2,$a6,$n0
+ adcs $acc3,$acc3,$t3
+ mul $t3,$a7,$n0
+ adcs $acc4,$acc4,$t0
+ umulh $t0,$a0,$n0
+ adcs $acc5,$acc5,$t1
+ umulh $t1,$a1,$n0
+ adcs $acc6,$acc6,$t2
+ umulh $t2,$a2,$n0
+ adcs $acc7,$acc7,$t3
+ umulh $t3,$a3,$n0
+ adc $carry,$carry,xzr
+ str $acc0,[$tp],#8
+ adds $acc0,$acc1,$t0
+ umulh $t0,$a4,$n0
+ adcs $acc1,$acc2,$t1
+ umulh $t1,$a5,$n0
+ adcs $acc2,$acc3,$t2
+ umulh $t2,$a6,$n0
+ adcs $acc3,$acc4,$t3
+ umulh $t3,$a7,$n0
+ ldr $n0,[$rp,$cnt]
+ adcs $acc4,$acc5,$t0
+ adcs $acc5,$acc6,$t1
+ adcs $acc6,$acc7,$t2
+ adcs $acc7,$carry,$t3
+ //adc $carry,xzr,xzr // moved above
+ cbnz $cnt,.Lsqr8x_tail
+ // note that carry flag is guaranteed
+ // to be zero at this point
+ ldp $a0,$a1,[$tp,#8*0]
+ sub $cnt,$np_end,$np // done yet?
+ sub $t2,$np_end,$num // rewinded np
+ ldp $a2,$a3,[$tp,#8*2]
+ ldp $a4,$a5,[$tp,#8*4]
+ ldp $a6,$a7,[$tp,#8*6]
+ cbz $cnt,.Lsqr8x_tail_break
+
+ ldr $n0,[$rp,#-8*8]
+ adds $acc0,$acc0,$a0
+ adcs $acc1,$acc1,$a1
+ ldp $a0,$a1,[$np,#8*0]
+ adcs $acc2,$acc2,$a2
+ adcs $acc3,$acc3,$a3
+ ldp $a2,$a3,[$np,#8*2]
+ adcs $acc4,$acc4,$a4
+ adcs $acc5,$acc5,$a5
+ ldp $a4,$a5,[$np,#8*4]
+ adcs $acc6,$acc6,$a6
+ mov $cnt,#-8*8
+ adcs $acc7,$acc7,$a7
+ ldp $a6,$a7,[$np,#8*6]
+ add $np,$np,#8*8
+ //adc $carry,xzr,xzr // moved above
+ b .Lsqr8x_tail
+
+.align 4
+.Lsqr8x_tail_break:
+ ldr $n0,[x29,#112] // pull n0
+ add $cnt,$tp,#8*8 // end of current t[num] window
+
+ subs xzr,$topmost,#1 // "move" top-most carry to carry bit
+ adcs $t0,$acc0,$a0
+ adcs $t1,$acc1,$a1
+ ldp $acc0,$acc1,[$rp,#8*0]
+ adcs $acc2,$acc2,$a2
+ ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
+ adcs $acc3,$acc3,$a3
+ ldp $a2,$a3,[$t2,#8*2]
+ adcs $acc4,$acc4,$a4
+ adcs $acc5,$acc5,$a5
+ ldp $a4,$a5,[$t2,#8*4]
+ adcs $acc6,$acc6,$a6
+ adcs $acc7,$acc7,$a7
+ ldp $a6,$a7,[$t2,#8*6]
+ add $np,$t2,#8*8
+ adc $topmost,xzr,xzr // top-most carry
+ mul $na0,$n0,$acc0
+ stp $t0,$t1,[$tp,#8*0]
+ stp $acc2,$acc3,[$tp,#8*2]
+ ldp $acc2,$acc3,[$rp,#8*2]
+ stp $acc4,$acc5,[$tp,#8*4]
+ ldp $acc4,$acc5,[$rp,#8*4]
+ cmp $cnt,x29 // did we hit the bottom?
+ stp $acc6,$acc7,[$tp,#8*6]
+ mov $tp,$rp // slide the window
+ ldp $acc6,$acc7,[$rp,#8*6]
+ mov $cnt,#8
+ b.ne .Lsqr8x_reduction
+
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ ldr $rp,[x29,#96] // pull rp
+ add $tp,$tp,#8*8
+ subs $t0,$acc0,$a0
+ sbcs $t1,$acc1,$a1
+ sub $cnt,$num,#8*8
+ mov $ap_end,$rp // $rp copy
+
+.Lsqr8x_sub:
+ sbcs $t2,$acc2,$a2
+ ldp $a0,$a1,[$np,#8*0]
+ sbcs $t3,$acc3,$a3
+ stp $t0,$t1,[$rp,#8*0]
+ sbcs $t0,$acc4,$a4
+ ldp $a2,$a3,[$np,#8*2]
+ sbcs $t1,$acc5,$a5
+ stp $t2,$t3,[$rp,#8*2]
+ sbcs $t2,$acc6,$a6
+ ldp $a4,$a5,[$np,#8*4]
+ sbcs $t3,$acc7,$a7
+ ldp $a6,$a7,[$np,#8*6]
+ add $np,$np,#8*8
+ ldp $acc0,$acc1,[$tp,#8*0]
+ sub $cnt,$cnt,#8*8
+ ldp $acc2,$acc3,[$tp,#8*2]
+ ldp $acc4,$acc5,[$tp,#8*4]
+ ldp $acc6,$acc7,[$tp,#8*6]
+ add $tp,$tp,#8*8
+ stp $t0,$t1,[$rp,#8*4]
+ sbcs $t0,$acc0,$a0
+ stp $t2,$t3,[$rp,#8*6]
+ add $rp,$rp,#8*8
+ sbcs $t1,$acc1,$a1
+ cbnz $cnt,.Lsqr8x_sub
+
+ sbcs $t2,$acc2,$a2
+ mov $tp,sp
+ add $ap,sp,$num
+ ldp $a0,$a1,[$ap_end,#8*0]
+ sbcs $t3,$acc3,$a3
+ stp $t0,$t1,[$rp,#8*0]
+ sbcs $t0,$acc4,$a4
+ ldp $a2,$a3,[$ap_end,#8*2]
+ sbcs $t1,$acc5,$a5
+ stp $t2,$t3,[$rp,#8*2]
+ sbcs $t2,$acc6,$a6
+ ldp $acc0,$acc1,[$ap,#8*0]
+ sbcs $t3,$acc7,$a7
+ ldp $acc2,$acc3,[$ap,#8*2]
+ sbcs xzr,$topmost,xzr // did it borrow?
+ ldr x30,[x29,#8] // pull return address
+ stp $t0,$t1,[$rp,#8*4]
+ stp $t2,$t3,[$rp,#8*6]
+
+ sub $cnt,$num,#8*4
+.Lsqr4x_cond_copy:
+ sub $cnt,$cnt,#8*4
+ csel $t0,$acc0,$a0,lo
+ stp xzr,xzr,[$tp,#8*0]
+ csel $t1,$acc1,$a1,lo
+ ldp $a0,$a1,[$ap_end,#8*4]
+ ldp $acc0,$acc1,[$ap,#8*4]
+ csel $t2,$acc2,$a2,lo
+ stp xzr,xzr,[$tp,#8*2]
+ add $tp,$tp,#8*4
+ csel $t3,$acc3,$a3,lo
+ ldp $a2,$a3,[$ap_end,#8*6]
+ ldp $acc2,$acc3,[$ap,#8*6]
+ add $ap,$ap,#8*4
+ stp $t0,$t1,[$ap_end,#8*0]
+ stp $t2,$t3,[$ap_end,#8*2]
+ add $ap_end,$ap_end,#8*4
+ stp xzr,xzr,[$ap,#8*0]
+ stp xzr,xzr,[$ap,#8*2]
+ cbnz $cnt,.Lsqr4x_cond_copy
+
+ csel $t0,$acc0,$a0,lo
+ stp xzr,xzr,[$tp,#8*0]
+ csel $t1,$acc1,$a1,lo
+ stp xzr,xzr,[$tp,#8*2]
+ csel $t2,$acc2,$a2,lo
+ csel $t3,$acc3,$a3,lo
+ stp $t0,$t1,[$ap_end,#8*0]
+ stp $t2,$t3,[$ap_end,#8*2]
+
+ b .Lsqr8x_done
+
+.align 4
+.Lsqr8x8_post_condition:
+ adc $carry,xzr,xzr
+ ldr x30,[x29,#8] // pull return address
+ // $acc0-7,$carry hold result, $a0-7 hold modulus
+ subs $a0,$acc0,$a0
+ ldr $ap,[x29,#96] // pull rp
+ sbcs $a1,$acc1,$a1
+ stp xzr,xzr,[sp,#8*0]
+ sbcs $a2,$acc2,$a2
+ stp xzr,xzr,[sp,#8*2]
+ sbcs $a3,$acc3,$a3
+ stp xzr,xzr,[sp,#8*4]
+ sbcs $a4,$acc4,$a4
+ stp xzr,xzr,[sp,#8*6]
+ sbcs $a5,$acc5,$a5
+ stp xzr,xzr,[sp,#8*8]
+ sbcs $a6,$acc6,$a6
+ stp xzr,xzr,[sp,#8*10]
+ sbcs $a7,$acc7,$a7
+ stp xzr,xzr,[sp,#8*12]
+ sbcs $carry,$carry,xzr // did it borrow?
+ stp xzr,xzr,[sp,#8*14]
+
+ // $a0-7 hold result-modulus
+ csel $a0,$acc0,$a0,lo
+ csel $a1,$acc1,$a1,lo
+ csel $a2,$acc2,$a2,lo
+ csel $a3,$acc3,$a3,lo
+ stp $a0,$a1,[$ap,#8*0]
+ csel $a4,$acc4,$a4,lo
+ csel $a5,$acc5,$a5,lo
+ stp $a2,$a3,[$ap,#8*2]
+ csel $a6,$acc6,$a6,lo
+ csel $a7,$acc7,$a7,lo
+ stp $a4,$a5,[$ap,#8*4]
+ stp $a6,$a7,[$ap,#8*6]