Fix race for X509 store found by thread sanitizer
[openssl.git] / crypto / bn / asm / armv8-mont.pl
index 0bf9bf376d293660f613f28a5308867556e95531..21ab12bdf07e9b357656390ec720739ed7c5fddf 100755 (executable)
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # umulh and therefore uses same amount of multiplication instructions
 # to do the job. Assembly's edge is to minimize number of "collateral"
 # instructions and of course instruction scheduling.
+#
+# April 2015
+#
+# Squaring procedure that handles lengths divisible by 8 improves
+# RSA/DSA performance by 25-40-60% depending on processor and key
+# length. Overall improvement coefficients are always positive in
+# comparison to compiler-generated code. On Cortex-A57 improvement
+# is still modest on longest key lengths, while others exhibit e.g.
+# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
+# on Cortex-A57 and ~60-100% faster on others.
 
-$flavour = shift;
-$output  = shift;
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
 
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
 die "can't locate arm-xlate.pl";
 
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+    or die "can't call $xlate: $1";
 *STDOUT=*OUT;
 
 ($lo0,$hi0,$aj,$m0,$alo,$ahi,
@@ -47,12 +67,36 @@ $n0="x4";   # const BN_ULONG *n0,
 $num="x5";     # int num);
 
 $code.=<<___;
+#include "arm_arch.h"
+#ifndef        __KERNEL__
+.extern OPENSSL_armv8_rsa_neonized
+.hidden OPENSSL_armv8_rsa_neonized
+#endif
 .text
 
 .globl bn_mul_mont
 .type  bn_mul_mont,%function
 .align 5
 bn_mul_mont:
+       AARCH64_SIGN_LINK_REGISTER
+.Lbn_mul_mont:
+       tst     $num,#3
+       b.ne    .Lmul_mont
+       cmp     $num,#32
+       b.le    .Lscalar_impl
+#ifndef        __KERNEL__
+       adrp    x17,OPENSSL_armv8_rsa_neonized
+       ldr     w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
+       cbnz    w17, bn_mul8x_mont_neon
+#endif
+
+.Lscalar_impl:
+       tst     $num,#7
+       b.eq    __bn_sqr8x_mont
+       tst     $num,#3
+       b.eq    __bn_mul4x_mont
+
+.Lmul_mont:
        stp     x29,x30,[sp,#-64]!
        add     x29,sp,#0
        stp     x19,x20,[sp,#16]
@@ -76,10 +120,20 @@ bn_mul_mont:
        mul     $m1,$lo0,$n0            // "tp[0]"*n0
        mov     sp,$tp                  // alloca
 
-       mul     $lo1,$hi1,$m1           // np[0]*m1
+       // (*)  mul     $lo1,$hi1,$m1   // np[0]*m1
        umulh   $hi1,$hi1,$m1
        mul     $nlo,$nj,$m1            // np[1]*m1
-       adds    $lo1,$lo1,$lo0          // discarded
+       // (*)  adds    $lo1,$lo1,$lo0  // discarded
+       // (*)  As for removal of first multiplication and addition
+       //      instructions. The outcome of first addition is
+       //      guaranteed to be zero, which leaves two computationally
+       //      significant outcomes: it either carries or not. Then
+       //      question is when does it carry? Is there alternative
+       //      way to deduce it? If you follow operations, you can
+       //      observe that condition for carry is quite simple:
+       //      $lo0 being non-zero. So that carry can be calculated
+       //      by adding -1 to $lo0. That's what next instruction does.
+       subs    xzr,$lo0,#1             // (*)
        umulh   $nhi,$nj,$m1
        adc     $hi1,$hi1,xzr
        cbz     $j,.L1st_skip
@@ -137,10 +191,11 @@ bn_mul_mont:
        mul     $m1,$lo0,$n0
        sub     $i,$i,#8                // i--
 
-       mul     $lo1,$hi1,$m1           // np[0]*m1
+       // (*)  mul     $lo1,$hi1,$m1   // np[0]*m1
        umulh   $hi1,$hi1,$m1
        mul     $nlo,$nj,$m1            // np[1]*m1
-       adds    $lo1,$lo1,$lo0
+       // (*)  adds    $lo1,$lo1,$lo0
+       subs    xzr,$lo0,#1             // (*)
        umulh   $nhi,$nj,$m1
        cbz     $j,.Linner_skip
 
@@ -164,7 +219,7 @@ bn_mul_mont:
        mul     $nlo,$nj,$m1            // np[j]*m1
        adds    $lo1,$lo1,$lo0
        umulh   $nhi,$nj,$m1
-       st    $lo1,[$tp,#-16]         // tp[j-1]
+       stur    $lo1,[$tp,#-16]         // tp[j-1]
        cbnz    $j,.Linner
 
 .Linner_skip:
@@ -176,14 +231,15 @@ bn_mul_mont:
 
        adds    $lo1,$nlo,$hi1
        sub     $np,$np,$num            // rewind $np
-       adc     $hi1,$nhi,$ovf
+       adcs    $hi1,$nhi,$ovf
+       adc     $ovf,xzr,xzr
 
        adds    $lo0,$lo0,$tj
        adc     $hi0,$hi0,xzr
 
        adds    $lo1,$lo1,$lo0
        adcs    $hi1,$hi1,$hi0
-       adc     $ovf,xzr,xzr            // upmost overflow bit
+       adc     $ovf,$ovf,xzr           // upmost overflow bit
        stp     $lo1,$hi1,[$tp,#-16]
 
        cbnz    $i,.Louter
@@ -191,7 +247,7 @@ bn_mul_mont:
        // Final step. We see if result is larger than modulus, and
        // if it is, subtract the modulus. But comparison implies
        // subtraction. So we subtract modulus, see if it borrowed,
-       // and conditionally copy original value. 
+       // and conditionally copy original value.
        ldr     $tj,[sp]                // tp[0]
        add     $tp,sp,#8
        ldr     $nj,[$np],#8            // np[0]
@@ -216,29 +272,1636 @@ bn_mul_mont:
        nop
 .Lcond_copy:
        sub     $num,$num,#8            // num--
-       csel    $nj,$aj,$tj,cs          // did it borrow?
+       csel    $nj,$tj,$aj,lo          // did it borrow?
        ldr     $tj,[$tp],#8
        ldr     $aj,[$rp],#8
-       st    xzr,[$tp,#-16]          // wipe tp
-       st    $nj,[$rp,#-16]
+       stur    xzr,[$tp,#-16]          // wipe tp
+       stur    $nj,[$rp,#-16]
        cbnz    $num,.Lcond_copy
 
-       csel    $nj,$aj,$tj,cs
-       st    xzr,[$tp,#-8]           // wipe tp
-       st    $nj,[$rp,#-8]
+       csel    $nj,$tj,$aj,lo
+       stur    xzr,[$tp,#-8]           // wipe tp
+       stur    $nj,[$rp,#-8]
 
        ldp     x19,x20,[x29,#16]
        mov     sp,x29
        ldp     x21,x22,[x29,#32]
+       mov     x0,#1
        ldp     x23,x24,[x29,#48]
        ldr     x29,[sp],#64
+       AARCH64_VALIDATE_LINK_REGISTER
        ret
 .size  bn_mul_mont,.-bn_mul_mont
+___
+{
+my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
+my ($Z,$Temp)=("v4.16b","v5");
+my @ACC=map("v$_",(6..13));
+my ($Bi,$Ni,$M0)=map("v$_",(28..30));
+my $sBi="s28";
+my $sM0="s30";
+my $zero="v14";
+my $temp="v15";
+my $ACCTemp="v16";
+
+my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
+my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
+
+$code.=<<___;
+.type  bn_mul8x_mont_neon,%function
+.align 5
+bn_mul8x_mont_neon:
+       // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to
+       // only from bn_mul_mont which has already signed the return address.
+       stp     x29,x30,[sp,#-80]!
+       mov     x16,sp
+       stp     d8,d9,[sp,#16]
+       stp     d10,d11,[sp,#32]
+       stp     d12,d13,[sp,#48]
+       stp     d14,d15,[sp,#64]
+       lsl     $num,$num,#1
+       eor     $zero.16b,$zero.16b,$zero.16b
+
+.align 4
+.LNEON_8n:
+       eor     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b
+       sub     $toutptr,sp,#128
+       eor     @ACC[1].16b,@ACC[1].16b,@ACC[1].16b
+       sub     $toutptr,$toutptr,$num,lsl#4
+       eor     @ACC[2].16b,@ACC[2].16b,@ACC[2].16b
+       and     $toutptr,$toutptr,#-64
+       eor     @ACC[3].16b,@ACC[3].16b,@ACC[3].16b
+       mov     sp,$toutptr             // alloca
+       eor     @ACC[4].16b,@ACC[4].16b,@ACC[4].16b
+       add     $toutptr,$toutptr,#256
+       eor     @ACC[5].16b,@ACC[5].16b,@ACC[5].16b
+       sub     $inner,$num,#8
+       eor     @ACC[6].16b,@ACC[6].16b,@ACC[6].16b
+       eor     @ACC[7].16b,@ACC[7].16b,@ACC[7].16b
+
+.LNEON_8n_init:
+       st1     {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
+       subs    $inner,$inner,#8
+       st1     {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
+       st1     {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
+       st1     {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
+       bne     .LNEON_8n_init
+
+       add     $tinptr,sp,#256
+       ld1     {$A0.4s,$A1.4s},[$aptr],#32
+       add     $bnptr,sp,#8
+       ldr     $sM0,[$n0],#4
+       mov     $outer,$num
+       b       .LNEON_8n_outer
+
+.align 4
+.LNEON_8n_outer:
+       ldr     $sBi,[$bptr],#4   // *b++
+       uxtl    $Bi.4s,$Bi.4h
+       add     $toutptr,sp,#128
+       ld1     {$N0.4s,$N1.4s},[$nptr],#32
+
+       umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
+       umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
+       umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
+       shl     $Ni.2d,@ACC[0].2d,#16
+       ext     $Ni.16b,$Ni.16b,$Ni.16b,#8
+       umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
+       add     $Ni.2d,$Ni.2d,@ACC[0].2d
+       umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
+       mul     $Ni.2s,$Ni.2s,$M0.2s
+       umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
+       st1     {$Bi.2s},[sp]           // put aside smashed b[8*i+0]
+       umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
+       uxtl    $Ni.4s,$Ni.4h
+       umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+for ($i=0; $i<7;) {
+$code.=<<___;
+       ldr     $sBi,[$bptr],#4   // *b++
+       umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
+       umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
+       uxtl    $Bi.4s,$Bi.4h
+       umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
+       ushr    $temp.2d,@ACC[0].2d,#16
+       umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
+       umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
+       ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+       add     @ACC[0].2d,@ACC[0].2d,$temp.2d
+       umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
+       ushr    @ACC[0].2d,@ACC[0].2d,#16
+       umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
+       umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
+       add     $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
+       ins     @ACC[1].d[0],$ACCTemp.d[0]
+       st1     {$Ni.2s},[$bnptr],#8    // put aside smashed m[8*i+$i]
+___
+       push(@ACC,shift(@ACC)); $i++;
+$code.=<<___;
+       umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
+       ld1     {@ACC[7].2d},[$tinptr],#16
+       umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
+       umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
+       shl     $Ni.2d,@ACC[0].2d,#16
+       ext     $Ni.16b,$Ni.16b,$Ni.16b,#8
+       umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
+       add     $Ni.2d,$Ni.2d,@ACC[0].2d
+       umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
+       mul     $Ni.2s,$Ni.2s,$M0.2s
+       umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
+       st1     {$Bi.2s},[$bnptr],#8    // put aside smashed b[8*i+$i]
+       umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
+       uxtl    $Ni.4s,$Ni.4h
+       umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+}
+$code.=<<___;
+       ld1     {$Bi.2s},[sp]           // pull smashed b[8*i+0]
+       umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
+       ld1     {$A0.4s,$A1.4s},[$aptr],#32
+       umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
+       umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
+       mov     $Temp.16b,@ACC[0].16b
+       ushr    $Temp.2d,$Temp.2d,#16
+       ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+       umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
+       umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
+       add     @ACC[0].2d,@ACC[0].2d,$Temp.2d
+       umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
+       ushr    @ACC[0].2d,@ACC[0].2d,#16
+       eor     $temp.16b,$temp.16b,$temp.16b
+       ins     @ACC[0].d[1],$temp.d[0]
+       umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
+       umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
+       add     @ACC[1].2d,@ACC[1].2d,@ACC[0].2d
+       st1     {$Ni.2s},[$bnptr],#8    // put aside smashed m[8*i+$i]
+       add     $bnptr,sp,#8            // rewind
+___
+       push(@ACC,shift(@ACC));
+$code.=<<___;
+       sub     $inner,$num,#8
+       b       .LNEON_8n_inner
+
+.align 4
+.LNEON_8n_inner:
+       subs    $inner,$inner,#8
+       umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
+       ld1     {@ACC[7].2d},[$tinptr]
+       umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
+       ld1     {$Ni.2s},[$bnptr],#8    // pull smashed m[8*i+0]
+       umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
+       ld1     {$N0.4s,$N1.4s},[$nptr],#32
+       umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
+       b.eq    .LInner_jump
+       add     $tinptr,$tinptr,#16     // don't advance in last iteration
+.LInner_jump:
+       umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
+       umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
+       umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
+       umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+       ld1     {$Bi.2s},[$bnptr],#8    // pull smashed b[8*i+$i]
+       umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
+       umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
+       umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
+       umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
+       umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
+       umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
+       umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
+       umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
+       st1     {@ACC[0].2d},[$toutptr],#16
+___
+       push(@ACC,shift(@ACC));
+$code.=<<___;
+       umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
+       ld1     {@ACC[7].2d},[$tinptr]
+       umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
+       ld1     {$Ni.2s},[$bnptr],#8    // pull smashed m[8*i+$i]
+       umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
+       b.eq    .LInner_jump$i
+       add     $tinptr,$tinptr,#16     // don't advance in last iteration
+.LInner_jump$i:
+       umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
+       umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
+       umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
+       umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
+       umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+}
+$code.=<<___;
+       b.ne    .LInner_after_rewind$i
+       sub     $aptr,$aptr,$num,lsl#2  // rewind
+.LInner_after_rewind$i:
+       umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
+       ld1     {$Bi.2s},[sp]           // pull smashed b[8*i+0]
+       umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
+       ld1     {$A0.4s,$A1.4s},[$aptr],#32
+       umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
+       add     $bnptr,sp,#8            // rewind
+       umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
+       umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
+       umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
+       umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
+       st1     {@ACC[0].2d},[$toutptr],#16
+       umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
+
+       bne     .LNEON_8n_inner
+___
+       push(@ACC,shift(@ACC));
+$code.=<<___;
+       add     $tinptr,sp,#128
+       st1     {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
+       eor     $N0.16b,$N0.16b,$N0.16b // $N0
+       st1     {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
+       eor     $N1.16b,$N1.16b,$N1.16b // $N1
+       st1     {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
+       st1     {@ACC[6].2d},[$toutptr]
+
+       subs    $outer,$outer,#8
+       ld1     {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
+       ld1     {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
+       ld1     {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
+       ld1     {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
+
+       b.eq    .LInner_8n_jump_2steps
+       sub     $nptr,$nptr,$num,lsl#2  // rewind
+       b       .LNEON_8n_outer
+
+.LInner_8n_jump_2steps:
+       add     $toutptr,sp,#128
+       st1     {$N0.2d,$N1.2d}, [sp],#32       // start wiping stack frame
+       mov     $Temp.16b,@ACC[0].16b
+       ushr    $temp.2d,@ACC[0].2d,#16
+       ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+       st1     {$N0.2d,$N1.2d}, [sp],#32
+       add     @ACC[0].2d,@ACC[0].2d,$temp.2d
+       st1     {$N0.2d,$N1.2d}, [sp],#32
+       ushr    $temp.2d,@ACC[0].2d,#16
+       st1     {$N0.2d,$N1.2d}, [sp],#32
+       zip1    @ACC[0].4h,$Temp.4h,@ACC[0].4h
+       ins     $temp.d[1],$zero.d[0]
+
+       mov     $inner,$num
+       b       .LNEON_tail_entry
+
+.align 4
+.LNEON_tail:
+       add     @ACC[0].2d,@ACC[0].2d,$temp.2d
+       mov     $Temp.16b,@ACC[0].16b
+       ushr    $temp.2d,@ACC[0].2d,#16
+       ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+       ld1     {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
+       add     @ACC[0].2d,@ACC[0].2d,$temp.2d
+       ld1     {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
+       ushr    $temp.2d,@ACC[0].2d,#16
+       ld1     {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
+       zip1    @ACC[0].4h,$Temp.4h,@ACC[0].4h
+       ins     $temp.d[1],$zero.d[0]
+
+.LNEON_tail_entry:
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+       add     @ACC[1].2d,@ACC[1].2d,$temp.2d
+       st1     {@ACC[0].s}[0], [$toutptr],#4
+       ushr    $temp.2d,@ACC[1].2d,#16
+       mov     $Temp.16b,@ACC[1].16b
+       ext     @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
+       add     @ACC[1].2d,@ACC[1].2d,$temp.2d
+       ushr    $temp.2d,@ACC[1].2d,#16
+       zip1    @ACC[1].4h,$Temp.4h,@ACC[1].4h
+       ins     $temp.d[1],$zero.d[0]
+___
+       push(@ACC,shift(@ACC));
+}
+       push(@ACC,shift(@ACC));
+$code.=<<___;
+       ld1     {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
+       subs    $inner,$inner,#8
+       st1     {@ACC[7].s}[0], [$toutptr],#4
+       bne     .LNEON_tail
+
+       st1     {$temp.s}[0], [$toutptr],#4     // top-most bit
+       sub     $nptr,$nptr,$num,lsl#2          // rewind $nptr
+       subs    $aptr,sp,#0                     // clear carry flag
+       add     $bptr,sp,$num,lsl#2
+
+.LNEON_sub:
+       ldp     w4,w5,[$aptr],#8
+       ldp     w6,w7,[$aptr],#8
+       ldp     w8,w9,[$nptr],#8
+       ldp     w10,w11,[$nptr],#8
+       sbcs    w8,w4,w8
+       sbcs    w9,w5,w9
+       sbcs    w10,w6,w10
+       sbcs    w11,w7,w11
+       sub     x17,$bptr,$aptr
+       stp     w8,w9,[$rptr],#8
+       stp     w10,w11,[$rptr],#8
+       cbnz    x17,.LNEON_sub
+
+       ldr     w10, [$aptr]            // load top-most bit
+       mov     x11,sp
+       eor     v0.16b,v0.16b,v0.16b
+       sub     x11,$bptr,x11           // this is num*4
+       eor     v1.16b,v1.16b,v1.16b
+       mov     $aptr,sp
+       sub     $rptr,$rptr,x11         // rewind $rptr
+       mov     $nptr,$bptr             // second 3/4th of frame
+       sbcs    w10,w10,wzr             // result is carry flag
+
+.LNEON_copy_n_zap:
+       ldp     w4,w5,[$aptr],#8
+       ldp     w6,w7,[$aptr],#8
+       ldp     w8,w9,[$rptr],#8
+       ldp     w10,w11,[$rptr]
+       sub     $rptr,$rptr,#8
+       b.cs    .LCopy_1
+       mov     w8,w4
+       mov     w9,w5
+       mov     w10,w6
+       mov     w11,w7
+.LCopy_1:
+       st1     {v0.2d,v1.2d}, [$nptr],#32              // wipe
+       st1     {v0.2d,v1.2d}, [$nptr],#32              // wipe
+       ldp     w4,w5,[$aptr],#8
+       ldp     w6,w7,[$aptr],#8
+       stp     w8,w9,[$rptr],#8
+       stp     w10,w11,[$rptr],#8
+       sub     $aptr,$aptr,#32
+       ldp     w8,w9,[$rptr],#8
+       ldp     w10,w11,[$rptr]
+       sub     $rptr,$rptr,#8
+       b.cs    .LCopy_2
+       mov     w8, w4
+       mov     w9, w5
+       mov     w10, w6
+       mov     w11, w7
+.LCopy_2:
+       st1     {v0.2d,v1.2d}, [$aptr],#32              // wipe
+       st1     {v0.2d,v1.2d}, [$nptr],#32              // wipe
+       sub     x17,$bptr,$aptr         // preserves carry
+       stp     w8,w9,[$rptr],#8
+       stp     w10,w11,[$rptr],#8
+       cbnz    x17,.LNEON_copy_n_zap
+
+       mov     sp,x16
+       ldp     d14,d15,[sp,#64]
+       ldp     d12,d13,[sp,#48]
+       ldp     d10,d11,[sp,#32]
+       ldp     d8,d9,[sp,#16]
+       ldr     x29,[sp],#80
+       AARCH64_VALIDATE_LINK_REGISTER
+       ret                     // bx lr
+
+.size  bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+___
+}
+{
+########################################################################
+# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
+
+my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
+my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
+my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
+my ($cnt,$carry,$topmost)=("x27","x28","x30");
+my ($tp,$ap_end,$na0)=($bp,$np,$carry);
+
+$code.=<<___;
+.type  __bn_sqr8x_mont,%function
+.align 5
+__bn_sqr8x_mont:
+       cmp     $ap,$bp
+       b.ne    __bn_mul4x_mont
+.Lsqr8x_mont:
+       // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
+       // only from bn_mul_mont which has already signed the return address.
+       stp     x29,x30,[sp,#-128]!
+       add     x29,sp,#0
+       stp     x19,x20,[sp,#16]
+       stp     x21,x22,[sp,#32]
+       stp     x23,x24,[sp,#48]
+       stp     x25,x26,[sp,#64]
+       stp     x27,x28,[sp,#80]
+       stp     $rp,$np,[sp,#96]        // offload rp and np
+
+       ldp     $a0,$a1,[$ap,#8*0]
+       ldp     $a2,$a3,[$ap,#8*2]
+       ldp     $a4,$a5,[$ap,#8*4]
+       ldp     $a6,$a7,[$ap,#8*6]
+
+       sub     $tp,sp,$num,lsl#4
+       lsl     $num,$num,#3
+       ldr     $n0,[$n0]               // *n0
+       mov     sp,$tp                  // alloca
+       sub     $cnt,$num,#8*8
+       b       .Lsqr8x_zero_start
+
+.Lsqr8x_zero:
+       sub     $cnt,$cnt,#8*8
+       stp     xzr,xzr,[$tp,#8*0]
+       stp     xzr,xzr,[$tp,#8*2]
+       stp     xzr,xzr,[$tp,#8*4]
+       stp     xzr,xzr,[$tp,#8*6]
+.Lsqr8x_zero_start:
+       stp     xzr,xzr,[$tp,#8*8]
+       stp     xzr,xzr,[$tp,#8*10]
+       stp     xzr,xzr,[$tp,#8*12]
+       stp     xzr,xzr,[$tp,#8*14]
+       add     $tp,$tp,#8*16
+       cbnz    $cnt,.Lsqr8x_zero
+
+       add     $ap_end,$ap,$num
+       add     $ap,$ap,#8*8
+       mov     $acc0,xzr
+       mov     $acc1,xzr
+       mov     $acc2,xzr
+       mov     $acc3,xzr
+       mov     $acc4,xzr
+       mov     $acc5,xzr
+       mov     $acc6,xzr
+       mov     $acc7,xzr
+       mov     $tp,sp
+       str     $n0,[x29,#112]          // offload n0
+
+       // Multiply everything but a[i]*a[i]
+.align 4
+.Lsqr8x_outer_loop:
+        //                                                 a[1]a[0]    (i)
+        //                                             a[2]a[0]
+        //                                         a[3]a[0]
+        //                                     a[4]a[0]
+        //                                 a[5]a[0]
+        //                             a[6]a[0]
+        //                         a[7]a[0]
+        //                                         a[2]a[1]            (ii)
+        //                                     a[3]a[1]
+        //                                 a[4]a[1]
+        //                             a[5]a[1]
+        //                         a[6]a[1]
+        //                     a[7]a[1]
+        //                                 a[3]a[2]                    (iii)
+        //                             a[4]a[2]
+        //                         a[5]a[2]
+        //                     a[6]a[2]
+        //                 a[7]a[2]
+        //                         a[4]a[3]                            (iv)
+        //                     a[5]a[3]
+        //                 a[6]a[3]
+        //             a[7]a[3]
+        //                 a[5]a[4]                                    (v)
+        //             a[6]a[4]
+        //         a[7]a[4]
+        //         a[6]a[5]                                            (vi)
+        //     a[7]a[5]
+        // a[7]a[6]                                                    (vii)
+
+       mul     $t0,$a1,$a0             // lo(a[1..7]*a[0])             (i)
+       mul     $t1,$a2,$a0
+       mul     $t2,$a3,$a0
+       mul     $t3,$a4,$a0
+       adds    $acc1,$acc1,$t0         // t[1]+lo(a[1]*a[0])
+       mul     $t0,$a5,$a0
+       adcs    $acc2,$acc2,$t1
+       mul     $t1,$a6,$a0
+       adcs    $acc3,$acc3,$t2
+       mul     $t2,$a7,$a0
+       adcs    $acc4,$acc4,$t3
+       umulh   $t3,$a1,$a0             // hi(a[1..7]*a[0])
+       adcs    $acc5,$acc5,$t0
+       umulh   $t0,$a2,$a0
+       adcs    $acc6,$acc6,$t1
+       umulh   $t1,$a3,$a0
+       adcs    $acc7,$acc7,$t2
+       umulh   $t2,$a4,$a0
+       stp     $acc0,$acc1,[$tp],#8*2  // t[0..1]
+       adc     $acc0,xzr,xzr           // t[8]
+       adds    $acc2,$acc2,$t3         // t[2]+lo(a[1]*a[0])
+       umulh   $t3,$a5,$a0
+       adcs    $acc3,$acc3,$t0
+       umulh   $t0,$a6,$a0
+       adcs    $acc4,$acc4,$t1
+       umulh   $t1,$a7,$a0
+       adcs    $acc5,$acc5,$t2
+        mul    $t2,$a2,$a1             // lo(a[2..7]*a[1])             (ii)
+       adcs    $acc6,$acc6,$t3
+        mul    $t3,$a3,$a1
+       adcs    $acc7,$acc7,$t0
+        mul    $t0,$a4,$a1
+       adc     $acc0,$acc0,$t1
+
+       mul     $t1,$a5,$a1
+       adds    $acc3,$acc3,$t2
+       mul     $t2,$a6,$a1
+       adcs    $acc4,$acc4,$t3
+       mul     $t3,$a7,$a1
+       adcs    $acc5,$acc5,$t0
+       umulh   $t0,$a2,$a1             // hi(a[2..7]*a[1])
+       adcs    $acc6,$acc6,$t1
+       umulh   $t1,$a3,$a1
+       adcs    $acc7,$acc7,$t2
+       umulh   $t2,$a4,$a1
+       adcs    $acc0,$acc0,$t3
+       umulh   $t3,$a5,$a1
+       stp     $acc2,$acc3,[$tp],#8*2  // t[2..3]
+       adc     $acc1,xzr,xzr           // t[9]
+       adds    $acc4,$acc4,$t0
+       umulh   $t0,$a6,$a1
+       adcs    $acc5,$acc5,$t1
+       umulh   $t1,$a7,$a1
+       adcs    $acc6,$acc6,$t2
+        mul    $t2,$a3,$a2             // lo(a[3..7]*a[2])             (iii)
+       adcs    $acc7,$acc7,$t3
+        mul    $t3,$a4,$a2
+       adcs    $acc0,$acc0,$t0
+        mul    $t0,$a5,$a2
+       adc     $acc1,$acc1,$t1
+
+       mul     $t1,$a6,$a2
+       adds    $acc5,$acc5,$t2
+       mul     $t2,$a7,$a2
+       adcs    $acc6,$acc6,$t3
+       umulh   $t3,$a3,$a2             // hi(a[3..7]*a[2])
+       adcs    $acc7,$acc7,$t0
+       umulh   $t0,$a4,$a2
+       adcs    $acc0,$acc0,$t1
+       umulh   $t1,$a5,$a2
+       adcs    $acc1,$acc1,$t2
+       umulh   $t2,$a6,$a2
+       stp     $acc4,$acc5,[$tp],#8*2  // t[4..5]
+       adc     $acc2,xzr,xzr           // t[10]
+       adds    $acc6,$acc6,$t3
+       umulh   $t3,$a7,$a2
+       adcs    $acc7,$acc7,$t0
+        mul    $t0,$a4,$a3             // lo(a[4..7]*a[3])             (iv)
+       adcs    $acc0,$acc0,$t1
+        mul    $t1,$a5,$a3
+       adcs    $acc1,$acc1,$t2
+        mul    $t2,$a6,$a3
+       adc     $acc2,$acc2,$t3
+
+       mul     $t3,$a7,$a3
+       adds    $acc7,$acc7,$t0
+       umulh   $t0,$a4,$a3             // hi(a[4..7]*a[3])
+       adcs    $acc0,$acc0,$t1
+       umulh   $t1,$a5,$a3
+       adcs    $acc1,$acc1,$t2
+       umulh   $t2,$a6,$a3
+       adcs    $acc2,$acc2,$t3
+       umulh   $t3,$a7,$a3
+       stp     $acc6,$acc7,[$tp],#8*2  // t[6..7]
+       adc     $acc3,xzr,xzr           // t[11]
+       adds    $acc0,$acc0,$t0
+        mul    $t0,$a5,$a4             // lo(a[5..7]*a[4])             (v)
+       adcs    $acc1,$acc1,$t1
+        mul    $t1,$a6,$a4
+       adcs    $acc2,$acc2,$t2
+        mul    $t2,$a7,$a4
+       adc     $acc3,$acc3,$t3
+
+       umulh   $t3,$a5,$a4             // hi(a[5..7]*a[4])
+       adds    $acc1,$acc1,$t0
+       umulh   $t0,$a6,$a4
+       adcs    $acc2,$acc2,$t1
+       umulh   $t1,$a7,$a4
+       adcs    $acc3,$acc3,$t2
+        mul    $t2,$a6,$a5             // lo(a[6..7]*a[5])             (vi)
+       adc     $acc4,xzr,xzr           // t[12]
+       adds    $acc2,$acc2,$t3
+        mul    $t3,$a7,$a5
+       adcs    $acc3,$acc3,$t0
+        umulh  $t0,$a6,$a5             // hi(a[6..7]*a[5])
+       adc     $acc4,$acc4,$t1
+
+       umulh   $t1,$a7,$a5
+       adds    $acc3,$acc3,$t2
+        mul    $t2,$a7,$a6             // lo(a[7]*a[6])                (vii)
+       adcs    $acc4,$acc4,$t3
+        umulh  $t3,$a7,$a6             // hi(a[7]*a[6])
+       adc     $acc5,xzr,xzr           // t[13]
+       adds    $acc4,$acc4,$t0
+       sub     $cnt,$ap_end,$ap        // done yet?
+       adc     $acc5,$acc5,$t1
+
+       adds    $acc5,$acc5,$t2
+       sub     $t0,$ap_end,$num        // rewinded ap
+       adc     $acc6,xzr,xzr           // t[14]
+       add     $acc6,$acc6,$t3
+
+       cbz     $cnt,.Lsqr8x_outer_break
+
+       mov     $n0,$a0
+       ldp     $a0,$a1,[$tp,#8*0]
+       ldp     $a2,$a3,[$tp,#8*2]
+       ldp     $a4,$a5,[$tp,#8*4]
+       ldp     $a6,$a7,[$tp,#8*6]
+       adds    $acc0,$acc0,$a0
+       adcs    $acc1,$acc1,$a1
+       ldp     $a0,$a1,[$ap,#8*0]
+       adcs    $acc2,$acc2,$a2
+       adcs    $acc3,$acc3,$a3
+       ldp     $a2,$a3,[$ap,#8*2]
+       adcs    $acc4,$acc4,$a4
+       adcs    $acc5,$acc5,$a5
+       ldp     $a4,$a5,[$ap,#8*4]
+       adcs    $acc6,$acc6,$a6
+       mov     $rp,$ap
+       adcs    $acc7,xzr,$a7
+       ldp     $a6,$a7,[$ap,#8*6]
+       add     $ap,$ap,#8*8
+       //adc   $carry,xzr,xzr          // moved below
+       mov     $cnt,#-8*8
+
+       //                                                         a[8]a[0]
+       //                                                     a[9]a[0]
+       //                                                 a[a]a[0]
+       //                                             a[b]a[0]
+       //                                         a[c]a[0]
+       //                                     a[d]a[0]
+       //                                 a[e]a[0]
+       //                             a[f]a[0]
+       //                                                     a[8]a[1]
+       //                         a[f]a[1]........................
+       //                                                 a[8]a[2]
+       //                     a[f]a[2]........................
+       //                                             a[8]a[3]
+       //                 a[f]a[3]........................
+       //                                         a[8]a[4]
+       //             a[f]a[4]........................
+       //                                     a[8]a[5]
+       //         a[f]a[5]........................
+       //                                 a[8]a[6]
+       //     a[f]a[6]........................
+       //                             a[8]a[7]
+       // a[f]a[7]........................
+.Lsqr8x_mul:
+       mul     $t0,$a0,$n0
+       adc     $carry,xzr,xzr          // carry bit, modulo-scheduled
+       mul     $t1,$a1,$n0
+       add     $cnt,$cnt,#8
+       mul     $t2,$a2,$n0
+       mul     $t3,$a3,$n0
+       adds    $acc0,$acc0,$t0
+       mul     $t0,$a4,$n0
+       adcs    $acc1,$acc1,$t1
+       mul     $t1,$a5,$n0
+       adcs    $acc2,$acc2,$t2
+       mul     $t2,$a6,$n0
+       adcs    $acc3,$acc3,$t3
+       mul     $t3,$a7,$n0
+       adcs    $acc4,$acc4,$t0
+       umulh   $t0,$a0,$n0
+       adcs    $acc5,$acc5,$t1
+       umulh   $t1,$a1,$n0
+       adcs    $acc6,$acc6,$t2
+       umulh   $t2,$a2,$n0
+       adcs    $acc7,$acc7,$t3
+       umulh   $t3,$a3,$n0
+       adc     $carry,$carry,xzr
+       str     $acc0,[$tp],#8
+       adds    $acc0,$acc1,$t0
+       umulh   $t0,$a4,$n0
+       adcs    $acc1,$acc2,$t1
+       umulh   $t1,$a5,$n0
+       adcs    $acc2,$acc3,$t2
+       umulh   $t2,$a6,$n0
+       adcs    $acc3,$acc4,$t3
+       umulh   $t3,$a7,$n0
+       ldr     $n0,[$rp,$cnt]
+       adcs    $acc4,$acc5,$t0
+       adcs    $acc5,$acc6,$t1
+       adcs    $acc6,$acc7,$t2
+       adcs    $acc7,$carry,$t3
+       //adc   $carry,xzr,xzr          // moved above
+       cbnz    $cnt,.Lsqr8x_mul
+                                       // note that carry flag is guaranteed
+                                       // to be zero at this point
+       cmp     $ap,$ap_end             // done yet?
+       b.eq    .Lsqr8x_break
+
+       ldp     $a0,$a1,[$tp,#8*0]
+       ldp     $a2,$a3,[$tp,#8*2]
+       ldp     $a4,$a5,[$tp,#8*4]
+       ldp     $a6,$a7,[$tp,#8*6]
+       adds    $acc0,$acc0,$a0
+       ldur    $n0,[$rp,#-8*8]
+       adcs    $acc1,$acc1,$a1
+       ldp     $a0,$a1,[$ap,#8*0]
+       adcs    $acc2,$acc2,$a2
+       adcs    $acc3,$acc3,$a3
+       ldp     $a2,$a3,[$ap,#8*2]
+       adcs    $acc4,$acc4,$a4
+       adcs    $acc5,$acc5,$a5
+       ldp     $a4,$a5,[$ap,#8*4]
+       adcs    $acc6,$acc6,$a6
+       mov     $cnt,#-8*8
+       adcs    $acc7,$acc7,$a7
+       ldp     $a6,$a7,[$ap,#8*6]
+       add     $ap,$ap,#8*8
+       //adc   $carry,xzr,xzr          // moved above
+       b       .Lsqr8x_mul
+
+.align 4
+.Lsqr8x_break:
+       ldp     $a0,$a1,[$rp,#8*0]
+       add     $ap,$rp,#8*8
+       ldp     $a2,$a3,[$rp,#8*2]
+       sub     $t0,$ap_end,$ap         // is it last iteration?
+       ldp     $a4,$a5,[$rp,#8*4]
+       sub     $t1,$tp,$t0
+       ldp     $a6,$a7,[$rp,#8*6]
+       cbz     $t0,.Lsqr8x_outer_loop
+
+       stp     $acc0,$acc1,[$tp,#8*0]
+       ldp     $acc0,$acc1,[$t1,#8*0]
+       stp     $acc2,$acc3,[$tp,#8*2]
+       ldp     $acc2,$acc3,[$t1,#8*2]
+       stp     $acc4,$acc5,[$tp,#8*4]
+       ldp     $acc4,$acc5,[$t1,#8*4]
+       stp     $acc6,$acc7,[$tp,#8*6]
+       mov     $tp,$t1
+       ldp     $acc6,$acc7,[$t1,#8*6]
+       b       .Lsqr8x_outer_loop
+
+.align 4
+.Lsqr8x_outer_break:
+       // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+       ldp     $a1,$a3,[$t0,#8*0]      // recall that $t0 is &a[0]
+       ldp     $t1,$t2,[sp,#8*1]
+       ldp     $a5,$a7,[$t0,#8*2]
+       add     $ap,$t0,#8*4
+       ldp     $t3,$t0,[sp,#8*3]
 
+       stp     $acc0,$acc1,[$tp,#8*0]
+       mul     $acc0,$a1,$a1
+       stp     $acc2,$acc3,[$tp,#8*2]
+       umulh   $a1,$a1,$a1
+       stp     $acc4,$acc5,[$tp,#8*4]
+       mul     $a2,$a3,$a3
+       stp     $acc6,$acc7,[$tp,#8*6]
+       mov     $tp,sp
+       umulh   $a3,$a3,$a3
+       adds    $acc1,$a1,$t1,lsl#1
+       extr    $t1,$t2,$t1,#63
+       sub     $cnt,$num,#8*4
+
+.Lsqr4x_shift_n_add:
+       adcs    $acc2,$a2,$t1
+       extr    $t2,$t3,$t2,#63
+       sub     $cnt,$cnt,#8*4
+       adcs    $acc3,$a3,$t2
+       ldp     $t1,$t2,[$tp,#8*5]
+       mul     $a4,$a5,$a5
+       ldp     $a1,$a3,[$ap],#8*2
+       umulh   $a5,$a5,$a5
+       mul     $a6,$a7,$a7
+       umulh   $a7,$a7,$a7
+       extr    $t3,$t0,$t3,#63
+       stp     $acc0,$acc1,[$tp,#8*0]
+       adcs    $acc4,$a4,$t3
+       extr    $t0,$t1,$t0,#63
+       stp     $acc2,$acc3,[$tp,#8*2]
+       adcs    $acc5,$a5,$t0
+       ldp     $t3,$t0,[$tp,#8*7]
+       extr    $t1,$t2,$t1,#63
+       adcs    $acc6,$a6,$t1
+       extr    $t2,$t3,$t2,#63
+       adcs    $acc7,$a7,$t2
+       ldp     $t1,$t2,[$tp,#8*9]
+       mul     $a0,$a1,$a1
+       ldp     $a5,$a7,[$ap],#8*2
+       umulh   $a1,$a1,$a1
+       mul     $a2,$a3,$a3
+       umulh   $a3,$a3,$a3
+       stp     $acc4,$acc5,[$tp,#8*4]
+       extr    $t3,$t0,$t3,#63
+       stp     $acc6,$acc7,[$tp,#8*6]
+       add     $tp,$tp,#8*8
+       adcs    $acc0,$a0,$t3
+       extr    $t0,$t1,$t0,#63
+       adcs    $acc1,$a1,$t0
+       ldp     $t3,$t0,[$tp,#8*3]
+       extr    $t1,$t2,$t1,#63
+       cbnz    $cnt,.Lsqr4x_shift_n_add
+___
+my ($np,$np_end)=($ap,$ap_end);
+$code.=<<___;
+        ldp    $np,$n0,[x29,#104]      // pull np and n0
+
+       adcs    $acc2,$a2,$t1
+       extr    $t2,$t3,$t2,#63
+       adcs    $acc3,$a3,$t2
+       ldp     $t1,$t2,[$tp,#8*5]
+       mul     $a4,$a5,$a5
+       umulh   $a5,$a5,$a5
+       stp     $acc0,$acc1,[$tp,#8*0]
+       mul     $a6,$a7,$a7
+       umulh   $a7,$a7,$a7
+       stp     $acc2,$acc3,[$tp,#8*2]
+       extr    $t3,$t0,$t3,#63
+       adcs    $acc4,$a4,$t3
+       extr    $t0,$t1,$t0,#63
+        ldp    $acc0,$acc1,[sp,#8*0]
+       adcs    $acc5,$a5,$t0
+       extr    $t1,$t2,$t1,#63
+        ldp    $a0,$a1,[$np,#8*0]
+       adcs    $acc6,$a6,$t1
+       extr    $t2,xzr,$t2,#63
+        ldp    $a2,$a3,[$np,#8*2]
+       adc     $acc7,$a7,$t2
+        ldp    $a4,$a5,[$np,#8*4]
+
+       // Reduce by 512 bits per iteration
+       mul     $na0,$n0,$acc0          // t[0]*n0
+       ldp     $a6,$a7,[$np,#8*6]
+       add     $np_end,$np,$num
+       ldp     $acc2,$acc3,[sp,#8*2]
+       stp     $acc4,$acc5,[$tp,#8*4]
+       ldp     $acc4,$acc5,[sp,#8*4]
+       stp     $acc6,$acc7,[$tp,#8*6]
+       ldp     $acc6,$acc7,[sp,#8*6]
+       add     $np,$np,#8*8
+       mov     $topmost,xzr            // initial top-most carry
+       mov     $tp,sp
+       mov     $cnt,#8
+
+.Lsqr8x_reduction:
+       // (*)  mul     $t0,$a0,$na0    // lo(n[0-7])*lo(t[0]*n0)
+       mul     $t1,$a1,$na0
+       sub     $cnt,$cnt,#1
+       mul     $t2,$a2,$na0
+       str     $na0,[$tp],#8           // put aside t[0]*n0 for tail processing
+       mul     $t3,$a3,$na0
+       // (*)  adds    xzr,$acc0,$t0
+       subs    xzr,$acc0,#1            // (*)
+       mul     $t0,$a4,$na0
+       adcs    $acc0,$acc1,$t1
+       mul     $t1,$a5,$na0
+       adcs    $acc1,$acc2,$t2
+       mul     $t2,$a6,$na0
+       adcs    $acc2,$acc3,$t3
+       mul     $t3,$a7,$na0
+       adcs    $acc3,$acc4,$t0
+       umulh   $t0,$a0,$na0            // hi(n[0-7])*lo(t[0]*n0)
+       adcs    $acc4,$acc5,$t1
+       umulh   $t1,$a1,$na0
+       adcs    $acc5,$acc6,$t2
+       umulh   $t2,$a2,$na0
+       adcs    $acc6,$acc7,$t3
+       umulh   $t3,$a3,$na0
+       adc     $acc7,xzr,xzr
+       adds    $acc0,$acc0,$t0
+       umulh   $t0,$a4,$na0
+       adcs    $acc1,$acc1,$t1
+       umulh   $t1,$a5,$na0
+       adcs    $acc2,$acc2,$t2
+       umulh   $t2,$a6,$na0
+       adcs    $acc3,$acc3,$t3
+       umulh   $t3,$a7,$na0
+       mul     $na0,$n0,$acc0          // next t[0]*n0
+       adcs    $acc4,$acc4,$t0
+       adcs    $acc5,$acc5,$t1
+       adcs    $acc6,$acc6,$t2
+       adc     $acc7,$acc7,$t3
+       cbnz    $cnt,.Lsqr8x_reduction
+
+       ldp     $t0,$t1,[$tp,#8*0]
+       ldp     $t2,$t3,[$tp,#8*2]
+       mov     $rp,$tp
+       sub     $cnt,$np_end,$np        // done yet?
+       adds    $acc0,$acc0,$t0
+       adcs    $acc1,$acc1,$t1
+       ldp     $t0,$t1,[$tp,#8*4]
+       adcs    $acc2,$acc2,$t2
+       adcs    $acc3,$acc3,$t3
+       ldp     $t2,$t3,[$tp,#8*6]
+       adcs    $acc4,$acc4,$t0
+       adcs    $acc5,$acc5,$t1
+       adcs    $acc6,$acc6,$t2
+       adcs    $acc7,$acc7,$t3
+       //adc   $carry,xzr,xzr          // moved below
+       cbz     $cnt,.Lsqr8x8_post_condition
+
+       ldur    $n0,[$tp,#-8*8]
+       ldp     $a0,$a1,[$np,#8*0]
+       ldp     $a2,$a3,[$np,#8*2]
+       ldp     $a4,$a5,[$np,#8*4]
+       mov     $cnt,#-8*8
+       ldp     $a6,$a7,[$np,#8*6]
+       add     $np,$np,#8*8
+
+.Lsqr8x_tail:
+       mul     $t0,$a0,$n0
+       adc     $carry,xzr,xzr          // carry bit, modulo-scheduled
+       mul     $t1,$a1,$n0
+       add     $cnt,$cnt,#8
+       mul     $t2,$a2,$n0
+       mul     $t3,$a3,$n0
+       adds    $acc0,$acc0,$t0
+       mul     $t0,$a4,$n0
+       adcs    $acc1,$acc1,$t1
+       mul     $t1,$a5,$n0
+       adcs    $acc2,$acc2,$t2
+       mul     $t2,$a6,$n0
+       adcs    $acc3,$acc3,$t3
+       mul     $t3,$a7,$n0
+       adcs    $acc4,$acc4,$t0
+       umulh   $t0,$a0,$n0
+       adcs    $acc5,$acc5,$t1
+       umulh   $t1,$a1,$n0
+       adcs    $acc6,$acc6,$t2
+       umulh   $t2,$a2,$n0
+       adcs    $acc7,$acc7,$t3
+       umulh   $t3,$a3,$n0
+       adc     $carry,$carry,xzr
+       str     $acc0,[$tp],#8
+       adds    $acc0,$acc1,$t0
+       umulh   $t0,$a4,$n0
+       adcs    $acc1,$acc2,$t1
+       umulh   $t1,$a5,$n0
+       adcs    $acc2,$acc3,$t2
+       umulh   $t2,$a6,$n0
+       adcs    $acc3,$acc4,$t3
+       umulh   $t3,$a7,$n0
+       ldr     $n0,[$rp,$cnt]
+       adcs    $acc4,$acc5,$t0
+       adcs    $acc5,$acc6,$t1
+       adcs    $acc6,$acc7,$t2
+       adcs    $acc7,$carry,$t3
+       //adc   $carry,xzr,xzr          // moved above
+       cbnz    $cnt,.Lsqr8x_tail
+                                       // note that carry flag is guaranteed
+                                       // to be zero at this point
+       ldp     $a0,$a1,[$tp,#8*0]
+       sub     $cnt,$np_end,$np        // done yet?
+       sub     $t2,$np_end,$num        // rewinded np
+       ldp     $a2,$a3,[$tp,#8*2]
+       ldp     $a4,$a5,[$tp,#8*4]
+       ldp     $a6,$a7,[$tp,#8*6]
+       cbz     $cnt,.Lsqr8x_tail_break
+
+       ldur    $n0,[$rp,#-8*8]
+       adds    $acc0,$acc0,$a0
+       adcs    $acc1,$acc1,$a1
+       ldp     $a0,$a1,[$np,#8*0]
+       adcs    $acc2,$acc2,$a2
+       adcs    $acc3,$acc3,$a3
+       ldp     $a2,$a3,[$np,#8*2]
+       adcs    $acc4,$acc4,$a4
+       adcs    $acc5,$acc5,$a5
+       ldp     $a4,$a5,[$np,#8*4]
+       adcs    $acc6,$acc6,$a6
+       mov     $cnt,#-8*8
+       adcs    $acc7,$acc7,$a7
+       ldp     $a6,$a7,[$np,#8*6]
+       add     $np,$np,#8*8
+       //adc   $carry,xzr,xzr          // moved above
+       b       .Lsqr8x_tail
+
+.align 4
+.Lsqr8x_tail_break:
+       ldr     $n0,[x29,#112]          // pull n0
+       add     $cnt,$tp,#8*8           // end of current t[num] window
+
+       subs    xzr,$topmost,#1         // "move" top-most carry to carry bit
+       adcs    $t0,$acc0,$a0
+       adcs    $t1,$acc1,$a1
+       ldp     $acc0,$acc1,[$rp,#8*0]
+       adcs    $acc2,$acc2,$a2
+       ldp     $a0,$a1,[$t2,#8*0]      // recall that $t2 is &n[0]
+       adcs    $acc3,$acc3,$a3
+       ldp     $a2,$a3,[$t2,#8*2]
+       adcs    $acc4,$acc4,$a4
+       adcs    $acc5,$acc5,$a5
+       ldp     $a4,$a5,[$t2,#8*4]
+       adcs    $acc6,$acc6,$a6
+       adcs    $acc7,$acc7,$a7
+       ldp     $a6,$a7,[$t2,#8*6]
+       add     $np,$t2,#8*8
+       adc     $topmost,xzr,xzr        // top-most carry
+       mul     $na0,$n0,$acc0
+       stp     $t0,$t1,[$tp,#8*0]
+       stp     $acc2,$acc3,[$tp,#8*2]
+       ldp     $acc2,$acc3,[$rp,#8*2]
+       stp     $acc4,$acc5,[$tp,#8*4]
+       ldp     $acc4,$acc5,[$rp,#8*4]
+       cmp     $cnt,x29                // did we hit the bottom?
+       stp     $acc6,$acc7,[$tp,#8*6]
+       mov     $tp,$rp                 // slide the window
+       ldp     $acc6,$acc7,[$rp,#8*6]
+       mov     $cnt,#8
+       b.ne    .Lsqr8x_reduction
+
+       // Final step. We see if result is larger than modulus, and
+       // if it is, subtract the modulus. But comparison implies
+       // subtraction. So we subtract modulus, see if it borrowed,
+       // and conditionally copy original value.
+       ldr     $rp,[x29,#96]           // pull rp
+       add     $tp,$tp,#8*8
+       subs    $t0,$acc0,$a0
+       sbcs    $t1,$acc1,$a1
+       sub     $cnt,$num,#8*8
+       mov     $ap_end,$rp             // $rp copy
+
+.Lsqr8x_sub:
+       sbcs    $t2,$acc2,$a2
+       ldp     $a0,$a1,[$np,#8*0]
+       sbcs    $t3,$acc3,$a3
+       stp     $t0,$t1,[$rp,#8*0]
+       sbcs    $t0,$acc4,$a4
+       ldp     $a2,$a3,[$np,#8*2]
+       sbcs    $t1,$acc5,$a5
+       stp     $t2,$t3,[$rp,#8*2]
+       sbcs    $t2,$acc6,$a6
+       ldp     $a4,$a5,[$np,#8*4]
+       sbcs    $t3,$acc7,$a7
+       ldp     $a6,$a7,[$np,#8*6]
+       add     $np,$np,#8*8
+       ldp     $acc0,$acc1,[$tp,#8*0]
+       sub     $cnt,$cnt,#8*8
+       ldp     $acc2,$acc3,[$tp,#8*2]
+       ldp     $acc4,$acc5,[$tp,#8*4]
+       ldp     $acc6,$acc7,[$tp,#8*6]
+       add     $tp,$tp,#8*8
+       stp     $t0,$t1,[$rp,#8*4]
+       sbcs    $t0,$acc0,$a0
+       stp     $t2,$t3,[$rp,#8*6]
+       add     $rp,$rp,#8*8
+       sbcs    $t1,$acc1,$a1
+       cbnz    $cnt,.Lsqr8x_sub
+
+       sbcs    $t2,$acc2,$a2
+        mov    $tp,sp
+        add    $ap,sp,$num
+        ldp    $a0,$a1,[$ap_end,#8*0]
+       sbcs    $t3,$acc3,$a3
+       stp     $t0,$t1,[$rp,#8*0]
+       sbcs    $t0,$acc4,$a4
+        ldp    $a2,$a3,[$ap_end,#8*2]
+       sbcs    $t1,$acc5,$a5
+       stp     $t2,$t3,[$rp,#8*2]
+       sbcs    $t2,$acc6,$a6
+        ldp    $acc0,$acc1,[$ap,#8*0]
+       sbcs    $t3,$acc7,$a7
+        ldp    $acc2,$acc3,[$ap,#8*2]
+       sbcs    xzr,$topmost,xzr        // did it borrow?
+       ldr     x30,[x29,#8]            // pull return address
+       stp     $t0,$t1,[$rp,#8*4]
+       stp     $t2,$t3,[$rp,#8*6]
+
+       sub     $cnt,$num,#8*4
+.Lsqr4x_cond_copy:
+       sub     $cnt,$cnt,#8*4
+       csel    $t0,$acc0,$a0,lo
+        stp    xzr,xzr,[$tp,#8*0]
+       csel    $t1,$acc1,$a1,lo
+       ldp     $a0,$a1,[$ap_end,#8*4]
+       ldp     $acc0,$acc1,[$ap,#8*4]
+       csel    $t2,$acc2,$a2,lo
+        stp    xzr,xzr,[$tp,#8*2]
+        add    $tp,$tp,#8*4
+       csel    $t3,$acc3,$a3,lo
+       ldp     $a2,$a3,[$ap_end,#8*6]
+       ldp     $acc2,$acc3,[$ap,#8*6]
+       add     $ap,$ap,#8*4
+       stp     $t0,$t1,[$ap_end,#8*0]
+       stp     $t2,$t3,[$ap_end,#8*2]
+       add     $ap_end,$ap_end,#8*4
+        stp    xzr,xzr,[$ap,#8*0]
+        stp    xzr,xzr,[$ap,#8*2]
+       cbnz    $cnt,.Lsqr4x_cond_copy
+
+       csel    $t0,$acc0,$a0,lo
+        stp    xzr,xzr,[$tp,#8*0]
+       csel    $t1,$acc1,$a1,lo
+        stp    xzr,xzr,[$tp,#8*2]
+       csel    $t2,$acc2,$a2,lo
+       csel    $t3,$acc3,$a3,lo
+       stp     $t0,$t1,[$ap_end,#8*0]
+       stp     $t2,$t3,[$ap_end,#8*2]
+
+       b       .Lsqr8x_done
+
+.align 4
+.Lsqr8x8_post_condition:
+       adc     $carry,xzr,xzr
+       ldr     x30,[x29,#8]            // pull return address
+       // $acc0-7,$carry hold result, $a0-7 hold modulus
+       subs    $a0,$acc0,$a0
+       ldr     $ap,[x29,#96]           // pull rp
+       sbcs    $a1,$acc1,$a1
+        stp    xzr,xzr,[sp,#8*0]
+       sbcs    $a2,$acc2,$a2
+        stp    xzr,xzr,[sp,#8*2]
+       sbcs    $a3,$acc3,$a3
+        stp    xzr,xzr,[sp,#8*4]
+       sbcs    $a4,$acc4,$a4
+        stp    xzr,xzr,[sp,#8*6]
+       sbcs    $a5,$acc5,$a5
+        stp    xzr,xzr,[sp,#8*8]
+       sbcs    $a6,$acc6,$a6
+        stp    xzr,xzr,[sp,#8*10]
+       sbcs    $a7,$acc7,$a7
+        stp    xzr,xzr,[sp,#8*12]
+       sbcs    $carry,$carry,xzr       // did it borrow?
+        stp    xzr,xzr,[sp,#8*14]
+
+       // $a0-7 hold result-modulus
+       csel    $a0,$acc0,$a0,lo
+       csel    $a1,$acc1,$a1,lo
+       csel    $a2,$acc2,$a2,lo
+       csel    $a3,$acc3,$a3,lo
+       stp     $a0,$a1,[$ap,#8*0]
+       csel    $a4,$acc4,$a4,lo
+       csel    $a5,$acc5,$a5,lo
+       stp     $a2,$a3,[$ap,#8*2]
+       csel    $a6,$acc6,$a6,lo
+       csel    $a7,$acc7,$a7,lo
+       stp     $a4,$a5,[$ap,#8*4]
+       stp     $a6,$a7,[$ap,#8*6]
+
+.Lsqr8x_done:
+       ldp     x19,x20,[x29,#16]
+       mov     sp,x29
+       ldp     x21,x22,[x29,#32]
+       mov     x0,#1
+       ldp     x23,x24,[x29,#48]
+       ldp     x25,x26,[x29,#64]
+       ldp     x27,x28,[x29,#80]
+       ldr     x29,[sp],#128
+       // x30 is loaded earlier
+       AARCH64_VALIDATE_LINK_REGISTER
+       ret
+.size  __bn_sqr8x_mont,.-__bn_sqr8x_mont
+___
+}
+
+{
+########################################################################
+# Even though this might look as ARMv8 adaptation of mulx4x_mont from
+# x86_64-mont5 module, it's different in sense that it performs
+# reduction 256 bits at a time.
+
+my ($a0,$a1,$a2,$a3,
+    $t0,$t1,$t2,$t3,
+    $m0,$m1,$m2,$m3,
+    $acc0,$acc1,$acc2,$acc3,$acc4,
+    $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
+my  $bp_end=$rp;
+my  ($carry,$topmost) = ($rp,"x30");
+
+$code.=<<___;
+.type  __bn_mul4x_mont,%function
+.align 5
+__bn_mul4x_mont:
+       // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
+       // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address.
+       stp     x29,x30,[sp,#-128]!
+       add     x29,sp,#0
+       stp     x19,x20,[sp,#16]
+       stp     x21,x22,[sp,#32]
+       stp     x23,x24,[sp,#48]
+       stp     x25,x26,[sp,#64]
+       stp     x27,x28,[sp,#80]
+
+       sub     $tp,sp,$num,lsl#3
+       lsl     $num,$num,#3
+       ldr     $n0,[$n0]               // *n0
+       sub     sp,$tp,#8*4             // alloca
+
+       add     $t0,$bp,$num
+       add     $ap_end,$ap,$num
+       stp     $rp,$t0,[x29,#96]       // offload rp and &b[num]
+
+       ldr     $bi,[$bp,#8*0]          // b[0]
+       ldp     $a0,$a1,[$ap,#8*0]      // a[0..3]
+       ldp     $a2,$a3,[$ap,#8*2]
+       add     $ap,$ap,#8*4
+       mov     $acc0,xzr
+       mov     $acc1,xzr
+       mov     $acc2,xzr
+       mov     $acc3,xzr
+       ldp     $m0,$m1,[$np,#8*0]      // n[0..3]
+       ldp     $m2,$m3,[$np,#8*2]
+       adds    $np,$np,#8*4            // clear carry bit
+       mov     $carry,xzr
+       mov     $cnt,#0
+       mov     $tp,sp
+
+.Loop_mul4x_1st_reduction:
+       mul     $t0,$a0,$bi             // lo(a[0..3]*b[0])
+       adc     $carry,$carry,xzr       // modulo-scheduled
+       mul     $t1,$a1,$bi
+       add     $cnt,$cnt,#8
+       mul     $t2,$a2,$bi
+       and     $cnt,$cnt,#31
+       mul     $t3,$a3,$bi
+       adds    $acc0,$acc0,$t0
+       umulh   $t0,$a0,$bi             // hi(a[0..3]*b[0])
+       adcs    $acc1,$acc1,$t1
+       mul     $mi,$acc0,$n0           // t[0]*n0
+       adcs    $acc2,$acc2,$t2
+       umulh   $t1,$a1,$bi
+       adcs    $acc3,$acc3,$t3
+       umulh   $t2,$a2,$bi
+       adc     $acc4,xzr,xzr
+       umulh   $t3,$a3,$bi
+       ldr     $bi,[$bp,$cnt]          // next b[i] (or b[0])
+       adds    $acc1,$acc1,$t0
+       // (*)  mul     $t0,$m0,$mi     // lo(n[0..3]*t[0]*n0)
+       str     $mi,[$tp],#8            // put aside t[0]*n0 for tail processing
+       adcs    $acc2,$acc2,$t1
+       mul     $t1,$m1,$mi
+       adcs    $acc3,$acc3,$t2
+       mul     $t2,$m2,$mi
+       adc     $acc4,$acc4,$t3         // can't overflow
+       mul     $t3,$m3,$mi
+       // (*)  adds    xzr,$acc0,$t0
+       subs    xzr,$acc0,#1            // (*)
+       umulh   $t0,$m0,$mi             // hi(n[0..3]*t[0]*n0)
+       adcs    $acc0,$acc1,$t1
+       umulh   $t1,$m1,$mi
+       adcs    $acc1,$acc2,$t2
+       umulh   $t2,$m2,$mi
+       adcs    $acc2,$acc3,$t3
+       umulh   $t3,$m3,$mi
+       adcs    $acc3,$acc4,$carry
+       adc     $carry,xzr,xzr
+       adds    $acc0,$acc0,$t0
+       sub     $t0,$ap_end,$ap
+       adcs    $acc1,$acc1,$t1
+       adcs    $acc2,$acc2,$t2
+       adcs    $acc3,$acc3,$t3
+       //adc   $carry,$carry,xzr
+       cbnz    $cnt,.Loop_mul4x_1st_reduction
+
+       cbz     $t0,.Lmul4x4_post_condition
+
+       ldp     $a0,$a1,[$ap,#8*0]      // a[4..7]
+       ldp     $a2,$a3,[$ap,#8*2]
+       add     $ap,$ap,#8*4
+       ldr     $mi,[sp]                // a[0]*n0
+       ldp     $m0,$m1,[$np,#8*0]      // n[4..7]
+       ldp     $m2,$m3,[$np,#8*2]
+       add     $np,$np,#8*4
+
+.Loop_mul4x_1st_tail:
+       mul     $t0,$a0,$bi             // lo(a[4..7]*b[i])
+       adc     $carry,$carry,xzr       // modulo-scheduled
+       mul     $t1,$a1,$bi
+       add     $cnt,$cnt,#8
+       mul     $t2,$a2,$bi
+       and     $cnt,$cnt,#31
+       mul     $t3,$a3,$bi
+       adds    $acc0,$acc0,$t0
+       umulh   $t0,$a0,$bi             // hi(a[4..7]*b[i])
+       adcs    $acc1,$acc1,$t1
+       umulh   $t1,$a1,$bi
+       adcs    $acc2,$acc2,$t2
+       umulh   $t2,$a2,$bi
+       adcs    $acc3,$acc3,$t3
+       umulh   $t3,$a3,$bi
+       adc     $acc4,xzr,xzr
+       ldr     $bi,[$bp,$cnt]          // next b[i] (or b[0])
+       adds    $acc1,$acc1,$t0
+       mul     $t0,$m0,$mi             // lo(n[4..7]*a[0]*n0)
+       adcs    $acc2,$acc2,$t1
+       mul     $t1,$m1,$mi
+       adcs    $acc3,$acc3,$t2
+       mul     $t2,$m2,$mi
+       adc     $acc4,$acc4,$t3         // can't overflow
+       mul     $t3,$m3,$mi
+       adds    $acc0,$acc0,$t0
+       umulh   $t0,$m0,$mi             // hi(n[4..7]*a[0]*n0)
+       adcs    $acc1,$acc1,$t1
+       umulh   $t1,$m1,$mi
+       adcs    $acc2,$acc2,$t2
+       umulh   $t2,$m2,$mi
+       adcs    $acc3,$acc3,$t3
+       adcs    $acc4,$acc4,$carry
+       umulh   $t3,$m3,$mi
+       adc     $carry,xzr,xzr
+       ldr     $mi,[sp,$cnt]           // next t[0]*n0
+       str     $acc0,[$tp],#8          // result!!!
+       adds    $acc0,$acc1,$t0
+       sub     $t0,$ap_end,$ap         // done yet?
+       adcs    $acc1,$acc2,$t1
+       adcs    $acc2,$acc3,$t2
+       adcs    $acc3,$acc4,$t3
+       //adc   $carry,$carry,xzr
+       cbnz    $cnt,.Loop_mul4x_1st_tail
+
+       sub     $t1,$ap_end,$num        // rewinded $ap
+       cbz     $t0,.Lmul4x_proceed
+
+       ldp     $a0,$a1,[$ap,#8*0]
+       ldp     $a2,$a3,[$ap,#8*2]
+       add     $ap,$ap,#8*4
+       ldp     $m0,$m1,[$np,#8*0]
+       ldp     $m2,$m3,[$np,#8*2]
+       add     $np,$np,#8*4
+       b       .Loop_mul4x_1st_tail
+
+.align 5
+.Lmul4x_proceed:
+       ldr     $bi,[$bp,#8*4]!         // *++b
+       adc     $topmost,$carry,xzr
+       ldp     $a0,$a1,[$t1,#8*0]      // a[0..3]
+       sub     $np,$np,$num            // rewind np
+       ldp     $a2,$a3,[$t1,#8*2]
+       add     $ap,$t1,#8*4
+
+       stp     $acc0,$acc1,[$tp,#8*0]  // result!!!
+       ldp     $acc0,$acc1,[sp,#8*4]   // t[0..3]
+       stp     $acc2,$acc3,[$tp,#8*2]  // result!!!
+       ldp     $acc2,$acc3,[sp,#8*6]
+
+       ldp     $m0,$m1,[$np,#8*0]      // n[0..3]
+       mov     $tp,sp
+       ldp     $m2,$m3,[$np,#8*2]
+       adds    $np,$np,#8*4            // clear carry bit
+       mov     $carry,xzr
+
+.align 4
+.Loop_mul4x_reduction:
+       mul     $t0,$a0,$bi             // lo(a[0..3]*b[4])
+       adc     $carry,$carry,xzr       // modulo-scheduled
+       mul     $t1,$a1,$bi
+       add     $cnt,$cnt,#8
+       mul     $t2,$a2,$bi
+       and     $cnt,$cnt,#31
+       mul     $t3,$a3,$bi
+       adds    $acc0,$acc0,$t0
+       umulh   $t0,$a0,$bi             // hi(a[0..3]*b[4])
+       adcs    $acc1,$acc1,$t1
+       mul     $mi,$acc0,$n0           // t[0]*n0
+       adcs    $acc2,$acc2,$t2
+       umulh   $t1,$a1,$bi
+       adcs    $acc3,$acc3,$t3
+       umulh   $t2,$a2,$bi
+       adc     $acc4,xzr,xzr
+       umulh   $t3,$a3,$bi
+       ldr     $bi,[$bp,$cnt]          // next b[i]
+       adds    $acc1,$acc1,$t0
+       // (*)  mul     $t0,$m0,$mi
+       str     $mi,[$tp],#8            // put aside t[0]*n0 for tail processing
+       adcs    $acc2,$acc2,$t1
+       mul     $t1,$m1,$mi             // lo(n[0..3]*t[0]*n0
+       adcs    $acc3,$acc3,$t2
+       mul     $t2,$m2,$mi
+       adc     $acc4,$acc4,$t3         // can't overflow
+       mul     $t3,$m3,$mi
+       // (*)  adds    xzr,$acc0,$t0
+       subs    xzr,$acc0,#1            // (*)
+       umulh   $t0,$m0,$mi             // hi(n[0..3]*t[0]*n0
+       adcs    $acc0,$acc1,$t1
+       umulh   $t1,$m1,$mi
+       adcs    $acc1,$acc2,$t2
+       umulh   $t2,$m2,$mi
+       adcs    $acc2,$acc3,$t3
+       umulh   $t3,$m3,$mi
+       adcs    $acc3,$acc4,$carry
+       adc     $carry,xzr,xzr
+       adds    $acc0,$acc0,$t0
+       adcs    $acc1,$acc1,$t1
+       adcs    $acc2,$acc2,$t2
+       adcs    $acc3,$acc3,$t3
+       //adc   $carry,$carry,xzr
+       cbnz    $cnt,.Loop_mul4x_reduction
+
+       adc     $carry,$carry,xzr
+       ldp     $t0,$t1,[$tp,#8*4]      // t[4..7]
+       ldp     $t2,$t3,[$tp,#8*6]
+       ldp     $a0,$a1,[$ap,#8*0]      // a[4..7]
+       ldp     $a2,$a3,[$ap,#8*2]
+       add     $ap,$ap,#8*4
+       adds    $acc0,$acc0,$t0
+       adcs    $acc1,$acc1,$t1
+       adcs    $acc2,$acc2,$t2
+       adcs    $acc3,$acc3,$t3
+       //adc   $carry,$carry,xzr
+
+       ldr     $mi,[sp]                // t[0]*n0
+       ldp     $m0,$m1,[$np,#8*0]      // n[4..7]
+       ldp     $m2,$m3,[$np,#8*2]
+       add     $np,$np,#8*4
+
+.align 4
+.Loop_mul4x_tail:
+       mul     $t0,$a0,$bi             // lo(a[4..7]*b[4])
+       adc     $carry,$carry,xzr       // modulo-scheduled
+       mul     $t1,$a1,$bi
+       add     $cnt,$cnt,#8
+       mul     $t2,$a2,$bi
+       and     $cnt,$cnt,#31
+       mul     $t3,$a3,$bi
+       adds    $acc0,$acc0,$t0
+       umulh   $t0,$a0,$bi             // hi(a[4..7]*b[4])
+       adcs    $acc1,$acc1,$t1
+       umulh   $t1,$a1,$bi
+       adcs    $acc2,$acc2,$t2
+       umulh   $t2,$a2,$bi
+       adcs    $acc3,$acc3,$t3
+       umulh   $t3,$a3,$bi
+       adc     $acc4,xzr,xzr
+       ldr     $bi,[$bp,$cnt]          // next b[i]
+       adds    $acc1,$acc1,$t0
+       mul     $t0,$m0,$mi             // lo(n[4..7]*t[0]*n0)
+       adcs    $acc2,$acc2,$t1
+       mul     $t1,$m1,$mi
+       adcs    $acc3,$acc3,$t2
+       mul     $t2,$m2,$mi
+       adc     $acc4,$acc4,$t3         // can't overflow
+       mul     $t3,$m3,$mi
+       adds    $acc0,$acc0,$t0
+       umulh   $t0,$m0,$mi             // hi(n[4..7]*t[0]*n0)
+       adcs    $acc1,$acc1,$t1
+       umulh   $t1,$m1,$mi
+       adcs    $acc2,$acc2,$t2
+       umulh   $t2,$m2,$mi
+       adcs    $acc3,$acc3,$t3
+       umulh   $t3,$m3,$mi
+       adcs    $acc4,$acc4,$carry
+       ldr     $mi,[sp,$cnt]           // next a[0]*n0
+       adc     $carry,xzr,xzr
+       str     $acc0,[$tp],#8          // result!!!
+       adds    $acc0,$acc1,$t0
+       sub     $t0,$ap_end,$ap         // done yet?
+       adcs    $acc1,$acc2,$t1
+       adcs    $acc2,$acc3,$t2
+       adcs    $acc3,$acc4,$t3
+       //adc   $carry,$carry,xzr
+       cbnz    $cnt,.Loop_mul4x_tail
+
+       sub     $t1,$np,$num            // rewinded np?
+       adc     $carry,$carry,xzr
+       cbz     $t0,.Loop_mul4x_break
+
+       ldp     $t0,$t1,[$tp,#8*4]
+       ldp     $t2,$t3,[$tp,#8*6]
+       ldp     $a0,$a1,[$ap,#8*0]
+       ldp     $a2,$a3,[$ap,#8*2]
+       add     $ap,$ap,#8*4
+       adds    $acc0,$acc0,$t0
+       adcs    $acc1,$acc1,$t1
+       adcs    $acc2,$acc2,$t2
+       adcs    $acc3,$acc3,$t3
+       //adc   $carry,$carry,xzr
+       ldp     $m0,$m1,[$np,#8*0]
+       ldp     $m2,$m3,[$np,#8*2]
+       add     $np,$np,#8*4
+       b       .Loop_mul4x_tail
+
+.align 4
+.Loop_mul4x_break:
+       ldp     $t2,$t3,[x29,#96]       // pull rp and &b[num]
+       adds    $acc0,$acc0,$topmost
+       add     $bp,$bp,#8*4            // bp++
+       adcs    $acc1,$acc1,xzr
+       sub     $ap,$ap,$num            // rewind ap
+       adcs    $acc2,$acc2,xzr
+       stp     $acc0,$acc1,[$tp,#8*0]  // result!!!
+       adcs    $acc3,$acc3,xzr
+       ldp     $acc0,$acc1,[sp,#8*4]   // t[0..3]
+       adc     $topmost,$carry,xzr
+       stp     $acc2,$acc3,[$tp,#8*2]  // result!!!
+       cmp     $bp,$t3                 // done yet?
+       ldp     $acc2,$acc3,[sp,#8*6]
+       ldp     $m0,$m1,[$t1,#8*0]      // n[0..3]
+       ldp     $m2,$m3,[$t1,#8*2]
+       add     $np,$t1,#8*4
+       b.eq    .Lmul4x_post
+
+       ldr     $bi,[$bp]
+       ldp     $a0,$a1,[$ap,#8*0]      // a[0..3]
+       ldp     $a2,$a3,[$ap,#8*2]
+       adds    $ap,$ap,#8*4            // clear carry bit
+       mov     $carry,xzr
+       mov     $tp,sp
+       b       .Loop_mul4x_reduction
+
+.align 4
+.Lmul4x_post:
+       // Final step. We see if result is larger than modulus, and
+       // if it is, subtract the modulus. But comparison implies
+       // subtraction. So we subtract modulus, see if it borrowed,
+       // and conditionally copy original value.
+       mov     $rp,$t2
+       mov     $ap_end,$t2             // $rp copy
+       subs    $t0,$acc0,$m0
+       add     $tp,sp,#8*8
+       sbcs    $t1,$acc1,$m1
+       sub     $cnt,$num,#8*4
+
+.Lmul4x_sub:
+       sbcs    $t2,$acc2,$m2
+       ldp     $m0,$m1,[$np,#8*0]
+       sub     $cnt,$cnt,#8*4
+       ldp     $acc0,$acc1,[$tp,#8*0]
+       sbcs    $t3,$acc3,$m3
+       ldp     $m2,$m3,[$np,#8*2]
+       add     $np,$np,#8*4
+       ldp     $acc2,$acc3,[$tp,#8*2]
+       add     $tp,$tp,#8*4
+       stp     $t0,$t1,[$rp,#8*0]
+       sbcs    $t0,$acc0,$m0
+       stp     $t2,$t3,[$rp,#8*2]
+       add     $rp,$rp,#8*4
+       sbcs    $t1,$acc1,$m1
+       cbnz    $cnt,.Lmul4x_sub
+
+       sbcs    $t2,$acc2,$m2
+        mov    $tp,sp
+        add    $ap,sp,#8*4
+        ldp    $a0,$a1,[$ap_end,#8*0]
+       sbcs    $t3,$acc3,$m3
+       stp     $t0,$t1,[$rp,#8*0]
+        ldp    $a2,$a3,[$ap_end,#8*2]
+       stp     $t2,$t3,[$rp,#8*2]
+        ldp    $acc0,$acc1,[$ap,#8*0]
+        ldp    $acc2,$acc3,[$ap,#8*2]
+       sbcs    xzr,$topmost,xzr        // did it borrow?
+       ldr     x30,[x29,#8]            // pull return address
+
+       sub     $cnt,$num,#8*4
+.Lmul4x_cond_copy:
+       sub     $cnt,$cnt,#8*4
+       csel    $t0,$acc0,$a0,lo
+        stp    xzr,xzr,[$tp,#8*0]
+       csel    $t1,$acc1,$a1,lo
+       ldp     $a0,$a1,[$ap_end,#8*4]
+       ldp     $acc0,$acc1,[$ap,#8*4]
+       csel    $t2,$acc2,$a2,lo
+        stp    xzr,xzr,[$tp,#8*2]
+        add    $tp,$tp,#8*4
+       csel    $t3,$acc3,$a3,lo
+       ldp     $a2,$a3,[$ap_end,#8*6]
+       ldp     $acc2,$acc3,[$ap,#8*6]
+       add     $ap,$ap,#8*4
+       stp     $t0,$t1,[$ap_end,#8*0]
+       stp     $t2,$t3,[$ap_end,#8*2]
+       add     $ap_end,$ap_end,#8*4
+       cbnz    $cnt,.Lmul4x_cond_copy
+
+       csel    $t0,$acc0,$a0,lo
+        stp    xzr,xzr,[$tp,#8*0]
+       csel    $t1,$acc1,$a1,lo
+        stp    xzr,xzr,[$tp,#8*2]
+       csel    $t2,$acc2,$a2,lo
+        stp    xzr,xzr,[$tp,#8*3]
+       csel    $t3,$acc3,$a3,lo
+        stp    xzr,xzr,[$tp,#8*4]
+       stp     $t0,$t1,[$ap_end,#8*0]
+       stp     $t2,$t3,[$ap_end,#8*2]
+
+       b       .Lmul4x_done
+
+.align 4
+.Lmul4x4_post_condition:
+       adc     $carry,$carry,xzr
+       ldr     $ap,[x29,#96]           // pull rp
+       // $acc0-3,$carry hold result, $m0-7 hold modulus
+       subs    $a0,$acc0,$m0
+       ldr     x30,[x29,#8]            // pull return address
+       sbcs    $a1,$acc1,$m1
+        stp    xzr,xzr,[sp,#8*0]
+       sbcs    $a2,$acc2,$m2
+        stp    xzr,xzr,[sp,#8*2]
+       sbcs    $a3,$acc3,$m3
+        stp    xzr,xzr,[sp,#8*4]
+       sbcs    xzr,$carry,xzr          // did it borrow?
+        stp    xzr,xzr,[sp,#8*6]
+
+       // $a0-3 hold result-modulus
+       csel    $a0,$acc0,$a0,lo
+       csel    $a1,$acc1,$a1,lo
+       csel    $a2,$acc2,$a2,lo
+       csel    $a3,$acc3,$a3,lo
+       stp     $a0,$a1,[$ap,#8*0]
+       stp     $a2,$a3,[$ap,#8*2]
+
+.Lmul4x_done:
+       ldp     x19,x20,[x29,#16]
+       mov     sp,x29
+       ldp     x21,x22,[x29,#32]
+       mov     x0,#1
+       ldp     x23,x24,[x29,#48]
+       ldp     x25,x26,[x29,#64]
+       ldp     x27,x28,[x29,#80]
+       ldr     x29,[sp],#128
+       // x30 loaded earlier
+       AARCH64_VALIDATE_LINK_REGISTER
+       ret
+.size  __bn_mul4x_mont,.-__bn_mul4x_mont
+___
+}
+$code.=<<___;
 .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align 4
 ___
 
 print $code;
 
-close STDOUT;
+close STDOUT or die "error closing STDOUT: $!";