aarch64: support BTI and pointer authentication in assembly
[openssl.git] / crypto / bn / asm / armv8-mont.pl
1 #! /usr/bin/env perl
2 # Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # March 2015
18 #
19 # "Teaser" Montgomery multiplication module for ARMv8. Needs more
20 # work. While it does improve RSA sign performance by 20-30% (less for
21 # longer keys) on most processors, for some reason RSA2048 is not
22 # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
23 # instruction issue rate is limited on processor in question, meaning
24 # that dedicated squaring procedure is a must. Well, actually all
25 # contemporary AArch64 processors seem to have limited multiplication
26 # issue rate, i.e. they can't issue multiplication every cycle, which
27 # explains moderate improvement coefficients in comparison to
28 # compiler-generated code. Recall that compiler is instructed to use
29 # umulh and therefore uses same amount of multiplication instructions
30 # to do the job. Assembly's edge is to minimize number of "collateral"
31 # instructions and of course instruction scheduling.
32 #
33 # April 2015
34 #
35 # Squaring procedure that handles lengths divisible by 8 improves
36 # RSA/DSA performance by 25-40-60% depending on processor and key
37 # length. Overall improvement coefficients are always positive in
38 # comparison to compiler-generated code. On Cortex-A57 improvement
39 # is still modest on longest key lengths, while others exhibit e.g.
40 # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
41 # on Cortex-A57 and ~60-100% faster on others.
42
43 # $output is the last argument if it looks like a file (it has an extension)
44 # $flavour is the first argument if it doesn't look like a file
45 my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46 my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
47
48 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
50 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
51 die "can't locate arm-xlate.pl";
52
53 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
54     or die "can't call $xlate: $1";
55 *STDOUT=*OUT;
56
57 ($lo0,$hi0,$aj,$m0,$alo,$ahi,
58  $lo1,$hi1,$nj,$m1,$nlo,$nhi,
59  $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
60
61 # int bn_mul_mont(
62 $rp="x0";       # BN_ULONG *rp,
63 $ap="x1";       # const BN_ULONG *ap,
64 $bp="x2";       # const BN_ULONG *bp,
65 $np="x3";       # const BN_ULONG *np,
66 $n0="x4";       # const BN_ULONG *n0,
67 $num="x5";      # int num);
68
69 $code.=<<___;
70 #include "arm_arch.h"
71 #ifndef __KERNEL__
72 .extern OPENSSL_armv8_rsa_neonized
73 .hidden OPENSSL_armv8_rsa_neonized
74 #endif
75 .text
76
77 .globl  bn_mul_mont
78 .type   bn_mul_mont,%function
79 .align  5
80 bn_mul_mont:
81         AARCH64_SIGN_LINK_REGISTER
82 .Lbn_mul_mont:
83         tst     $num,#3
84         b.ne    .Lmul_mont
85         cmp     $num,#32
86         b.le    .Lscalar_impl
87 #ifndef __KERNEL__
88         adrp    x17,OPENSSL_armv8_rsa_neonized
89         ldr     w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
90         cbnz    w17, bn_mul8x_mont_neon
91 #endif
92
93 .Lscalar_impl:
94         tst     $num,#7
95         b.eq    __bn_sqr8x_mont
96         tst     $num,#3
97         b.eq    __bn_mul4x_mont
98
99 .Lmul_mont:
100         stp     x29,x30,[sp,#-64]!
101         add     x29,sp,#0
102         stp     x19,x20,[sp,#16]
103         stp     x21,x22,[sp,#32]
104         stp     x23,x24,[sp,#48]
105
106         ldr     $m0,[$bp],#8            // bp[0]
107         sub     $tp,sp,$num,lsl#3
108         ldp     $hi0,$aj,[$ap],#16      // ap[0..1]
109         lsl     $num,$num,#3
110         ldr     $n0,[$n0]               // *n0
111         and     $tp,$tp,#-16            // ABI says so
112         ldp     $hi1,$nj,[$np],#16      // np[0..1]
113
114         mul     $lo0,$hi0,$m0           // ap[0]*bp[0]
115         sub     $j,$num,#16             // j=num-2
116         umulh   $hi0,$hi0,$m0
117         mul     $alo,$aj,$m0            // ap[1]*bp[0]
118         umulh   $ahi,$aj,$m0
119
120         mul     $m1,$lo0,$n0            // "tp[0]"*n0
121         mov     sp,$tp                  // alloca
122
123         // (*)  mul     $lo1,$hi1,$m1   // np[0]*m1
124         umulh   $hi1,$hi1,$m1
125         mul     $nlo,$nj,$m1            // np[1]*m1
126         // (*)  adds    $lo1,$lo1,$lo0  // discarded
127         // (*)  As for removal of first multiplication and addition
128         //      instructions. The outcome of first addition is
129         //      guaranteed to be zero, which leaves two computationally
130         //      significant outcomes: it either carries or not. Then
131         //      question is when does it carry? Is there alternative
132         //      way to deduce it? If you follow operations, you can
133         //      observe that condition for carry is quite simple:
134         //      $lo0 being non-zero. So that carry can be calculated
135         //      by adding -1 to $lo0. That's what next instruction does.
136         subs    xzr,$lo0,#1             // (*)
137         umulh   $nhi,$nj,$m1
138         adc     $hi1,$hi1,xzr
139         cbz     $j,.L1st_skip
140
141 .L1st:
142         ldr     $aj,[$ap],#8
143         adds    $lo0,$alo,$hi0
144         sub     $j,$j,#8                // j--
145         adc     $hi0,$ahi,xzr
146
147         ldr     $nj,[$np],#8
148         adds    $lo1,$nlo,$hi1
149         mul     $alo,$aj,$m0            // ap[j]*bp[0]
150         adc     $hi1,$nhi,xzr
151         umulh   $ahi,$aj,$m0
152
153         adds    $lo1,$lo1,$lo0
154         mul     $nlo,$nj,$m1            // np[j]*m1
155         adc     $hi1,$hi1,xzr
156         umulh   $nhi,$nj,$m1
157         str     $lo1,[$tp],#8           // tp[j-1]
158         cbnz    $j,.L1st
159
160 .L1st_skip:
161         adds    $lo0,$alo,$hi0
162         sub     $ap,$ap,$num            // rewind $ap
163         adc     $hi0,$ahi,xzr
164
165         adds    $lo1,$nlo,$hi1
166         sub     $np,$np,$num            // rewind $np
167         adc     $hi1,$nhi,xzr
168
169         adds    $lo1,$lo1,$lo0
170         sub     $i,$num,#8              // i=num-1
171         adcs    $hi1,$hi1,$hi0
172
173         adc     $ovf,xzr,xzr            // upmost overflow bit
174         stp     $lo1,$hi1,[$tp]
175
176 .Louter:
177         ldr     $m0,[$bp],#8            // bp[i]
178         ldp     $hi0,$aj,[$ap],#16
179         ldr     $tj,[sp]                // tp[0]
180         add     $tp,sp,#8
181
182         mul     $lo0,$hi0,$m0           // ap[0]*bp[i]
183         sub     $j,$num,#16             // j=num-2
184         umulh   $hi0,$hi0,$m0
185         ldp     $hi1,$nj,[$np],#16
186         mul     $alo,$aj,$m0            // ap[1]*bp[i]
187         adds    $lo0,$lo0,$tj
188         umulh   $ahi,$aj,$m0
189         adc     $hi0,$hi0,xzr
190
191         mul     $m1,$lo0,$n0
192         sub     $i,$i,#8                // i--
193
194         // (*)  mul     $lo1,$hi1,$m1   // np[0]*m1
195         umulh   $hi1,$hi1,$m1
196         mul     $nlo,$nj,$m1            // np[1]*m1
197         // (*)  adds    $lo1,$lo1,$lo0
198         subs    xzr,$lo0,#1             // (*)
199         umulh   $nhi,$nj,$m1
200         cbz     $j,.Linner_skip
201
202 .Linner:
203         ldr     $aj,[$ap],#8
204         adc     $hi1,$hi1,xzr
205         ldr     $tj,[$tp],#8            // tp[j]
206         adds    $lo0,$alo,$hi0
207         sub     $j,$j,#8                // j--
208         adc     $hi0,$ahi,xzr
209
210         adds    $lo1,$nlo,$hi1
211         ldr     $nj,[$np],#8
212         adc     $hi1,$nhi,xzr
213
214         mul     $alo,$aj,$m0            // ap[j]*bp[i]
215         adds    $lo0,$lo0,$tj
216         umulh   $ahi,$aj,$m0
217         adc     $hi0,$hi0,xzr
218
219         mul     $nlo,$nj,$m1            // np[j]*m1
220         adds    $lo1,$lo1,$lo0
221         umulh   $nhi,$nj,$m1
222         stur    $lo1,[$tp,#-16]         // tp[j-1]
223         cbnz    $j,.Linner
224
225 .Linner_skip:
226         ldr     $tj,[$tp],#8            // tp[j]
227         adc     $hi1,$hi1,xzr
228         adds    $lo0,$alo,$hi0
229         sub     $ap,$ap,$num            // rewind $ap
230         adc     $hi0,$ahi,xzr
231
232         adds    $lo1,$nlo,$hi1
233         sub     $np,$np,$num            // rewind $np
234         adcs    $hi1,$nhi,$ovf
235         adc     $ovf,xzr,xzr
236
237         adds    $lo0,$lo0,$tj
238         adc     $hi0,$hi0,xzr
239
240         adds    $lo1,$lo1,$lo0
241         adcs    $hi1,$hi1,$hi0
242         adc     $ovf,$ovf,xzr           // upmost overflow bit
243         stp     $lo1,$hi1,[$tp,#-16]
244
245         cbnz    $i,.Louter
246
247         // Final step. We see if result is larger than modulus, and
248         // if it is, subtract the modulus. But comparison implies
249         // subtraction. So we subtract modulus, see if it borrowed,
250         // and conditionally copy original value.
251         ldr     $tj,[sp]                // tp[0]
252         add     $tp,sp,#8
253         ldr     $nj,[$np],#8            // np[0]
254         subs    $j,$num,#8              // j=num-1 and clear borrow
255         mov     $ap,$rp
256 .Lsub:
257         sbcs    $aj,$tj,$nj             // tp[j]-np[j]
258         ldr     $tj,[$tp],#8
259         sub     $j,$j,#8                // j--
260         ldr     $nj,[$np],#8
261         str     $aj,[$ap],#8            // rp[j]=tp[j]-np[j]
262         cbnz    $j,.Lsub
263
264         sbcs    $aj,$tj,$nj
265         sbcs    $ovf,$ovf,xzr           // did it borrow?
266         str     $aj,[$ap],#8            // rp[num-1]
267
268         ldr     $tj,[sp]                // tp[0]
269         add     $tp,sp,#8
270         ldr     $aj,[$rp],#8            // rp[0]
271         sub     $num,$num,#8            // num--
272         nop
273 .Lcond_copy:
274         sub     $num,$num,#8            // num--
275         csel    $nj,$tj,$aj,lo          // did it borrow?
276         ldr     $tj,[$tp],#8
277         ldr     $aj,[$rp],#8
278         stur    xzr,[$tp,#-16]          // wipe tp
279         stur    $nj,[$rp,#-16]
280         cbnz    $num,.Lcond_copy
281
282         csel    $nj,$tj,$aj,lo
283         stur    xzr,[$tp,#-8]           // wipe tp
284         stur    $nj,[$rp,#-8]
285
286         ldp     x19,x20,[x29,#16]
287         mov     sp,x29
288         ldp     x21,x22,[x29,#32]
289         mov     x0,#1
290         ldp     x23,x24,[x29,#48]
291         ldr     x29,[sp],#64
292         AARCH64_VALIDATE_LINK_REGISTER
293         ret
294 .size   bn_mul_mont,.-bn_mul_mont
295 ___
296 {
297 my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
298 my ($Z,$Temp)=("v4.16b","v5");
299 my @ACC=map("v$_",(6..13));
300 my ($Bi,$Ni,$M0)=map("v$_",(28..30));
301 my $sBi="s28";
302 my $sM0="s30";
303 my $zero="v14";
304 my $temp="v15";
305 my $ACCTemp="v16";
306
307 my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
308 my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
309
310 $code.=<<___;
311 .type   bn_mul8x_mont_neon,%function
312 .align  5
313 bn_mul8x_mont_neon:
314         // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to
315         // only from bn_mul_mont which has already signed the return address.
316         stp     x29,x30,[sp,#-80]!
317         mov     x16,sp
318         stp     d8,d9,[sp,#16]
319         stp     d10,d11,[sp,#32]
320         stp     d12,d13,[sp,#48]
321         stp     d14,d15,[sp,#64]
322         lsl     $num,$num,#1
323         eor     $zero.16b,$zero.16b,$zero.16b
324
325 .align  4
326 .LNEON_8n:
327         eor     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b
328         sub     $toutptr,sp,#128
329         eor     @ACC[1].16b,@ACC[1].16b,@ACC[1].16b
330         sub     $toutptr,$toutptr,$num,lsl#4
331         eor     @ACC[2].16b,@ACC[2].16b,@ACC[2].16b
332         and     $toutptr,$toutptr,#-64
333         eor     @ACC[3].16b,@ACC[3].16b,@ACC[3].16b
334         mov     sp,$toutptr             // alloca
335         eor     @ACC[4].16b,@ACC[4].16b,@ACC[4].16b
336         add     $toutptr,$toutptr,#256
337         eor     @ACC[5].16b,@ACC[5].16b,@ACC[5].16b
338         sub     $inner,$num,#8
339         eor     @ACC[6].16b,@ACC[6].16b,@ACC[6].16b
340         eor     @ACC[7].16b,@ACC[7].16b,@ACC[7].16b
341
342 .LNEON_8n_init:
343         st1     {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
344         subs    $inner,$inner,#8
345         st1     {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
346         st1     {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
347         st1     {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
348         bne     .LNEON_8n_init
349
350         add     $tinptr,sp,#256
351         ld1     {$A0.4s,$A1.4s},[$aptr],#32
352         add     $bnptr,sp,#8
353         ldr     $sM0,[$n0],#4
354         mov     $outer,$num
355         b       .LNEON_8n_outer
356
357 .align  4
358 .LNEON_8n_outer:
359         ldr     $sBi,[$bptr],#4   // *b++
360         uxtl    $Bi.4s,$Bi.4h
361         add     $toutptr,sp,#128
362         ld1     {$N0.4s,$N1.4s},[$nptr],#32
363
364         umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
365         umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
366         umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
367         shl     $Ni.2d,@ACC[0].2d,#16
368         ext     $Ni.16b,$Ni.16b,$Ni.16b,#8
369         umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
370         add     $Ni.2d,$Ni.2d,@ACC[0].2d
371         umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
372         mul     $Ni.2s,$Ni.2s,$M0.2s
373         umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
374         st1     {$Bi.2s},[sp]           // put aside smashed b[8*i+0]
375         umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
376         uxtl    $Ni.4s,$Ni.4h
377         umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
378 ___
379 for ($i=0; $i<7;) {
380 $code.=<<___;
381         ldr     $sBi,[$bptr],#4   // *b++
382         umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
383         umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
384         uxtl    $Bi.4s,$Bi.4h
385         umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
386         ushr    $temp.2d,@ACC[0].2d,#16
387         umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
388         umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
389         ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
390         add     @ACC[0].2d,@ACC[0].2d,$temp.2d
391         umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
392         ushr    @ACC[0].2d,@ACC[0].2d,#16
393         umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
394         umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
395         add     $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
396         ins     @ACC[1].d[0],$ACCTemp.d[0]
397         st1     {$Ni.2s},[$bnptr],#8    // put aside smashed m[8*i+$i]
398 ___
399         push(@ACC,shift(@ACC)); $i++;
400 $code.=<<___;
401         umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
402         ld1     {@ACC[7].2d},[$tinptr],#16
403         umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
404         umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
405         shl     $Ni.2d,@ACC[0].2d,#16
406         ext     $Ni.16b,$Ni.16b,$Ni.16b,#8
407         umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
408         add     $Ni.2d,$Ni.2d,@ACC[0].2d
409         umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
410         mul     $Ni.2s,$Ni.2s,$M0.2s
411         umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
412         st1     {$Bi.2s},[$bnptr],#8    // put aside smashed b[8*i+$i]
413         umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
414         uxtl    $Ni.4s,$Ni.4h
415         umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
416 ___
417 }
418 $code.=<<___;
419         ld1     {$Bi.2s},[sp]           // pull smashed b[8*i+0]
420         umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
421         ld1     {$A0.4s,$A1.4s},[$aptr],#32
422         umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
423         umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
424         mov     $Temp.16b,@ACC[0].16b
425         ushr    $Temp.2d,$Temp.2d,#16
426         ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
427         umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
428         umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
429         add     @ACC[0].2d,@ACC[0].2d,$Temp.2d
430         umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
431         ushr    @ACC[0].2d,@ACC[0].2d,#16
432         eor     $temp.16b,$temp.16b,$temp.16b
433         ins     @ACC[0].d[1],$temp.d[0]
434         umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
435         umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
436         add     @ACC[1].2d,@ACC[1].2d,@ACC[0].2d
437         st1     {$Ni.2s},[$bnptr],#8    // put aside smashed m[8*i+$i]
438         add     $bnptr,sp,#8            // rewind
439 ___
440         push(@ACC,shift(@ACC));
441 $code.=<<___;
442         sub     $inner,$num,#8
443         b       .LNEON_8n_inner
444
445 .align  4
446 .LNEON_8n_inner:
447         subs    $inner,$inner,#8
448         umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
449         ld1     {@ACC[7].2d},[$tinptr]
450         umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
451         ld1     {$Ni.2s},[$bnptr],#8    // pull smashed m[8*i+0]
452         umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
453         ld1     {$N0.4s,$N1.4s},[$nptr],#32
454         umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
455         b.eq    .LInner_jump
456         add     $tinptr,$tinptr,#16     // don't advance in last iteration
457 .LInner_jump:
458         umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
459         umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
460         umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
461         umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
462 ___
463 for ($i=1; $i<8; $i++) {
464 $code.=<<___;
465         ld1     {$Bi.2s},[$bnptr],#8    // pull smashed b[8*i+$i]
466         umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
467         umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
468         umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
469         umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
470         umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
471         umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
472         umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
473         umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
474         st1     {@ACC[0].2d},[$toutptr],#16
475 ___
476         push(@ACC,shift(@ACC));
477 $code.=<<___;
478         umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
479         ld1     {@ACC[7].2d},[$tinptr]
480         umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
481         ld1     {$Ni.2s},[$bnptr],#8    // pull smashed m[8*i+$i]
482         umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
483         b.eq    .LInner_jump$i
484         add     $tinptr,$tinptr,#16     // don't advance in last iteration
485 .LInner_jump$i:
486         umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
487         umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
488         umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
489         umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
490         umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
491 ___
492 }
493 $code.=<<___;
494         b.ne    .LInner_after_rewind$i
495         sub     $aptr,$aptr,$num,lsl#2  // rewind
496 .LInner_after_rewind$i:
497         umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
498         ld1     {$Bi.2s},[sp]           // pull smashed b[8*i+0]
499         umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
500         ld1     {$A0.4s,$A1.4s},[$aptr],#32
501         umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
502         add     $bnptr,sp,#8            // rewind
503         umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
504         umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
505         umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
506         umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
507         st1     {@ACC[0].2d},[$toutptr],#16
508         umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
509
510         bne     .LNEON_8n_inner
511 ___
512         push(@ACC,shift(@ACC));
513 $code.=<<___;
514         add     $tinptr,sp,#128
515         st1     {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
516         eor     $N0.16b,$N0.16b,$N0.16b // $N0
517         st1     {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
518         eor     $N1.16b,$N1.16b,$N1.16b // $N1
519         st1     {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
520         st1     {@ACC[6].2d},[$toutptr]
521
522         subs    $outer,$outer,#8
523         ld1     {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
524         ld1     {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
525         ld1     {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
526         ld1     {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
527
528         b.eq    .LInner_8n_jump_2steps
529         sub     $nptr,$nptr,$num,lsl#2  // rewind
530         b       .LNEON_8n_outer
531
532 .LInner_8n_jump_2steps:
533         add     $toutptr,sp,#128
534         st1     {$N0.2d,$N1.2d}, [sp],#32       // start wiping stack frame
535         mov     $Temp.16b,@ACC[0].16b
536         ushr    $temp.2d,@ACC[0].2d,#16
537         ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
538         st1     {$N0.2d,$N1.2d}, [sp],#32
539         add     @ACC[0].2d,@ACC[0].2d,$temp.2d
540         st1     {$N0.2d,$N1.2d}, [sp],#32
541         ushr    $temp.2d,@ACC[0].2d,#16
542         st1     {$N0.2d,$N1.2d}, [sp],#32
543         zip1    @ACC[0].4h,$Temp.4h,@ACC[0].4h
544         ins     $temp.d[1],$zero.d[0]
545
546         mov     $inner,$num
547         b       .LNEON_tail_entry
548
549 .align  4
550 .LNEON_tail:
551         add     @ACC[0].2d,@ACC[0].2d,$temp.2d
552         mov     $Temp.16b,@ACC[0].16b
553         ushr    $temp.2d,@ACC[0].2d,#16
554         ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
555         ld1     {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
556         add     @ACC[0].2d,@ACC[0].2d,$temp.2d
557         ld1     {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
558         ushr    $temp.2d,@ACC[0].2d,#16
559         ld1     {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
560         zip1    @ACC[0].4h,$Temp.4h,@ACC[0].4h
561         ins     $temp.d[1],$zero.d[0]
562
563 .LNEON_tail_entry:
564 ___
565 for ($i=1; $i<8; $i++) {
566 $code.=<<___;
567         add     @ACC[1].2d,@ACC[1].2d,$temp.2d
568         st1     {@ACC[0].s}[0], [$toutptr],#4
569         ushr    $temp.2d,@ACC[1].2d,#16
570         mov     $Temp.16b,@ACC[1].16b
571         ext     @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
572         add     @ACC[1].2d,@ACC[1].2d,$temp.2d
573         ushr    $temp.2d,@ACC[1].2d,#16
574         zip1    @ACC[1].4h,$Temp.4h,@ACC[1].4h
575         ins     $temp.d[1],$zero.d[0]
576 ___
577         push(@ACC,shift(@ACC));
578 }
579         push(@ACC,shift(@ACC));
580 $code.=<<___;
581         ld1     {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
582         subs    $inner,$inner,#8
583         st1     {@ACC[7].s}[0], [$toutptr],#4
584         bne     .LNEON_tail
585
586         st1     {$temp.s}[0], [$toutptr],#4     // top-most bit
587         sub     $nptr,$nptr,$num,lsl#2          // rewind $nptr
588         subs    $aptr,sp,#0                     // clear carry flag
589         add     $bptr,sp,$num,lsl#2
590
591 .LNEON_sub:
592         ldp     w4,w5,[$aptr],#8
593         ldp     w6,w7,[$aptr],#8
594         ldp     w8,w9,[$nptr],#8
595         ldp     w10,w11,[$nptr],#8
596         sbcs    w8,w4,w8
597         sbcs    w9,w5,w9
598         sbcs    w10,w6,w10
599         sbcs    w11,w7,w11
600         sub     x17,$bptr,$aptr
601         stp     w8,w9,[$rptr],#8
602         stp     w10,w11,[$rptr],#8
603         cbnz    x17,.LNEON_sub
604
605         ldr     w10, [$aptr]            // load top-most bit
606         mov     x11,sp
607         eor     v0.16b,v0.16b,v0.16b
608         sub     x11,$bptr,x11           // this is num*4
609         eor     v1.16b,v1.16b,v1.16b
610         mov     $aptr,sp
611         sub     $rptr,$rptr,x11         // rewind $rptr
612         mov     $nptr,$bptr             // second 3/4th of frame
613         sbcs    w10,w10,wzr             // result is carry flag
614
615 .LNEON_copy_n_zap:
616         ldp     w4,w5,[$aptr],#8
617         ldp     w6,w7,[$aptr],#8
618         ldp     w8,w9,[$rptr],#8
619         ldp     w10,w11,[$rptr]
620         sub     $rptr,$rptr,#8
621         b.cs    .LCopy_1
622         mov     w8,w4
623         mov     w9,w5
624         mov     w10,w6
625         mov     w11,w7
626 .LCopy_1:
627         st1     {v0.2d,v1.2d}, [$nptr],#32              // wipe
628         st1     {v0.2d,v1.2d}, [$nptr],#32              // wipe
629         ldp     w4,w5,[$aptr],#8
630         ldp     w6,w7,[$aptr],#8
631         stp     w8,w9,[$rptr],#8
632         stp     w10,w11,[$rptr],#8
633         sub     $aptr,$aptr,#32
634         ldp     w8,w9,[$rptr],#8
635         ldp     w10,w11,[$rptr]
636         sub     $rptr,$rptr,#8
637         b.cs    .LCopy_2
638         mov     w8, w4
639         mov     w9, w5
640         mov     w10, w6
641         mov     w11, w7
642 .LCopy_2:
643         st1     {v0.2d,v1.2d}, [$aptr],#32              // wipe
644         st1     {v0.2d,v1.2d}, [$nptr],#32              // wipe
645         sub     x17,$bptr,$aptr         // preserves carry
646         stp     w8,w9,[$rptr],#8
647         stp     w10,w11,[$rptr],#8
648         cbnz    x17,.LNEON_copy_n_zap
649
650         mov     sp,x16
651         ldp     d14,d15,[sp,#64]
652         ldp     d12,d13,[sp,#48]
653         ldp     d10,d11,[sp,#32]
654         ldp     d8,d9,[sp,#16]
655         ldr     x29,[sp],#80
656         AARCH64_VALIDATE_LINK_REGISTER
657         ret                     // bx lr
658
659 .size   bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
660 ___
661 }
662 {
663 ########################################################################
664 # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
665
666 my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
667 my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
668 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
669 my ($cnt,$carry,$topmost)=("x27","x28","x30");
670 my ($tp,$ap_end,$na0)=($bp,$np,$carry);
671
672 $code.=<<___;
673 .type   __bn_sqr8x_mont,%function
674 .align  5
675 __bn_sqr8x_mont:
676         cmp     $ap,$bp
677         b.ne    __bn_mul4x_mont
678 .Lsqr8x_mont:
679         // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
680         // only from bn_mul_mont which has already signed the return address.
681         stp     x29,x30,[sp,#-128]!
682         add     x29,sp,#0
683         stp     x19,x20,[sp,#16]
684         stp     x21,x22,[sp,#32]
685         stp     x23,x24,[sp,#48]
686         stp     x25,x26,[sp,#64]
687         stp     x27,x28,[sp,#80]
688         stp     $rp,$np,[sp,#96]        // offload rp and np
689
690         ldp     $a0,$a1,[$ap,#8*0]
691         ldp     $a2,$a3,[$ap,#8*2]
692         ldp     $a4,$a5,[$ap,#8*4]
693         ldp     $a6,$a7,[$ap,#8*6]
694
695         sub     $tp,sp,$num,lsl#4
696         lsl     $num,$num,#3
697         ldr     $n0,[$n0]               // *n0
698         mov     sp,$tp                  // alloca
699         sub     $cnt,$num,#8*8
700         b       .Lsqr8x_zero_start
701
702 .Lsqr8x_zero:
703         sub     $cnt,$cnt,#8*8
704         stp     xzr,xzr,[$tp,#8*0]
705         stp     xzr,xzr,[$tp,#8*2]
706         stp     xzr,xzr,[$tp,#8*4]
707         stp     xzr,xzr,[$tp,#8*6]
708 .Lsqr8x_zero_start:
709         stp     xzr,xzr,[$tp,#8*8]
710         stp     xzr,xzr,[$tp,#8*10]
711         stp     xzr,xzr,[$tp,#8*12]
712         stp     xzr,xzr,[$tp,#8*14]
713         add     $tp,$tp,#8*16
714         cbnz    $cnt,.Lsqr8x_zero
715
716         add     $ap_end,$ap,$num
717         add     $ap,$ap,#8*8
718         mov     $acc0,xzr
719         mov     $acc1,xzr
720         mov     $acc2,xzr
721         mov     $acc3,xzr
722         mov     $acc4,xzr
723         mov     $acc5,xzr
724         mov     $acc6,xzr
725         mov     $acc7,xzr
726         mov     $tp,sp
727         str     $n0,[x29,#112]          // offload n0
728
729         // Multiply everything but a[i]*a[i]
730 .align  4
731 .Lsqr8x_outer_loop:
732         //                                                 a[1]a[0]     (i)
733         //                                             a[2]a[0]
734         //                                         a[3]a[0]
735         //                                     a[4]a[0]
736         //                                 a[5]a[0]
737         //                             a[6]a[0]
738         //                         a[7]a[0]
739         //                                         a[2]a[1]             (ii)
740         //                                     a[3]a[1]
741         //                                 a[4]a[1]
742         //                             a[5]a[1]
743         //                         a[6]a[1]
744         //                     a[7]a[1]
745         //                                 a[3]a[2]                     (iii)
746         //                             a[4]a[2]
747         //                         a[5]a[2]
748         //                     a[6]a[2]
749         //                 a[7]a[2]
750         //                         a[4]a[3]                             (iv)
751         //                     a[5]a[3]
752         //                 a[6]a[3]
753         //             a[7]a[3]
754         //                 a[5]a[4]                                     (v)
755         //             a[6]a[4]
756         //         a[7]a[4]
757         //         a[6]a[5]                                             (vi)
758         //     a[7]a[5]
759         // a[7]a[6]                                                     (vii)
760
761         mul     $t0,$a1,$a0             // lo(a[1..7]*a[0])             (i)
762         mul     $t1,$a2,$a0
763         mul     $t2,$a3,$a0
764         mul     $t3,$a4,$a0
765         adds    $acc1,$acc1,$t0         // t[1]+lo(a[1]*a[0])
766         mul     $t0,$a5,$a0
767         adcs    $acc2,$acc2,$t1
768         mul     $t1,$a6,$a0
769         adcs    $acc3,$acc3,$t2
770         mul     $t2,$a7,$a0
771         adcs    $acc4,$acc4,$t3
772         umulh   $t3,$a1,$a0             // hi(a[1..7]*a[0])
773         adcs    $acc5,$acc5,$t0
774         umulh   $t0,$a2,$a0
775         adcs    $acc6,$acc6,$t1
776         umulh   $t1,$a3,$a0
777         adcs    $acc7,$acc7,$t2
778         umulh   $t2,$a4,$a0
779         stp     $acc0,$acc1,[$tp],#8*2  // t[0..1]
780         adc     $acc0,xzr,xzr           // t[8]
781         adds    $acc2,$acc2,$t3         // t[2]+lo(a[1]*a[0])
782         umulh   $t3,$a5,$a0
783         adcs    $acc3,$acc3,$t0
784         umulh   $t0,$a6,$a0
785         adcs    $acc4,$acc4,$t1
786         umulh   $t1,$a7,$a0
787         adcs    $acc5,$acc5,$t2
788          mul    $t2,$a2,$a1             // lo(a[2..7]*a[1])             (ii)
789         adcs    $acc6,$acc6,$t3
790          mul    $t3,$a3,$a1
791         adcs    $acc7,$acc7,$t0
792          mul    $t0,$a4,$a1
793         adc     $acc0,$acc0,$t1
794
795         mul     $t1,$a5,$a1
796         adds    $acc3,$acc3,$t2
797         mul     $t2,$a6,$a1
798         adcs    $acc4,$acc4,$t3
799         mul     $t3,$a7,$a1
800         adcs    $acc5,$acc5,$t0
801         umulh   $t0,$a2,$a1             // hi(a[2..7]*a[1])
802         adcs    $acc6,$acc6,$t1
803         umulh   $t1,$a3,$a1
804         adcs    $acc7,$acc7,$t2
805         umulh   $t2,$a4,$a1
806         adcs    $acc0,$acc0,$t3
807         umulh   $t3,$a5,$a1
808         stp     $acc2,$acc3,[$tp],#8*2  // t[2..3]
809         adc     $acc1,xzr,xzr           // t[9]
810         adds    $acc4,$acc4,$t0
811         umulh   $t0,$a6,$a1
812         adcs    $acc5,$acc5,$t1
813         umulh   $t1,$a7,$a1
814         adcs    $acc6,$acc6,$t2
815          mul    $t2,$a3,$a2             // lo(a[3..7]*a[2])             (iii)
816         adcs    $acc7,$acc7,$t3
817          mul    $t3,$a4,$a2
818         adcs    $acc0,$acc0,$t0
819          mul    $t0,$a5,$a2
820         adc     $acc1,$acc1,$t1
821
822         mul     $t1,$a6,$a2
823         adds    $acc5,$acc5,$t2
824         mul     $t2,$a7,$a2
825         adcs    $acc6,$acc6,$t3
826         umulh   $t3,$a3,$a2             // hi(a[3..7]*a[2])
827         adcs    $acc7,$acc7,$t0
828         umulh   $t0,$a4,$a2
829         adcs    $acc0,$acc0,$t1
830         umulh   $t1,$a5,$a2
831         adcs    $acc1,$acc1,$t2
832         umulh   $t2,$a6,$a2
833         stp     $acc4,$acc5,[$tp],#8*2  // t[4..5]
834         adc     $acc2,xzr,xzr           // t[10]
835         adds    $acc6,$acc6,$t3
836         umulh   $t3,$a7,$a2
837         adcs    $acc7,$acc7,$t0
838          mul    $t0,$a4,$a3             // lo(a[4..7]*a[3])             (iv)
839         adcs    $acc0,$acc0,$t1
840          mul    $t1,$a5,$a3
841         adcs    $acc1,$acc1,$t2
842          mul    $t2,$a6,$a3
843         adc     $acc2,$acc2,$t3
844
845         mul     $t3,$a7,$a3
846         adds    $acc7,$acc7,$t0
847         umulh   $t0,$a4,$a3             // hi(a[4..7]*a[3])
848         adcs    $acc0,$acc0,$t1
849         umulh   $t1,$a5,$a3
850         adcs    $acc1,$acc1,$t2
851         umulh   $t2,$a6,$a3
852         adcs    $acc2,$acc2,$t3
853         umulh   $t3,$a7,$a3
854         stp     $acc6,$acc7,[$tp],#8*2  // t[6..7]
855         adc     $acc3,xzr,xzr           // t[11]
856         adds    $acc0,$acc0,$t0
857          mul    $t0,$a5,$a4             // lo(a[5..7]*a[4])             (v)
858         adcs    $acc1,$acc1,$t1
859          mul    $t1,$a6,$a4
860         adcs    $acc2,$acc2,$t2
861          mul    $t2,$a7,$a4
862         adc     $acc3,$acc3,$t3
863
864         umulh   $t3,$a5,$a4             // hi(a[5..7]*a[4])
865         adds    $acc1,$acc1,$t0
866         umulh   $t0,$a6,$a4
867         adcs    $acc2,$acc2,$t1
868         umulh   $t1,$a7,$a4
869         adcs    $acc3,$acc3,$t2
870          mul    $t2,$a6,$a5             // lo(a[6..7]*a[5])             (vi)
871         adc     $acc4,xzr,xzr           // t[12]
872         adds    $acc2,$acc2,$t3
873          mul    $t3,$a7,$a5
874         adcs    $acc3,$acc3,$t0
875          umulh  $t0,$a6,$a5             // hi(a[6..7]*a[5])
876         adc     $acc4,$acc4,$t1
877
878         umulh   $t1,$a7,$a5
879         adds    $acc3,$acc3,$t2
880          mul    $t2,$a7,$a6             // lo(a[7]*a[6])                (vii)
881         adcs    $acc4,$acc4,$t3
882          umulh  $t3,$a7,$a6             // hi(a[7]*a[6])
883         adc     $acc5,xzr,xzr           // t[13]
884         adds    $acc4,$acc4,$t0
885         sub     $cnt,$ap_end,$ap        // done yet?
886         adc     $acc5,$acc5,$t1
887
888         adds    $acc5,$acc5,$t2
889         sub     $t0,$ap_end,$num        // rewinded ap
890         adc     $acc6,xzr,xzr           // t[14]
891         add     $acc6,$acc6,$t3
892
893         cbz     $cnt,.Lsqr8x_outer_break
894
895         mov     $n0,$a0
896         ldp     $a0,$a1,[$tp,#8*0]
897         ldp     $a2,$a3,[$tp,#8*2]
898         ldp     $a4,$a5,[$tp,#8*4]
899         ldp     $a6,$a7,[$tp,#8*6]
900         adds    $acc0,$acc0,$a0
901         adcs    $acc1,$acc1,$a1
902         ldp     $a0,$a1,[$ap,#8*0]
903         adcs    $acc2,$acc2,$a2
904         adcs    $acc3,$acc3,$a3
905         ldp     $a2,$a3,[$ap,#8*2]
906         adcs    $acc4,$acc4,$a4
907         adcs    $acc5,$acc5,$a5
908         ldp     $a4,$a5,[$ap,#8*4]
909         adcs    $acc6,$acc6,$a6
910         mov     $rp,$ap
911         adcs    $acc7,xzr,$a7
912         ldp     $a6,$a7,[$ap,#8*6]
913         add     $ap,$ap,#8*8
914         //adc   $carry,xzr,xzr          // moved below
915         mov     $cnt,#-8*8
916
917         //                                                         a[8]a[0]
918         //                                                     a[9]a[0]
919         //                                                 a[a]a[0]
920         //                                             a[b]a[0]
921         //                                         a[c]a[0]
922         //                                     a[d]a[0]
923         //                                 a[e]a[0]
924         //                             a[f]a[0]
925         //                                                     a[8]a[1]
926         //                         a[f]a[1]........................
927         //                                                 a[8]a[2]
928         //                     a[f]a[2]........................
929         //                                             a[8]a[3]
930         //                 a[f]a[3]........................
931         //                                         a[8]a[4]
932         //             a[f]a[4]........................
933         //                                     a[8]a[5]
934         //         a[f]a[5]........................
935         //                                 a[8]a[6]
936         //     a[f]a[6]........................
937         //                             a[8]a[7]
938         // a[f]a[7]........................
939 .Lsqr8x_mul:
940         mul     $t0,$a0,$n0
941         adc     $carry,xzr,xzr          // carry bit, modulo-scheduled
942         mul     $t1,$a1,$n0
943         add     $cnt,$cnt,#8
944         mul     $t2,$a2,$n0
945         mul     $t3,$a3,$n0
946         adds    $acc0,$acc0,$t0
947         mul     $t0,$a4,$n0
948         adcs    $acc1,$acc1,$t1
949         mul     $t1,$a5,$n0
950         adcs    $acc2,$acc2,$t2
951         mul     $t2,$a6,$n0
952         adcs    $acc3,$acc3,$t3
953         mul     $t3,$a7,$n0
954         adcs    $acc4,$acc4,$t0
955         umulh   $t0,$a0,$n0
956         adcs    $acc5,$acc5,$t1
957         umulh   $t1,$a1,$n0
958         adcs    $acc6,$acc6,$t2
959         umulh   $t2,$a2,$n0
960         adcs    $acc7,$acc7,$t3
961         umulh   $t3,$a3,$n0
962         adc     $carry,$carry,xzr
963         str     $acc0,[$tp],#8
964         adds    $acc0,$acc1,$t0
965         umulh   $t0,$a4,$n0
966         adcs    $acc1,$acc2,$t1
967         umulh   $t1,$a5,$n0
968         adcs    $acc2,$acc3,$t2
969         umulh   $t2,$a6,$n0
970         adcs    $acc3,$acc4,$t3
971         umulh   $t3,$a7,$n0
972         ldr     $n0,[$rp,$cnt]
973         adcs    $acc4,$acc5,$t0
974         adcs    $acc5,$acc6,$t1
975         adcs    $acc6,$acc7,$t2
976         adcs    $acc7,$carry,$t3
977         //adc   $carry,xzr,xzr          // moved above
978         cbnz    $cnt,.Lsqr8x_mul
979                                         // note that carry flag is guaranteed
980                                         // to be zero at this point
981         cmp     $ap,$ap_end             // done yet?
982         b.eq    .Lsqr8x_break
983
984         ldp     $a0,$a1,[$tp,#8*0]
985         ldp     $a2,$a3,[$tp,#8*2]
986         ldp     $a4,$a5,[$tp,#8*4]
987         ldp     $a6,$a7,[$tp,#8*6]
988         adds    $acc0,$acc0,$a0
989         ldur    $n0,[$rp,#-8*8]
990         adcs    $acc1,$acc1,$a1
991         ldp     $a0,$a1,[$ap,#8*0]
992         adcs    $acc2,$acc2,$a2
993         adcs    $acc3,$acc3,$a3
994         ldp     $a2,$a3,[$ap,#8*2]
995         adcs    $acc4,$acc4,$a4
996         adcs    $acc5,$acc5,$a5
997         ldp     $a4,$a5,[$ap,#8*4]
998         adcs    $acc6,$acc6,$a6
999         mov     $cnt,#-8*8
1000         adcs    $acc7,$acc7,$a7
1001         ldp     $a6,$a7,[$ap,#8*6]
1002         add     $ap,$ap,#8*8
1003         //adc   $carry,xzr,xzr          // moved above
1004         b       .Lsqr8x_mul
1005
1006 .align  4
1007 .Lsqr8x_break:
1008         ldp     $a0,$a1,[$rp,#8*0]
1009         add     $ap,$rp,#8*8
1010         ldp     $a2,$a3,[$rp,#8*2]
1011         sub     $t0,$ap_end,$ap         // is it last iteration?
1012         ldp     $a4,$a5,[$rp,#8*4]
1013         sub     $t1,$tp,$t0
1014         ldp     $a6,$a7,[$rp,#8*6]
1015         cbz     $t0,.Lsqr8x_outer_loop
1016
1017         stp     $acc0,$acc1,[$tp,#8*0]
1018         ldp     $acc0,$acc1,[$t1,#8*0]
1019         stp     $acc2,$acc3,[$tp,#8*2]
1020         ldp     $acc2,$acc3,[$t1,#8*2]
1021         stp     $acc4,$acc5,[$tp,#8*4]
1022         ldp     $acc4,$acc5,[$t1,#8*4]
1023         stp     $acc6,$acc7,[$tp,#8*6]
1024         mov     $tp,$t1
1025         ldp     $acc6,$acc7,[$t1,#8*6]
1026         b       .Lsqr8x_outer_loop
1027
1028 .align  4
1029 .Lsqr8x_outer_break:
1030         // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1031         ldp     $a1,$a3,[$t0,#8*0]      // recall that $t0 is &a[0]
1032         ldp     $t1,$t2,[sp,#8*1]
1033         ldp     $a5,$a7,[$t0,#8*2]
1034         add     $ap,$t0,#8*4
1035         ldp     $t3,$t0,[sp,#8*3]
1036
1037         stp     $acc0,$acc1,[$tp,#8*0]
1038         mul     $acc0,$a1,$a1
1039         stp     $acc2,$acc3,[$tp,#8*2]
1040         umulh   $a1,$a1,$a1
1041         stp     $acc4,$acc5,[$tp,#8*4]
1042         mul     $a2,$a3,$a3
1043         stp     $acc6,$acc7,[$tp,#8*6]
1044         mov     $tp,sp
1045         umulh   $a3,$a3,$a3
1046         adds    $acc1,$a1,$t1,lsl#1
1047         extr    $t1,$t2,$t1,#63
1048         sub     $cnt,$num,#8*4
1049
1050 .Lsqr4x_shift_n_add:
1051         adcs    $acc2,$a2,$t1
1052         extr    $t2,$t3,$t2,#63
1053         sub     $cnt,$cnt,#8*4
1054         adcs    $acc3,$a3,$t2
1055         ldp     $t1,$t2,[$tp,#8*5]
1056         mul     $a4,$a5,$a5
1057         ldp     $a1,$a3,[$ap],#8*2
1058         umulh   $a5,$a5,$a5
1059         mul     $a6,$a7,$a7
1060         umulh   $a7,$a7,$a7
1061         extr    $t3,$t0,$t3,#63
1062         stp     $acc0,$acc1,[$tp,#8*0]
1063         adcs    $acc4,$a4,$t3
1064         extr    $t0,$t1,$t0,#63
1065         stp     $acc2,$acc3,[$tp,#8*2]
1066         adcs    $acc5,$a5,$t0
1067         ldp     $t3,$t0,[$tp,#8*7]
1068         extr    $t1,$t2,$t1,#63
1069         adcs    $acc6,$a6,$t1
1070         extr    $t2,$t3,$t2,#63
1071         adcs    $acc7,$a7,$t2
1072         ldp     $t1,$t2,[$tp,#8*9]
1073         mul     $a0,$a1,$a1
1074         ldp     $a5,$a7,[$ap],#8*2
1075         umulh   $a1,$a1,$a1
1076         mul     $a2,$a3,$a3
1077         umulh   $a3,$a3,$a3
1078         stp     $acc4,$acc5,[$tp,#8*4]
1079         extr    $t3,$t0,$t3,#63
1080         stp     $acc6,$acc7,[$tp,#8*6]
1081         add     $tp,$tp,#8*8
1082         adcs    $acc0,$a0,$t3
1083         extr    $t0,$t1,$t0,#63
1084         adcs    $acc1,$a1,$t0
1085         ldp     $t3,$t0,[$tp,#8*3]
1086         extr    $t1,$t2,$t1,#63
1087         cbnz    $cnt,.Lsqr4x_shift_n_add
1088 ___
1089 my ($np,$np_end)=($ap,$ap_end);
1090 $code.=<<___;
1091          ldp    $np,$n0,[x29,#104]      // pull np and n0
1092
1093         adcs    $acc2,$a2,$t1
1094         extr    $t2,$t3,$t2,#63
1095         adcs    $acc3,$a3,$t2
1096         ldp     $t1,$t2,[$tp,#8*5]
1097         mul     $a4,$a5,$a5
1098         umulh   $a5,$a5,$a5
1099         stp     $acc0,$acc1,[$tp,#8*0]
1100         mul     $a6,$a7,$a7
1101         umulh   $a7,$a7,$a7
1102         stp     $acc2,$acc3,[$tp,#8*2]
1103         extr    $t3,$t0,$t3,#63
1104         adcs    $acc4,$a4,$t3
1105         extr    $t0,$t1,$t0,#63
1106          ldp    $acc0,$acc1,[sp,#8*0]
1107         adcs    $acc5,$a5,$t0
1108         extr    $t1,$t2,$t1,#63
1109          ldp    $a0,$a1,[$np,#8*0]
1110         adcs    $acc6,$a6,$t1
1111         extr    $t2,xzr,$t2,#63
1112          ldp    $a2,$a3,[$np,#8*2]
1113         adc     $acc7,$a7,$t2
1114          ldp    $a4,$a5,[$np,#8*4]
1115
1116         // Reduce by 512 bits per iteration
1117         mul     $na0,$n0,$acc0          // t[0]*n0
1118         ldp     $a6,$a7,[$np,#8*6]
1119         add     $np_end,$np,$num
1120         ldp     $acc2,$acc3,[sp,#8*2]
1121         stp     $acc4,$acc5,[$tp,#8*4]
1122         ldp     $acc4,$acc5,[sp,#8*4]
1123         stp     $acc6,$acc7,[$tp,#8*6]
1124         ldp     $acc6,$acc7,[sp,#8*6]
1125         add     $np,$np,#8*8
1126         mov     $topmost,xzr            // initial top-most carry
1127         mov     $tp,sp
1128         mov     $cnt,#8
1129
1130 .Lsqr8x_reduction:
1131         // (*)  mul     $t0,$a0,$na0    // lo(n[0-7])*lo(t[0]*n0)
1132         mul     $t1,$a1,$na0
1133         sub     $cnt,$cnt,#1
1134         mul     $t2,$a2,$na0
1135         str     $na0,[$tp],#8           // put aside t[0]*n0 for tail processing
1136         mul     $t3,$a3,$na0
1137         // (*)  adds    xzr,$acc0,$t0
1138         subs    xzr,$acc0,#1            // (*)
1139         mul     $t0,$a4,$na0
1140         adcs    $acc0,$acc1,$t1
1141         mul     $t1,$a5,$na0
1142         adcs    $acc1,$acc2,$t2
1143         mul     $t2,$a6,$na0
1144         adcs    $acc2,$acc3,$t3
1145         mul     $t3,$a7,$na0
1146         adcs    $acc3,$acc4,$t0
1147         umulh   $t0,$a0,$na0            // hi(n[0-7])*lo(t[0]*n0)
1148         adcs    $acc4,$acc5,$t1
1149         umulh   $t1,$a1,$na0
1150         adcs    $acc5,$acc6,$t2
1151         umulh   $t2,$a2,$na0
1152         adcs    $acc6,$acc7,$t3
1153         umulh   $t3,$a3,$na0
1154         adc     $acc7,xzr,xzr
1155         adds    $acc0,$acc0,$t0
1156         umulh   $t0,$a4,$na0
1157         adcs    $acc1,$acc1,$t1
1158         umulh   $t1,$a5,$na0
1159         adcs    $acc2,$acc2,$t2
1160         umulh   $t2,$a6,$na0
1161         adcs    $acc3,$acc3,$t3
1162         umulh   $t3,$a7,$na0
1163         mul     $na0,$n0,$acc0          // next t[0]*n0
1164         adcs    $acc4,$acc4,$t0
1165         adcs    $acc5,$acc5,$t1
1166         adcs    $acc6,$acc6,$t2
1167         adc     $acc7,$acc7,$t3
1168         cbnz    $cnt,.Lsqr8x_reduction
1169
1170         ldp     $t0,$t1,[$tp,#8*0]
1171         ldp     $t2,$t3,[$tp,#8*2]
1172         mov     $rp,$tp
1173         sub     $cnt,$np_end,$np        // done yet?
1174         adds    $acc0,$acc0,$t0
1175         adcs    $acc1,$acc1,$t1
1176         ldp     $t0,$t1,[$tp,#8*4]
1177         adcs    $acc2,$acc2,$t2
1178         adcs    $acc3,$acc3,$t3
1179         ldp     $t2,$t3,[$tp,#8*6]
1180         adcs    $acc4,$acc4,$t0
1181         adcs    $acc5,$acc5,$t1
1182         adcs    $acc6,$acc6,$t2
1183         adcs    $acc7,$acc7,$t3
1184         //adc   $carry,xzr,xzr          // moved below
1185         cbz     $cnt,.Lsqr8x8_post_condition
1186
1187         ldur    $n0,[$tp,#-8*8]
1188         ldp     $a0,$a1,[$np,#8*0]
1189         ldp     $a2,$a3,[$np,#8*2]
1190         ldp     $a4,$a5,[$np,#8*4]
1191         mov     $cnt,#-8*8
1192         ldp     $a6,$a7,[$np,#8*6]
1193         add     $np,$np,#8*8
1194
1195 .Lsqr8x_tail:
1196         mul     $t0,$a0,$n0
1197         adc     $carry,xzr,xzr          // carry bit, modulo-scheduled
1198         mul     $t1,$a1,$n0
1199         add     $cnt,$cnt,#8
1200         mul     $t2,$a2,$n0
1201         mul     $t3,$a3,$n0
1202         adds    $acc0,$acc0,$t0
1203         mul     $t0,$a4,$n0
1204         adcs    $acc1,$acc1,$t1
1205         mul     $t1,$a5,$n0
1206         adcs    $acc2,$acc2,$t2
1207         mul     $t2,$a6,$n0
1208         adcs    $acc3,$acc3,$t3
1209         mul     $t3,$a7,$n0
1210         adcs    $acc4,$acc4,$t0
1211         umulh   $t0,$a0,$n0
1212         adcs    $acc5,$acc5,$t1
1213         umulh   $t1,$a1,$n0
1214         adcs    $acc6,$acc6,$t2
1215         umulh   $t2,$a2,$n0
1216         adcs    $acc7,$acc7,$t3
1217         umulh   $t3,$a3,$n0
1218         adc     $carry,$carry,xzr
1219         str     $acc0,[$tp],#8
1220         adds    $acc0,$acc1,$t0
1221         umulh   $t0,$a4,$n0
1222         adcs    $acc1,$acc2,$t1
1223         umulh   $t1,$a5,$n0
1224         adcs    $acc2,$acc3,$t2
1225         umulh   $t2,$a6,$n0
1226         adcs    $acc3,$acc4,$t3
1227         umulh   $t3,$a7,$n0
1228         ldr     $n0,[$rp,$cnt]
1229         adcs    $acc4,$acc5,$t0
1230         adcs    $acc5,$acc6,$t1
1231         adcs    $acc6,$acc7,$t2
1232         adcs    $acc7,$carry,$t3
1233         //adc   $carry,xzr,xzr          // moved above
1234         cbnz    $cnt,.Lsqr8x_tail
1235                                         // note that carry flag is guaranteed
1236                                         // to be zero at this point
1237         ldp     $a0,$a1,[$tp,#8*0]
1238         sub     $cnt,$np_end,$np        // done yet?
1239         sub     $t2,$np_end,$num        // rewinded np
1240         ldp     $a2,$a3,[$tp,#8*2]
1241         ldp     $a4,$a5,[$tp,#8*4]
1242         ldp     $a6,$a7,[$tp,#8*6]
1243         cbz     $cnt,.Lsqr8x_tail_break
1244
1245         ldur    $n0,[$rp,#-8*8]
1246         adds    $acc0,$acc0,$a0
1247         adcs    $acc1,$acc1,$a1
1248         ldp     $a0,$a1,[$np,#8*0]
1249         adcs    $acc2,$acc2,$a2
1250         adcs    $acc3,$acc3,$a3
1251         ldp     $a2,$a3,[$np,#8*2]
1252         adcs    $acc4,$acc4,$a4
1253         adcs    $acc5,$acc5,$a5
1254         ldp     $a4,$a5,[$np,#8*4]
1255         adcs    $acc6,$acc6,$a6
1256         mov     $cnt,#-8*8
1257         adcs    $acc7,$acc7,$a7
1258         ldp     $a6,$a7,[$np,#8*6]
1259         add     $np,$np,#8*8
1260         //adc   $carry,xzr,xzr          // moved above
1261         b       .Lsqr8x_tail
1262
1263 .align  4
1264 .Lsqr8x_tail_break:
1265         ldr     $n0,[x29,#112]          // pull n0
1266         add     $cnt,$tp,#8*8           // end of current t[num] window
1267
1268         subs    xzr,$topmost,#1         // "move" top-most carry to carry bit
1269         adcs    $t0,$acc0,$a0
1270         adcs    $t1,$acc1,$a1
1271         ldp     $acc0,$acc1,[$rp,#8*0]
1272         adcs    $acc2,$acc2,$a2
1273         ldp     $a0,$a1,[$t2,#8*0]      // recall that $t2 is &n[0]
1274         adcs    $acc3,$acc3,$a3
1275         ldp     $a2,$a3,[$t2,#8*2]
1276         adcs    $acc4,$acc4,$a4
1277         adcs    $acc5,$acc5,$a5
1278         ldp     $a4,$a5,[$t2,#8*4]
1279         adcs    $acc6,$acc6,$a6
1280         adcs    $acc7,$acc7,$a7
1281         ldp     $a6,$a7,[$t2,#8*6]
1282         add     $np,$t2,#8*8
1283         adc     $topmost,xzr,xzr        // top-most carry
1284         mul     $na0,$n0,$acc0
1285         stp     $t0,$t1,[$tp,#8*0]
1286         stp     $acc2,$acc3,[$tp,#8*2]
1287         ldp     $acc2,$acc3,[$rp,#8*2]
1288         stp     $acc4,$acc5,[$tp,#8*4]
1289         ldp     $acc4,$acc5,[$rp,#8*4]
1290         cmp     $cnt,x29                // did we hit the bottom?
1291         stp     $acc6,$acc7,[$tp,#8*6]
1292         mov     $tp,$rp                 // slide the window
1293         ldp     $acc6,$acc7,[$rp,#8*6]
1294         mov     $cnt,#8
1295         b.ne    .Lsqr8x_reduction
1296
1297         // Final step. We see if result is larger than modulus, and
1298         // if it is, subtract the modulus. But comparison implies
1299         // subtraction. So we subtract modulus, see if it borrowed,
1300         // and conditionally copy original value.
1301         ldr     $rp,[x29,#96]           // pull rp
1302         add     $tp,$tp,#8*8
1303         subs    $t0,$acc0,$a0
1304         sbcs    $t1,$acc1,$a1
1305         sub     $cnt,$num,#8*8
1306         mov     $ap_end,$rp             // $rp copy
1307
1308 .Lsqr8x_sub:
1309         sbcs    $t2,$acc2,$a2
1310         ldp     $a0,$a1,[$np,#8*0]
1311         sbcs    $t3,$acc3,$a3
1312         stp     $t0,$t1,[$rp,#8*0]
1313         sbcs    $t0,$acc4,$a4
1314         ldp     $a2,$a3,[$np,#8*2]
1315         sbcs    $t1,$acc5,$a5
1316         stp     $t2,$t3,[$rp,#8*2]
1317         sbcs    $t2,$acc6,$a6
1318         ldp     $a4,$a5,[$np,#8*4]
1319         sbcs    $t3,$acc7,$a7
1320         ldp     $a6,$a7,[$np,#8*6]
1321         add     $np,$np,#8*8
1322         ldp     $acc0,$acc1,[$tp,#8*0]
1323         sub     $cnt,$cnt,#8*8
1324         ldp     $acc2,$acc3,[$tp,#8*2]
1325         ldp     $acc4,$acc5,[$tp,#8*4]
1326         ldp     $acc6,$acc7,[$tp,#8*6]
1327         add     $tp,$tp,#8*8
1328         stp     $t0,$t1,[$rp,#8*4]
1329         sbcs    $t0,$acc0,$a0
1330         stp     $t2,$t3,[$rp,#8*6]
1331         add     $rp,$rp,#8*8
1332         sbcs    $t1,$acc1,$a1
1333         cbnz    $cnt,.Lsqr8x_sub
1334
1335         sbcs    $t2,$acc2,$a2
1336          mov    $tp,sp
1337          add    $ap,sp,$num
1338          ldp    $a0,$a1,[$ap_end,#8*0]
1339         sbcs    $t3,$acc3,$a3
1340         stp     $t0,$t1,[$rp,#8*0]
1341         sbcs    $t0,$acc4,$a4
1342          ldp    $a2,$a3,[$ap_end,#8*2]
1343         sbcs    $t1,$acc5,$a5
1344         stp     $t2,$t3,[$rp,#8*2]
1345         sbcs    $t2,$acc6,$a6
1346          ldp    $acc0,$acc1,[$ap,#8*0]
1347         sbcs    $t3,$acc7,$a7
1348          ldp    $acc2,$acc3,[$ap,#8*2]
1349         sbcs    xzr,$topmost,xzr        // did it borrow?
1350         ldr     x30,[x29,#8]            // pull return address
1351         stp     $t0,$t1,[$rp,#8*4]
1352         stp     $t2,$t3,[$rp,#8*6]
1353
1354         sub     $cnt,$num,#8*4
1355 .Lsqr4x_cond_copy:
1356         sub     $cnt,$cnt,#8*4
1357         csel    $t0,$acc0,$a0,lo
1358          stp    xzr,xzr,[$tp,#8*0]
1359         csel    $t1,$acc1,$a1,lo
1360         ldp     $a0,$a1,[$ap_end,#8*4]
1361         ldp     $acc0,$acc1,[$ap,#8*4]
1362         csel    $t2,$acc2,$a2,lo
1363          stp    xzr,xzr,[$tp,#8*2]
1364          add    $tp,$tp,#8*4
1365         csel    $t3,$acc3,$a3,lo
1366         ldp     $a2,$a3,[$ap_end,#8*6]
1367         ldp     $acc2,$acc3,[$ap,#8*6]
1368         add     $ap,$ap,#8*4
1369         stp     $t0,$t1,[$ap_end,#8*0]
1370         stp     $t2,$t3,[$ap_end,#8*2]
1371         add     $ap_end,$ap_end,#8*4
1372          stp    xzr,xzr,[$ap,#8*0]
1373          stp    xzr,xzr,[$ap,#8*2]
1374         cbnz    $cnt,.Lsqr4x_cond_copy
1375
1376         csel    $t0,$acc0,$a0,lo
1377          stp    xzr,xzr,[$tp,#8*0]
1378         csel    $t1,$acc1,$a1,lo
1379          stp    xzr,xzr,[$tp,#8*2]
1380         csel    $t2,$acc2,$a2,lo
1381         csel    $t3,$acc3,$a3,lo
1382         stp     $t0,$t1,[$ap_end,#8*0]
1383         stp     $t2,$t3,[$ap_end,#8*2]
1384
1385         b       .Lsqr8x_done
1386
1387 .align  4
1388 .Lsqr8x8_post_condition:
1389         adc     $carry,xzr,xzr
1390         ldr     x30,[x29,#8]            // pull return address
1391         // $acc0-7,$carry hold result, $a0-7 hold modulus
1392         subs    $a0,$acc0,$a0
1393         ldr     $ap,[x29,#96]           // pull rp
1394         sbcs    $a1,$acc1,$a1
1395          stp    xzr,xzr,[sp,#8*0]
1396         sbcs    $a2,$acc2,$a2
1397          stp    xzr,xzr,[sp,#8*2]
1398         sbcs    $a3,$acc3,$a3
1399          stp    xzr,xzr,[sp,#8*4]
1400         sbcs    $a4,$acc4,$a4
1401          stp    xzr,xzr,[sp,#8*6]
1402         sbcs    $a5,$acc5,$a5
1403          stp    xzr,xzr,[sp,#8*8]
1404         sbcs    $a6,$acc6,$a6
1405          stp    xzr,xzr,[sp,#8*10]
1406         sbcs    $a7,$acc7,$a7
1407          stp    xzr,xzr,[sp,#8*12]
1408         sbcs    $carry,$carry,xzr       // did it borrow?
1409          stp    xzr,xzr,[sp,#8*14]
1410
1411         // $a0-7 hold result-modulus
1412         csel    $a0,$acc0,$a0,lo
1413         csel    $a1,$acc1,$a1,lo
1414         csel    $a2,$acc2,$a2,lo
1415         csel    $a3,$acc3,$a3,lo
1416         stp     $a0,$a1,[$ap,#8*0]
1417         csel    $a4,$acc4,$a4,lo
1418         csel    $a5,$acc5,$a5,lo
1419         stp     $a2,$a3,[$ap,#8*2]
1420         csel    $a6,$acc6,$a6,lo
1421         csel    $a7,$acc7,$a7,lo
1422         stp     $a4,$a5,[$ap,#8*4]
1423         stp     $a6,$a7,[$ap,#8*6]
1424
1425 .Lsqr8x_done:
1426         ldp     x19,x20,[x29,#16]
1427         mov     sp,x29
1428         ldp     x21,x22,[x29,#32]
1429         mov     x0,#1
1430         ldp     x23,x24,[x29,#48]
1431         ldp     x25,x26,[x29,#64]
1432         ldp     x27,x28,[x29,#80]
1433         ldr     x29,[sp],#128
1434         // x30 is loaded earlier
1435         AARCH64_VALIDATE_LINK_REGISTER
1436         ret
1437 .size   __bn_sqr8x_mont,.-__bn_sqr8x_mont
1438 ___
1439 }
1440
1441 {
1442 ########################################################################
1443 # Even though this might look as ARMv8 adaptation of mulx4x_mont from
1444 # x86_64-mont5 module, it's different in sense that it performs
1445 # reduction 256 bits at a time.
1446
1447 my ($a0,$a1,$a2,$a3,
1448     $t0,$t1,$t2,$t3,
1449     $m0,$m1,$m2,$m3,
1450     $acc0,$acc1,$acc2,$acc3,$acc4,
1451     $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1452 my  $bp_end=$rp;
1453 my  ($carry,$topmost) = ($rp,"x30");
1454
1455 $code.=<<___;
1456 .type   __bn_mul4x_mont,%function
1457 .align  5
1458 __bn_mul4x_mont:
1459         // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
1460         // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address.
1461         stp     x29,x30,[sp,#-128]!
1462         add     x29,sp,#0
1463         stp     x19,x20,[sp,#16]
1464         stp     x21,x22,[sp,#32]
1465         stp     x23,x24,[sp,#48]
1466         stp     x25,x26,[sp,#64]
1467         stp     x27,x28,[sp,#80]
1468
1469         sub     $tp,sp,$num,lsl#3
1470         lsl     $num,$num,#3
1471         ldr     $n0,[$n0]               // *n0
1472         sub     sp,$tp,#8*4             // alloca
1473
1474         add     $t0,$bp,$num
1475         add     $ap_end,$ap,$num
1476         stp     $rp,$t0,[x29,#96]       // offload rp and &b[num]
1477
1478         ldr     $bi,[$bp,#8*0]          // b[0]
1479         ldp     $a0,$a1,[$ap,#8*0]      // a[0..3]
1480         ldp     $a2,$a3,[$ap,#8*2]
1481         add     $ap,$ap,#8*4
1482         mov     $acc0,xzr
1483         mov     $acc1,xzr
1484         mov     $acc2,xzr
1485         mov     $acc3,xzr
1486         ldp     $m0,$m1,[$np,#8*0]      // n[0..3]
1487         ldp     $m2,$m3,[$np,#8*2]
1488         adds    $np,$np,#8*4            // clear carry bit
1489         mov     $carry,xzr
1490         mov     $cnt,#0
1491         mov     $tp,sp
1492
1493 .Loop_mul4x_1st_reduction:
1494         mul     $t0,$a0,$bi             // lo(a[0..3]*b[0])
1495         adc     $carry,$carry,xzr       // modulo-scheduled
1496         mul     $t1,$a1,$bi
1497         add     $cnt,$cnt,#8
1498         mul     $t2,$a2,$bi
1499         and     $cnt,$cnt,#31
1500         mul     $t3,$a3,$bi
1501         adds    $acc0,$acc0,$t0
1502         umulh   $t0,$a0,$bi             // hi(a[0..3]*b[0])
1503         adcs    $acc1,$acc1,$t1
1504         mul     $mi,$acc0,$n0           // t[0]*n0
1505         adcs    $acc2,$acc2,$t2
1506         umulh   $t1,$a1,$bi
1507         adcs    $acc3,$acc3,$t3
1508         umulh   $t2,$a2,$bi
1509         adc     $acc4,xzr,xzr
1510         umulh   $t3,$a3,$bi
1511         ldr     $bi,[$bp,$cnt]          // next b[i] (or b[0])
1512         adds    $acc1,$acc1,$t0
1513         // (*)  mul     $t0,$m0,$mi     // lo(n[0..3]*t[0]*n0)
1514         str     $mi,[$tp],#8            // put aside t[0]*n0 for tail processing
1515         adcs    $acc2,$acc2,$t1
1516         mul     $t1,$m1,$mi
1517         adcs    $acc3,$acc3,$t2
1518         mul     $t2,$m2,$mi
1519         adc     $acc4,$acc4,$t3         // can't overflow
1520         mul     $t3,$m3,$mi
1521         // (*)  adds    xzr,$acc0,$t0
1522         subs    xzr,$acc0,#1            // (*)
1523         umulh   $t0,$m0,$mi             // hi(n[0..3]*t[0]*n0)
1524         adcs    $acc0,$acc1,$t1
1525         umulh   $t1,$m1,$mi
1526         adcs    $acc1,$acc2,$t2
1527         umulh   $t2,$m2,$mi
1528         adcs    $acc2,$acc3,$t3
1529         umulh   $t3,$m3,$mi
1530         adcs    $acc3,$acc4,$carry
1531         adc     $carry,xzr,xzr
1532         adds    $acc0,$acc0,$t0
1533         sub     $t0,$ap_end,$ap
1534         adcs    $acc1,$acc1,$t1
1535         adcs    $acc2,$acc2,$t2
1536         adcs    $acc3,$acc3,$t3
1537         //adc   $carry,$carry,xzr
1538         cbnz    $cnt,.Loop_mul4x_1st_reduction
1539
1540         cbz     $t0,.Lmul4x4_post_condition
1541
1542         ldp     $a0,$a1,[$ap,#8*0]      // a[4..7]
1543         ldp     $a2,$a3,[$ap,#8*2]
1544         add     $ap,$ap,#8*4
1545         ldr     $mi,[sp]                // a[0]*n0
1546         ldp     $m0,$m1,[$np,#8*0]      // n[4..7]
1547         ldp     $m2,$m3,[$np,#8*2]
1548         add     $np,$np,#8*4
1549
1550 .Loop_mul4x_1st_tail:
1551         mul     $t0,$a0,$bi             // lo(a[4..7]*b[i])
1552         adc     $carry,$carry,xzr       // modulo-scheduled
1553         mul     $t1,$a1,$bi
1554         add     $cnt,$cnt,#8
1555         mul     $t2,$a2,$bi
1556         and     $cnt,$cnt,#31
1557         mul     $t3,$a3,$bi
1558         adds    $acc0,$acc0,$t0
1559         umulh   $t0,$a0,$bi             // hi(a[4..7]*b[i])
1560         adcs    $acc1,$acc1,$t1
1561         umulh   $t1,$a1,$bi
1562         adcs    $acc2,$acc2,$t2
1563         umulh   $t2,$a2,$bi
1564         adcs    $acc3,$acc3,$t3
1565         umulh   $t3,$a3,$bi
1566         adc     $acc4,xzr,xzr
1567         ldr     $bi,[$bp,$cnt]          // next b[i] (or b[0])
1568         adds    $acc1,$acc1,$t0
1569         mul     $t0,$m0,$mi             // lo(n[4..7]*a[0]*n0)
1570         adcs    $acc2,$acc2,$t1
1571         mul     $t1,$m1,$mi
1572         adcs    $acc3,$acc3,$t2
1573         mul     $t2,$m2,$mi
1574         adc     $acc4,$acc4,$t3         // can't overflow
1575         mul     $t3,$m3,$mi
1576         adds    $acc0,$acc0,$t0
1577         umulh   $t0,$m0,$mi             // hi(n[4..7]*a[0]*n0)
1578         adcs    $acc1,$acc1,$t1
1579         umulh   $t1,$m1,$mi
1580         adcs    $acc2,$acc2,$t2
1581         umulh   $t2,$m2,$mi
1582         adcs    $acc3,$acc3,$t3
1583         adcs    $acc4,$acc4,$carry
1584         umulh   $t3,$m3,$mi
1585         adc     $carry,xzr,xzr
1586         ldr     $mi,[sp,$cnt]           // next t[0]*n0
1587         str     $acc0,[$tp],#8          // result!!!
1588         adds    $acc0,$acc1,$t0
1589         sub     $t0,$ap_end,$ap         // done yet?
1590         adcs    $acc1,$acc2,$t1
1591         adcs    $acc2,$acc3,$t2
1592         adcs    $acc3,$acc4,$t3
1593         //adc   $carry,$carry,xzr
1594         cbnz    $cnt,.Loop_mul4x_1st_tail
1595
1596         sub     $t1,$ap_end,$num        // rewinded $ap
1597         cbz     $t0,.Lmul4x_proceed
1598
1599         ldp     $a0,$a1,[$ap,#8*0]
1600         ldp     $a2,$a3,[$ap,#8*2]
1601         add     $ap,$ap,#8*4
1602         ldp     $m0,$m1,[$np,#8*0]
1603         ldp     $m2,$m3,[$np,#8*2]
1604         add     $np,$np,#8*4
1605         b       .Loop_mul4x_1st_tail
1606
1607 .align  5
1608 .Lmul4x_proceed:
1609         ldr     $bi,[$bp,#8*4]!         // *++b
1610         adc     $topmost,$carry,xzr
1611         ldp     $a0,$a1,[$t1,#8*0]      // a[0..3]
1612         sub     $np,$np,$num            // rewind np
1613         ldp     $a2,$a3,[$t1,#8*2]
1614         add     $ap,$t1,#8*4
1615
1616         stp     $acc0,$acc1,[$tp,#8*0]  // result!!!
1617         ldp     $acc0,$acc1,[sp,#8*4]   // t[0..3]
1618         stp     $acc2,$acc3,[$tp,#8*2]  // result!!!
1619         ldp     $acc2,$acc3,[sp,#8*6]
1620
1621         ldp     $m0,$m1,[$np,#8*0]      // n[0..3]
1622         mov     $tp,sp
1623         ldp     $m2,$m3,[$np,#8*2]
1624         adds    $np,$np,#8*4            // clear carry bit
1625         mov     $carry,xzr
1626
1627 .align  4
1628 .Loop_mul4x_reduction:
1629         mul     $t0,$a0,$bi             // lo(a[0..3]*b[4])
1630         adc     $carry,$carry,xzr       // modulo-scheduled
1631         mul     $t1,$a1,$bi
1632         add     $cnt,$cnt,#8
1633         mul     $t2,$a2,$bi
1634         and     $cnt,$cnt,#31
1635         mul     $t3,$a3,$bi
1636         adds    $acc0,$acc0,$t0
1637         umulh   $t0,$a0,$bi             // hi(a[0..3]*b[4])
1638         adcs    $acc1,$acc1,$t1
1639         mul     $mi,$acc0,$n0           // t[0]*n0
1640         adcs    $acc2,$acc2,$t2
1641         umulh   $t1,$a1,$bi
1642         adcs    $acc3,$acc3,$t3
1643         umulh   $t2,$a2,$bi
1644         adc     $acc4,xzr,xzr
1645         umulh   $t3,$a3,$bi
1646         ldr     $bi,[$bp,$cnt]          // next b[i]
1647         adds    $acc1,$acc1,$t0
1648         // (*)  mul     $t0,$m0,$mi
1649         str     $mi,[$tp],#8            // put aside t[0]*n0 for tail processing
1650         adcs    $acc2,$acc2,$t1
1651         mul     $t1,$m1,$mi             // lo(n[0..3]*t[0]*n0
1652         adcs    $acc3,$acc3,$t2
1653         mul     $t2,$m2,$mi
1654         adc     $acc4,$acc4,$t3         // can't overflow
1655         mul     $t3,$m3,$mi
1656         // (*)  adds    xzr,$acc0,$t0
1657         subs    xzr,$acc0,#1            // (*)
1658         umulh   $t0,$m0,$mi             // hi(n[0..3]*t[0]*n0
1659         adcs    $acc0,$acc1,$t1
1660         umulh   $t1,$m1,$mi
1661         adcs    $acc1,$acc2,$t2
1662         umulh   $t2,$m2,$mi
1663         adcs    $acc2,$acc3,$t3
1664         umulh   $t3,$m3,$mi
1665         adcs    $acc3,$acc4,$carry
1666         adc     $carry,xzr,xzr
1667         adds    $acc0,$acc0,$t0
1668         adcs    $acc1,$acc1,$t1
1669         adcs    $acc2,$acc2,$t2
1670         adcs    $acc3,$acc3,$t3
1671         //adc   $carry,$carry,xzr
1672         cbnz    $cnt,.Loop_mul4x_reduction
1673
1674         adc     $carry,$carry,xzr
1675         ldp     $t0,$t1,[$tp,#8*4]      // t[4..7]
1676         ldp     $t2,$t3,[$tp,#8*6]
1677         ldp     $a0,$a1,[$ap,#8*0]      // a[4..7]
1678         ldp     $a2,$a3,[$ap,#8*2]
1679         add     $ap,$ap,#8*4
1680         adds    $acc0,$acc0,$t0
1681         adcs    $acc1,$acc1,$t1
1682         adcs    $acc2,$acc2,$t2
1683         adcs    $acc3,$acc3,$t3
1684         //adc   $carry,$carry,xzr
1685
1686         ldr     $mi,[sp]                // t[0]*n0
1687         ldp     $m0,$m1,[$np,#8*0]      // n[4..7]
1688         ldp     $m2,$m3,[$np,#8*2]
1689         add     $np,$np,#8*4
1690
1691 .align  4
1692 .Loop_mul4x_tail:
1693         mul     $t0,$a0,$bi             // lo(a[4..7]*b[4])
1694         adc     $carry,$carry,xzr       // modulo-scheduled
1695         mul     $t1,$a1,$bi
1696         add     $cnt,$cnt,#8
1697         mul     $t2,$a2,$bi
1698         and     $cnt,$cnt,#31
1699         mul     $t3,$a3,$bi
1700         adds    $acc0,$acc0,$t0
1701         umulh   $t0,$a0,$bi             // hi(a[4..7]*b[4])
1702         adcs    $acc1,$acc1,$t1
1703         umulh   $t1,$a1,$bi
1704         adcs    $acc2,$acc2,$t2
1705         umulh   $t2,$a2,$bi
1706         adcs    $acc3,$acc3,$t3
1707         umulh   $t3,$a3,$bi
1708         adc     $acc4,xzr,xzr
1709         ldr     $bi,[$bp,$cnt]          // next b[i]
1710         adds    $acc1,$acc1,$t0
1711         mul     $t0,$m0,$mi             // lo(n[4..7]*t[0]*n0)
1712         adcs    $acc2,$acc2,$t1
1713         mul     $t1,$m1,$mi
1714         adcs    $acc3,$acc3,$t2
1715         mul     $t2,$m2,$mi
1716         adc     $acc4,$acc4,$t3         // can't overflow
1717         mul     $t3,$m3,$mi
1718         adds    $acc0,$acc0,$t0
1719         umulh   $t0,$m0,$mi             // hi(n[4..7]*t[0]*n0)
1720         adcs    $acc1,$acc1,$t1
1721         umulh   $t1,$m1,$mi
1722         adcs    $acc2,$acc2,$t2
1723         umulh   $t2,$m2,$mi
1724         adcs    $acc3,$acc3,$t3
1725         umulh   $t3,$m3,$mi
1726         adcs    $acc4,$acc4,$carry
1727         ldr     $mi,[sp,$cnt]           // next a[0]*n0
1728         adc     $carry,xzr,xzr
1729         str     $acc0,[$tp],#8          // result!!!
1730         adds    $acc0,$acc1,$t0
1731         sub     $t0,$ap_end,$ap         // done yet?
1732         adcs    $acc1,$acc2,$t1
1733         adcs    $acc2,$acc3,$t2
1734         adcs    $acc3,$acc4,$t3
1735         //adc   $carry,$carry,xzr
1736         cbnz    $cnt,.Loop_mul4x_tail
1737
1738         sub     $t1,$np,$num            // rewinded np?
1739         adc     $carry,$carry,xzr
1740         cbz     $t0,.Loop_mul4x_break
1741
1742         ldp     $t0,$t1,[$tp,#8*4]
1743         ldp     $t2,$t3,[$tp,#8*6]
1744         ldp     $a0,$a1,[$ap,#8*0]
1745         ldp     $a2,$a3,[$ap,#8*2]
1746         add     $ap,$ap,#8*4
1747         adds    $acc0,$acc0,$t0
1748         adcs    $acc1,$acc1,$t1
1749         adcs    $acc2,$acc2,$t2
1750         adcs    $acc3,$acc3,$t3
1751         //adc   $carry,$carry,xzr
1752         ldp     $m0,$m1,[$np,#8*0]
1753         ldp     $m2,$m3,[$np,#8*2]
1754         add     $np,$np,#8*4
1755         b       .Loop_mul4x_tail
1756
1757 .align  4
1758 .Loop_mul4x_break:
1759         ldp     $t2,$t3,[x29,#96]       // pull rp and &b[num]
1760         adds    $acc0,$acc0,$topmost
1761         add     $bp,$bp,#8*4            // bp++
1762         adcs    $acc1,$acc1,xzr
1763         sub     $ap,$ap,$num            // rewind ap
1764         adcs    $acc2,$acc2,xzr
1765         stp     $acc0,$acc1,[$tp,#8*0]  // result!!!
1766         adcs    $acc3,$acc3,xzr
1767         ldp     $acc0,$acc1,[sp,#8*4]   // t[0..3]
1768         adc     $topmost,$carry,xzr
1769         stp     $acc2,$acc3,[$tp,#8*2]  // result!!!
1770         cmp     $bp,$t3                 // done yet?
1771         ldp     $acc2,$acc3,[sp,#8*6]
1772         ldp     $m0,$m1,[$t1,#8*0]      // n[0..3]
1773         ldp     $m2,$m3,[$t1,#8*2]
1774         add     $np,$t1,#8*4
1775         b.eq    .Lmul4x_post
1776
1777         ldr     $bi,[$bp]
1778         ldp     $a0,$a1,[$ap,#8*0]      // a[0..3]
1779         ldp     $a2,$a3,[$ap,#8*2]
1780         adds    $ap,$ap,#8*4            // clear carry bit
1781         mov     $carry,xzr
1782         mov     $tp,sp
1783         b       .Loop_mul4x_reduction
1784
1785 .align  4
1786 .Lmul4x_post:
1787         // Final step. We see if result is larger than modulus, and
1788         // if it is, subtract the modulus. But comparison implies
1789         // subtraction. So we subtract modulus, see if it borrowed,
1790         // and conditionally copy original value.
1791         mov     $rp,$t2
1792         mov     $ap_end,$t2             // $rp copy
1793         subs    $t0,$acc0,$m0
1794         add     $tp,sp,#8*8
1795         sbcs    $t1,$acc1,$m1
1796         sub     $cnt,$num,#8*4
1797
1798 .Lmul4x_sub:
1799         sbcs    $t2,$acc2,$m2
1800         ldp     $m0,$m1,[$np,#8*0]
1801         sub     $cnt,$cnt,#8*4
1802         ldp     $acc0,$acc1,[$tp,#8*0]
1803         sbcs    $t3,$acc3,$m3
1804         ldp     $m2,$m3,[$np,#8*2]
1805         add     $np,$np,#8*4
1806         ldp     $acc2,$acc3,[$tp,#8*2]
1807         add     $tp,$tp,#8*4
1808         stp     $t0,$t1,[$rp,#8*0]
1809         sbcs    $t0,$acc0,$m0
1810         stp     $t2,$t3,[$rp,#8*2]
1811         add     $rp,$rp,#8*4
1812         sbcs    $t1,$acc1,$m1
1813         cbnz    $cnt,.Lmul4x_sub
1814
1815         sbcs    $t2,$acc2,$m2
1816          mov    $tp,sp
1817          add    $ap,sp,#8*4
1818          ldp    $a0,$a1,[$ap_end,#8*0]
1819         sbcs    $t3,$acc3,$m3
1820         stp     $t0,$t1,[$rp,#8*0]
1821          ldp    $a2,$a3,[$ap_end,#8*2]
1822         stp     $t2,$t3,[$rp,#8*2]
1823          ldp    $acc0,$acc1,[$ap,#8*0]
1824          ldp    $acc2,$acc3,[$ap,#8*2]
1825         sbcs    xzr,$topmost,xzr        // did it borrow?
1826         ldr     x30,[x29,#8]            // pull return address
1827
1828         sub     $cnt,$num,#8*4
1829 .Lmul4x_cond_copy:
1830         sub     $cnt,$cnt,#8*4
1831         csel    $t0,$acc0,$a0,lo
1832          stp    xzr,xzr,[$tp,#8*0]
1833         csel    $t1,$acc1,$a1,lo
1834         ldp     $a0,$a1,[$ap_end,#8*4]
1835         ldp     $acc0,$acc1,[$ap,#8*4]
1836         csel    $t2,$acc2,$a2,lo
1837          stp    xzr,xzr,[$tp,#8*2]
1838          add    $tp,$tp,#8*4
1839         csel    $t3,$acc3,$a3,lo
1840         ldp     $a2,$a3,[$ap_end,#8*6]
1841         ldp     $acc2,$acc3,[$ap,#8*6]
1842         add     $ap,$ap,#8*4
1843         stp     $t0,$t1,[$ap_end,#8*0]
1844         stp     $t2,$t3,[$ap_end,#8*2]
1845         add     $ap_end,$ap_end,#8*4
1846         cbnz    $cnt,.Lmul4x_cond_copy
1847
1848         csel    $t0,$acc0,$a0,lo
1849          stp    xzr,xzr,[$tp,#8*0]
1850         csel    $t1,$acc1,$a1,lo
1851          stp    xzr,xzr,[$tp,#8*2]
1852         csel    $t2,$acc2,$a2,lo
1853          stp    xzr,xzr,[$tp,#8*3]
1854         csel    $t3,$acc3,$a3,lo
1855          stp    xzr,xzr,[$tp,#8*4]
1856         stp     $t0,$t1,[$ap_end,#8*0]
1857         stp     $t2,$t3,[$ap_end,#8*2]
1858
1859         b       .Lmul4x_done
1860
1861 .align  4
1862 .Lmul4x4_post_condition:
1863         adc     $carry,$carry,xzr
1864         ldr     $ap,[x29,#96]           // pull rp
1865         // $acc0-3,$carry hold result, $m0-7 hold modulus
1866         subs    $a0,$acc0,$m0
1867         ldr     x30,[x29,#8]            // pull return address
1868         sbcs    $a1,$acc1,$m1
1869          stp    xzr,xzr,[sp,#8*0]
1870         sbcs    $a2,$acc2,$m2
1871          stp    xzr,xzr,[sp,#8*2]
1872         sbcs    $a3,$acc3,$m3
1873          stp    xzr,xzr,[sp,#8*4]
1874         sbcs    xzr,$carry,xzr          // did it borrow?
1875          stp    xzr,xzr,[sp,#8*6]
1876
1877         // $a0-3 hold result-modulus
1878         csel    $a0,$acc0,$a0,lo
1879         csel    $a1,$acc1,$a1,lo
1880         csel    $a2,$acc2,$a2,lo
1881         csel    $a3,$acc3,$a3,lo
1882         stp     $a0,$a1,[$ap,#8*0]
1883         stp     $a2,$a3,[$ap,#8*2]
1884
1885 .Lmul4x_done:
1886         ldp     x19,x20,[x29,#16]
1887         mov     sp,x29
1888         ldp     x21,x22,[x29,#32]
1889         mov     x0,#1
1890         ldp     x23,x24,[x29,#48]
1891         ldp     x25,x26,[x29,#64]
1892         ldp     x27,x28,[x29,#80]
1893         ldr     x29,[sp],#128
1894         // x30 loaded earlier
1895         AARCH64_VALIDATE_LINK_REGISTER
1896         ret
1897 .size   __bn_mul4x_mont,.-__bn_mul4x_mont
1898 ___
1899 }
1900 $code.=<<___;
1901 .asciz  "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1902 .align  4
1903 ___
1904
1905 print $code;
1906
1907 close STDOUT or die "error closing STDOUT: $!";