Update copyright year
[openssl.git] / crypto / bn / asm / armv8-mont.pl
1 #! /usr/bin/env perl
2 # Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # March 2015
18 #
19 # "Teaser" Montgomery multiplication module for ARMv8. Needs more
20 # work. While it does improve RSA sign performance by 20-30% (less for
21 # longer keys) on most processors, for some reason RSA2048 is not
22 # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
23 # instruction issue rate is limited on processor in question, meaning
24 # that dedicated squaring procedure is a must. Well, actually all
25 # contemporary AArch64 processors seem to have limited multiplication
26 # issue rate, i.e. they can't issue multiplication every cycle, which
27 # explains moderate improvement coefficients in comparison to
28 # compiler-generated code. Recall that compiler is instructed to use
29 # umulh and therefore uses same amount of multiplication instructions
30 # to do the job. Assembly's edge is to minimize number of "collateral"
31 # instructions and of course instruction scheduling.
32 #
33 # April 2015
34 #
35 # Squaring procedure that handles lengths divisible by 8 improves
36 # RSA/DSA performance by 25-40-60% depending on processor and key
37 # length. Overall improvement coefficients are always positive in
38 # comparison to compiler-generated code. On Cortex-A57 improvement
39 # is still modest on longest key lengths, while others exhibit e.g.
40 # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
41 # on Cortex-A57 and ~60-100% faster on others.
42
43 # $output is the last argument if it looks like a file (it has an extension)
44 # $flavour is the first argument if it doesn't look like a file
45 my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46 my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
47
48 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
50 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
51 die "can't locate arm-xlate.pl";
52
53 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
54     or die "can't call $xlate: $1";
55 *STDOUT=*OUT;
56
57 ($lo0,$hi0,$aj,$m0,$alo,$ahi,
58  $lo1,$hi1,$nj,$m1,$nlo,$nhi,
59  $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
60
61 # int bn_mul_mont(
62 $rp="x0";       # BN_ULONG *rp,
63 $ap="x1";       # const BN_ULONG *ap,
64 $bp="x2";       # const BN_ULONG *bp,
65 $np="x3";       # const BN_ULONG *np,
66 $n0="x4";       # const BN_ULONG *n0,
67 $num="x5";      # int num);
68
69 $code.=<<___;
70 #ifndef __KERNEL__
71 # include "arm_arch.h"
72 .extern OPENSSL_armv8_rsa_neonized
73 .hidden OPENSSL_armv8_rsa_neonized
74 #endif
75 .text
76
77 .globl  bn_mul_mont
78 .type   bn_mul_mont,%function
79 .align  5
80 bn_mul_mont:
81 .Lbn_mul_mont:
82         tst     $num,#3
83         b.ne    .Lmul_mont
84         cmp     $num,#32
85         b.le    .Lscalar_impl
86 #ifndef __KERNEL__
87         adrp    x17,OPENSSL_armv8_rsa_neonized
88         ldr     w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
89         cbnz    w17, bn_mul8x_mont_neon
90 #endif
91
92 .Lscalar_impl:
93         tst     $num,#7
94         b.eq    __bn_sqr8x_mont
95         tst     $num,#3
96         b.eq    __bn_mul4x_mont
97
98 .Lmul_mont:
99         stp     x29,x30,[sp,#-64]!
100         add     x29,sp,#0
101         stp     x19,x20,[sp,#16]
102         stp     x21,x22,[sp,#32]
103         stp     x23,x24,[sp,#48]
104
105         ldr     $m0,[$bp],#8            // bp[0]
106         sub     $tp,sp,$num,lsl#3
107         ldp     $hi0,$aj,[$ap],#16      // ap[0..1]
108         lsl     $num,$num,#3
109         ldr     $n0,[$n0]               // *n0
110         and     $tp,$tp,#-16            // ABI says so
111         ldp     $hi1,$nj,[$np],#16      // np[0..1]
112
113         mul     $lo0,$hi0,$m0           // ap[0]*bp[0]
114         sub     $j,$num,#16             // j=num-2
115         umulh   $hi0,$hi0,$m0
116         mul     $alo,$aj,$m0            // ap[1]*bp[0]
117         umulh   $ahi,$aj,$m0
118
119         mul     $m1,$lo0,$n0            // "tp[0]"*n0
120         mov     sp,$tp                  // alloca
121
122         // (*)  mul     $lo1,$hi1,$m1   // np[0]*m1
123         umulh   $hi1,$hi1,$m1
124         mul     $nlo,$nj,$m1            // np[1]*m1
125         // (*)  adds    $lo1,$lo1,$lo0  // discarded
126         // (*)  As for removal of first multiplication and addition
127         //      instructions. The outcome of first addition is
128         //      guaranteed to be zero, which leaves two computationally
129         //      significant outcomes: it either carries or not. Then
130         //      question is when does it carry? Is there alternative
131         //      way to deduce it? If you follow operations, you can
132         //      observe that condition for carry is quite simple:
133         //      $lo0 being non-zero. So that carry can be calculated
134         //      by adding -1 to $lo0. That's what next instruction does.
135         subs    xzr,$lo0,#1             // (*)
136         umulh   $nhi,$nj,$m1
137         adc     $hi1,$hi1,xzr
138         cbz     $j,.L1st_skip
139
140 .L1st:
141         ldr     $aj,[$ap],#8
142         adds    $lo0,$alo,$hi0
143         sub     $j,$j,#8                // j--
144         adc     $hi0,$ahi,xzr
145
146         ldr     $nj,[$np],#8
147         adds    $lo1,$nlo,$hi1
148         mul     $alo,$aj,$m0            // ap[j]*bp[0]
149         adc     $hi1,$nhi,xzr
150         umulh   $ahi,$aj,$m0
151
152         adds    $lo1,$lo1,$lo0
153         mul     $nlo,$nj,$m1            // np[j]*m1
154         adc     $hi1,$hi1,xzr
155         umulh   $nhi,$nj,$m1
156         str     $lo1,[$tp],#8           // tp[j-1]
157         cbnz    $j,.L1st
158
159 .L1st_skip:
160         adds    $lo0,$alo,$hi0
161         sub     $ap,$ap,$num            // rewind $ap
162         adc     $hi0,$ahi,xzr
163
164         adds    $lo1,$nlo,$hi1
165         sub     $np,$np,$num            // rewind $np
166         adc     $hi1,$nhi,xzr
167
168         adds    $lo1,$lo1,$lo0
169         sub     $i,$num,#8              // i=num-1
170         adcs    $hi1,$hi1,$hi0
171
172         adc     $ovf,xzr,xzr            // upmost overflow bit
173         stp     $lo1,$hi1,[$tp]
174
175 .Louter:
176         ldr     $m0,[$bp],#8            // bp[i]
177         ldp     $hi0,$aj,[$ap],#16
178         ldr     $tj,[sp]                // tp[0]
179         add     $tp,sp,#8
180
181         mul     $lo0,$hi0,$m0           // ap[0]*bp[i]
182         sub     $j,$num,#16             // j=num-2
183         umulh   $hi0,$hi0,$m0
184         ldp     $hi1,$nj,[$np],#16
185         mul     $alo,$aj,$m0            // ap[1]*bp[i]
186         adds    $lo0,$lo0,$tj
187         umulh   $ahi,$aj,$m0
188         adc     $hi0,$hi0,xzr
189
190         mul     $m1,$lo0,$n0
191         sub     $i,$i,#8                // i--
192
193         // (*)  mul     $lo1,$hi1,$m1   // np[0]*m1
194         umulh   $hi1,$hi1,$m1
195         mul     $nlo,$nj,$m1            // np[1]*m1
196         // (*)  adds    $lo1,$lo1,$lo0
197         subs    xzr,$lo0,#1             // (*)
198         umulh   $nhi,$nj,$m1
199         cbz     $j,.Linner_skip
200
201 .Linner:
202         ldr     $aj,[$ap],#8
203         adc     $hi1,$hi1,xzr
204         ldr     $tj,[$tp],#8            // tp[j]
205         adds    $lo0,$alo,$hi0
206         sub     $j,$j,#8                // j--
207         adc     $hi0,$ahi,xzr
208
209         adds    $lo1,$nlo,$hi1
210         ldr     $nj,[$np],#8
211         adc     $hi1,$nhi,xzr
212
213         mul     $alo,$aj,$m0            // ap[j]*bp[i]
214         adds    $lo0,$lo0,$tj
215         umulh   $ahi,$aj,$m0
216         adc     $hi0,$hi0,xzr
217
218         mul     $nlo,$nj,$m1            // np[j]*m1
219         adds    $lo1,$lo1,$lo0
220         umulh   $nhi,$nj,$m1
221         stur    $lo1,[$tp,#-16]         // tp[j-1]
222         cbnz    $j,.Linner
223
224 .Linner_skip:
225         ldr     $tj,[$tp],#8            // tp[j]
226         adc     $hi1,$hi1,xzr
227         adds    $lo0,$alo,$hi0
228         sub     $ap,$ap,$num            // rewind $ap
229         adc     $hi0,$ahi,xzr
230
231         adds    $lo1,$nlo,$hi1
232         sub     $np,$np,$num            // rewind $np
233         adcs    $hi1,$nhi,$ovf
234         adc     $ovf,xzr,xzr
235
236         adds    $lo0,$lo0,$tj
237         adc     $hi0,$hi0,xzr
238
239         adds    $lo1,$lo1,$lo0
240         adcs    $hi1,$hi1,$hi0
241         adc     $ovf,$ovf,xzr           // upmost overflow bit
242         stp     $lo1,$hi1,[$tp,#-16]
243
244         cbnz    $i,.Louter
245
246         // Final step. We see if result is larger than modulus, and
247         // if it is, subtract the modulus. But comparison implies
248         // subtraction. So we subtract modulus, see if it borrowed,
249         // and conditionally copy original value.
250         ldr     $tj,[sp]                // tp[0]
251         add     $tp,sp,#8
252         ldr     $nj,[$np],#8            // np[0]
253         subs    $j,$num,#8              // j=num-1 and clear borrow
254         mov     $ap,$rp
255 .Lsub:
256         sbcs    $aj,$tj,$nj             // tp[j]-np[j]
257         ldr     $tj,[$tp],#8
258         sub     $j,$j,#8                // j--
259         ldr     $nj,[$np],#8
260         str     $aj,[$ap],#8            // rp[j]=tp[j]-np[j]
261         cbnz    $j,.Lsub
262
263         sbcs    $aj,$tj,$nj
264         sbcs    $ovf,$ovf,xzr           // did it borrow?
265         str     $aj,[$ap],#8            // rp[num-1]
266
267         ldr     $tj,[sp]                // tp[0]
268         add     $tp,sp,#8
269         ldr     $aj,[$rp],#8            // rp[0]
270         sub     $num,$num,#8            // num--
271         nop
272 .Lcond_copy:
273         sub     $num,$num,#8            // num--
274         csel    $nj,$tj,$aj,lo          // did it borrow?
275         ldr     $tj,[$tp],#8
276         ldr     $aj,[$rp],#8
277         stur    xzr,[$tp,#-16]          // wipe tp
278         stur    $nj,[$rp,#-16]
279         cbnz    $num,.Lcond_copy
280
281         csel    $nj,$tj,$aj,lo
282         stur    xzr,[$tp,#-8]           // wipe tp
283         stur    $nj,[$rp,#-8]
284
285         ldp     x19,x20,[x29,#16]
286         mov     sp,x29
287         ldp     x21,x22,[x29,#32]
288         mov     x0,#1
289         ldp     x23,x24,[x29,#48]
290         ldr     x29,[sp],#64
291         ret
292 .size   bn_mul_mont,.-bn_mul_mont
293 ___
294 {
295 my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
296 my ($Z,$Temp)=("v4.16b","v5");
297 my @ACC=map("v$_",(6..13));
298 my ($Bi,$Ni,$M0)=map("v$_",(28..30));
299 my $sBi="s28";
300 my $sM0="s30";
301 my $zero="v14";
302 my $temp="v15";
303 my $ACCTemp="v16";
304
305 my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
306 my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
307
308 $code.=<<___;
309 .type   bn_mul8x_mont_neon,%function
310 .align  5
311 bn_mul8x_mont_neon:
312         stp     x29,x30,[sp,#-80]!
313         mov     x16,sp
314         stp     d8,d9,[sp,#16]
315         stp     d10,d11,[sp,#32]
316         stp     d12,d13,[sp,#48]
317         stp     d14,d15,[sp,#64]
318         lsl     $num,$num,#1
319         eor     $zero.16b,$zero.16b,$zero.16b
320
321 .align  4
322 .LNEON_8n:
323         eor     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b
324         sub     $toutptr,sp,#128
325         eor     @ACC[1].16b,@ACC[1].16b,@ACC[1].16b
326         sub     $toutptr,$toutptr,$num,lsl#4
327         eor     @ACC[2].16b,@ACC[2].16b,@ACC[2].16b
328         and     $toutptr,$toutptr,#-64
329         eor     @ACC[3].16b,@ACC[3].16b,@ACC[3].16b
330         mov     sp,$toutptr             // alloca
331         eor     @ACC[4].16b,@ACC[4].16b,@ACC[4].16b
332         add     $toutptr,$toutptr,#256
333         eor     @ACC[5].16b,@ACC[5].16b,@ACC[5].16b
334         sub     $inner,$num,#8
335         eor     @ACC[6].16b,@ACC[6].16b,@ACC[6].16b
336         eor     @ACC[7].16b,@ACC[7].16b,@ACC[7].16b
337
338 .LNEON_8n_init:
339         st1     {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
340         subs    $inner,$inner,#8
341         st1     {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
342         st1     {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
343         st1     {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
344         bne     .LNEON_8n_init
345
346         add     $tinptr,sp,#256
347         ld1     {$A0.4s,$A1.4s},[$aptr],#32
348         add     $bnptr,sp,#8
349         ldr     $sM0,[$n0],#4
350         mov     $outer,$num
351         b       .LNEON_8n_outer
352
353 .align  4
354 .LNEON_8n_outer:
355         ldr     $sBi,[$bptr],#4   // *b++
356         uxtl    $Bi.4s,$Bi.4h
357         add     $toutptr,sp,#128
358         ld1     {$N0.4s,$N1.4s},[$nptr],#32
359
360         umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
361         umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
362         umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
363         shl     $Ni.2d,@ACC[0].2d,#16
364         ext     $Ni.16b,$Ni.16b,$Ni.16b,#8
365         umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
366         add     $Ni.2d,$Ni.2d,@ACC[0].2d
367         umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
368         mul     $Ni.2s,$Ni.2s,$M0.2s
369         umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
370         st1     {$Bi.2s},[sp]           // put aside smashed b[8*i+0]
371         umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
372         uxtl    $Ni.4s,$Ni.4h
373         umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
374 ___
375 for ($i=0; $i<7;) {
376 $code.=<<___;
377         ldr     $sBi,[$bptr],#4   // *b++
378         umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
379         umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
380         uxtl    $Bi.4s,$Bi.4h
381         umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
382         ushr    $temp.2d,@ACC[0].2d,#16
383         umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
384         umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
385         ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
386         add     @ACC[0].2d,@ACC[0].2d,$temp.2d
387         umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
388         ushr    @ACC[0].2d,@ACC[0].2d,#16
389         umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
390         umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
391         add     $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
392         ins     @ACC[1].d[0],$ACCTemp.d[0]
393         st1     {$Ni.2s},[$bnptr],#8    // put aside smashed m[8*i+$i]
394 ___
395         push(@ACC,shift(@ACC)); $i++;
396 $code.=<<___;
397         umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
398         ld1     {@ACC[7].2d},[$tinptr],#16
399         umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
400         umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
401         shl     $Ni.2d,@ACC[0].2d,#16
402         ext     $Ni.16b,$Ni.16b,$Ni.16b,#8
403         umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
404         add     $Ni.2d,$Ni.2d,@ACC[0].2d
405         umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
406         mul     $Ni.2s,$Ni.2s,$M0.2s
407         umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
408         st1     {$Bi.2s},[$bnptr],#8    // put aside smashed b[8*i+$i]
409         umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
410         uxtl    $Ni.4s,$Ni.4h
411         umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
412 ___
413 }
414 $code.=<<___;
415         ld1     {$Bi.2s},[sp]           // pull smashed b[8*i+0]
416         umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
417         ld1     {$A0.4s,$A1.4s},[$aptr],#32
418         umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
419         umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
420         mov     $Temp.16b,@ACC[0].16b
421         ushr    $Temp.2d,$Temp.2d,#16
422         ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
423         umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
424         umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
425         add     @ACC[0].2d,@ACC[0].2d,$Temp.2d
426         umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
427         ushr    @ACC[0].2d,@ACC[0].2d,#16
428         eor     $temp.16b,$temp.16b,$temp.16b
429         ins     @ACC[0].d[1],$temp.d[0]
430         umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
431         umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
432         add     @ACC[1].2d,@ACC[1].2d,@ACC[0].2d
433         st1     {$Ni.2s},[$bnptr],#8    // put aside smashed m[8*i+$i]
434         add     $bnptr,sp,#8            // rewind
435 ___
436         push(@ACC,shift(@ACC));
437 $code.=<<___;
438         sub     $inner,$num,#8
439         b       .LNEON_8n_inner
440
441 .align  4
442 .LNEON_8n_inner:
443         subs    $inner,$inner,#8
444         umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
445         ld1     {@ACC[7].2d},[$tinptr]
446         umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
447         ld1     {$Ni.2s},[$bnptr],#8    // pull smashed m[8*i+0]
448         umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
449         ld1     {$N0.4s,$N1.4s},[$nptr],#32
450         umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
451         b.eq    .LInner_jump
452         add     $tinptr,$tinptr,#16     // don't advance in last iteration
453 .LInner_jump:
454         umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
455         umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
456         umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
457         umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
458 ___
459 for ($i=1; $i<8; $i++) {
460 $code.=<<___;
461         ld1     {$Bi.2s},[$bnptr],#8    // pull smashed b[8*i+$i]
462         umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
463         umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
464         umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
465         umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
466         umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
467         umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
468         umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
469         umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
470         st1     {@ACC[0].2d},[$toutptr],#16
471 ___
472         push(@ACC,shift(@ACC));
473 $code.=<<___;
474         umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
475         ld1     {@ACC[7].2d},[$tinptr]
476         umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
477         ld1     {$Ni.2s},[$bnptr],#8    // pull smashed m[8*i+$i]
478         umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
479         b.eq    .LInner_jump$i
480         add     $tinptr,$tinptr,#16     // don't advance in last iteration
481 .LInner_jump$i:
482         umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
483         umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
484         umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
485         umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
486         umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
487 ___
488 }
489 $code.=<<___;
490         b.ne    .LInner_after_rewind$i
491         sub     $aptr,$aptr,$num,lsl#2  // rewind
492 .LInner_after_rewind$i:
493         umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
494         ld1     {$Bi.2s},[sp]           // pull smashed b[8*i+0]
495         umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
496         ld1     {$A0.4s,$A1.4s},[$aptr],#32
497         umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
498         add     $bnptr,sp,#8            // rewind
499         umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
500         umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
501         umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
502         umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
503         st1     {@ACC[0].2d},[$toutptr],#16
504         umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
505
506         bne     .LNEON_8n_inner
507 ___
508         push(@ACC,shift(@ACC));
509 $code.=<<___;
510         add     $tinptr,sp,#128
511         st1     {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
512         eor     $N0.16b,$N0.16b,$N0.16b // $N0
513         st1     {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
514         eor     $N1.16b,$N1.16b,$N1.16b // $N1
515         st1     {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
516         st1     {@ACC[6].2d},[$toutptr]
517
518         subs    $outer,$outer,#8
519         ld1     {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
520         ld1     {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
521         ld1     {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
522         ld1     {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
523
524         b.eq    .LInner_8n_jump_2steps
525         sub     $nptr,$nptr,$num,lsl#2  // rewind
526         b       .LNEON_8n_outer
527
528 .LInner_8n_jump_2steps:
529         add     $toutptr,sp,#128
530         st1     {$N0.2d,$N1.2d}, [sp],#32       // start wiping stack frame
531         mov     $Temp.16b,@ACC[0].16b
532         ushr    $temp.2d,@ACC[0].2d,#16
533         ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
534         st1     {$N0.2d,$N1.2d}, [sp],#32
535         add     @ACC[0].2d,@ACC[0].2d,$temp.2d
536         st1     {$N0.2d,$N1.2d}, [sp],#32
537         ushr    $temp.2d,@ACC[0].2d,#16
538         st1     {$N0.2d,$N1.2d}, [sp],#32
539         zip1    @ACC[0].4h,$Temp.4h,@ACC[0].4h
540         ins     $temp.d[1],$zero.d[0]
541
542         mov     $inner,$num
543         b       .LNEON_tail_entry
544
545 .align  4
546 .LNEON_tail:
547         add     @ACC[0].2d,@ACC[0].2d,$temp.2d
548         mov     $Temp.16b,@ACC[0].16b
549         ushr    $temp.2d,@ACC[0].2d,#16
550         ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
551         ld1     {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
552         add     @ACC[0].2d,@ACC[0].2d,$temp.2d
553         ld1     {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
554         ushr    $temp.2d,@ACC[0].2d,#16
555         ld1     {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
556         zip1    @ACC[0].4h,$Temp.4h,@ACC[0].4h
557         ins     $temp.d[1],$zero.d[0]
558
559 .LNEON_tail_entry:
560 ___
561 for ($i=1; $i<8; $i++) {
562 $code.=<<___;
563         add     @ACC[1].2d,@ACC[1].2d,$temp.2d
564         st1     {@ACC[0].s}[0], [$toutptr],#4
565         ushr    $temp.2d,@ACC[1].2d,#16
566         mov     $Temp.16b,@ACC[1].16b
567         ext     @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
568         add     @ACC[1].2d,@ACC[1].2d,$temp.2d
569         ushr    $temp.2d,@ACC[1].2d,#16
570         zip1    @ACC[1].4h,$Temp.4h,@ACC[1].4h
571         ins     $temp.d[1],$zero.d[0]
572 ___
573         push(@ACC,shift(@ACC));
574 }
575         push(@ACC,shift(@ACC));
576 $code.=<<___;
577         ld1     {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
578         subs    $inner,$inner,#8
579         st1     {@ACC[7].s}[0], [$toutptr],#4
580         bne     .LNEON_tail
581
582         st1     {$temp.s}[0], [$toutptr],#4     // top-most bit
583         sub     $nptr,$nptr,$num,lsl#2          // rewind $nptr
584         subs    $aptr,sp,#0                     // clear carry flag
585         add     $bptr,sp,$num,lsl#2
586
587 .LNEON_sub:
588         ldp     w4,w5,[$aptr],#8
589         ldp     w6,w7,[$aptr],#8
590         ldp     w8,w9,[$nptr],#8
591         ldp     w10,w11,[$nptr],#8
592         sbcs    w8,w4,w8
593         sbcs    w9,w5,w9
594         sbcs    w10,w6,w10
595         sbcs    w11,w7,w11
596         sub     x17,$bptr,$aptr
597         stp     w8,w9,[$rptr],#8
598         stp     w10,w11,[$rptr],#8
599         cbnz    x17,.LNEON_sub
600
601         ldr     w10, [$aptr]            // load top-most bit
602         mov     x11,sp
603         eor     v0.16b,v0.16b,v0.16b
604         sub     x11,$bptr,x11           // this is num*4
605         eor     v1.16b,v1.16b,v1.16b
606         mov     $aptr,sp
607         sub     $rptr,$rptr,x11         // rewind $rptr
608         mov     $nptr,$bptr             // second 3/4th of frame
609         sbcs    w10,w10,wzr             // result is carry flag
610
611 .LNEON_copy_n_zap:
612         ldp     w4,w5,[$aptr],#8
613         ldp     w6,w7,[$aptr],#8
614         ldp     w8,w9,[$rptr],#8
615         ldp     w10,w11,[$rptr]
616         sub     $rptr,$rptr,#8
617         b.cs    .LCopy_1
618         mov     w8,w4
619         mov     w9,w5
620         mov     w10,w6
621         mov     w11,w7
622 .LCopy_1:
623         st1     {v0.2d,v1.2d}, [$nptr],#32              // wipe
624         st1     {v0.2d,v1.2d}, [$nptr],#32              // wipe
625         ldp     w4,w5,[$aptr],#8
626         ldp     w6,w7,[$aptr],#8
627         stp     w8,w9,[$rptr],#8
628         stp     w10,w11,[$rptr],#8
629         sub     $aptr,$aptr,#32
630         ldp     w8,w9,[$rptr],#8
631         ldp     w10,w11,[$rptr]
632         sub     $rptr,$rptr,#8
633         b.cs    .LCopy_2
634         mov     w8, w4
635         mov     w9, w5
636         mov     w10, w6
637         mov     w11, w7
638 .LCopy_2:
639         st1     {v0.2d,v1.2d}, [$aptr],#32              // wipe
640         st1     {v0.2d,v1.2d}, [$nptr],#32              // wipe
641         sub     x17,$bptr,$aptr         // preserves carry
642         stp     w8,w9,[$rptr],#8
643         stp     w10,w11,[$rptr],#8
644         cbnz    x17,.LNEON_copy_n_zap
645
646         mov     sp,x16
647         ldp     d14,d15,[sp,#64]
648         ldp     d12,d13,[sp,#48]
649         ldp     d10,d11,[sp,#32]
650         ldp     d8,d9,[sp,#16]
651         ldr     x29,[sp],#80
652         ret                     // bx lr
653
654 .size   bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
655 ___
656 }
657 {
658 ########################################################################
659 # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
660
661 my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
662 my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
663 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
664 my ($cnt,$carry,$topmost)=("x27","x28","x30");
665 my ($tp,$ap_end,$na0)=($bp,$np,$carry);
666
667 $code.=<<___;
668 .type   __bn_sqr8x_mont,%function
669 .align  5
670 __bn_sqr8x_mont:
671         cmp     $ap,$bp
672         b.ne    __bn_mul4x_mont
673 .Lsqr8x_mont:
674         .inst   0xd503233f              // paciasp
675         stp     x29,x30,[sp,#-128]!
676         add     x29,sp,#0
677         stp     x19,x20,[sp,#16]
678         stp     x21,x22,[sp,#32]
679         stp     x23,x24,[sp,#48]
680         stp     x25,x26,[sp,#64]
681         stp     x27,x28,[sp,#80]
682         stp     $rp,$np,[sp,#96]        // offload rp and np
683
684         ldp     $a0,$a1,[$ap,#8*0]
685         ldp     $a2,$a3,[$ap,#8*2]
686         ldp     $a4,$a5,[$ap,#8*4]
687         ldp     $a6,$a7,[$ap,#8*6]
688
689         sub     $tp,sp,$num,lsl#4
690         lsl     $num,$num,#3
691         ldr     $n0,[$n0]               // *n0
692         mov     sp,$tp                  // alloca
693         sub     $cnt,$num,#8*8
694         b       .Lsqr8x_zero_start
695
696 .Lsqr8x_zero:
697         sub     $cnt,$cnt,#8*8
698         stp     xzr,xzr,[$tp,#8*0]
699         stp     xzr,xzr,[$tp,#8*2]
700         stp     xzr,xzr,[$tp,#8*4]
701         stp     xzr,xzr,[$tp,#8*6]
702 .Lsqr8x_zero_start:
703         stp     xzr,xzr,[$tp,#8*8]
704         stp     xzr,xzr,[$tp,#8*10]
705         stp     xzr,xzr,[$tp,#8*12]
706         stp     xzr,xzr,[$tp,#8*14]
707         add     $tp,$tp,#8*16
708         cbnz    $cnt,.Lsqr8x_zero
709
710         add     $ap_end,$ap,$num
711         add     $ap,$ap,#8*8
712         mov     $acc0,xzr
713         mov     $acc1,xzr
714         mov     $acc2,xzr
715         mov     $acc3,xzr
716         mov     $acc4,xzr
717         mov     $acc5,xzr
718         mov     $acc6,xzr
719         mov     $acc7,xzr
720         mov     $tp,sp
721         str     $n0,[x29,#112]          // offload n0
722
723         // Multiply everything but a[i]*a[i]
724 .align  4
725 .Lsqr8x_outer_loop:
726         //                                                 a[1]a[0]     (i)
727         //                                             a[2]a[0]
728         //                                         a[3]a[0]
729         //                                     a[4]a[0]
730         //                                 a[5]a[0]
731         //                             a[6]a[0]
732         //                         a[7]a[0]
733         //                                         a[2]a[1]             (ii)
734         //                                     a[3]a[1]
735         //                                 a[4]a[1]
736         //                             a[5]a[1]
737         //                         a[6]a[1]
738         //                     a[7]a[1]
739         //                                 a[3]a[2]                     (iii)
740         //                             a[4]a[2]
741         //                         a[5]a[2]
742         //                     a[6]a[2]
743         //                 a[7]a[2]
744         //                         a[4]a[3]                             (iv)
745         //                     a[5]a[3]
746         //                 a[6]a[3]
747         //             a[7]a[3]
748         //                 a[5]a[4]                                     (v)
749         //             a[6]a[4]
750         //         a[7]a[4]
751         //         a[6]a[5]                                             (vi)
752         //     a[7]a[5]
753         // a[7]a[6]                                                     (vii)
754
755         mul     $t0,$a1,$a0             // lo(a[1..7]*a[0])             (i)
756         mul     $t1,$a2,$a0
757         mul     $t2,$a3,$a0
758         mul     $t3,$a4,$a0
759         adds    $acc1,$acc1,$t0         // t[1]+lo(a[1]*a[0])
760         mul     $t0,$a5,$a0
761         adcs    $acc2,$acc2,$t1
762         mul     $t1,$a6,$a0
763         adcs    $acc3,$acc3,$t2
764         mul     $t2,$a7,$a0
765         adcs    $acc4,$acc4,$t3
766         umulh   $t3,$a1,$a0             // hi(a[1..7]*a[0])
767         adcs    $acc5,$acc5,$t0
768         umulh   $t0,$a2,$a0
769         adcs    $acc6,$acc6,$t1
770         umulh   $t1,$a3,$a0
771         adcs    $acc7,$acc7,$t2
772         umulh   $t2,$a4,$a0
773         stp     $acc0,$acc1,[$tp],#8*2  // t[0..1]
774         adc     $acc0,xzr,xzr           // t[8]
775         adds    $acc2,$acc2,$t3         // t[2]+lo(a[1]*a[0])
776         umulh   $t3,$a5,$a0
777         adcs    $acc3,$acc3,$t0
778         umulh   $t0,$a6,$a0
779         adcs    $acc4,$acc4,$t1
780         umulh   $t1,$a7,$a0
781         adcs    $acc5,$acc5,$t2
782          mul    $t2,$a2,$a1             // lo(a[2..7]*a[1])             (ii)
783         adcs    $acc6,$acc6,$t3
784          mul    $t3,$a3,$a1
785         adcs    $acc7,$acc7,$t0
786          mul    $t0,$a4,$a1
787         adc     $acc0,$acc0,$t1
788
789         mul     $t1,$a5,$a1
790         adds    $acc3,$acc3,$t2
791         mul     $t2,$a6,$a1
792         adcs    $acc4,$acc4,$t3
793         mul     $t3,$a7,$a1
794         adcs    $acc5,$acc5,$t0
795         umulh   $t0,$a2,$a1             // hi(a[2..7]*a[1])
796         adcs    $acc6,$acc6,$t1
797         umulh   $t1,$a3,$a1
798         adcs    $acc7,$acc7,$t2
799         umulh   $t2,$a4,$a1
800         adcs    $acc0,$acc0,$t3
801         umulh   $t3,$a5,$a1
802         stp     $acc2,$acc3,[$tp],#8*2  // t[2..3]
803         adc     $acc1,xzr,xzr           // t[9]
804         adds    $acc4,$acc4,$t0
805         umulh   $t0,$a6,$a1
806         adcs    $acc5,$acc5,$t1
807         umulh   $t1,$a7,$a1
808         adcs    $acc6,$acc6,$t2
809          mul    $t2,$a3,$a2             // lo(a[3..7]*a[2])             (iii)
810         adcs    $acc7,$acc7,$t3
811          mul    $t3,$a4,$a2
812         adcs    $acc0,$acc0,$t0
813          mul    $t0,$a5,$a2
814         adc     $acc1,$acc1,$t1
815
816         mul     $t1,$a6,$a2
817         adds    $acc5,$acc5,$t2
818         mul     $t2,$a7,$a2
819         adcs    $acc6,$acc6,$t3
820         umulh   $t3,$a3,$a2             // hi(a[3..7]*a[2])
821         adcs    $acc7,$acc7,$t0
822         umulh   $t0,$a4,$a2
823         adcs    $acc0,$acc0,$t1
824         umulh   $t1,$a5,$a2
825         adcs    $acc1,$acc1,$t2
826         umulh   $t2,$a6,$a2
827         stp     $acc4,$acc5,[$tp],#8*2  // t[4..5]
828         adc     $acc2,xzr,xzr           // t[10]
829         adds    $acc6,$acc6,$t3
830         umulh   $t3,$a7,$a2
831         adcs    $acc7,$acc7,$t0
832          mul    $t0,$a4,$a3             // lo(a[4..7]*a[3])             (iv)
833         adcs    $acc0,$acc0,$t1
834          mul    $t1,$a5,$a3
835         adcs    $acc1,$acc1,$t2
836          mul    $t2,$a6,$a3
837         adc     $acc2,$acc2,$t3
838
839         mul     $t3,$a7,$a3
840         adds    $acc7,$acc7,$t0
841         umulh   $t0,$a4,$a3             // hi(a[4..7]*a[3])
842         adcs    $acc0,$acc0,$t1
843         umulh   $t1,$a5,$a3
844         adcs    $acc1,$acc1,$t2
845         umulh   $t2,$a6,$a3
846         adcs    $acc2,$acc2,$t3
847         umulh   $t3,$a7,$a3
848         stp     $acc6,$acc7,[$tp],#8*2  // t[6..7]
849         adc     $acc3,xzr,xzr           // t[11]
850         adds    $acc0,$acc0,$t0
851          mul    $t0,$a5,$a4             // lo(a[5..7]*a[4])             (v)
852         adcs    $acc1,$acc1,$t1
853          mul    $t1,$a6,$a4
854         adcs    $acc2,$acc2,$t2
855          mul    $t2,$a7,$a4
856         adc     $acc3,$acc3,$t3
857
858         umulh   $t3,$a5,$a4             // hi(a[5..7]*a[4])
859         adds    $acc1,$acc1,$t0
860         umulh   $t0,$a6,$a4
861         adcs    $acc2,$acc2,$t1
862         umulh   $t1,$a7,$a4
863         adcs    $acc3,$acc3,$t2
864          mul    $t2,$a6,$a5             // lo(a[6..7]*a[5])             (vi)
865         adc     $acc4,xzr,xzr           // t[12]
866         adds    $acc2,$acc2,$t3
867          mul    $t3,$a7,$a5
868         adcs    $acc3,$acc3,$t0
869          umulh  $t0,$a6,$a5             // hi(a[6..7]*a[5])
870         adc     $acc4,$acc4,$t1
871
872         umulh   $t1,$a7,$a5
873         adds    $acc3,$acc3,$t2
874          mul    $t2,$a7,$a6             // lo(a[7]*a[6])                (vii)
875         adcs    $acc4,$acc4,$t3
876          umulh  $t3,$a7,$a6             // hi(a[7]*a[6])
877         adc     $acc5,xzr,xzr           // t[13]
878         adds    $acc4,$acc4,$t0
879         sub     $cnt,$ap_end,$ap        // done yet?
880         adc     $acc5,$acc5,$t1
881
882         adds    $acc5,$acc5,$t2
883         sub     $t0,$ap_end,$num        // rewinded ap
884         adc     $acc6,xzr,xzr           // t[14]
885         add     $acc6,$acc6,$t3
886
887         cbz     $cnt,.Lsqr8x_outer_break
888
889         mov     $n0,$a0
890         ldp     $a0,$a1,[$tp,#8*0]
891         ldp     $a2,$a3,[$tp,#8*2]
892         ldp     $a4,$a5,[$tp,#8*4]
893         ldp     $a6,$a7,[$tp,#8*6]
894         adds    $acc0,$acc0,$a0
895         adcs    $acc1,$acc1,$a1
896         ldp     $a0,$a1,[$ap,#8*0]
897         adcs    $acc2,$acc2,$a2
898         adcs    $acc3,$acc3,$a3
899         ldp     $a2,$a3,[$ap,#8*2]
900         adcs    $acc4,$acc4,$a4
901         adcs    $acc5,$acc5,$a5
902         ldp     $a4,$a5,[$ap,#8*4]
903         adcs    $acc6,$acc6,$a6
904         mov     $rp,$ap
905         adcs    $acc7,xzr,$a7
906         ldp     $a6,$a7,[$ap,#8*6]
907         add     $ap,$ap,#8*8
908         //adc   $carry,xzr,xzr          // moved below
909         mov     $cnt,#-8*8
910
911         //                                                         a[8]a[0]
912         //                                                     a[9]a[0]
913         //                                                 a[a]a[0]
914         //                                             a[b]a[0]
915         //                                         a[c]a[0]
916         //                                     a[d]a[0]
917         //                                 a[e]a[0]
918         //                             a[f]a[0]
919         //                                                     a[8]a[1]
920         //                         a[f]a[1]........................
921         //                                                 a[8]a[2]
922         //                     a[f]a[2]........................
923         //                                             a[8]a[3]
924         //                 a[f]a[3]........................
925         //                                         a[8]a[4]
926         //             a[f]a[4]........................
927         //                                     a[8]a[5]
928         //         a[f]a[5]........................
929         //                                 a[8]a[6]
930         //     a[f]a[6]........................
931         //                             a[8]a[7]
932         // a[f]a[7]........................
933 .Lsqr8x_mul:
934         mul     $t0,$a0,$n0
935         adc     $carry,xzr,xzr          // carry bit, modulo-scheduled
936         mul     $t1,$a1,$n0
937         add     $cnt,$cnt,#8
938         mul     $t2,$a2,$n0
939         mul     $t3,$a3,$n0
940         adds    $acc0,$acc0,$t0
941         mul     $t0,$a4,$n0
942         adcs    $acc1,$acc1,$t1
943         mul     $t1,$a5,$n0
944         adcs    $acc2,$acc2,$t2
945         mul     $t2,$a6,$n0
946         adcs    $acc3,$acc3,$t3
947         mul     $t3,$a7,$n0
948         adcs    $acc4,$acc4,$t0
949         umulh   $t0,$a0,$n0
950         adcs    $acc5,$acc5,$t1
951         umulh   $t1,$a1,$n0
952         adcs    $acc6,$acc6,$t2
953         umulh   $t2,$a2,$n0
954         adcs    $acc7,$acc7,$t3
955         umulh   $t3,$a3,$n0
956         adc     $carry,$carry,xzr
957         str     $acc0,[$tp],#8
958         adds    $acc0,$acc1,$t0
959         umulh   $t0,$a4,$n0
960         adcs    $acc1,$acc2,$t1
961         umulh   $t1,$a5,$n0
962         adcs    $acc2,$acc3,$t2
963         umulh   $t2,$a6,$n0
964         adcs    $acc3,$acc4,$t3
965         umulh   $t3,$a7,$n0
966         ldr     $n0,[$rp,$cnt]
967         adcs    $acc4,$acc5,$t0
968         adcs    $acc5,$acc6,$t1
969         adcs    $acc6,$acc7,$t2
970         adcs    $acc7,$carry,$t3
971         //adc   $carry,xzr,xzr          // moved above
972         cbnz    $cnt,.Lsqr8x_mul
973                                         // note that carry flag is guaranteed
974                                         // to be zero at this point
975         cmp     $ap,$ap_end             // done yet?
976         b.eq    .Lsqr8x_break
977
978         ldp     $a0,$a1,[$tp,#8*0]
979         ldp     $a2,$a3,[$tp,#8*2]
980         ldp     $a4,$a5,[$tp,#8*4]
981         ldp     $a6,$a7,[$tp,#8*6]
982         adds    $acc0,$acc0,$a0
983         ldur    $n0,[$rp,#-8*8]
984         adcs    $acc1,$acc1,$a1
985         ldp     $a0,$a1,[$ap,#8*0]
986         adcs    $acc2,$acc2,$a2
987         adcs    $acc3,$acc3,$a3
988         ldp     $a2,$a3,[$ap,#8*2]
989         adcs    $acc4,$acc4,$a4
990         adcs    $acc5,$acc5,$a5
991         ldp     $a4,$a5,[$ap,#8*4]
992         adcs    $acc6,$acc6,$a6
993         mov     $cnt,#-8*8
994         adcs    $acc7,$acc7,$a7
995         ldp     $a6,$a7,[$ap,#8*6]
996         add     $ap,$ap,#8*8
997         //adc   $carry,xzr,xzr          // moved above
998         b       .Lsqr8x_mul
999
1000 .align  4
1001 .Lsqr8x_break:
1002         ldp     $a0,$a1,[$rp,#8*0]
1003         add     $ap,$rp,#8*8
1004         ldp     $a2,$a3,[$rp,#8*2]
1005         sub     $t0,$ap_end,$ap         // is it last iteration?
1006         ldp     $a4,$a5,[$rp,#8*4]
1007         sub     $t1,$tp,$t0
1008         ldp     $a6,$a7,[$rp,#8*6]
1009         cbz     $t0,.Lsqr8x_outer_loop
1010
1011         stp     $acc0,$acc1,[$tp,#8*0]
1012         ldp     $acc0,$acc1,[$t1,#8*0]
1013         stp     $acc2,$acc3,[$tp,#8*2]
1014         ldp     $acc2,$acc3,[$t1,#8*2]
1015         stp     $acc4,$acc5,[$tp,#8*4]
1016         ldp     $acc4,$acc5,[$t1,#8*4]
1017         stp     $acc6,$acc7,[$tp,#8*6]
1018         mov     $tp,$t1
1019         ldp     $acc6,$acc7,[$t1,#8*6]
1020         b       .Lsqr8x_outer_loop
1021
1022 .align  4
1023 .Lsqr8x_outer_break:
1024         // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1025         ldp     $a1,$a3,[$t0,#8*0]      // recall that $t0 is &a[0]
1026         ldp     $t1,$t2,[sp,#8*1]
1027         ldp     $a5,$a7,[$t0,#8*2]
1028         add     $ap,$t0,#8*4
1029         ldp     $t3,$t0,[sp,#8*3]
1030
1031         stp     $acc0,$acc1,[$tp,#8*0]
1032         mul     $acc0,$a1,$a1
1033         stp     $acc2,$acc3,[$tp,#8*2]
1034         umulh   $a1,$a1,$a1
1035         stp     $acc4,$acc5,[$tp,#8*4]
1036         mul     $a2,$a3,$a3
1037         stp     $acc6,$acc7,[$tp,#8*6]
1038         mov     $tp,sp
1039         umulh   $a3,$a3,$a3
1040         adds    $acc1,$a1,$t1,lsl#1
1041         extr    $t1,$t2,$t1,#63
1042         sub     $cnt,$num,#8*4
1043
1044 .Lsqr4x_shift_n_add:
1045         adcs    $acc2,$a2,$t1
1046         extr    $t2,$t3,$t2,#63
1047         sub     $cnt,$cnt,#8*4
1048         adcs    $acc3,$a3,$t2
1049         ldp     $t1,$t2,[$tp,#8*5]
1050         mul     $a4,$a5,$a5
1051         ldp     $a1,$a3,[$ap],#8*2
1052         umulh   $a5,$a5,$a5
1053         mul     $a6,$a7,$a7
1054         umulh   $a7,$a7,$a7
1055         extr    $t3,$t0,$t3,#63
1056         stp     $acc0,$acc1,[$tp,#8*0]
1057         adcs    $acc4,$a4,$t3
1058         extr    $t0,$t1,$t0,#63
1059         stp     $acc2,$acc3,[$tp,#8*2]
1060         adcs    $acc5,$a5,$t0
1061         ldp     $t3,$t0,[$tp,#8*7]
1062         extr    $t1,$t2,$t1,#63
1063         adcs    $acc6,$a6,$t1
1064         extr    $t2,$t3,$t2,#63
1065         adcs    $acc7,$a7,$t2
1066         ldp     $t1,$t2,[$tp,#8*9]
1067         mul     $a0,$a1,$a1
1068         ldp     $a5,$a7,[$ap],#8*2
1069         umulh   $a1,$a1,$a1
1070         mul     $a2,$a3,$a3
1071         umulh   $a3,$a3,$a3
1072         stp     $acc4,$acc5,[$tp,#8*4]
1073         extr    $t3,$t0,$t3,#63
1074         stp     $acc6,$acc7,[$tp,#8*6]
1075         add     $tp,$tp,#8*8
1076         adcs    $acc0,$a0,$t3
1077         extr    $t0,$t1,$t0,#63
1078         adcs    $acc1,$a1,$t0
1079         ldp     $t3,$t0,[$tp,#8*3]
1080         extr    $t1,$t2,$t1,#63
1081         cbnz    $cnt,.Lsqr4x_shift_n_add
1082 ___
1083 my ($np,$np_end)=($ap,$ap_end);
1084 $code.=<<___;
1085          ldp    $np,$n0,[x29,#104]      // pull np and n0
1086
1087         adcs    $acc2,$a2,$t1
1088         extr    $t2,$t3,$t2,#63
1089         adcs    $acc3,$a3,$t2
1090         ldp     $t1,$t2,[$tp,#8*5]
1091         mul     $a4,$a5,$a5
1092         umulh   $a5,$a5,$a5
1093         stp     $acc0,$acc1,[$tp,#8*0]
1094         mul     $a6,$a7,$a7
1095         umulh   $a7,$a7,$a7
1096         stp     $acc2,$acc3,[$tp,#8*2]
1097         extr    $t3,$t0,$t3,#63
1098         adcs    $acc4,$a4,$t3
1099         extr    $t0,$t1,$t0,#63
1100          ldp    $acc0,$acc1,[sp,#8*0]
1101         adcs    $acc5,$a5,$t0
1102         extr    $t1,$t2,$t1,#63
1103          ldp    $a0,$a1,[$np,#8*0]
1104         adcs    $acc6,$a6,$t1
1105         extr    $t2,xzr,$t2,#63
1106          ldp    $a2,$a3,[$np,#8*2]
1107         adc     $acc7,$a7,$t2
1108          ldp    $a4,$a5,[$np,#8*4]
1109
1110         // Reduce by 512 bits per iteration
1111         mul     $na0,$n0,$acc0          // t[0]*n0
1112         ldp     $a6,$a7,[$np,#8*6]
1113         add     $np_end,$np,$num
1114         ldp     $acc2,$acc3,[sp,#8*2]
1115         stp     $acc4,$acc5,[$tp,#8*4]
1116         ldp     $acc4,$acc5,[sp,#8*4]
1117         stp     $acc6,$acc7,[$tp,#8*6]
1118         ldp     $acc6,$acc7,[sp,#8*6]
1119         add     $np,$np,#8*8
1120         mov     $topmost,xzr            // initial top-most carry
1121         mov     $tp,sp
1122         mov     $cnt,#8
1123
1124 .Lsqr8x_reduction:
1125         // (*)  mul     $t0,$a0,$na0    // lo(n[0-7])*lo(t[0]*n0)
1126         mul     $t1,$a1,$na0
1127         sub     $cnt,$cnt,#1
1128         mul     $t2,$a2,$na0
1129         str     $na0,[$tp],#8           // put aside t[0]*n0 for tail processing
1130         mul     $t3,$a3,$na0
1131         // (*)  adds    xzr,$acc0,$t0
1132         subs    xzr,$acc0,#1            // (*)
1133         mul     $t0,$a4,$na0
1134         adcs    $acc0,$acc1,$t1
1135         mul     $t1,$a5,$na0
1136         adcs    $acc1,$acc2,$t2
1137         mul     $t2,$a6,$na0
1138         adcs    $acc2,$acc3,$t3
1139         mul     $t3,$a7,$na0
1140         adcs    $acc3,$acc4,$t0
1141         umulh   $t0,$a0,$na0            // hi(n[0-7])*lo(t[0]*n0)
1142         adcs    $acc4,$acc5,$t1
1143         umulh   $t1,$a1,$na0
1144         adcs    $acc5,$acc6,$t2
1145         umulh   $t2,$a2,$na0
1146         adcs    $acc6,$acc7,$t3
1147         umulh   $t3,$a3,$na0
1148         adc     $acc7,xzr,xzr
1149         adds    $acc0,$acc0,$t0
1150         umulh   $t0,$a4,$na0
1151         adcs    $acc1,$acc1,$t1
1152         umulh   $t1,$a5,$na0
1153         adcs    $acc2,$acc2,$t2
1154         umulh   $t2,$a6,$na0
1155         adcs    $acc3,$acc3,$t3
1156         umulh   $t3,$a7,$na0
1157         mul     $na0,$n0,$acc0          // next t[0]*n0
1158         adcs    $acc4,$acc4,$t0
1159         adcs    $acc5,$acc5,$t1
1160         adcs    $acc6,$acc6,$t2
1161         adc     $acc7,$acc7,$t3
1162         cbnz    $cnt,.Lsqr8x_reduction
1163
1164         ldp     $t0,$t1,[$tp,#8*0]
1165         ldp     $t2,$t3,[$tp,#8*2]
1166         mov     $rp,$tp
1167         sub     $cnt,$np_end,$np        // done yet?
1168         adds    $acc0,$acc0,$t0
1169         adcs    $acc1,$acc1,$t1
1170         ldp     $t0,$t1,[$tp,#8*4]
1171         adcs    $acc2,$acc2,$t2
1172         adcs    $acc3,$acc3,$t3
1173         ldp     $t2,$t3,[$tp,#8*6]
1174         adcs    $acc4,$acc4,$t0
1175         adcs    $acc5,$acc5,$t1
1176         adcs    $acc6,$acc6,$t2
1177         adcs    $acc7,$acc7,$t3
1178         //adc   $carry,xzr,xzr          // moved below
1179         cbz     $cnt,.Lsqr8x8_post_condition
1180
1181         ldur    $n0,[$tp,#-8*8]
1182         ldp     $a0,$a1,[$np,#8*0]
1183         ldp     $a2,$a3,[$np,#8*2]
1184         ldp     $a4,$a5,[$np,#8*4]
1185         mov     $cnt,#-8*8
1186         ldp     $a6,$a7,[$np,#8*6]
1187         add     $np,$np,#8*8
1188
1189 .Lsqr8x_tail:
1190         mul     $t0,$a0,$n0
1191         adc     $carry,xzr,xzr          // carry bit, modulo-scheduled
1192         mul     $t1,$a1,$n0
1193         add     $cnt,$cnt,#8
1194         mul     $t2,$a2,$n0
1195         mul     $t3,$a3,$n0
1196         adds    $acc0,$acc0,$t0
1197         mul     $t0,$a4,$n0
1198         adcs    $acc1,$acc1,$t1
1199         mul     $t1,$a5,$n0
1200         adcs    $acc2,$acc2,$t2
1201         mul     $t2,$a6,$n0
1202         adcs    $acc3,$acc3,$t3
1203         mul     $t3,$a7,$n0
1204         adcs    $acc4,$acc4,$t0
1205         umulh   $t0,$a0,$n0
1206         adcs    $acc5,$acc5,$t1
1207         umulh   $t1,$a1,$n0
1208         adcs    $acc6,$acc6,$t2
1209         umulh   $t2,$a2,$n0
1210         adcs    $acc7,$acc7,$t3
1211         umulh   $t3,$a3,$n0
1212         adc     $carry,$carry,xzr
1213         str     $acc0,[$tp],#8
1214         adds    $acc0,$acc1,$t0
1215         umulh   $t0,$a4,$n0
1216         adcs    $acc1,$acc2,$t1
1217         umulh   $t1,$a5,$n0
1218         adcs    $acc2,$acc3,$t2
1219         umulh   $t2,$a6,$n0
1220         adcs    $acc3,$acc4,$t3
1221         umulh   $t3,$a7,$n0
1222         ldr     $n0,[$rp,$cnt]
1223         adcs    $acc4,$acc5,$t0
1224         adcs    $acc5,$acc6,$t1
1225         adcs    $acc6,$acc7,$t2
1226         adcs    $acc7,$carry,$t3
1227         //adc   $carry,xzr,xzr          // moved above
1228         cbnz    $cnt,.Lsqr8x_tail
1229                                         // note that carry flag is guaranteed
1230                                         // to be zero at this point
1231         ldp     $a0,$a1,[$tp,#8*0]
1232         sub     $cnt,$np_end,$np        // done yet?
1233         sub     $t2,$np_end,$num        // rewinded np
1234         ldp     $a2,$a3,[$tp,#8*2]
1235         ldp     $a4,$a5,[$tp,#8*4]
1236         ldp     $a6,$a7,[$tp,#8*6]
1237         cbz     $cnt,.Lsqr8x_tail_break
1238
1239         ldur    $n0,[$rp,#-8*8]
1240         adds    $acc0,$acc0,$a0
1241         adcs    $acc1,$acc1,$a1
1242         ldp     $a0,$a1,[$np,#8*0]
1243         adcs    $acc2,$acc2,$a2
1244         adcs    $acc3,$acc3,$a3
1245         ldp     $a2,$a3,[$np,#8*2]
1246         adcs    $acc4,$acc4,$a4
1247         adcs    $acc5,$acc5,$a5
1248         ldp     $a4,$a5,[$np,#8*4]
1249         adcs    $acc6,$acc6,$a6
1250         mov     $cnt,#-8*8
1251         adcs    $acc7,$acc7,$a7
1252         ldp     $a6,$a7,[$np,#8*6]
1253         add     $np,$np,#8*8
1254         //adc   $carry,xzr,xzr          // moved above
1255         b       .Lsqr8x_tail
1256
1257 .align  4
1258 .Lsqr8x_tail_break:
1259         ldr     $n0,[x29,#112]          // pull n0
1260         add     $cnt,$tp,#8*8           // end of current t[num] window
1261
1262         subs    xzr,$topmost,#1         // "move" top-most carry to carry bit
1263         adcs    $t0,$acc0,$a0
1264         adcs    $t1,$acc1,$a1
1265         ldp     $acc0,$acc1,[$rp,#8*0]
1266         adcs    $acc2,$acc2,$a2
1267         ldp     $a0,$a1,[$t2,#8*0]      // recall that $t2 is &n[0]
1268         adcs    $acc3,$acc3,$a3
1269         ldp     $a2,$a3,[$t2,#8*2]
1270         adcs    $acc4,$acc4,$a4
1271         adcs    $acc5,$acc5,$a5
1272         ldp     $a4,$a5,[$t2,#8*4]
1273         adcs    $acc6,$acc6,$a6
1274         adcs    $acc7,$acc7,$a7
1275         ldp     $a6,$a7,[$t2,#8*6]
1276         add     $np,$t2,#8*8
1277         adc     $topmost,xzr,xzr        // top-most carry
1278         mul     $na0,$n0,$acc0
1279         stp     $t0,$t1,[$tp,#8*0]
1280         stp     $acc2,$acc3,[$tp,#8*2]
1281         ldp     $acc2,$acc3,[$rp,#8*2]
1282         stp     $acc4,$acc5,[$tp,#8*4]
1283         ldp     $acc4,$acc5,[$rp,#8*4]
1284         cmp     $cnt,x29                // did we hit the bottom?
1285         stp     $acc6,$acc7,[$tp,#8*6]
1286         mov     $tp,$rp                 // slide the window
1287         ldp     $acc6,$acc7,[$rp,#8*6]
1288         mov     $cnt,#8
1289         b.ne    .Lsqr8x_reduction
1290
1291         // Final step. We see if result is larger than modulus, and
1292         // if it is, subtract the modulus. But comparison implies
1293         // subtraction. So we subtract modulus, see if it borrowed,
1294         // and conditionally copy original value.
1295         ldr     $rp,[x29,#96]           // pull rp
1296         add     $tp,$tp,#8*8
1297         subs    $t0,$acc0,$a0
1298         sbcs    $t1,$acc1,$a1
1299         sub     $cnt,$num,#8*8
1300         mov     $ap_end,$rp             // $rp copy
1301
1302 .Lsqr8x_sub:
1303         sbcs    $t2,$acc2,$a2
1304         ldp     $a0,$a1,[$np,#8*0]
1305         sbcs    $t3,$acc3,$a3
1306         stp     $t0,$t1,[$rp,#8*0]
1307         sbcs    $t0,$acc4,$a4
1308         ldp     $a2,$a3,[$np,#8*2]
1309         sbcs    $t1,$acc5,$a5
1310         stp     $t2,$t3,[$rp,#8*2]
1311         sbcs    $t2,$acc6,$a6
1312         ldp     $a4,$a5,[$np,#8*4]
1313         sbcs    $t3,$acc7,$a7
1314         ldp     $a6,$a7,[$np,#8*6]
1315         add     $np,$np,#8*8
1316         ldp     $acc0,$acc1,[$tp,#8*0]
1317         sub     $cnt,$cnt,#8*8
1318         ldp     $acc2,$acc3,[$tp,#8*2]
1319         ldp     $acc4,$acc5,[$tp,#8*4]
1320         ldp     $acc6,$acc7,[$tp,#8*6]
1321         add     $tp,$tp,#8*8
1322         stp     $t0,$t1,[$rp,#8*4]
1323         sbcs    $t0,$acc0,$a0
1324         stp     $t2,$t3,[$rp,#8*6]
1325         add     $rp,$rp,#8*8
1326         sbcs    $t1,$acc1,$a1
1327         cbnz    $cnt,.Lsqr8x_sub
1328
1329         sbcs    $t2,$acc2,$a2
1330          mov    $tp,sp
1331          add    $ap,sp,$num
1332          ldp    $a0,$a1,[$ap_end,#8*0]
1333         sbcs    $t3,$acc3,$a3
1334         stp     $t0,$t1,[$rp,#8*0]
1335         sbcs    $t0,$acc4,$a4
1336          ldp    $a2,$a3,[$ap_end,#8*2]
1337         sbcs    $t1,$acc5,$a5
1338         stp     $t2,$t3,[$rp,#8*2]
1339         sbcs    $t2,$acc6,$a6
1340          ldp    $acc0,$acc1,[$ap,#8*0]
1341         sbcs    $t3,$acc7,$a7
1342          ldp    $acc2,$acc3,[$ap,#8*2]
1343         sbcs    xzr,$topmost,xzr        // did it borrow?
1344         ldr     x30,[x29,#8]            // pull return address
1345         stp     $t0,$t1,[$rp,#8*4]
1346         stp     $t2,$t3,[$rp,#8*6]
1347
1348         sub     $cnt,$num,#8*4
1349 .Lsqr4x_cond_copy:
1350         sub     $cnt,$cnt,#8*4
1351         csel    $t0,$acc0,$a0,lo
1352          stp    xzr,xzr,[$tp,#8*0]
1353         csel    $t1,$acc1,$a1,lo
1354         ldp     $a0,$a1,[$ap_end,#8*4]
1355         ldp     $acc0,$acc1,[$ap,#8*4]
1356         csel    $t2,$acc2,$a2,lo
1357          stp    xzr,xzr,[$tp,#8*2]
1358          add    $tp,$tp,#8*4
1359         csel    $t3,$acc3,$a3,lo
1360         ldp     $a2,$a3,[$ap_end,#8*6]
1361         ldp     $acc2,$acc3,[$ap,#8*6]
1362         add     $ap,$ap,#8*4
1363         stp     $t0,$t1,[$ap_end,#8*0]
1364         stp     $t2,$t3,[$ap_end,#8*2]
1365         add     $ap_end,$ap_end,#8*4
1366          stp    xzr,xzr,[$ap,#8*0]
1367          stp    xzr,xzr,[$ap,#8*2]
1368         cbnz    $cnt,.Lsqr4x_cond_copy
1369
1370         csel    $t0,$acc0,$a0,lo
1371          stp    xzr,xzr,[$tp,#8*0]
1372         csel    $t1,$acc1,$a1,lo
1373          stp    xzr,xzr,[$tp,#8*2]
1374         csel    $t2,$acc2,$a2,lo
1375         csel    $t3,$acc3,$a3,lo
1376         stp     $t0,$t1,[$ap_end,#8*0]
1377         stp     $t2,$t3,[$ap_end,#8*2]
1378
1379         b       .Lsqr8x_done
1380
1381 .align  4
1382 .Lsqr8x8_post_condition:
1383         adc     $carry,xzr,xzr
1384         ldr     x30,[x29,#8]            // pull return address
1385         // $acc0-7,$carry hold result, $a0-7 hold modulus
1386         subs    $a0,$acc0,$a0
1387         ldr     $ap,[x29,#96]           // pull rp
1388         sbcs    $a1,$acc1,$a1
1389          stp    xzr,xzr,[sp,#8*0]
1390         sbcs    $a2,$acc2,$a2
1391          stp    xzr,xzr,[sp,#8*2]
1392         sbcs    $a3,$acc3,$a3
1393          stp    xzr,xzr,[sp,#8*4]
1394         sbcs    $a4,$acc4,$a4
1395          stp    xzr,xzr,[sp,#8*6]
1396         sbcs    $a5,$acc5,$a5
1397          stp    xzr,xzr,[sp,#8*8]
1398         sbcs    $a6,$acc6,$a6
1399          stp    xzr,xzr,[sp,#8*10]
1400         sbcs    $a7,$acc7,$a7
1401          stp    xzr,xzr,[sp,#8*12]
1402         sbcs    $carry,$carry,xzr       // did it borrow?
1403          stp    xzr,xzr,[sp,#8*14]
1404
1405         // $a0-7 hold result-modulus
1406         csel    $a0,$acc0,$a0,lo
1407         csel    $a1,$acc1,$a1,lo
1408         csel    $a2,$acc2,$a2,lo
1409         csel    $a3,$acc3,$a3,lo
1410         stp     $a0,$a1,[$ap,#8*0]
1411         csel    $a4,$acc4,$a4,lo
1412         csel    $a5,$acc5,$a5,lo
1413         stp     $a2,$a3,[$ap,#8*2]
1414         csel    $a6,$acc6,$a6,lo
1415         csel    $a7,$acc7,$a7,lo
1416         stp     $a4,$a5,[$ap,#8*4]
1417         stp     $a6,$a7,[$ap,#8*6]
1418
1419 .Lsqr8x_done:
1420         ldp     x19,x20,[x29,#16]
1421         mov     sp,x29
1422         ldp     x21,x22,[x29,#32]
1423         mov     x0,#1
1424         ldp     x23,x24,[x29,#48]
1425         ldp     x25,x26,[x29,#64]
1426         ldp     x27,x28,[x29,#80]
1427         ldr     x29,[sp],#128
1428         .inst   0xd50323bf              // autiasp
1429         ret
1430 .size   __bn_sqr8x_mont,.-__bn_sqr8x_mont
1431 ___
1432 }
1433
1434 {
1435 ########################################################################
1436 # Even though this might look as ARMv8 adaptation of mulx4x_mont from
1437 # x86_64-mont5 module, it's different in sense that it performs
1438 # reduction 256 bits at a time.
1439
1440 my ($a0,$a1,$a2,$a3,
1441     $t0,$t1,$t2,$t3,
1442     $m0,$m1,$m2,$m3,
1443     $acc0,$acc1,$acc2,$acc3,$acc4,
1444     $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1445 my  $bp_end=$rp;
1446 my  ($carry,$topmost) = ($rp,"x30");
1447
1448 $code.=<<___;
1449 .type   __bn_mul4x_mont,%function
1450 .align  5
1451 __bn_mul4x_mont:
1452         .inst   0xd503233f              // paciasp
1453         stp     x29,x30,[sp,#-128]!
1454         add     x29,sp,#0
1455         stp     x19,x20,[sp,#16]
1456         stp     x21,x22,[sp,#32]
1457         stp     x23,x24,[sp,#48]
1458         stp     x25,x26,[sp,#64]
1459         stp     x27,x28,[sp,#80]
1460
1461         sub     $tp,sp,$num,lsl#3
1462         lsl     $num,$num,#3
1463         ldr     $n0,[$n0]               // *n0
1464         sub     sp,$tp,#8*4             // alloca
1465
1466         add     $t0,$bp,$num
1467         add     $ap_end,$ap,$num
1468         stp     $rp,$t0,[x29,#96]       // offload rp and &b[num]
1469
1470         ldr     $bi,[$bp,#8*0]          // b[0]
1471         ldp     $a0,$a1,[$ap,#8*0]      // a[0..3]
1472         ldp     $a2,$a3,[$ap,#8*2]
1473         add     $ap,$ap,#8*4
1474         mov     $acc0,xzr
1475         mov     $acc1,xzr
1476         mov     $acc2,xzr
1477         mov     $acc3,xzr
1478         ldp     $m0,$m1,[$np,#8*0]      // n[0..3]
1479         ldp     $m2,$m3,[$np,#8*2]
1480         adds    $np,$np,#8*4            // clear carry bit
1481         mov     $carry,xzr
1482         mov     $cnt,#0
1483         mov     $tp,sp
1484
1485 .Loop_mul4x_1st_reduction:
1486         mul     $t0,$a0,$bi             // lo(a[0..3]*b[0])
1487         adc     $carry,$carry,xzr       // modulo-scheduled
1488         mul     $t1,$a1,$bi
1489         add     $cnt,$cnt,#8
1490         mul     $t2,$a2,$bi
1491         and     $cnt,$cnt,#31
1492         mul     $t3,$a3,$bi
1493         adds    $acc0,$acc0,$t0
1494         umulh   $t0,$a0,$bi             // hi(a[0..3]*b[0])
1495         adcs    $acc1,$acc1,$t1
1496         mul     $mi,$acc0,$n0           // t[0]*n0
1497         adcs    $acc2,$acc2,$t2
1498         umulh   $t1,$a1,$bi
1499         adcs    $acc3,$acc3,$t3
1500         umulh   $t2,$a2,$bi
1501         adc     $acc4,xzr,xzr
1502         umulh   $t3,$a3,$bi
1503         ldr     $bi,[$bp,$cnt]          // next b[i] (or b[0])
1504         adds    $acc1,$acc1,$t0
1505         // (*)  mul     $t0,$m0,$mi     // lo(n[0..3]*t[0]*n0)
1506         str     $mi,[$tp],#8            // put aside t[0]*n0 for tail processing
1507         adcs    $acc2,$acc2,$t1
1508         mul     $t1,$m1,$mi
1509         adcs    $acc3,$acc3,$t2
1510         mul     $t2,$m2,$mi
1511         adc     $acc4,$acc4,$t3         // can't overflow
1512         mul     $t3,$m3,$mi
1513         // (*)  adds    xzr,$acc0,$t0
1514         subs    xzr,$acc0,#1            // (*)
1515         umulh   $t0,$m0,$mi             // hi(n[0..3]*t[0]*n0)
1516         adcs    $acc0,$acc1,$t1
1517         umulh   $t1,$m1,$mi
1518         adcs    $acc1,$acc2,$t2
1519         umulh   $t2,$m2,$mi
1520         adcs    $acc2,$acc3,$t3
1521         umulh   $t3,$m3,$mi
1522         adcs    $acc3,$acc4,$carry
1523         adc     $carry,xzr,xzr
1524         adds    $acc0,$acc0,$t0
1525         sub     $t0,$ap_end,$ap
1526         adcs    $acc1,$acc1,$t1
1527         adcs    $acc2,$acc2,$t2
1528         adcs    $acc3,$acc3,$t3
1529         //adc   $carry,$carry,xzr
1530         cbnz    $cnt,.Loop_mul4x_1st_reduction
1531
1532         cbz     $t0,.Lmul4x4_post_condition
1533
1534         ldp     $a0,$a1,[$ap,#8*0]      // a[4..7]
1535         ldp     $a2,$a3,[$ap,#8*2]
1536         add     $ap,$ap,#8*4
1537         ldr     $mi,[sp]                // a[0]*n0
1538         ldp     $m0,$m1,[$np,#8*0]      // n[4..7]
1539         ldp     $m2,$m3,[$np,#8*2]
1540         add     $np,$np,#8*4
1541
1542 .Loop_mul4x_1st_tail:
1543         mul     $t0,$a0,$bi             // lo(a[4..7]*b[i])
1544         adc     $carry,$carry,xzr       // modulo-scheduled
1545         mul     $t1,$a1,$bi
1546         add     $cnt,$cnt,#8
1547         mul     $t2,$a2,$bi
1548         and     $cnt,$cnt,#31
1549         mul     $t3,$a3,$bi
1550         adds    $acc0,$acc0,$t0
1551         umulh   $t0,$a0,$bi             // hi(a[4..7]*b[i])
1552         adcs    $acc1,$acc1,$t1
1553         umulh   $t1,$a1,$bi
1554         adcs    $acc2,$acc2,$t2
1555         umulh   $t2,$a2,$bi
1556         adcs    $acc3,$acc3,$t3
1557         umulh   $t3,$a3,$bi
1558         adc     $acc4,xzr,xzr
1559         ldr     $bi,[$bp,$cnt]          // next b[i] (or b[0])
1560         adds    $acc1,$acc1,$t0
1561         mul     $t0,$m0,$mi             // lo(n[4..7]*a[0]*n0)
1562         adcs    $acc2,$acc2,$t1
1563         mul     $t1,$m1,$mi
1564         adcs    $acc3,$acc3,$t2
1565         mul     $t2,$m2,$mi
1566         adc     $acc4,$acc4,$t3         // can't overflow
1567         mul     $t3,$m3,$mi
1568         adds    $acc0,$acc0,$t0
1569         umulh   $t0,$m0,$mi             // hi(n[4..7]*a[0]*n0)
1570         adcs    $acc1,$acc1,$t1
1571         umulh   $t1,$m1,$mi
1572         adcs    $acc2,$acc2,$t2
1573         umulh   $t2,$m2,$mi
1574         adcs    $acc3,$acc3,$t3
1575         adcs    $acc4,$acc4,$carry
1576         umulh   $t3,$m3,$mi
1577         adc     $carry,xzr,xzr
1578         ldr     $mi,[sp,$cnt]           // next t[0]*n0
1579         str     $acc0,[$tp],#8          // result!!!
1580         adds    $acc0,$acc1,$t0
1581         sub     $t0,$ap_end,$ap         // done yet?
1582         adcs    $acc1,$acc2,$t1
1583         adcs    $acc2,$acc3,$t2
1584         adcs    $acc3,$acc4,$t3
1585         //adc   $carry,$carry,xzr
1586         cbnz    $cnt,.Loop_mul4x_1st_tail
1587
1588         sub     $t1,$ap_end,$num        // rewinded $ap
1589         cbz     $t0,.Lmul4x_proceed
1590
1591         ldp     $a0,$a1,[$ap,#8*0]
1592         ldp     $a2,$a3,[$ap,#8*2]
1593         add     $ap,$ap,#8*4
1594         ldp     $m0,$m1,[$np,#8*0]
1595         ldp     $m2,$m3,[$np,#8*2]
1596         add     $np,$np,#8*4
1597         b       .Loop_mul4x_1st_tail
1598
1599 .align  5
1600 .Lmul4x_proceed:
1601         ldr     $bi,[$bp,#8*4]!         // *++b
1602         adc     $topmost,$carry,xzr
1603         ldp     $a0,$a1,[$t1,#8*0]      // a[0..3]
1604         sub     $np,$np,$num            // rewind np
1605         ldp     $a2,$a3,[$t1,#8*2]
1606         add     $ap,$t1,#8*4
1607
1608         stp     $acc0,$acc1,[$tp,#8*0]  // result!!!
1609         ldp     $acc0,$acc1,[sp,#8*4]   // t[0..3]
1610         stp     $acc2,$acc3,[$tp,#8*2]  // result!!!
1611         ldp     $acc2,$acc3,[sp,#8*6]
1612
1613         ldp     $m0,$m1,[$np,#8*0]      // n[0..3]
1614         mov     $tp,sp
1615         ldp     $m2,$m3,[$np,#8*2]
1616         adds    $np,$np,#8*4            // clear carry bit
1617         mov     $carry,xzr
1618
1619 .align  4
1620 .Loop_mul4x_reduction:
1621         mul     $t0,$a0,$bi             // lo(a[0..3]*b[4])
1622         adc     $carry,$carry,xzr       // modulo-scheduled
1623         mul     $t1,$a1,$bi
1624         add     $cnt,$cnt,#8
1625         mul     $t2,$a2,$bi
1626         and     $cnt,$cnt,#31
1627         mul     $t3,$a3,$bi
1628         adds    $acc0,$acc0,$t0
1629         umulh   $t0,$a0,$bi             // hi(a[0..3]*b[4])
1630         adcs    $acc1,$acc1,$t1
1631         mul     $mi,$acc0,$n0           // t[0]*n0
1632         adcs    $acc2,$acc2,$t2
1633         umulh   $t1,$a1,$bi
1634         adcs    $acc3,$acc3,$t3
1635         umulh   $t2,$a2,$bi
1636         adc     $acc4,xzr,xzr
1637         umulh   $t3,$a3,$bi
1638         ldr     $bi,[$bp,$cnt]          // next b[i]
1639         adds    $acc1,$acc1,$t0
1640         // (*)  mul     $t0,$m0,$mi
1641         str     $mi,[$tp],#8            // put aside t[0]*n0 for tail processing
1642         adcs    $acc2,$acc2,$t1
1643         mul     $t1,$m1,$mi             // lo(n[0..3]*t[0]*n0
1644         adcs    $acc3,$acc3,$t2
1645         mul     $t2,$m2,$mi
1646         adc     $acc4,$acc4,$t3         // can't overflow
1647         mul     $t3,$m3,$mi
1648         // (*)  adds    xzr,$acc0,$t0
1649         subs    xzr,$acc0,#1            // (*)
1650         umulh   $t0,$m0,$mi             // hi(n[0..3]*t[0]*n0
1651         adcs    $acc0,$acc1,$t1
1652         umulh   $t1,$m1,$mi
1653         adcs    $acc1,$acc2,$t2
1654         umulh   $t2,$m2,$mi
1655         adcs    $acc2,$acc3,$t3
1656         umulh   $t3,$m3,$mi
1657         adcs    $acc3,$acc4,$carry
1658         adc     $carry,xzr,xzr
1659         adds    $acc0,$acc0,$t0
1660         adcs    $acc1,$acc1,$t1
1661         adcs    $acc2,$acc2,$t2
1662         adcs    $acc3,$acc3,$t3
1663         //adc   $carry,$carry,xzr
1664         cbnz    $cnt,.Loop_mul4x_reduction
1665
1666         adc     $carry,$carry,xzr
1667         ldp     $t0,$t1,[$tp,#8*4]      // t[4..7]
1668         ldp     $t2,$t3,[$tp,#8*6]
1669         ldp     $a0,$a1,[$ap,#8*0]      // a[4..7]
1670         ldp     $a2,$a3,[$ap,#8*2]
1671         add     $ap,$ap,#8*4
1672         adds    $acc0,$acc0,$t0
1673         adcs    $acc1,$acc1,$t1
1674         adcs    $acc2,$acc2,$t2
1675         adcs    $acc3,$acc3,$t3
1676         //adc   $carry,$carry,xzr
1677
1678         ldr     $mi,[sp]                // t[0]*n0
1679         ldp     $m0,$m1,[$np,#8*0]      // n[4..7]
1680         ldp     $m2,$m3,[$np,#8*2]
1681         add     $np,$np,#8*4
1682
1683 .align  4
1684 .Loop_mul4x_tail:
1685         mul     $t0,$a0,$bi             // lo(a[4..7]*b[4])
1686         adc     $carry,$carry,xzr       // modulo-scheduled
1687         mul     $t1,$a1,$bi
1688         add     $cnt,$cnt,#8
1689         mul     $t2,$a2,$bi
1690         and     $cnt,$cnt,#31
1691         mul     $t3,$a3,$bi
1692         adds    $acc0,$acc0,$t0
1693         umulh   $t0,$a0,$bi             // hi(a[4..7]*b[4])
1694         adcs    $acc1,$acc1,$t1
1695         umulh   $t1,$a1,$bi
1696         adcs    $acc2,$acc2,$t2
1697         umulh   $t2,$a2,$bi
1698         adcs    $acc3,$acc3,$t3
1699         umulh   $t3,$a3,$bi
1700         adc     $acc4,xzr,xzr
1701         ldr     $bi,[$bp,$cnt]          // next b[i]
1702         adds    $acc1,$acc1,$t0
1703         mul     $t0,$m0,$mi             // lo(n[4..7]*t[0]*n0)
1704         adcs    $acc2,$acc2,$t1
1705         mul     $t1,$m1,$mi
1706         adcs    $acc3,$acc3,$t2
1707         mul     $t2,$m2,$mi
1708         adc     $acc4,$acc4,$t3         // can't overflow
1709         mul     $t3,$m3,$mi
1710         adds    $acc0,$acc0,$t0
1711         umulh   $t0,$m0,$mi             // hi(n[4..7]*t[0]*n0)
1712         adcs    $acc1,$acc1,$t1
1713         umulh   $t1,$m1,$mi
1714         adcs    $acc2,$acc2,$t2
1715         umulh   $t2,$m2,$mi
1716         adcs    $acc3,$acc3,$t3
1717         umulh   $t3,$m3,$mi
1718         adcs    $acc4,$acc4,$carry
1719         ldr     $mi,[sp,$cnt]           // next a[0]*n0
1720         adc     $carry,xzr,xzr
1721         str     $acc0,[$tp],#8          // result!!!
1722         adds    $acc0,$acc1,$t0
1723         sub     $t0,$ap_end,$ap         // done yet?
1724         adcs    $acc1,$acc2,$t1
1725         adcs    $acc2,$acc3,$t2
1726         adcs    $acc3,$acc4,$t3
1727         //adc   $carry,$carry,xzr
1728         cbnz    $cnt,.Loop_mul4x_tail
1729
1730         sub     $t1,$np,$num            // rewinded np?
1731         adc     $carry,$carry,xzr
1732         cbz     $t0,.Loop_mul4x_break
1733
1734         ldp     $t0,$t1,[$tp,#8*4]
1735         ldp     $t2,$t3,[$tp,#8*6]
1736         ldp     $a0,$a1,[$ap,#8*0]
1737         ldp     $a2,$a3,[$ap,#8*2]
1738         add     $ap,$ap,#8*4
1739         adds    $acc0,$acc0,$t0
1740         adcs    $acc1,$acc1,$t1
1741         adcs    $acc2,$acc2,$t2
1742         adcs    $acc3,$acc3,$t3
1743         //adc   $carry,$carry,xzr
1744         ldp     $m0,$m1,[$np,#8*0]
1745         ldp     $m2,$m3,[$np,#8*2]
1746         add     $np,$np,#8*4
1747         b       .Loop_mul4x_tail
1748
1749 .align  4
1750 .Loop_mul4x_break:
1751         ldp     $t2,$t3,[x29,#96]       // pull rp and &b[num]
1752         adds    $acc0,$acc0,$topmost
1753         add     $bp,$bp,#8*4            // bp++
1754         adcs    $acc1,$acc1,xzr
1755         sub     $ap,$ap,$num            // rewind ap
1756         adcs    $acc2,$acc2,xzr
1757         stp     $acc0,$acc1,[$tp,#8*0]  // result!!!
1758         adcs    $acc3,$acc3,xzr
1759         ldp     $acc0,$acc1,[sp,#8*4]   // t[0..3]
1760         adc     $topmost,$carry,xzr
1761         stp     $acc2,$acc3,[$tp,#8*2]  // result!!!
1762         cmp     $bp,$t3                 // done yet?
1763         ldp     $acc2,$acc3,[sp,#8*6]
1764         ldp     $m0,$m1,[$t1,#8*0]      // n[0..3]
1765         ldp     $m2,$m3,[$t1,#8*2]
1766         add     $np,$t1,#8*4
1767         b.eq    .Lmul4x_post
1768
1769         ldr     $bi,[$bp]
1770         ldp     $a0,$a1,[$ap,#8*0]      // a[0..3]
1771         ldp     $a2,$a3,[$ap,#8*2]
1772         adds    $ap,$ap,#8*4            // clear carry bit
1773         mov     $carry,xzr
1774         mov     $tp,sp
1775         b       .Loop_mul4x_reduction
1776
1777 .align  4
1778 .Lmul4x_post:
1779         // Final step. We see if result is larger than modulus, and
1780         // if it is, subtract the modulus. But comparison implies
1781         // subtraction. So we subtract modulus, see if it borrowed,
1782         // and conditionally copy original value.
1783         mov     $rp,$t2
1784         mov     $ap_end,$t2             // $rp copy
1785         subs    $t0,$acc0,$m0
1786         add     $tp,sp,#8*8
1787         sbcs    $t1,$acc1,$m1
1788         sub     $cnt,$num,#8*4
1789
1790 .Lmul4x_sub:
1791         sbcs    $t2,$acc2,$m2
1792         ldp     $m0,$m1,[$np,#8*0]
1793         sub     $cnt,$cnt,#8*4
1794         ldp     $acc0,$acc1,[$tp,#8*0]
1795         sbcs    $t3,$acc3,$m3
1796         ldp     $m2,$m3,[$np,#8*2]
1797         add     $np,$np,#8*4
1798         ldp     $acc2,$acc3,[$tp,#8*2]
1799         add     $tp,$tp,#8*4
1800         stp     $t0,$t1,[$rp,#8*0]
1801         sbcs    $t0,$acc0,$m0
1802         stp     $t2,$t3,[$rp,#8*2]
1803         add     $rp,$rp,#8*4
1804         sbcs    $t1,$acc1,$m1
1805         cbnz    $cnt,.Lmul4x_sub
1806
1807         sbcs    $t2,$acc2,$m2
1808          mov    $tp,sp
1809          add    $ap,sp,#8*4
1810          ldp    $a0,$a1,[$ap_end,#8*0]
1811         sbcs    $t3,$acc3,$m3
1812         stp     $t0,$t1,[$rp,#8*0]
1813          ldp    $a2,$a3,[$ap_end,#8*2]
1814         stp     $t2,$t3,[$rp,#8*2]
1815          ldp    $acc0,$acc1,[$ap,#8*0]
1816          ldp    $acc2,$acc3,[$ap,#8*2]
1817         sbcs    xzr,$topmost,xzr        // did it borrow?
1818         ldr     x30,[x29,#8]            // pull return address
1819
1820         sub     $cnt,$num,#8*4
1821 .Lmul4x_cond_copy:
1822         sub     $cnt,$cnt,#8*4
1823         csel    $t0,$acc0,$a0,lo
1824          stp    xzr,xzr,[$tp,#8*0]
1825         csel    $t1,$acc1,$a1,lo
1826         ldp     $a0,$a1,[$ap_end,#8*4]
1827         ldp     $acc0,$acc1,[$ap,#8*4]
1828         csel    $t2,$acc2,$a2,lo
1829          stp    xzr,xzr,[$tp,#8*2]
1830          add    $tp,$tp,#8*4
1831         csel    $t3,$acc3,$a3,lo
1832         ldp     $a2,$a3,[$ap_end,#8*6]
1833         ldp     $acc2,$acc3,[$ap,#8*6]
1834         add     $ap,$ap,#8*4
1835         stp     $t0,$t1,[$ap_end,#8*0]
1836         stp     $t2,$t3,[$ap_end,#8*2]
1837         add     $ap_end,$ap_end,#8*4
1838         cbnz    $cnt,.Lmul4x_cond_copy
1839
1840         csel    $t0,$acc0,$a0,lo
1841          stp    xzr,xzr,[$tp,#8*0]
1842         csel    $t1,$acc1,$a1,lo
1843          stp    xzr,xzr,[$tp,#8*2]
1844         csel    $t2,$acc2,$a2,lo
1845          stp    xzr,xzr,[$tp,#8*3]
1846         csel    $t3,$acc3,$a3,lo
1847          stp    xzr,xzr,[$tp,#8*4]
1848         stp     $t0,$t1,[$ap_end,#8*0]
1849         stp     $t2,$t3,[$ap_end,#8*2]
1850
1851         b       .Lmul4x_done
1852
1853 .align  4
1854 .Lmul4x4_post_condition:
1855         adc     $carry,$carry,xzr
1856         ldr     $ap,[x29,#96]           // pull rp
1857         // $acc0-3,$carry hold result, $m0-7 hold modulus
1858         subs    $a0,$acc0,$m0
1859         ldr     x30,[x29,#8]            // pull return address
1860         sbcs    $a1,$acc1,$m1
1861          stp    xzr,xzr,[sp,#8*0]
1862         sbcs    $a2,$acc2,$m2
1863          stp    xzr,xzr,[sp,#8*2]
1864         sbcs    $a3,$acc3,$m3
1865          stp    xzr,xzr,[sp,#8*4]
1866         sbcs    xzr,$carry,xzr          // did it borrow?
1867          stp    xzr,xzr,[sp,#8*6]
1868
1869         // $a0-3 hold result-modulus
1870         csel    $a0,$acc0,$a0,lo
1871         csel    $a1,$acc1,$a1,lo
1872         csel    $a2,$acc2,$a2,lo
1873         csel    $a3,$acc3,$a3,lo
1874         stp     $a0,$a1,[$ap,#8*0]
1875         stp     $a2,$a3,[$ap,#8*2]
1876
1877 .Lmul4x_done:
1878         ldp     x19,x20,[x29,#16]
1879         mov     sp,x29
1880         ldp     x21,x22,[x29,#32]
1881         mov     x0,#1
1882         ldp     x23,x24,[x29,#48]
1883         ldp     x25,x26,[x29,#64]
1884         ldp     x27,x28,[x29,#80]
1885         ldr     x29,[sp],#128
1886         .inst   0xd50323bf              // autiasp
1887         ret
1888 .size   __bn_mul4x_mont,.-__bn_mul4x_mont
1889 ___
1890 }
1891 $code.=<<___;
1892 .asciz  "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1893 .align  4
1894 ___
1895
1896 print $code;
1897
1898 close STDOUT or die "error closing STDOUT: $!";