Update copyright; generated files.
[openssl.git] / crypto / bn / asm / armv8-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # March 2015
11 #
12 # "Teaser" Montgomery multiplication module for ARMv8. Needs more
13 # work. While it does improve RSA sign performance by 20-30% (less for
14 # longer keys) on most processors, for some reason RSA2048 is not
15 # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
16 # instruction issue rate is limited on processor in question, meaning
17 # that dedicated squaring procedure is a must. Well, actually all
18 # contemporary AArch64 processors seem to have limited multiplication
19 # issue rate, i.e. they can't issue multiplication every cycle, which
20 # explains moderate improvement coefficients in comparison to
21 # compiler-generated code. Recall that compiler is instructed to use
22 # umulh and therefore uses same amount of multiplication instructions
23 # to do the job. Assembly's edge is to minimize number of "collateral"
24 # instructions and of course instruction scheduling.
25 #
26 # April 2015
27 #
28 # Squaring procedure that handles lengths divisible by 8 improves
29 # RSA/DSA performance by 25-40-60% depending on processor and key
30 # length. Overall improvement coefficients are always positive in
31 # comparison to compiler-generated code. On Cortex-A57 improvement
32 # is still modest on longest key lengths, while others exhibit e.g.
33 # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
34 # on Cortex-A57 and ~60-100% faster on others.
35
36 $flavour = shift;
37 $output  = shift;
38
39 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
41 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
42 die "can't locate arm-xlate.pl";
43
44 open OUT,"| \"$^X\" $xlate $flavour $output";
45 *STDOUT=*OUT;
46
47 ($lo0,$hi0,$aj,$m0,$alo,$ahi,
48  $lo1,$hi1,$nj,$m1,$nlo,$nhi,
49  $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
50
51 # int bn_mul_mont(
52 $rp="x0";       # BN_ULONG *rp,
53 $ap="x1";       # const BN_ULONG *ap,
54 $bp="x2";       # const BN_ULONG *bp,
55 $np="x3";       # const BN_ULONG *np,
56 $n0="x4";       # const BN_ULONG *n0,
57 $num="x5";      # int num);
58
59 $code.=<<___;
60 .text
61
62 .globl  bn_mul_mont
63 .type   bn_mul_mont,%function
64 .align  5
65 bn_mul_mont:
66         tst     $num,#7
67         b.eq    __bn_sqr8x_mont
68         tst     $num,#3
69         b.eq    __bn_mul4x_mont
70 .Lmul_mont:
71         stp     x29,x30,[sp,#-64]!
72         add     x29,sp,#0
73         stp     x19,x20,[sp,#16]
74         stp     x21,x22,[sp,#32]
75         stp     x23,x24,[sp,#48]
76
77         ldr     $m0,[$bp],#8            // bp[0]
78         sub     $tp,sp,$num,lsl#3
79         ldp     $hi0,$aj,[$ap],#16      // ap[0..1]
80         lsl     $num,$num,#3
81         ldr     $n0,[$n0]               // *n0
82         and     $tp,$tp,#-16            // ABI says so
83         ldp     $hi1,$nj,[$np],#16      // np[0..1]
84
85         mul     $lo0,$hi0,$m0           // ap[0]*bp[0]
86         sub     $j,$num,#16             // j=num-2
87         umulh   $hi0,$hi0,$m0
88         mul     $alo,$aj,$m0            // ap[1]*bp[0]
89         umulh   $ahi,$aj,$m0
90
91         mul     $m1,$lo0,$n0            // "tp[0]"*n0
92         mov     sp,$tp                  // alloca
93
94         // (*)  mul     $lo1,$hi1,$m1   // np[0]*m1
95         umulh   $hi1,$hi1,$m1
96         mul     $nlo,$nj,$m1            // np[1]*m1
97         // (*)  adds    $lo1,$lo1,$lo0  // discarded
98         // (*)  As for removal of first multiplication and addition
99         //      instructions. The outcome of first addition is
100         //      guaranteed to be zero, which leaves two computationally
101         //      significant outcomes: it either carries or not. Then
102         //      question is when does it carry? Is there alternative
103         //      way to deduce it? If you follow operations, you can
104         //      observe that condition for carry is quite simple:
105         //      $lo0 being non-zero. So that carry can be calculated
106         //      by adding -1 to $lo0. That's what next instruction does.
107         subs    xzr,$lo0,#1             // (*)
108         umulh   $nhi,$nj,$m1
109         adc     $hi1,$hi1,xzr
110         cbz     $j,.L1st_skip
111
112 .L1st:
113         ldr     $aj,[$ap],#8
114         adds    $lo0,$alo,$hi0
115         sub     $j,$j,#8                // j--
116         adc     $hi0,$ahi,xzr
117
118         ldr     $nj,[$np],#8
119         adds    $lo1,$nlo,$hi1
120         mul     $alo,$aj,$m0            // ap[j]*bp[0]
121         adc     $hi1,$nhi,xzr
122         umulh   $ahi,$aj,$m0
123
124         adds    $lo1,$lo1,$lo0
125         mul     $nlo,$nj,$m1            // np[j]*m1
126         adc     $hi1,$hi1,xzr
127         umulh   $nhi,$nj,$m1
128         str     $lo1,[$tp],#8           // tp[j-1]
129         cbnz    $j,.L1st
130
131 .L1st_skip:
132         adds    $lo0,$alo,$hi0
133         sub     $ap,$ap,$num            // rewind $ap
134         adc     $hi0,$ahi,xzr
135
136         adds    $lo1,$nlo,$hi1
137         sub     $np,$np,$num            // rewind $np
138         adc     $hi1,$nhi,xzr
139
140         adds    $lo1,$lo1,$lo0
141         sub     $i,$num,#8              // i=num-1
142         adcs    $hi1,$hi1,$hi0
143
144         adc     $ovf,xzr,xzr            // upmost overflow bit
145         stp     $lo1,$hi1,[$tp]
146
147 .Louter:
148         ldr     $m0,[$bp],#8            // bp[i]
149         ldp     $hi0,$aj,[$ap],#16
150         ldr     $tj,[sp]                // tp[0]
151         add     $tp,sp,#8
152
153         mul     $lo0,$hi0,$m0           // ap[0]*bp[i]
154         sub     $j,$num,#16             // j=num-2
155         umulh   $hi0,$hi0,$m0
156         ldp     $hi1,$nj,[$np],#16
157         mul     $alo,$aj,$m0            // ap[1]*bp[i]
158         adds    $lo0,$lo0,$tj
159         umulh   $ahi,$aj,$m0
160         adc     $hi0,$hi0,xzr
161
162         mul     $m1,$lo0,$n0
163         sub     $i,$i,#8                // i--
164
165         // (*)  mul     $lo1,$hi1,$m1   // np[0]*m1
166         umulh   $hi1,$hi1,$m1
167         mul     $nlo,$nj,$m1            // np[1]*m1
168         // (*)  adds    $lo1,$lo1,$lo0
169         subs    xzr,$lo0,#1             // (*)
170         umulh   $nhi,$nj,$m1
171         cbz     $j,.Linner_skip
172
173 .Linner:
174         ldr     $aj,[$ap],#8
175         adc     $hi1,$hi1,xzr
176         ldr     $tj,[$tp],#8            // tp[j]
177         adds    $lo0,$alo,$hi0
178         sub     $j,$j,#8                // j--
179         adc     $hi0,$ahi,xzr
180
181         adds    $lo1,$nlo,$hi1
182         ldr     $nj,[$np],#8
183         adc     $hi1,$nhi,xzr
184
185         mul     $alo,$aj,$m0            // ap[j]*bp[i]
186         adds    $lo0,$lo0,$tj
187         umulh   $ahi,$aj,$m0
188         adc     $hi0,$hi0,xzr
189
190         mul     $nlo,$nj,$m1            // np[j]*m1
191         adds    $lo1,$lo1,$lo0
192         umulh   $nhi,$nj,$m1
193         str     $lo1,[$tp,#-16]         // tp[j-1]
194         cbnz    $j,.Linner
195
196 .Linner_skip:
197         ldr     $tj,[$tp],#8            // tp[j]
198         adc     $hi1,$hi1,xzr
199         adds    $lo0,$alo,$hi0
200         sub     $ap,$ap,$num            // rewind $ap
201         adc     $hi0,$ahi,xzr
202
203         adds    $lo1,$nlo,$hi1
204         sub     $np,$np,$num            // rewind $np
205         adcs    $hi1,$nhi,$ovf
206         adc     $ovf,xzr,xzr
207
208         adds    $lo0,$lo0,$tj
209         adc     $hi0,$hi0,xzr
210
211         adds    $lo1,$lo1,$lo0
212         adcs    $hi1,$hi1,$hi0
213         adc     $ovf,$ovf,xzr           // upmost overflow bit
214         stp     $lo1,$hi1,[$tp,#-16]
215
216         cbnz    $i,.Louter
217
218         // Final step. We see if result is larger than modulus, and
219         // if it is, subtract the modulus. But comparison implies
220         // subtraction. So we subtract modulus, see if it borrowed,
221         // and conditionally copy original value.
222         ldr     $tj,[sp]                // tp[0]
223         add     $tp,sp,#8
224         ldr     $nj,[$np],#8            // np[0]
225         subs    $j,$num,#8              // j=num-1 and clear borrow
226         mov     $ap,$rp
227 .Lsub:
228         sbcs    $aj,$tj,$nj             // tp[j]-np[j]
229         ldr     $tj,[$tp],#8
230         sub     $j,$j,#8                // j--
231         ldr     $nj,[$np],#8
232         str     $aj,[$ap],#8            // rp[j]=tp[j]-np[j]
233         cbnz    $j,.Lsub
234
235         sbcs    $aj,$tj,$nj
236         sbcs    $ovf,$ovf,xzr           // did it borrow?
237         str     $aj,[$ap],#8            // rp[num-1]
238
239         ldr     $tj,[sp]                // tp[0]
240         add     $tp,sp,#8
241         ldr     $aj,[$rp],#8            // rp[0]
242         sub     $num,$num,#8            // num--
243         nop
244 .Lcond_copy:
245         sub     $num,$num,#8            // num--
246         csel    $nj,$tj,$aj,lo          // did it borrow?
247         ldr     $tj,[$tp],#8
248         ldr     $aj,[$rp],#8
249         str     xzr,[$tp,#-16]          // wipe tp
250         str     $nj,[$rp,#-16]
251         cbnz    $num,.Lcond_copy
252
253         csel    $nj,$tj,$aj,lo
254         str     xzr,[$tp,#-8]           // wipe tp
255         str     $nj,[$rp,#-8]
256
257         ldp     x19,x20,[x29,#16]
258         mov     sp,x29
259         ldp     x21,x22,[x29,#32]
260         mov     x0,#1
261         ldp     x23,x24,[x29,#48]
262         ldr     x29,[sp],#64
263         ret
264 .size   bn_mul_mont,.-bn_mul_mont
265 ___
266 {
267 ########################################################################
268 # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
269
270 my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
271 my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
272 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
273 my ($cnt,$carry,$topmost)=("x27","x28","x30");
274 my ($tp,$ap_end,$na0)=($bp,$np,$carry);
275
276 $code.=<<___;
277 .type   __bn_sqr8x_mont,%function
278 .align  5
279 __bn_sqr8x_mont:
280         cmp     $ap,$bp
281         b.ne    __bn_mul4x_mont
282 .Lsqr8x_mont:
283         stp     x29,x30,[sp,#-128]!
284         add     x29,sp,#0
285         stp     x19,x20,[sp,#16]
286         stp     x21,x22,[sp,#32]
287         stp     x23,x24,[sp,#48]
288         stp     x25,x26,[sp,#64]
289         stp     x27,x28,[sp,#80]
290         stp     $rp,$np,[sp,#96]        // offload rp and np
291
292         ldp     $a0,$a1,[$ap,#8*0]
293         ldp     $a2,$a3,[$ap,#8*2]
294         ldp     $a4,$a5,[$ap,#8*4]
295         ldp     $a6,$a7,[$ap,#8*6]
296
297         sub     $tp,sp,$num,lsl#4
298         lsl     $num,$num,#3
299         ldr     $n0,[$n0]               // *n0
300         mov     sp,$tp                  // alloca
301         sub     $cnt,$num,#8*8
302         b       .Lsqr8x_zero_start
303
304 .Lsqr8x_zero:
305         sub     $cnt,$cnt,#8*8
306         stp     xzr,xzr,[$tp,#8*0]
307         stp     xzr,xzr,[$tp,#8*2]
308         stp     xzr,xzr,[$tp,#8*4]
309         stp     xzr,xzr,[$tp,#8*6]
310 .Lsqr8x_zero_start:
311         stp     xzr,xzr,[$tp,#8*8]
312         stp     xzr,xzr,[$tp,#8*10]
313         stp     xzr,xzr,[$tp,#8*12]
314         stp     xzr,xzr,[$tp,#8*14]
315         add     $tp,$tp,#8*16
316         cbnz    $cnt,.Lsqr8x_zero
317
318         add     $ap_end,$ap,$num
319         add     $ap,$ap,#8*8
320         mov     $acc0,xzr
321         mov     $acc1,xzr
322         mov     $acc2,xzr
323         mov     $acc3,xzr
324         mov     $acc4,xzr
325         mov     $acc5,xzr
326         mov     $acc6,xzr
327         mov     $acc7,xzr
328         mov     $tp,sp
329         str     $n0,[x29,#112]          // offload n0
330
331         // Multiply everything but a[i]*a[i]
332 .align  4
333 .Lsqr8x_outer_loop:
334         //                                                 a[1]a[0]     (i)
335         //                                             a[2]a[0]
336         //                                         a[3]a[0]
337         //                                     a[4]a[0]
338         //                                 a[5]a[0]
339         //                             a[6]a[0]
340         //                         a[7]a[0]
341         //                                         a[2]a[1]             (ii)
342         //                                     a[3]a[1]
343         //                                 a[4]a[1]
344         //                             a[5]a[1]
345         //                         a[6]a[1]
346         //                     a[7]a[1]
347         //                                 a[3]a[2]                     (iii)
348         //                             a[4]a[2]
349         //                         a[5]a[2]
350         //                     a[6]a[2]
351         //                 a[7]a[2]
352         //                         a[4]a[3]                             (iv)
353         //                     a[5]a[3]
354         //                 a[6]a[3]
355         //             a[7]a[3]
356         //                 a[5]a[4]                                     (v)
357         //             a[6]a[4]
358         //         a[7]a[4]
359         //         a[6]a[5]                                             (vi)
360         //     a[7]a[5]
361         // a[7]a[6]                                                     (vii)
362
363         mul     $t0,$a1,$a0             // lo(a[1..7]*a[0])             (i)
364         mul     $t1,$a2,$a0
365         mul     $t2,$a3,$a0
366         mul     $t3,$a4,$a0
367         adds    $acc1,$acc1,$t0         // t[1]+lo(a[1]*a[0])
368         mul     $t0,$a5,$a0
369         adcs    $acc2,$acc2,$t1
370         mul     $t1,$a6,$a0
371         adcs    $acc3,$acc3,$t2
372         mul     $t2,$a7,$a0
373         adcs    $acc4,$acc4,$t3
374         umulh   $t3,$a1,$a0             // hi(a[1..7]*a[0])
375         adcs    $acc5,$acc5,$t0
376         umulh   $t0,$a2,$a0
377         adcs    $acc6,$acc6,$t1
378         umulh   $t1,$a3,$a0
379         adcs    $acc7,$acc7,$t2
380         umulh   $t2,$a4,$a0
381         stp     $acc0,$acc1,[$tp],#8*2  // t[0..1]
382         adc     $acc0,xzr,xzr           // t[8]
383         adds    $acc2,$acc2,$t3         // t[2]+lo(a[1]*a[0])
384         umulh   $t3,$a5,$a0
385         adcs    $acc3,$acc3,$t0
386         umulh   $t0,$a6,$a0
387         adcs    $acc4,$acc4,$t1
388         umulh   $t1,$a7,$a0
389         adcs    $acc5,$acc5,$t2
390          mul    $t2,$a2,$a1             // lo(a[2..7]*a[1])             (ii)
391         adcs    $acc6,$acc6,$t3
392          mul    $t3,$a3,$a1
393         adcs    $acc7,$acc7,$t0
394          mul    $t0,$a4,$a1
395         adc     $acc0,$acc0,$t1
396
397         mul     $t1,$a5,$a1
398         adds    $acc3,$acc3,$t2
399         mul     $t2,$a6,$a1
400         adcs    $acc4,$acc4,$t3
401         mul     $t3,$a7,$a1
402         adcs    $acc5,$acc5,$t0
403         umulh   $t0,$a2,$a1             // hi(a[2..7]*a[1])
404         adcs    $acc6,$acc6,$t1
405         umulh   $t1,$a3,$a1
406         adcs    $acc7,$acc7,$t2
407         umulh   $t2,$a4,$a1
408         adcs    $acc0,$acc0,$t3
409         umulh   $t3,$a5,$a1
410         stp     $acc2,$acc3,[$tp],#8*2  // t[2..3]
411         adc     $acc1,xzr,xzr           // t[9]
412         adds    $acc4,$acc4,$t0
413         umulh   $t0,$a6,$a1
414         adcs    $acc5,$acc5,$t1
415         umulh   $t1,$a7,$a1
416         adcs    $acc6,$acc6,$t2
417          mul    $t2,$a3,$a2             // lo(a[3..7]*a[2])             (iii)
418         adcs    $acc7,$acc7,$t3
419          mul    $t3,$a4,$a2
420         adcs    $acc0,$acc0,$t0
421          mul    $t0,$a5,$a2
422         adc     $acc1,$acc1,$t1
423
424         mul     $t1,$a6,$a2
425         adds    $acc5,$acc5,$t2
426         mul     $t2,$a7,$a2
427         adcs    $acc6,$acc6,$t3
428         umulh   $t3,$a3,$a2             // hi(a[3..7]*a[2])
429         adcs    $acc7,$acc7,$t0
430         umulh   $t0,$a4,$a2
431         adcs    $acc0,$acc0,$t1
432         umulh   $t1,$a5,$a2
433         adcs    $acc1,$acc1,$t2
434         umulh   $t2,$a6,$a2
435         stp     $acc4,$acc5,[$tp],#8*2  // t[4..5]
436         adc     $acc2,xzr,xzr           // t[10]
437         adds    $acc6,$acc6,$t3
438         umulh   $t3,$a7,$a2
439         adcs    $acc7,$acc7,$t0
440          mul    $t0,$a4,$a3             // lo(a[4..7]*a[3])             (iv)
441         adcs    $acc0,$acc0,$t1
442          mul    $t1,$a5,$a3
443         adcs    $acc1,$acc1,$t2
444          mul    $t2,$a6,$a3
445         adc     $acc2,$acc2,$t3
446
447         mul     $t3,$a7,$a3
448         adds    $acc7,$acc7,$t0
449         umulh   $t0,$a4,$a3             // hi(a[4..7]*a[3])
450         adcs    $acc0,$acc0,$t1
451         umulh   $t1,$a5,$a3
452         adcs    $acc1,$acc1,$t2
453         umulh   $t2,$a6,$a3
454         adcs    $acc2,$acc2,$t3
455         umulh   $t3,$a7,$a3
456         stp     $acc6,$acc7,[$tp],#8*2  // t[6..7]
457         adc     $acc3,xzr,xzr           // t[11]
458         adds    $acc0,$acc0,$t0
459          mul    $t0,$a5,$a4             // lo(a[5..7]*a[4])             (v)
460         adcs    $acc1,$acc1,$t1
461          mul    $t1,$a6,$a4
462         adcs    $acc2,$acc2,$t2
463          mul    $t2,$a7,$a4
464         adc     $acc3,$acc3,$t3
465
466         umulh   $t3,$a5,$a4             // hi(a[5..7]*a[4])
467         adds    $acc1,$acc1,$t0
468         umulh   $t0,$a6,$a4
469         adcs    $acc2,$acc2,$t1
470         umulh   $t1,$a7,$a4
471         adcs    $acc3,$acc3,$t2
472          mul    $t2,$a6,$a5             // lo(a[6..7]*a[5])             (vi)
473         adc     $acc4,xzr,xzr           // t[12]
474         adds    $acc2,$acc2,$t3
475          mul    $t3,$a7,$a5
476         adcs    $acc3,$acc3,$t0
477          umulh  $t0,$a6,$a5             // hi(a[6..7]*a[5])
478         adc     $acc4,$acc4,$t1
479
480         umulh   $t1,$a7,$a5
481         adds    $acc3,$acc3,$t2
482          mul    $t2,$a7,$a6             // lo(a[7]*a[6])                (vii)
483         adcs    $acc4,$acc4,$t3
484          umulh  $t3,$a7,$a6             // hi(a[7]*a[6])
485         adc     $acc5,xzr,xzr           // t[13]
486         adds    $acc4,$acc4,$t0
487         sub     $cnt,$ap_end,$ap        // done yet?
488         adc     $acc5,$acc5,$t1
489
490         adds    $acc5,$acc5,$t2
491         sub     $t0,$ap_end,$num        // rewinded ap
492         adc     $acc6,xzr,xzr           // t[14]
493         add     $acc6,$acc6,$t3
494
495         cbz     $cnt,.Lsqr8x_outer_break
496
497         mov     $n0,$a0
498         ldp     $a0,$a1,[$tp,#8*0]
499         ldp     $a2,$a3,[$tp,#8*2]
500         ldp     $a4,$a5,[$tp,#8*4]
501         ldp     $a6,$a7,[$tp,#8*6]
502         adds    $acc0,$acc0,$a0
503         adcs    $acc1,$acc1,$a1
504         ldp     $a0,$a1,[$ap,#8*0]
505         adcs    $acc2,$acc2,$a2
506         adcs    $acc3,$acc3,$a3
507         ldp     $a2,$a3,[$ap,#8*2]
508         adcs    $acc4,$acc4,$a4
509         adcs    $acc5,$acc5,$a5
510         ldp     $a4,$a5,[$ap,#8*4]
511         adcs    $acc6,$acc6,$a6
512         mov     $rp,$ap
513         adcs    $acc7,xzr,$a7
514         ldp     $a6,$a7,[$ap,#8*6]
515         add     $ap,$ap,#8*8
516         //adc   $carry,xzr,xzr          // moved below
517         mov     $cnt,#-8*8
518
519         //                                                         a[8]a[0]
520         //                                                     a[9]a[0]
521         //                                                 a[a]a[0]
522         //                                             a[b]a[0]
523         //                                         a[c]a[0]
524         //                                     a[d]a[0]
525         //                                 a[e]a[0]
526         //                             a[f]a[0]
527         //                                                     a[8]a[1]
528         //                         a[f]a[1]........................
529         //                                                 a[8]a[2]
530         //                     a[f]a[2]........................
531         //                                             a[8]a[3]
532         //                 a[f]a[3]........................
533         //                                         a[8]a[4]
534         //             a[f]a[4]........................
535         //                                     a[8]a[5]
536         //         a[f]a[5]........................
537         //                                 a[8]a[6]
538         //     a[f]a[6]........................
539         //                             a[8]a[7]
540         // a[f]a[7]........................
541 .Lsqr8x_mul:
542         mul     $t0,$a0,$n0
543         adc     $carry,xzr,xzr          // carry bit, modulo-scheduled
544         mul     $t1,$a1,$n0
545         add     $cnt,$cnt,#8
546         mul     $t2,$a2,$n0
547         mul     $t3,$a3,$n0
548         adds    $acc0,$acc0,$t0
549         mul     $t0,$a4,$n0
550         adcs    $acc1,$acc1,$t1
551         mul     $t1,$a5,$n0
552         adcs    $acc2,$acc2,$t2
553         mul     $t2,$a6,$n0
554         adcs    $acc3,$acc3,$t3
555         mul     $t3,$a7,$n0
556         adcs    $acc4,$acc4,$t0
557         umulh   $t0,$a0,$n0
558         adcs    $acc5,$acc5,$t1
559         umulh   $t1,$a1,$n0
560         adcs    $acc6,$acc6,$t2
561         umulh   $t2,$a2,$n0
562         adcs    $acc7,$acc7,$t3
563         umulh   $t3,$a3,$n0
564         adc     $carry,$carry,xzr
565         str     $acc0,[$tp],#8
566         adds    $acc0,$acc1,$t0
567         umulh   $t0,$a4,$n0
568         adcs    $acc1,$acc2,$t1
569         umulh   $t1,$a5,$n0
570         adcs    $acc2,$acc3,$t2
571         umulh   $t2,$a6,$n0
572         adcs    $acc3,$acc4,$t3
573         umulh   $t3,$a7,$n0
574         ldr     $n0,[$rp,$cnt]
575         adcs    $acc4,$acc5,$t0
576         adcs    $acc5,$acc6,$t1
577         adcs    $acc6,$acc7,$t2
578         adcs    $acc7,$carry,$t3
579         //adc   $carry,xzr,xzr          // moved above
580         cbnz    $cnt,.Lsqr8x_mul
581                                         // note that carry flag is guaranteed
582                                         // to be zero at this point
583         cmp     $ap,$ap_end             // done yet?
584         b.eq    .Lsqr8x_break
585
586         ldp     $a0,$a1,[$tp,#8*0]
587         ldp     $a2,$a3,[$tp,#8*2]
588         ldp     $a4,$a5,[$tp,#8*4]
589         ldp     $a6,$a7,[$tp,#8*6]
590         adds    $acc0,$acc0,$a0
591         ldr     $n0,[$rp,#-8*8]
592         adcs    $acc1,$acc1,$a1
593         ldp     $a0,$a1,[$ap,#8*0]
594         adcs    $acc2,$acc2,$a2
595         adcs    $acc3,$acc3,$a3
596         ldp     $a2,$a3,[$ap,#8*2]
597         adcs    $acc4,$acc4,$a4
598         adcs    $acc5,$acc5,$a5
599         ldp     $a4,$a5,[$ap,#8*4]
600         adcs    $acc6,$acc6,$a6
601         mov     $cnt,#-8*8
602         adcs    $acc7,$acc7,$a7
603         ldp     $a6,$a7,[$ap,#8*6]
604         add     $ap,$ap,#8*8
605         //adc   $carry,xzr,xzr          // moved above
606         b       .Lsqr8x_mul
607
608 .align  4
609 .Lsqr8x_break:
610         ldp     $a0,$a1,[$rp,#8*0]
611         add     $ap,$rp,#8*8
612         ldp     $a2,$a3,[$rp,#8*2]
613         sub     $t0,$ap_end,$ap         // is it last iteration?
614         ldp     $a4,$a5,[$rp,#8*4]
615         sub     $t1,$tp,$t0
616         ldp     $a6,$a7,[$rp,#8*6]
617         cbz     $t0,.Lsqr8x_outer_loop
618
619         stp     $acc0,$acc1,[$tp,#8*0]
620         ldp     $acc0,$acc1,[$t1,#8*0]
621         stp     $acc2,$acc3,[$tp,#8*2]
622         ldp     $acc2,$acc3,[$t1,#8*2]
623         stp     $acc4,$acc5,[$tp,#8*4]
624         ldp     $acc4,$acc5,[$t1,#8*4]
625         stp     $acc6,$acc7,[$tp,#8*6]
626         mov     $tp,$t1
627         ldp     $acc6,$acc7,[$t1,#8*6]
628         b       .Lsqr8x_outer_loop
629
630 .align  4
631 .Lsqr8x_outer_break:
632         // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
633         ldp     $a1,$a3,[$t0,#8*0]      // recall that $t0 is &a[0]
634         ldp     $t1,$t2,[sp,#8*1]
635         ldp     $a5,$a7,[$t0,#8*2]
636         add     $ap,$t0,#8*4
637         ldp     $t3,$t0,[sp,#8*3]
638
639         stp     $acc0,$acc1,[$tp,#8*0]
640         mul     $acc0,$a1,$a1
641         stp     $acc2,$acc3,[$tp,#8*2]
642         umulh   $a1,$a1,$a1
643         stp     $acc4,$acc5,[$tp,#8*4]
644         mul     $a2,$a3,$a3
645         stp     $acc6,$acc7,[$tp,#8*6]
646         mov     $tp,sp
647         umulh   $a3,$a3,$a3
648         adds    $acc1,$a1,$t1,lsl#1
649         extr    $t1,$t2,$t1,#63
650         sub     $cnt,$num,#8*4
651
652 .Lsqr4x_shift_n_add:
653         adcs    $acc2,$a2,$t1
654         extr    $t2,$t3,$t2,#63
655         sub     $cnt,$cnt,#8*4
656         adcs    $acc3,$a3,$t2
657         ldp     $t1,$t2,[$tp,#8*5]
658         mul     $a4,$a5,$a5
659         ldp     $a1,$a3,[$ap],#8*2
660         umulh   $a5,$a5,$a5
661         mul     $a6,$a7,$a7
662         umulh   $a7,$a7,$a7
663         extr    $t3,$t0,$t3,#63
664         stp     $acc0,$acc1,[$tp,#8*0]
665         adcs    $acc4,$a4,$t3
666         extr    $t0,$t1,$t0,#63
667         stp     $acc2,$acc3,[$tp,#8*2]
668         adcs    $acc5,$a5,$t0
669         ldp     $t3,$t0,[$tp,#8*7]
670         extr    $t1,$t2,$t1,#63
671         adcs    $acc6,$a6,$t1
672         extr    $t2,$t3,$t2,#63
673         adcs    $acc7,$a7,$t2
674         ldp     $t1,$t2,[$tp,#8*9]
675         mul     $a0,$a1,$a1
676         ldp     $a5,$a7,[$ap],#8*2
677         umulh   $a1,$a1,$a1
678         mul     $a2,$a3,$a3
679         umulh   $a3,$a3,$a3
680         stp     $acc4,$acc5,[$tp,#8*4]
681         extr    $t3,$t0,$t3,#63
682         stp     $acc6,$acc7,[$tp,#8*6]
683         add     $tp,$tp,#8*8
684         adcs    $acc0,$a0,$t3
685         extr    $t0,$t1,$t0,#63
686         adcs    $acc1,$a1,$t0
687         ldp     $t3,$t0,[$tp,#8*3]
688         extr    $t1,$t2,$t1,#63
689         cbnz    $cnt,.Lsqr4x_shift_n_add
690 ___
691 my ($np,$np_end)=($ap,$ap_end);
692 $code.=<<___;
693          ldp    $np,$n0,[x29,#104]      // pull np and n0
694
695         adcs    $acc2,$a2,$t1
696         extr    $t2,$t3,$t2,#63
697         adcs    $acc3,$a3,$t2
698         ldp     $t1,$t2,[$tp,#8*5]
699         mul     $a4,$a5,$a5
700         umulh   $a5,$a5,$a5
701         stp     $acc0,$acc1,[$tp,#8*0]
702         mul     $a6,$a7,$a7
703         umulh   $a7,$a7,$a7
704         stp     $acc2,$acc3,[$tp,#8*2]
705         extr    $t3,$t0,$t3,#63
706         adcs    $acc4,$a4,$t3
707         extr    $t0,$t1,$t0,#63
708          ldp    $acc0,$acc1,[sp,#8*0]
709         adcs    $acc5,$a5,$t0
710         extr    $t1,$t2,$t1,#63
711          ldp    $a0,$a1,[$np,#8*0]
712         adcs    $acc6,$a6,$t1
713         extr    $t2,xzr,$t2,#63
714          ldp    $a2,$a3,[$np,#8*2]
715         adc     $acc7,$a7,$t2
716          ldp    $a4,$a5,[$np,#8*4]
717
718         // Reduce by 512 bits per iteration
719         mul     $na0,$n0,$acc0          // t[0]*n0
720         ldp     $a6,$a7,[$np,#8*6]
721         add     $np_end,$np,$num
722         ldp     $acc2,$acc3,[sp,#8*2]
723         stp     $acc4,$acc5,[$tp,#8*4]
724         ldp     $acc4,$acc5,[sp,#8*4]
725         stp     $acc6,$acc7,[$tp,#8*6]
726         ldp     $acc6,$acc7,[sp,#8*6]
727         add     $np,$np,#8*8
728         mov     $topmost,xzr            // initial top-most carry
729         mov     $tp,sp
730         mov     $cnt,#8
731
732 .Lsqr8x_reduction:
733         // (*)  mul     $t0,$a0,$na0    // lo(n[0-7])*lo(t[0]*n0)
734         mul     $t1,$a1,$na0
735         sub     $cnt,$cnt,#1
736         mul     $t2,$a2,$na0
737         str     $na0,[$tp],#8           // put aside t[0]*n0 for tail processing
738         mul     $t3,$a3,$na0
739         // (*)  adds    xzr,$acc0,$t0
740         subs    xzr,$acc0,#1            // (*)
741         mul     $t0,$a4,$na0
742         adcs    $acc0,$acc1,$t1
743         mul     $t1,$a5,$na0
744         adcs    $acc1,$acc2,$t2
745         mul     $t2,$a6,$na0
746         adcs    $acc2,$acc3,$t3
747         mul     $t3,$a7,$na0
748         adcs    $acc3,$acc4,$t0
749         umulh   $t0,$a0,$na0            // hi(n[0-7])*lo(t[0]*n0)
750         adcs    $acc4,$acc5,$t1
751         umulh   $t1,$a1,$na0
752         adcs    $acc5,$acc6,$t2
753         umulh   $t2,$a2,$na0
754         adcs    $acc6,$acc7,$t3
755         umulh   $t3,$a3,$na0
756         adc     $acc7,xzr,xzr
757         adds    $acc0,$acc0,$t0
758         umulh   $t0,$a4,$na0
759         adcs    $acc1,$acc1,$t1
760         umulh   $t1,$a5,$na0
761         adcs    $acc2,$acc2,$t2
762         umulh   $t2,$a6,$na0
763         adcs    $acc3,$acc3,$t3
764         umulh   $t3,$a7,$na0
765         mul     $na0,$n0,$acc0          // next t[0]*n0
766         adcs    $acc4,$acc4,$t0
767         adcs    $acc5,$acc5,$t1
768         adcs    $acc6,$acc6,$t2
769         adc     $acc7,$acc7,$t3
770         cbnz    $cnt,.Lsqr8x_reduction
771
772         ldp     $t0,$t1,[$tp,#8*0]
773         ldp     $t2,$t3,[$tp,#8*2]
774         mov     $rp,$tp
775         sub     $cnt,$np_end,$np        // done yet?
776         adds    $acc0,$acc0,$t0
777         adcs    $acc1,$acc1,$t1
778         ldp     $t0,$t1,[$tp,#8*4]
779         adcs    $acc2,$acc2,$t2
780         adcs    $acc3,$acc3,$t3
781         ldp     $t2,$t3,[$tp,#8*6]
782         adcs    $acc4,$acc4,$t0
783         adcs    $acc5,$acc5,$t1
784         adcs    $acc6,$acc6,$t2
785         adcs    $acc7,$acc7,$t3
786         //adc   $carry,xzr,xzr          // moved below
787         cbz     $cnt,.Lsqr8x8_post_condition
788
789         ldr     $n0,[$tp,#-8*8]
790         ldp     $a0,$a1,[$np,#8*0]
791         ldp     $a2,$a3,[$np,#8*2]
792         ldp     $a4,$a5,[$np,#8*4]
793         mov     $cnt,#-8*8
794         ldp     $a6,$a7,[$np,#8*6]
795         add     $np,$np,#8*8
796
797 .Lsqr8x_tail:
798         mul     $t0,$a0,$n0
799         adc     $carry,xzr,xzr          // carry bit, modulo-scheduled
800         mul     $t1,$a1,$n0
801         add     $cnt,$cnt,#8
802         mul     $t2,$a2,$n0
803         mul     $t3,$a3,$n0
804         adds    $acc0,$acc0,$t0
805         mul     $t0,$a4,$n0
806         adcs    $acc1,$acc1,$t1
807         mul     $t1,$a5,$n0
808         adcs    $acc2,$acc2,$t2
809         mul     $t2,$a6,$n0
810         adcs    $acc3,$acc3,$t3
811         mul     $t3,$a7,$n0
812         adcs    $acc4,$acc4,$t0
813         umulh   $t0,$a0,$n0
814         adcs    $acc5,$acc5,$t1
815         umulh   $t1,$a1,$n0
816         adcs    $acc6,$acc6,$t2
817         umulh   $t2,$a2,$n0
818         adcs    $acc7,$acc7,$t3
819         umulh   $t3,$a3,$n0
820         adc     $carry,$carry,xzr
821         str     $acc0,[$tp],#8
822         adds    $acc0,$acc1,$t0
823         umulh   $t0,$a4,$n0
824         adcs    $acc1,$acc2,$t1
825         umulh   $t1,$a5,$n0
826         adcs    $acc2,$acc3,$t2
827         umulh   $t2,$a6,$n0
828         adcs    $acc3,$acc4,$t3
829         umulh   $t3,$a7,$n0
830         ldr     $n0,[$rp,$cnt]
831         adcs    $acc4,$acc5,$t0
832         adcs    $acc5,$acc6,$t1
833         adcs    $acc6,$acc7,$t2
834         adcs    $acc7,$carry,$t3
835         //adc   $carry,xzr,xzr          // moved above
836         cbnz    $cnt,.Lsqr8x_tail
837                                         // note that carry flag is guaranteed
838                                         // to be zero at this point
839         ldp     $a0,$a1,[$tp,#8*0]
840         sub     $cnt,$np_end,$np        // done yet?
841         sub     $t2,$np_end,$num        // rewinded np
842         ldp     $a2,$a3,[$tp,#8*2]
843         ldp     $a4,$a5,[$tp,#8*4]
844         ldp     $a6,$a7,[$tp,#8*6]
845         cbz     $cnt,.Lsqr8x_tail_break
846
847         ldr     $n0,[$rp,#-8*8]
848         adds    $acc0,$acc0,$a0
849         adcs    $acc1,$acc1,$a1
850         ldp     $a0,$a1,[$np,#8*0]
851         adcs    $acc2,$acc2,$a2
852         adcs    $acc3,$acc3,$a3
853         ldp     $a2,$a3,[$np,#8*2]
854         adcs    $acc4,$acc4,$a4
855         adcs    $acc5,$acc5,$a5
856         ldp     $a4,$a5,[$np,#8*4]
857         adcs    $acc6,$acc6,$a6
858         mov     $cnt,#-8*8
859         adcs    $acc7,$acc7,$a7
860         ldp     $a6,$a7,[$np,#8*6]
861         add     $np,$np,#8*8
862         //adc   $carry,xzr,xzr          // moved above
863         b       .Lsqr8x_tail
864
865 .align  4
866 .Lsqr8x_tail_break:
867         ldr     $n0,[x29,#112]          // pull n0
868         add     $cnt,$tp,#8*8           // end of current t[num] window
869
870         subs    xzr,$topmost,#1         // "move" top-most carry to carry bit
871         adcs    $t0,$acc0,$a0
872         adcs    $t1,$acc1,$a1
873         ldp     $acc0,$acc1,[$rp,#8*0]
874         adcs    $acc2,$acc2,$a2
875         ldp     $a0,$a1,[$t2,#8*0]      // recall that $t2 is &n[0]
876         adcs    $acc3,$acc3,$a3
877         ldp     $a2,$a3,[$t2,#8*2]
878         adcs    $acc4,$acc4,$a4
879         adcs    $acc5,$acc5,$a5
880         ldp     $a4,$a5,[$t2,#8*4]
881         adcs    $acc6,$acc6,$a6
882         adcs    $acc7,$acc7,$a7
883         ldp     $a6,$a7,[$t2,#8*6]
884         add     $np,$t2,#8*8
885         adc     $topmost,xzr,xzr        // top-most carry
886         mul     $na0,$n0,$acc0
887         stp     $t0,$t1,[$tp,#8*0]
888         stp     $acc2,$acc3,[$tp,#8*2]
889         ldp     $acc2,$acc3,[$rp,#8*2]
890         stp     $acc4,$acc5,[$tp,#8*4]
891         ldp     $acc4,$acc5,[$rp,#8*4]
892         cmp     $cnt,x29                // did we hit the bottom?
893         stp     $acc6,$acc7,[$tp,#8*6]
894         mov     $tp,$rp                 // slide the window
895         ldp     $acc6,$acc7,[$rp,#8*6]
896         mov     $cnt,#8
897         b.ne    .Lsqr8x_reduction
898
899         // Final step. We see if result is larger than modulus, and
900         // if it is, subtract the modulus. But comparison implies
901         // subtraction. So we subtract modulus, see if it borrowed,
902         // and conditionally copy original value.
903         ldr     $rp,[x29,#96]           // pull rp
904         add     $tp,$tp,#8*8
905         subs    $t0,$acc0,$a0
906         sbcs    $t1,$acc1,$a1
907         sub     $cnt,$num,#8*8
908         mov     $ap_end,$rp             // $rp copy
909
910 .Lsqr8x_sub:
911         sbcs    $t2,$acc2,$a2
912         ldp     $a0,$a1,[$np,#8*0]
913         sbcs    $t3,$acc3,$a3
914         stp     $t0,$t1,[$rp,#8*0]
915         sbcs    $t0,$acc4,$a4
916         ldp     $a2,$a3,[$np,#8*2]
917         sbcs    $t1,$acc5,$a5
918         stp     $t2,$t3,[$rp,#8*2]
919         sbcs    $t2,$acc6,$a6
920         ldp     $a4,$a5,[$np,#8*4]
921         sbcs    $t3,$acc7,$a7
922         ldp     $a6,$a7,[$np,#8*6]
923         add     $np,$np,#8*8
924         ldp     $acc0,$acc1,[$tp,#8*0]
925         sub     $cnt,$cnt,#8*8
926         ldp     $acc2,$acc3,[$tp,#8*2]
927         ldp     $acc4,$acc5,[$tp,#8*4]
928         ldp     $acc6,$acc7,[$tp,#8*6]
929         add     $tp,$tp,#8*8
930         stp     $t0,$t1,[$rp,#8*4]
931         sbcs    $t0,$acc0,$a0
932         stp     $t2,$t3,[$rp,#8*6]
933         add     $rp,$rp,#8*8
934         sbcs    $t1,$acc1,$a1
935         cbnz    $cnt,.Lsqr8x_sub
936
937         sbcs    $t2,$acc2,$a2
938          mov    $tp,sp
939          add    $ap,sp,$num
940          ldp    $a0,$a1,[$ap_end,#8*0]
941         sbcs    $t3,$acc3,$a3
942         stp     $t0,$t1,[$rp,#8*0]
943         sbcs    $t0,$acc4,$a4
944          ldp    $a2,$a3,[$ap_end,#8*2]
945         sbcs    $t1,$acc5,$a5
946         stp     $t2,$t3,[$rp,#8*2]
947         sbcs    $t2,$acc6,$a6
948          ldp    $acc0,$acc1,[$ap,#8*0]
949         sbcs    $t3,$acc7,$a7
950          ldp    $acc2,$acc3,[$ap,#8*2]
951         sbcs    xzr,$topmost,xzr        // did it borrow?
952         ldr     x30,[x29,#8]            // pull return address
953         stp     $t0,$t1,[$rp,#8*4]
954         stp     $t2,$t3,[$rp,#8*6]
955
956         sub     $cnt,$num,#8*4
957 .Lsqr4x_cond_copy:
958         sub     $cnt,$cnt,#8*4
959         csel    $t0,$acc0,$a0,lo
960          stp    xzr,xzr,[$tp,#8*0]
961         csel    $t1,$acc1,$a1,lo
962         ldp     $a0,$a1,[$ap_end,#8*4]
963         ldp     $acc0,$acc1,[$ap,#8*4]
964         csel    $t2,$acc2,$a2,lo
965          stp    xzr,xzr,[$tp,#8*2]
966          add    $tp,$tp,#8*4
967         csel    $t3,$acc3,$a3,lo
968         ldp     $a2,$a3,[$ap_end,#8*6]
969         ldp     $acc2,$acc3,[$ap,#8*6]
970         add     $ap,$ap,#8*4
971         stp     $t0,$t1,[$ap_end,#8*0]
972         stp     $t2,$t3,[$ap_end,#8*2]
973         add     $ap_end,$ap_end,#8*4
974          stp    xzr,xzr,[$ap,#8*0]
975          stp    xzr,xzr,[$ap,#8*2]
976         cbnz    $cnt,.Lsqr4x_cond_copy
977
978         csel    $t0,$acc0,$a0,lo
979          stp    xzr,xzr,[$tp,#8*0]
980         csel    $t1,$acc1,$a1,lo
981          stp    xzr,xzr,[$tp,#8*2]
982         csel    $t2,$acc2,$a2,lo
983         csel    $t3,$acc3,$a3,lo
984         stp     $t0,$t1,[$ap_end,#8*0]
985         stp     $t2,$t3,[$ap_end,#8*2]
986
987         b       .Lsqr8x_done
988
989 .align  4
990 .Lsqr8x8_post_condition:
991         adc     $carry,xzr,xzr
992         ldr     x30,[x29,#8]            // pull return address
993         // $acc0-7,$carry hold result, $a0-7 hold modulus
994         subs    $a0,$acc0,$a0
995         ldr     $ap,[x29,#96]           // pull rp
996         sbcs    $a1,$acc1,$a1
997          stp    xzr,xzr,[sp,#8*0]
998         sbcs    $a2,$acc2,$a2
999          stp    xzr,xzr,[sp,#8*2]
1000         sbcs    $a3,$acc3,$a3
1001          stp    xzr,xzr,[sp,#8*4]
1002         sbcs    $a4,$acc4,$a4
1003          stp    xzr,xzr,[sp,#8*6]
1004         sbcs    $a5,$acc5,$a5
1005          stp    xzr,xzr,[sp,#8*8]
1006         sbcs    $a6,$acc6,$a6
1007          stp    xzr,xzr,[sp,#8*10]
1008         sbcs    $a7,$acc7,$a7
1009          stp    xzr,xzr,[sp,#8*12]
1010         sbcs    $carry,$carry,xzr       // did it borrow?
1011          stp    xzr,xzr,[sp,#8*14]
1012
1013         // $a0-7 hold result-modulus
1014         csel    $a0,$acc0,$a0,lo
1015         csel    $a1,$acc1,$a1,lo
1016         csel    $a2,$acc2,$a2,lo
1017         csel    $a3,$acc3,$a3,lo
1018         stp     $a0,$a1,[$ap,#8*0]
1019         csel    $a4,$acc4,$a4,lo
1020         csel    $a5,$acc5,$a5,lo
1021         stp     $a2,$a3,[$ap,#8*2]
1022         csel    $a6,$acc6,$a6,lo
1023         csel    $a7,$acc7,$a7,lo
1024         stp     $a4,$a5,[$ap,#8*4]
1025         stp     $a6,$a7,[$ap,#8*6]
1026
1027 .Lsqr8x_done:
1028         ldp     x19,x20,[x29,#16]
1029         mov     sp,x29
1030         ldp     x21,x22,[x29,#32]
1031         mov     x0,#1
1032         ldp     x23,x24,[x29,#48]
1033         ldp     x25,x26,[x29,#64]
1034         ldp     x27,x28,[x29,#80]
1035         ldr     x29,[sp],#128
1036         ret
1037 .size   __bn_sqr8x_mont,.-__bn_sqr8x_mont
1038 ___
1039 }
1040
1041 {
1042 ########################################################################
1043 # Even though this might look as ARMv8 adaptation of mulx4x_mont from
1044 # x86_64-mont5 module, it's different in sense that it performs
1045 # reduction 256 bits at a time.
1046
1047 my ($a0,$a1,$a2,$a3,
1048     $t0,$t1,$t2,$t3,
1049     $m0,$m1,$m2,$m3,
1050     $acc0,$acc1,$acc2,$acc3,$acc4,
1051     $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1052 my  $bp_end=$rp;
1053 my  ($carry,$topmost) = ($rp,"x30");
1054
1055 $code.=<<___;
1056 .type   __bn_mul4x_mont,%function
1057 .align  5
1058 __bn_mul4x_mont:
1059         stp     x29,x30,[sp,#-128]!
1060         add     x29,sp,#0
1061         stp     x19,x20,[sp,#16]
1062         stp     x21,x22,[sp,#32]
1063         stp     x23,x24,[sp,#48]
1064         stp     x25,x26,[sp,#64]
1065         stp     x27,x28,[sp,#80]
1066
1067         sub     $tp,sp,$num,lsl#3
1068         lsl     $num,$num,#3
1069         ldr     $n0,[$n0]               // *n0
1070         sub     sp,$tp,#8*4             // alloca
1071
1072         add     $t0,$bp,$num
1073         add     $ap_end,$ap,$num
1074         stp     $rp,$t0,[x29,#96]       // offload rp and &b[num]
1075
1076         ldr     $bi,[$bp,#8*0]          // b[0]
1077         ldp     $a0,$a1,[$ap,#8*0]      // a[0..3]
1078         ldp     $a2,$a3,[$ap,#8*2]
1079         add     $ap,$ap,#8*4
1080         mov     $acc0,xzr
1081         mov     $acc1,xzr
1082         mov     $acc2,xzr
1083         mov     $acc3,xzr
1084         ldp     $m0,$m1,[$np,#8*0]      // n[0..3]
1085         ldp     $m2,$m3,[$np,#8*2]
1086         adds    $np,$np,#8*4            // clear carry bit
1087         mov     $carry,xzr
1088         mov     $cnt,#0
1089         mov     $tp,sp
1090
1091 .Loop_mul4x_1st_reduction:
1092         mul     $t0,$a0,$bi             // lo(a[0..3]*b[0])
1093         adc     $carry,$carry,xzr       // modulo-scheduled
1094         mul     $t1,$a1,$bi
1095         add     $cnt,$cnt,#8
1096         mul     $t2,$a2,$bi
1097         and     $cnt,$cnt,#31
1098         mul     $t3,$a3,$bi
1099         adds    $acc0,$acc0,$t0
1100         umulh   $t0,$a0,$bi             // hi(a[0..3]*b[0])
1101         adcs    $acc1,$acc1,$t1
1102         mul     $mi,$acc0,$n0           // t[0]*n0
1103         adcs    $acc2,$acc2,$t2
1104         umulh   $t1,$a1,$bi
1105         adcs    $acc3,$acc3,$t3
1106         umulh   $t2,$a2,$bi
1107         adc     $acc4,xzr,xzr
1108         umulh   $t3,$a3,$bi
1109         ldr     $bi,[$bp,$cnt]          // next b[i] (or b[0])
1110         adds    $acc1,$acc1,$t0
1111         // (*)  mul     $t0,$m0,$mi     // lo(n[0..3]*t[0]*n0)
1112         str     $mi,[$tp],#8            // put aside t[0]*n0 for tail processing
1113         adcs    $acc2,$acc2,$t1
1114         mul     $t1,$m1,$mi
1115         adcs    $acc3,$acc3,$t2
1116         mul     $t2,$m2,$mi
1117         adc     $acc4,$acc4,$t3         // can't overflow
1118         mul     $t3,$m3,$mi
1119         // (*)  adds    xzr,$acc0,$t0
1120         subs    xzr,$acc0,#1            // (*)
1121         umulh   $t0,$m0,$mi             // hi(n[0..3]*t[0]*n0)
1122         adcs    $acc0,$acc1,$t1
1123         umulh   $t1,$m1,$mi
1124         adcs    $acc1,$acc2,$t2
1125         umulh   $t2,$m2,$mi
1126         adcs    $acc2,$acc3,$t3
1127         umulh   $t3,$m3,$mi
1128         adcs    $acc3,$acc4,$carry
1129         adc     $carry,xzr,xzr
1130         adds    $acc0,$acc0,$t0
1131         sub     $t0,$ap_end,$ap
1132         adcs    $acc1,$acc1,$t1
1133         adcs    $acc2,$acc2,$t2
1134         adcs    $acc3,$acc3,$t3
1135         //adc   $carry,$carry,xzr
1136         cbnz    $cnt,.Loop_mul4x_1st_reduction
1137
1138         cbz     $t0,.Lmul4x4_post_condition
1139
1140         ldp     $a0,$a1,[$ap,#8*0]      // a[4..7]
1141         ldp     $a2,$a3,[$ap,#8*2]
1142         add     $ap,$ap,#8*4
1143         ldr     $mi,[sp]                // a[0]*n0
1144         ldp     $m0,$m1,[$np,#8*0]      // n[4..7]
1145         ldp     $m2,$m3,[$np,#8*2]
1146         add     $np,$np,#8*4
1147
1148 .Loop_mul4x_1st_tail:
1149         mul     $t0,$a0,$bi             // lo(a[4..7]*b[i])
1150         adc     $carry,$carry,xzr       // modulo-scheduled
1151         mul     $t1,$a1,$bi
1152         add     $cnt,$cnt,#8
1153         mul     $t2,$a2,$bi
1154         and     $cnt,$cnt,#31
1155         mul     $t3,$a3,$bi
1156         adds    $acc0,$acc0,$t0
1157         umulh   $t0,$a0,$bi             // hi(a[4..7]*b[i])
1158         adcs    $acc1,$acc1,$t1
1159         umulh   $t1,$a1,$bi
1160         adcs    $acc2,$acc2,$t2
1161         umulh   $t2,$a2,$bi
1162         adcs    $acc3,$acc3,$t3
1163         umulh   $t3,$a3,$bi
1164         adc     $acc4,xzr,xzr
1165         ldr     $bi,[$bp,$cnt]          // next b[i] (or b[0])
1166         adds    $acc1,$acc1,$t0
1167         mul     $t0,$m0,$mi             // lo(n[4..7]*a[0]*n0)
1168         adcs    $acc2,$acc2,$t1
1169         mul     $t1,$m1,$mi
1170         adcs    $acc3,$acc3,$t2
1171         mul     $t2,$m2,$mi
1172         adc     $acc4,$acc4,$t3         // can't overflow
1173         mul     $t3,$m3,$mi
1174         adds    $acc0,$acc0,$t0
1175         umulh   $t0,$m0,$mi             // hi(n[4..7]*a[0]*n0)
1176         adcs    $acc1,$acc1,$t1
1177         umulh   $t1,$m1,$mi
1178         adcs    $acc2,$acc2,$t2
1179         umulh   $t2,$m2,$mi
1180         adcs    $acc3,$acc3,$t3
1181         adcs    $acc4,$acc4,$carry
1182         umulh   $t3,$m3,$mi
1183         adc     $carry,xzr,xzr
1184         ldr     $mi,[sp,$cnt]           // next t[0]*n0
1185         str     $acc0,[$tp],#8          // result!!!
1186         adds    $acc0,$acc1,$t0
1187         sub     $t0,$ap_end,$ap         // done yet?
1188         adcs    $acc1,$acc2,$t1
1189         adcs    $acc2,$acc3,$t2
1190         adcs    $acc3,$acc4,$t3
1191         //adc   $carry,$carry,xzr
1192         cbnz    $cnt,.Loop_mul4x_1st_tail
1193
1194         sub     $t1,$ap_end,$num        // rewinded $ap
1195         cbz     $t0,.Lmul4x_proceed
1196
1197         ldp     $a0,$a1,[$ap,#8*0]
1198         ldp     $a2,$a3,[$ap,#8*2]
1199         add     $ap,$ap,#8*4
1200         ldp     $m0,$m1,[$np,#8*0]
1201         ldp     $m2,$m3,[$np,#8*2]
1202         add     $np,$np,#8*4
1203         b       .Loop_mul4x_1st_tail
1204
1205 .align  5
1206 .Lmul4x_proceed:
1207         ldr     $bi,[$bp,#8*4]!         // *++b
1208         adc     $topmost,$carry,xzr
1209         ldp     $a0,$a1,[$t1,#8*0]      // a[0..3]
1210         sub     $np,$np,$num            // rewind np
1211         ldp     $a2,$a3,[$t1,#8*2]
1212         add     $ap,$t1,#8*4
1213
1214         stp     $acc0,$acc1,[$tp,#8*0]  // result!!!
1215         ldp     $acc0,$acc1,[sp,#8*4]   // t[0..3]
1216         stp     $acc2,$acc3,[$tp,#8*2]  // result!!!
1217         ldp     $acc2,$acc3,[sp,#8*6]
1218
1219         ldp     $m0,$m1,[$np,#8*0]      // n[0..3]
1220         mov     $tp,sp
1221         ldp     $m2,$m3,[$np,#8*2]
1222         adds    $np,$np,#8*4            // clear carry bit
1223         mov     $carry,xzr
1224
1225 .align  4
1226 .Loop_mul4x_reduction:
1227         mul     $t0,$a0,$bi             // lo(a[0..3]*b[4])
1228         adc     $carry,$carry,xzr       // modulo-scheduled
1229         mul     $t1,$a1,$bi
1230         add     $cnt,$cnt,#8
1231         mul     $t2,$a2,$bi
1232         and     $cnt,$cnt,#31
1233         mul     $t3,$a3,$bi
1234         adds    $acc0,$acc0,$t0
1235         umulh   $t0,$a0,$bi             // hi(a[0..3]*b[4])
1236         adcs    $acc1,$acc1,$t1
1237         mul     $mi,$acc0,$n0           // t[0]*n0
1238         adcs    $acc2,$acc2,$t2
1239         umulh   $t1,$a1,$bi
1240         adcs    $acc3,$acc3,$t3
1241         umulh   $t2,$a2,$bi
1242         adc     $acc4,xzr,xzr
1243         umulh   $t3,$a3,$bi
1244         ldr     $bi,[$bp,$cnt]          // next b[i]
1245         adds    $acc1,$acc1,$t0
1246         // (*)  mul     $t0,$m0,$mi
1247         str     $mi,[$tp],#8            // put aside t[0]*n0 for tail processing
1248         adcs    $acc2,$acc2,$t1
1249         mul     $t1,$m1,$mi             // lo(n[0..3]*t[0]*n0
1250         adcs    $acc3,$acc3,$t2
1251         mul     $t2,$m2,$mi
1252         adc     $acc4,$acc4,$t3         // can't overflow
1253         mul     $t3,$m3,$mi
1254         // (*)  adds    xzr,$acc0,$t0
1255         subs    xzr,$acc0,#1            // (*)
1256         umulh   $t0,$m0,$mi             // hi(n[0..3]*t[0]*n0
1257         adcs    $acc0,$acc1,$t1
1258         umulh   $t1,$m1,$mi
1259         adcs    $acc1,$acc2,$t2
1260         umulh   $t2,$m2,$mi
1261         adcs    $acc2,$acc3,$t3
1262         umulh   $t3,$m3,$mi
1263         adcs    $acc3,$acc4,$carry
1264         adc     $carry,xzr,xzr
1265         adds    $acc0,$acc0,$t0
1266         adcs    $acc1,$acc1,$t1
1267         adcs    $acc2,$acc2,$t2
1268         adcs    $acc3,$acc3,$t3
1269         //adc   $carry,$carry,xzr
1270         cbnz    $cnt,.Loop_mul4x_reduction
1271
1272         adc     $carry,$carry,xzr
1273         ldp     $t0,$t1,[$tp,#8*4]      // t[4..7]
1274         ldp     $t2,$t3,[$tp,#8*6]
1275         ldp     $a0,$a1,[$ap,#8*0]      // a[4..7]
1276         ldp     $a2,$a3,[$ap,#8*2]
1277         add     $ap,$ap,#8*4
1278         adds    $acc0,$acc0,$t0
1279         adcs    $acc1,$acc1,$t1
1280         adcs    $acc2,$acc2,$t2
1281         adcs    $acc3,$acc3,$t3
1282         //adc   $carry,$carry,xzr
1283
1284         ldr     $mi,[sp]                // t[0]*n0
1285         ldp     $m0,$m1,[$np,#8*0]      // n[4..7]
1286         ldp     $m2,$m3,[$np,#8*2]
1287         add     $np,$np,#8*4
1288
1289 .align  4
1290 .Loop_mul4x_tail:
1291         mul     $t0,$a0,$bi             // lo(a[4..7]*b[4])
1292         adc     $carry,$carry,xzr       // modulo-scheduled
1293         mul     $t1,$a1,$bi
1294         add     $cnt,$cnt,#8
1295         mul     $t2,$a2,$bi
1296         and     $cnt,$cnt,#31
1297         mul     $t3,$a3,$bi
1298         adds    $acc0,$acc0,$t0
1299         umulh   $t0,$a0,$bi             // hi(a[4..7]*b[4])
1300         adcs    $acc1,$acc1,$t1
1301         umulh   $t1,$a1,$bi
1302         adcs    $acc2,$acc2,$t2
1303         umulh   $t2,$a2,$bi
1304         adcs    $acc3,$acc3,$t3
1305         umulh   $t3,$a3,$bi
1306         adc     $acc4,xzr,xzr
1307         ldr     $bi,[$bp,$cnt]          // next b[i]
1308         adds    $acc1,$acc1,$t0
1309         mul     $t0,$m0,$mi             // lo(n[4..7]*t[0]*n0)
1310         adcs    $acc2,$acc2,$t1
1311         mul     $t1,$m1,$mi
1312         adcs    $acc3,$acc3,$t2
1313         mul     $t2,$m2,$mi
1314         adc     $acc4,$acc4,$t3         // can't overflow
1315         mul     $t3,$m3,$mi
1316         adds    $acc0,$acc0,$t0
1317         umulh   $t0,$m0,$mi             // hi(n[4..7]*t[0]*n0)
1318         adcs    $acc1,$acc1,$t1
1319         umulh   $t1,$m1,$mi
1320         adcs    $acc2,$acc2,$t2
1321         umulh   $t2,$m2,$mi
1322         adcs    $acc3,$acc3,$t3
1323         umulh   $t3,$m3,$mi
1324         adcs    $acc4,$acc4,$carry
1325         ldr     $mi,[sp,$cnt]           // next a[0]*n0
1326         adc     $carry,xzr,xzr
1327         str     $acc0,[$tp],#8          // result!!!
1328         adds    $acc0,$acc1,$t0
1329         sub     $t0,$ap_end,$ap         // done yet?
1330         adcs    $acc1,$acc2,$t1
1331         adcs    $acc2,$acc3,$t2
1332         adcs    $acc3,$acc4,$t3
1333         //adc   $carry,$carry,xzr
1334         cbnz    $cnt,.Loop_mul4x_tail
1335
1336         sub     $t1,$np,$num            // rewinded np?
1337         adc     $carry,$carry,xzr
1338         cbz     $t0,.Loop_mul4x_break
1339
1340         ldp     $t0,$t1,[$tp,#8*4]
1341         ldp     $t2,$t3,[$tp,#8*6]
1342         ldp     $a0,$a1,[$ap,#8*0]
1343         ldp     $a2,$a3,[$ap,#8*2]
1344         add     $ap,$ap,#8*4
1345         adds    $acc0,$acc0,$t0
1346         adcs    $acc1,$acc1,$t1
1347         adcs    $acc2,$acc2,$t2
1348         adcs    $acc3,$acc3,$t3
1349         //adc   $carry,$carry,xzr
1350         ldp     $m0,$m1,[$np,#8*0]
1351         ldp     $m2,$m3,[$np,#8*2]
1352         add     $np,$np,#8*4
1353         b       .Loop_mul4x_tail
1354
1355 .align  4
1356 .Loop_mul4x_break:
1357         ldp     $t2,$t3,[x29,#96]       // pull rp and &b[num]
1358         adds    $acc0,$acc0,$topmost
1359         add     $bp,$bp,#8*4            // bp++
1360         adcs    $acc1,$acc1,xzr
1361         sub     $ap,$ap,$num            // rewind ap
1362         adcs    $acc2,$acc2,xzr
1363         stp     $acc0,$acc1,[$tp,#8*0]  // result!!!
1364         adcs    $acc3,$acc3,xzr
1365         ldp     $acc0,$acc1,[sp,#8*4]   // t[0..3]
1366         adc     $topmost,$carry,xzr
1367         stp     $acc2,$acc3,[$tp,#8*2]  // result!!!
1368         cmp     $bp,$t3                 // done yet?
1369         ldp     $acc2,$acc3,[sp,#8*6]
1370         ldp     $m0,$m1,[$t1,#8*0]      // n[0..3]
1371         ldp     $m2,$m3,[$t1,#8*2]
1372         add     $np,$t1,#8*4
1373         b.eq    .Lmul4x_post
1374
1375         ldr     $bi,[$bp]
1376         ldp     $a0,$a1,[$ap,#8*0]      // a[0..3]
1377         ldp     $a2,$a3,[$ap,#8*2]
1378         adds    $ap,$ap,#8*4            // clear carry bit
1379         mov     $carry,xzr
1380         mov     $tp,sp
1381         b       .Loop_mul4x_reduction
1382
1383 .align  4
1384 .Lmul4x_post:
1385         // Final step. We see if result is larger than modulus, and
1386         // if it is, subtract the modulus. But comparison implies
1387         // subtraction. So we subtract modulus, see if it borrowed,
1388         // and conditionally copy original value.
1389         mov     $rp,$t2
1390         mov     $ap_end,$t2             // $rp copy
1391         subs    $t0,$acc0,$m0
1392         add     $tp,sp,#8*8
1393         sbcs    $t1,$acc1,$m1
1394         sub     $cnt,$num,#8*4
1395
1396 .Lmul4x_sub:
1397         sbcs    $t2,$acc2,$m2
1398         ldp     $m0,$m1,[$np,#8*0]
1399         sub     $cnt,$cnt,#8*4
1400         ldp     $acc0,$acc1,[$tp,#8*0]
1401         sbcs    $t3,$acc3,$m3
1402         ldp     $m2,$m3,[$np,#8*2]
1403         add     $np,$np,#8*4
1404         ldp     $acc2,$acc3,[$tp,#8*2]
1405         add     $tp,$tp,#8*4
1406         stp     $t0,$t1,[$rp,#8*0]
1407         sbcs    $t0,$acc0,$m0
1408         stp     $t2,$t3,[$rp,#8*2]
1409         add     $rp,$rp,#8*4
1410         sbcs    $t1,$acc1,$m1
1411         cbnz    $cnt,.Lmul4x_sub
1412
1413         sbcs    $t2,$acc2,$m2
1414          mov    $tp,sp
1415          add    $ap,sp,#8*4
1416          ldp    $a0,$a1,[$ap_end,#8*0]
1417         sbcs    $t3,$acc3,$m3
1418         stp     $t0,$t1,[$rp,#8*0]
1419          ldp    $a2,$a3,[$ap_end,#8*2]
1420         stp     $t2,$t3,[$rp,#8*2]
1421          ldp    $acc0,$acc1,[$ap,#8*0]
1422          ldp    $acc2,$acc3,[$ap,#8*2]
1423         sbcs    xzr,$topmost,xzr        // did it borrow?
1424         ldr     x30,[x29,#8]            // pull return address
1425
1426         sub     $cnt,$num,#8*4
1427 .Lmul4x_cond_copy:
1428         sub     $cnt,$cnt,#8*4
1429         csel    $t0,$acc0,$a0,lo
1430          stp    xzr,xzr,[$tp,#8*0]
1431         csel    $t1,$acc1,$a1,lo
1432         ldp     $a0,$a1,[$ap_end,#8*4]
1433         ldp     $acc0,$acc1,[$ap,#8*4]
1434         csel    $t2,$acc2,$a2,lo
1435          stp    xzr,xzr,[$tp,#8*2]
1436          add    $tp,$tp,#8*4
1437         csel    $t3,$acc3,$a3,lo
1438         ldp     $a2,$a3,[$ap_end,#8*6]
1439         ldp     $acc2,$acc3,[$ap,#8*6]
1440         add     $ap,$ap,#8*4
1441         stp     $t0,$t1,[$ap_end,#8*0]
1442         stp     $t2,$t3,[$ap_end,#8*2]
1443         add     $ap_end,$ap_end,#8*4
1444         cbnz    $cnt,.Lmul4x_cond_copy
1445
1446         csel    $t0,$acc0,$a0,lo
1447          stp    xzr,xzr,[$tp,#8*0]
1448         csel    $t1,$acc1,$a1,lo
1449          stp    xzr,xzr,[$tp,#8*2]
1450         csel    $t2,$acc2,$a2,lo
1451          stp    xzr,xzr,[$tp,#8*3]
1452         csel    $t3,$acc3,$a3,lo
1453          stp    xzr,xzr,[$tp,#8*4]
1454         stp     $t0,$t1,[$ap_end,#8*0]
1455         stp     $t2,$t3,[$ap_end,#8*2]
1456
1457         b       .Lmul4x_done
1458
1459 .align  4
1460 .Lmul4x4_post_condition:
1461         adc     $carry,$carry,xzr
1462         ldr     $ap,[x29,#96]           // pull rp
1463         // $acc0-3,$carry hold result, $m0-7 hold modulus
1464         subs    $a0,$acc0,$m0
1465         ldr     x30,[x29,#8]            // pull return address
1466         sbcs    $a1,$acc1,$m1
1467          stp    xzr,xzr,[sp,#8*0]
1468         sbcs    $a2,$acc2,$m2
1469          stp    xzr,xzr,[sp,#8*2]
1470         sbcs    $a3,$acc3,$m3
1471          stp    xzr,xzr,[sp,#8*4]
1472         sbcs    xzr,$carry,xzr          // did it borrow?
1473          stp    xzr,xzr,[sp,#8*6]
1474
1475         // $a0-3 hold result-modulus
1476         csel    $a0,$acc0,$a0,lo
1477         csel    $a1,$acc1,$a1,lo
1478         csel    $a2,$acc2,$a2,lo
1479         csel    $a3,$acc3,$a3,lo
1480         stp     $a0,$a1,[$ap,#8*0]
1481         stp     $a2,$a3,[$ap,#8*2]
1482
1483 .Lmul4x_done:
1484         ldp     x19,x20,[x29,#16]
1485         mov     sp,x29
1486         ldp     x21,x22,[x29,#32]
1487         mov     x0,#1
1488         ldp     x23,x24,[x29,#48]
1489         ldp     x25,x26,[x29,#64]
1490         ldp     x27,x28,[x29,#80]
1491         ldr     x29,[sp],#128
1492         ret
1493 .size   __bn_mul4x_mont,.-__bn_mul4x_mont
1494 ___
1495 }
1496 $code.=<<___;
1497 .asciz  "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1498 .align  4
1499 ___
1500
1501 print $code;
1502
1503 close STDOUT;