2 # Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # "Teaser" Montgomery multiplication module for ARMv8. Needs more
20 # work. While it does improve RSA sign performance by 20-30% (less for
21 # longer keys) on most processors, for some reason RSA2048 is not
22 # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
23 # instruction issue rate is limited on processor in question, meaning
24 # that dedicated squaring procedure is a must. Well, actually all
25 # contemporary AArch64 processors seem to have limited multiplication
26 # issue rate, i.e. they can't issue multiplication every cycle, which
27 # explains moderate improvement coefficients in comparison to
28 # compiler-generated code. Recall that compiler is instructed to use
29 # umulh and therefore uses same amount of multiplication instructions
30 # to do the job. Assembly's edge is to minimize number of "collateral"
31 # instructions and of course instruction scheduling.
35 # Squaring procedure that handles lengths divisible by 8 improves
36 # RSA/DSA performance by 25-40-60% depending on processor and key
37 # length. Overall improvement coefficients are always positive in
38 # comparison to compiler-generated code. On Cortex-A57 improvement
39 # is still modest on longest key lengths, while others exhibit e.g.
40 # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
41 # on Cortex-A57 and ~60-100% faster on others.
43 # $output is the last argument if it looks like a file (it has an extension)
44 # $flavour is the first argument if it doesn't look like a file
45 my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46 my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
48 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
50 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
51 die "can't locate arm-xlate.pl";
53 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
54 or die "can't call $xlate: $1";
57 ($lo0,$hi0,$aj,$m0,$alo,$ahi,
58 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
59 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
62 $rp="x0"; # BN_ULONG *rp,
63 $ap="x1"; # const BN_ULONG *ap,
64 $bp="x2"; # const BN_ULONG *bp,
65 $np="x3"; # const BN_ULONG *np,
66 $n0="x4"; # const BN_ULONG *n0,
67 $num="x5"; # int num);
72 .extern OPENSSL_armv8_rsa_neonized
73 .hidden OPENSSL_armv8_rsa_neonized
78 .type bn_mul_mont,%function
81 AARCH64_SIGN_LINK_REGISTER
88 adrp x17,OPENSSL_armv8_rsa_neonized
89 ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
90 cbnz w17, bn_mul8x_mont_neon
100 stp x29,x30,[sp,#-64]!
106 ldr $m0,[$bp],#8 // bp[0]
107 sub $tp,sp,$num,lsl#3
108 ldp $hi0,$aj,[$ap],#16 // ap[0..1]
111 and $tp,$tp,#-16 // ABI says so
112 ldp $hi1,$nj,[$np],#16 // np[0..1]
114 mul $lo0,$hi0,$m0 // ap[0]*bp[0]
115 sub $j,$num,#16 // j=num-2
117 mul $alo,$aj,$m0 // ap[1]*bp[0]
120 mul $m1,$lo0,$n0 // "tp[0]"*n0
123 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
125 mul $nlo,$nj,$m1 // np[1]*m1
126 // (*) adds $lo1,$lo1,$lo0 // discarded
127 // (*) As for removal of first multiplication and addition
128 // instructions. The outcome of first addition is
129 // guaranteed to be zero, which leaves two computationally
130 // significant outcomes: it either carries or not. Then
131 // question is when does it carry? Is there alternative
132 // way to deduce it? If you follow operations, you can
133 // observe that condition for carry is quite simple:
134 // $lo0 being non-zero. So that carry can be calculated
135 // by adding -1 to $lo0. That's what next instruction does.
136 subs xzr,$lo0,#1 // (*)
149 mul $alo,$aj,$m0 // ap[j]*bp[0]
154 mul $nlo,$nj,$m1 // np[j]*m1
157 str $lo1,[$tp],#8 // tp[j-1]
162 sub $ap,$ap,$num // rewind $ap
166 sub $np,$np,$num // rewind $np
170 sub $i,$num,#8 // i=num-1
173 adc $ovf,xzr,xzr // upmost overflow bit
177 ldr $m0,[$bp],#8 // bp[i]
178 ldp $hi0,$aj,[$ap],#16
179 ldr $tj,[sp] // tp[0]
182 mul $lo0,$hi0,$m0 // ap[0]*bp[i]
183 sub $j,$num,#16 // j=num-2
185 ldp $hi1,$nj,[$np],#16
186 mul $alo,$aj,$m0 // ap[1]*bp[i]
194 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
196 mul $nlo,$nj,$m1 // np[1]*m1
197 // (*) adds $lo1,$lo1,$lo0
198 subs xzr,$lo0,#1 // (*)
205 ldr $tj,[$tp],#8 // tp[j]
214 mul $alo,$aj,$m0 // ap[j]*bp[i]
219 mul $nlo,$nj,$m1 // np[j]*m1
222 stur $lo1,[$tp,#-16] // tp[j-1]
226 ldr $tj,[$tp],#8 // tp[j]
229 sub $ap,$ap,$num // rewind $ap
233 sub $np,$np,$num // rewind $np
242 adc $ovf,$ovf,xzr // upmost overflow bit
243 stp $lo1,$hi1,[$tp,#-16]
247 // Final step. We see if result is larger than modulus, and
248 // if it is, subtract the modulus. But comparison implies
249 // subtraction. So we subtract modulus, see if it borrowed,
250 // and conditionally copy original value.
251 ldr $tj,[sp] // tp[0]
253 ldr $nj,[$np],#8 // np[0]
254 subs $j,$num,#8 // j=num-1 and clear borrow
257 sbcs $aj,$tj,$nj // tp[j]-np[j]
261 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
265 sbcs $ovf,$ovf,xzr // did it borrow?
266 str $aj,[$ap],#8 // rp[num-1]
268 ldr $tj,[sp] // tp[0]
270 ldr $aj,[$rp],#8 // rp[0]
271 sub $num,$num,#8 // num--
274 sub $num,$num,#8 // num--
275 csel $nj,$tj,$aj,lo // did it borrow?
278 stur xzr,[$tp,#-16] // wipe tp
280 cbnz $num,.Lcond_copy
283 stur xzr,[$tp,#-8] // wipe tp
286 ldp x19,x20,[x29,#16]
288 ldp x21,x22,[x29,#32]
290 ldp x23,x24,[x29,#48]
292 AARCH64_VALIDATE_LINK_REGISTER
294 .size bn_mul_mont,.-bn_mul_mont
297 my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
298 my ($Z,$Temp)=("v4.16b","v5");
299 my @ACC=map("v$_",(6..13));
300 my ($Bi,$Ni,$M0)=map("v$_",(28..30));
307 my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
308 my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
311 .type bn_mul8x_mont_neon,%function
314 // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to
315 // only from bn_mul_mont which has already signed the return address.
316 stp x29,x30,[sp,#-80]!
323 eor $zero.16b,$zero.16b,$zero.16b
327 eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b
329 eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b
330 sub $toutptr,$toutptr,$num,lsl#4
331 eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b
332 and $toutptr,$toutptr,#-64
333 eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b
334 mov sp,$toutptr // alloca
335 eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b
336 add $toutptr,$toutptr,#256
337 eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b
339 eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b
340 eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b
343 st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
344 subs $inner,$inner,#8
345 st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
346 st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
347 st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
351 ld1 {$A0.4s,$A1.4s},[$aptr],#32
359 ldr $sBi,[$bptr],#4 // *b++
362 ld1 {$N0.4s,$N1.4s},[$nptr],#32
364 umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
365 umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
366 umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
367 shl $Ni.2d,@ACC[0].2d,#16
368 ext $Ni.16b,$Ni.16b,$Ni.16b,#8
369 umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
370 add $Ni.2d,$Ni.2d,@ACC[0].2d
371 umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
372 mul $Ni.2s,$Ni.2s,$M0.2s
373 umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
374 st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0]
375 umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
377 umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
381 ldr $sBi,[$bptr],#4 // *b++
382 umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
383 umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
385 umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
386 ushr $temp.2d,@ACC[0].2d,#16
387 umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
388 umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
389 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
390 add @ACC[0].2d,@ACC[0].2d,$temp.2d
391 umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
392 ushr @ACC[0].2d,@ACC[0].2d,#16
393 umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
394 umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
395 add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
396 ins @ACC[1].d[0],$ACCTemp.d[0]
397 st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
399 push(@ACC,shift(@ACC)); $i++;
401 umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
402 ld1 {@ACC[7].2d},[$tinptr],#16
403 umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
404 umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
405 shl $Ni.2d,@ACC[0].2d,#16
406 ext $Ni.16b,$Ni.16b,$Ni.16b,#8
407 umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
408 add $Ni.2d,$Ni.2d,@ACC[0].2d
409 umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
410 mul $Ni.2s,$Ni.2s,$M0.2s
411 umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
412 st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i]
413 umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
415 umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
419 ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
420 umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
421 ld1 {$A0.4s,$A1.4s},[$aptr],#32
422 umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
423 umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
424 mov $Temp.16b,@ACC[0].16b
425 ushr $Temp.2d,$Temp.2d,#16
426 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
427 umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
428 umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
429 add @ACC[0].2d,@ACC[0].2d,$Temp.2d
430 umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
431 ushr @ACC[0].2d,@ACC[0].2d,#16
432 eor $temp.16b,$temp.16b,$temp.16b
433 ins @ACC[0].d[1],$temp.d[0]
434 umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
435 umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
436 add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d
437 st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
438 add $bnptr,sp,#8 // rewind
440 push(@ACC,shift(@ACC));
447 subs $inner,$inner,#8
448 umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
449 ld1 {@ACC[7].2d},[$tinptr]
450 umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
451 ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0]
452 umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
453 ld1 {$N0.4s,$N1.4s},[$nptr],#32
454 umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
456 add $tinptr,$tinptr,#16 // don't advance in last iteration
458 umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
459 umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
460 umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
461 umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
463 for ($i=1; $i<8; $i++) {
465 ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i]
466 umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
467 umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
468 umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
469 umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
470 umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
471 umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
472 umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
473 umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
474 st1 {@ACC[0].2d},[$toutptr],#16
476 push(@ACC,shift(@ACC));
478 umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
479 ld1 {@ACC[7].2d},[$tinptr]
480 umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
481 ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i]
482 umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
484 add $tinptr,$tinptr,#16 // don't advance in last iteration
486 umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
487 umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
488 umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
489 umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
490 umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
494 b.ne .LInner_after_rewind$i
495 sub $aptr,$aptr,$num,lsl#2 // rewind
496 .LInner_after_rewind$i:
497 umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
498 ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
499 umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
500 ld1 {$A0.4s,$A1.4s},[$aptr],#32
501 umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
502 add $bnptr,sp,#8 // rewind
503 umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
504 umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
505 umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
506 umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
507 st1 {@ACC[0].2d},[$toutptr],#16
508 umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
512 push(@ACC,shift(@ACC));
515 st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
516 eor $N0.16b,$N0.16b,$N0.16b // $N0
517 st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
518 eor $N1.16b,$N1.16b,$N1.16b // $N1
519 st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
520 st1 {@ACC[6].2d},[$toutptr]
522 subs $outer,$outer,#8
523 ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
524 ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
525 ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
526 ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
528 b.eq .LInner_8n_jump_2steps
529 sub $nptr,$nptr,$num,lsl#2 // rewind
532 .LInner_8n_jump_2steps:
534 st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame
535 mov $Temp.16b,@ACC[0].16b
536 ushr $temp.2d,@ACC[0].2d,#16
537 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
538 st1 {$N0.2d,$N1.2d}, [sp],#32
539 add @ACC[0].2d,@ACC[0].2d,$temp.2d
540 st1 {$N0.2d,$N1.2d}, [sp],#32
541 ushr $temp.2d,@ACC[0].2d,#16
542 st1 {$N0.2d,$N1.2d}, [sp],#32
543 zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
544 ins $temp.d[1],$zero.d[0]
551 add @ACC[0].2d,@ACC[0].2d,$temp.2d
552 mov $Temp.16b,@ACC[0].16b
553 ushr $temp.2d,@ACC[0].2d,#16
554 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
555 ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
556 add @ACC[0].2d,@ACC[0].2d,$temp.2d
557 ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
558 ushr $temp.2d,@ACC[0].2d,#16
559 ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
560 zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
561 ins $temp.d[1],$zero.d[0]
565 for ($i=1; $i<8; $i++) {
567 add @ACC[1].2d,@ACC[1].2d,$temp.2d
568 st1 {@ACC[0].s}[0], [$toutptr],#4
569 ushr $temp.2d,@ACC[1].2d,#16
570 mov $Temp.16b,@ACC[1].16b
571 ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
572 add @ACC[1].2d,@ACC[1].2d,$temp.2d
573 ushr $temp.2d,@ACC[1].2d,#16
574 zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h
575 ins $temp.d[1],$zero.d[0]
577 push(@ACC,shift(@ACC));
579 push(@ACC,shift(@ACC));
581 ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
582 subs $inner,$inner,#8
583 st1 {@ACC[7].s}[0], [$toutptr],#4
586 st1 {$temp.s}[0], [$toutptr],#4 // top-most bit
587 sub $nptr,$nptr,$num,lsl#2 // rewind $nptr
588 subs $aptr,sp,#0 // clear carry flag
589 add $bptr,sp,$num,lsl#2
595 ldp w10,w11,[$nptr],#8
602 stp w10,w11,[$rptr],#8
605 ldr w10, [$aptr] // load top-most bit
607 eor v0.16b,v0.16b,v0.16b
608 sub x11,$bptr,x11 // this is num*4
609 eor v1.16b,v1.16b,v1.16b
611 sub $rptr,$rptr,x11 // rewind $rptr
612 mov $nptr,$bptr // second 3/4th of frame
613 sbcs w10,w10,wzr // result is carry flag
627 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
628 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
632 stp w10,w11,[$rptr],#8
643 st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe
644 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
645 sub x17,$bptr,$aptr // preserves carry
647 stp w10,w11,[$rptr],#8
648 cbnz x17,.LNEON_copy_n_zap
656 AARCH64_VALIDATE_LINK_REGISTER
659 .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
663 ########################################################################
664 # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
666 my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
667 my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
668 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
669 my ($cnt,$carry,$topmost)=("x27","x28","x30");
670 my ($tp,$ap_end,$na0)=($bp,$np,$carry);
673 .type __bn_sqr8x_mont,%function
679 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
680 // only from bn_mul_mont which has already signed the return address.
681 stp x29,x30,[sp,#-128]!
688 stp $rp,$np,[sp,#96] // offload rp and np
690 ldp $a0,$a1,[$ap,#8*0]
691 ldp $a2,$a3,[$ap,#8*2]
692 ldp $a4,$a5,[$ap,#8*4]
693 ldp $a6,$a7,[$ap,#8*6]
695 sub $tp,sp,$num,lsl#4
704 stp xzr,xzr,[$tp,#8*0]
705 stp xzr,xzr,[$tp,#8*2]
706 stp xzr,xzr,[$tp,#8*4]
707 stp xzr,xzr,[$tp,#8*6]
709 stp xzr,xzr,[$tp,#8*8]
710 stp xzr,xzr,[$tp,#8*10]
711 stp xzr,xzr,[$tp,#8*12]
712 stp xzr,xzr,[$tp,#8*14]
714 cbnz $cnt,.Lsqr8x_zero
727 str $n0,[x29,#112] // offload n0
729 // Multiply everything but a[i]*a[i]
761 mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
765 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
772 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
779 stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
780 adc $acc0,xzr,xzr // t[8]
781 adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
788 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
801 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
808 stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
809 adc $acc1,xzr,xzr // t[9]
815 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
826 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
833 stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
834 adc $acc2,xzr,xzr // t[10]
838 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
847 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
854 stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
855 adc $acc3,xzr,xzr // t[11]
857 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
864 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
870 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
871 adc $acc4,xzr,xzr // t[12]
875 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
880 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
882 umulh $t3,$a7,$a6 // hi(a[7]*a[6])
883 adc $acc5,xzr,xzr // t[13]
885 sub $cnt,$ap_end,$ap // done yet?
889 sub $t0,$ap_end,$num // rewinded ap
890 adc $acc6,xzr,xzr // t[14]
893 cbz $cnt,.Lsqr8x_outer_break
896 ldp $a0,$a1,[$tp,#8*0]
897 ldp $a2,$a3,[$tp,#8*2]
898 ldp $a4,$a5,[$tp,#8*4]
899 ldp $a6,$a7,[$tp,#8*6]
902 ldp $a0,$a1,[$ap,#8*0]
905 ldp $a2,$a3,[$ap,#8*2]
908 ldp $a4,$a5,[$ap,#8*4]
912 ldp $a6,$a7,[$ap,#8*6]
914 //adc $carry,xzr,xzr // moved below
926 // a[f]a[1]........................
928 // a[f]a[2]........................
930 // a[f]a[3]........................
932 // a[f]a[4]........................
934 // a[f]a[5]........................
936 // a[f]a[6]........................
938 // a[f]a[7]........................
941 adc $carry,xzr,xzr // carry bit, modulo-scheduled
962 adc $carry,$carry,xzr
976 adcs $acc7,$carry,$t3
977 //adc $carry,xzr,xzr // moved above
978 cbnz $cnt,.Lsqr8x_mul
979 // note that carry flag is guaranteed
980 // to be zero at this point
981 cmp $ap,$ap_end // done yet?
984 ldp $a0,$a1,[$tp,#8*0]
985 ldp $a2,$a3,[$tp,#8*2]
986 ldp $a4,$a5,[$tp,#8*4]
987 ldp $a6,$a7,[$tp,#8*6]
991 ldp $a0,$a1,[$ap,#8*0]
994 ldp $a2,$a3,[$ap,#8*2]
997 ldp $a4,$a5,[$ap,#8*4]
1000 adcs $acc7,$acc7,$a7
1001 ldp $a6,$a7,[$ap,#8*6]
1003 //adc $carry,xzr,xzr // moved above
1008 ldp $a0,$a1,[$rp,#8*0]
1010 ldp $a2,$a3,[$rp,#8*2]
1011 sub $t0,$ap_end,$ap // is it last iteration?
1012 ldp $a4,$a5,[$rp,#8*4]
1014 ldp $a6,$a7,[$rp,#8*6]
1015 cbz $t0,.Lsqr8x_outer_loop
1017 stp $acc0,$acc1,[$tp,#8*0]
1018 ldp $acc0,$acc1,[$t1,#8*0]
1019 stp $acc2,$acc3,[$tp,#8*2]
1020 ldp $acc2,$acc3,[$t1,#8*2]
1021 stp $acc4,$acc5,[$tp,#8*4]
1022 ldp $acc4,$acc5,[$t1,#8*4]
1023 stp $acc6,$acc7,[$tp,#8*6]
1025 ldp $acc6,$acc7,[$t1,#8*6]
1026 b .Lsqr8x_outer_loop
1029 .Lsqr8x_outer_break:
1030 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1031 ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
1032 ldp $t1,$t2,[sp,#8*1]
1033 ldp $a5,$a7,[$t0,#8*2]
1035 ldp $t3,$t0,[sp,#8*3]
1037 stp $acc0,$acc1,[$tp,#8*0]
1039 stp $acc2,$acc3,[$tp,#8*2]
1041 stp $acc4,$acc5,[$tp,#8*4]
1043 stp $acc6,$acc7,[$tp,#8*6]
1046 adds $acc1,$a1,$t1,lsl#1
1047 extr $t1,$t2,$t1,#63
1050 .Lsqr4x_shift_n_add:
1052 extr $t2,$t3,$t2,#63
1055 ldp $t1,$t2,[$tp,#8*5]
1057 ldp $a1,$a3,[$ap],#8*2
1061 extr $t3,$t0,$t3,#63
1062 stp $acc0,$acc1,[$tp,#8*0]
1064 extr $t0,$t1,$t0,#63
1065 stp $acc2,$acc3,[$tp,#8*2]
1067 ldp $t3,$t0,[$tp,#8*7]
1068 extr $t1,$t2,$t1,#63
1070 extr $t2,$t3,$t2,#63
1072 ldp $t1,$t2,[$tp,#8*9]
1074 ldp $a5,$a7,[$ap],#8*2
1078 stp $acc4,$acc5,[$tp,#8*4]
1079 extr $t3,$t0,$t3,#63
1080 stp $acc6,$acc7,[$tp,#8*6]
1083 extr $t0,$t1,$t0,#63
1085 ldp $t3,$t0,[$tp,#8*3]
1086 extr $t1,$t2,$t1,#63
1087 cbnz $cnt,.Lsqr4x_shift_n_add
1089 my ($np,$np_end)=($ap,$ap_end);
1091 ldp $np,$n0,[x29,#104] // pull np and n0
1094 extr $t2,$t3,$t2,#63
1096 ldp $t1,$t2,[$tp,#8*5]
1099 stp $acc0,$acc1,[$tp,#8*0]
1102 stp $acc2,$acc3,[$tp,#8*2]
1103 extr $t3,$t0,$t3,#63
1105 extr $t0,$t1,$t0,#63
1106 ldp $acc0,$acc1,[sp,#8*0]
1108 extr $t1,$t2,$t1,#63
1109 ldp $a0,$a1,[$np,#8*0]
1111 extr $t2,xzr,$t2,#63
1112 ldp $a2,$a3,[$np,#8*2]
1114 ldp $a4,$a5,[$np,#8*4]
1116 // Reduce by 512 bits per iteration
1117 mul $na0,$n0,$acc0 // t[0]*n0
1118 ldp $a6,$a7,[$np,#8*6]
1119 add $np_end,$np,$num
1120 ldp $acc2,$acc3,[sp,#8*2]
1121 stp $acc4,$acc5,[$tp,#8*4]
1122 ldp $acc4,$acc5,[sp,#8*4]
1123 stp $acc6,$acc7,[$tp,#8*6]
1124 ldp $acc6,$acc7,[sp,#8*6]
1126 mov $topmost,xzr // initial top-most carry
1131 // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
1135 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
1137 // (*) adds xzr,$acc0,$t0
1138 subs xzr,$acc0,#1 // (*)
1140 adcs $acc0,$acc1,$t1
1142 adcs $acc1,$acc2,$t2
1144 adcs $acc2,$acc3,$t3
1146 adcs $acc3,$acc4,$t0
1147 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
1148 adcs $acc4,$acc5,$t1
1150 adcs $acc5,$acc6,$t2
1152 adcs $acc6,$acc7,$t3
1155 adds $acc0,$acc0,$t0
1157 adcs $acc1,$acc1,$t1
1159 adcs $acc2,$acc2,$t2
1161 adcs $acc3,$acc3,$t3
1163 mul $na0,$n0,$acc0 // next t[0]*n0
1164 adcs $acc4,$acc4,$t0
1165 adcs $acc5,$acc5,$t1
1166 adcs $acc6,$acc6,$t2
1168 cbnz $cnt,.Lsqr8x_reduction
1170 ldp $t0,$t1,[$tp,#8*0]
1171 ldp $t2,$t3,[$tp,#8*2]
1173 sub $cnt,$np_end,$np // done yet?
1174 adds $acc0,$acc0,$t0
1175 adcs $acc1,$acc1,$t1
1176 ldp $t0,$t1,[$tp,#8*4]
1177 adcs $acc2,$acc2,$t2
1178 adcs $acc3,$acc3,$t3
1179 ldp $t2,$t3,[$tp,#8*6]
1180 adcs $acc4,$acc4,$t0
1181 adcs $acc5,$acc5,$t1
1182 adcs $acc6,$acc6,$t2
1183 adcs $acc7,$acc7,$t3
1184 //adc $carry,xzr,xzr // moved below
1185 cbz $cnt,.Lsqr8x8_post_condition
1187 ldur $n0,[$tp,#-8*8]
1188 ldp $a0,$a1,[$np,#8*0]
1189 ldp $a2,$a3,[$np,#8*2]
1190 ldp $a4,$a5,[$np,#8*4]
1192 ldp $a6,$a7,[$np,#8*6]
1197 adc $carry,xzr,xzr // carry bit, modulo-scheduled
1202 adds $acc0,$acc0,$t0
1204 adcs $acc1,$acc1,$t1
1206 adcs $acc2,$acc2,$t2
1208 adcs $acc3,$acc3,$t3
1210 adcs $acc4,$acc4,$t0
1212 adcs $acc5,$acc5,$t1
1214 adcs $acc6,$acc6,$t2
1216 adcs $acc7,$acc7,$t3
1218 adc $carry,$carry,xzr
1220 adds $acc0,$acc1,$t0
1222 adcs $acc1,$acc2,$t1
1224 adcs $acc2,$acc3,$t2
1226 adcs $acc3,$acc4,$t3
1229 adcs $acc4,$acc5,$t0
1230 adcs $acc5,$acc6,$t1
1231 adcs $acc6,$acc7,$t2
1232 adcs $acc7,$carry,$t3
1233 //adc $carry,xzr,xzr // moved above
1234 cbnz $cnt,.Lsqr8x_tail
1235 // note that carry flag is guaranteed
1236 // to be zero at this point
1237 ldp $a0,$a1,[$tp,#8*0]
1238 sub $cnt,$np_end,$np // done yet?
1239 sub $t2,$np_end,$num // rewinded np
1240 ldp $a2,$a3,[$tp,#8*2]
1241 ldp $a4,$a5,[$tp,#8*4]
1242 ldp $a6,$a7,[$tp,#8*6]
1243 cbz $cnt,.Lsqr8x_tail_break
1245 ldur $n0,[$rp,#-8*8]
1246 adds $acc0,$acc0,$a0
1247 adcs $acc1,$acc1,$a1
1248 ldp $a0,$a1,[$np,#8*0]
1249 adcs $acc2,$acc2,$a2
1250 adcs $acc3,$acc3,$a3
1251 ldp $a2,$a3,[$np,#8*2]
1252 adcs $acc4,$acc4,$a4
1253 adcs $acc5,$acc5,$a5
1254 ldp $a4,$a5,[$np,#8*4]
1255 adcs $acc6,$acc6,$a6
1257 adcs $acc7,$acc7,$a7
1258 ldp $a6,$a7,[$np,#8*6]
1260 //adc $carry,xzr,xzr // moved above
1265 ldr $n0,[x29,#112] // pull n0
1266 add $cnt,$tp,#8*8 // end of current t[num] window
1268 subs xzr,$topmost,#1 // "move" top-most carry to carry bit
1271 ldp $acc0,$acc1,[$rp,#8*0]
1272 adcs $acc2,$acc2,$a2
1273 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
1274 adcs $acc3,$acc3,$a3
1275 ldp $a2,$a3,[$t2,#8*2]
1276 adcs $acc4,$acc4,$a4
1277 adcs $acc5,$acc5,$a5
1278 ldp $a4,$a5,[$t2,#8*4]
1279 adcs $acc6,$acc6,$a6
1280 adcs $acc7,$acc7,$a7
1281 ldp $a6,$a7,[$t2,#8*6]
1283 adc $topmost,xzr,xzr // top-most carry
1285 stp $t0,$t1,[$tp,#8*0]
1286 stp $acc2,$acc3,[$tp,#8*2]
1287 ldp $acc2,$acc3,[$rp,#8*2]
1288 stp $acc4,$acc5,[$tp,#8*4]
1289 ldp $acc4,$acc5,[$rp,#8*4]
1290 cmp $cnt,x29 // did we hit the bottom?
1291 stp $acc6,$acc7,[$tp,#8*6]
1292 mov $tp,$rp // slide the window
1293 ldp $acc6,$acc7,[$rp,#8*6]
1295 b.ne .Lsqr8x_reduction
1297 // Final step. We see if result is larger than modulus, and
1298 // if it is, subtract the modulus. But comparison implies
1299 // subtraction. So we subtract modulus, see if it borrowed,
1300 // and conditionally copy original value.
1301 ldr $rp,[x29,#96] // pull rp
1306 mov $ap_end,$rp // $rp copy
1310 ldp $a0,$a1,[$np,#8*0]
1312 stp $t0,$t1,[$rp,#8*0]
1314 ldp $a2,$a3,[$np,#8*2]
1316 stp $t2,$t3,[$rp,#8*2]
1318 ldp $a4,$a5,[$np,#8*4]
1320 ldp $a6,$a7,[$np,#8*6]
1322 ldp $acc0,$acc1,[$tp,#8*0]
1324 ldp $acc2,$acc3,[$tp,#8*2]
1325 ldp $acc4,$acc5,[$tp,#8*4]
1326 ldp $acc6,$acc7,[$tp,#8*6]
1328 stp $t0,$t1,[$rp,#8*4]
1330 stp $t2,$t3,[$rp,#8*6]
1333 cbnz $cnt,.Lsqr8x_sub
1338 ldp $a0,$a1,[$ap_end,#8*0]
1340 stp $t0,$t1,[$rp,#8*0]
1342 ldp $a2,$a3,[$ap_end,#8*2]
1344 stp $t2,$t3,[$rp,#8*2]
1346 ldp $acc0,$acc1,[$ap,#8*0]
1348 ldp $acc2,$acc3,[$ap,#8*2]
1349 sbcs xzr,$topmost,xzr // did it borrow?
1350 ldr x30,[x29,#8] // pull return address
1351 stp $t0,$t1,[$rp,#8*4]
1352 stp $t2,$t3,[$rp,#8*6]
1357 csel $t0,$acc0,$a0,lo
1358 stp xzr,xzr,[$tp,#8*0]
1359 csel $t1,$acc1,$a1,lo
1360 ldp $a0,$a1,[$ap_end,#8*4]
1361 ldp $acc0,$acc1,[$ap,#8*4]
1362 csel $t2,$acc2,$a2,lo
1363 stp xzr,xzr,[$tp,#8*2]
1365 csel $t3,$acc3,$a3,lo
1366 ldp $a2,$a3,[$ap_end,#8*6]
1367 ldp $acc2,$acc3,[$ap,#8*6]
1369 stp $t0,$t1,[$ap_end,#8*0]
1370 stp $t2,$t3,[$ap_end,#8*2]
1371 add $ap_end,$ap_end,#8*4
1372 stp xzr,xzr,[$ap,#8*0]
1373 stp xzr,xzr,[$ap,#8*2]
1374 cbnz $cnt,.Lsqr4x_cond_copy
1376 csel $t0,$acc0,$a0,lo
1377 stp xzr,xzr,[$tp,#8*0]
1378 csel $t1,$acc1,$a1,lo
1379 stp xzr,xzr,[$tp,#8*2]
1380 csel $t2,$acc2,$a2,lo
1381 csel $t3,$acc3,$a3,lo
1382 stp $t0,$t1,[$ap_end,#8*0]
1383 stp $t2,$t3,[$ap_end,#8*2]
1388 .Lsqr8x8_post_condition:
1390 ldr x30,[x29,#8] // pull return address
1391 // $acc0-7,$carry hold result, $a0-7 hold modulus
1393 ldr $ap,[x29,#96] // pull rp
1395 stp xzr,xzr,[sp,#8*0]
1397 stp xzr,xzr,[sp,#8*2]
1399 stp xzr,xzr,[sp,#8*4]
1401 stp xzr,xzr,[sp,#8*6]
1403 stp xzr,xzr,[sp,#8*8]
1405 stp xzr,xzr,[sp,#8*10]
1407 stp xzr,xzr,[sp,#8*12]
1408 sbcs $carry,$carry,xzr // did it borrow?
1409 stp xzr,xzr,[sp,#8*14]
1411 // $a0-7 hold result-modulus
1412 csel $a0,$acc0,$a0,lo
1413 csel $a1,$acc1,$a1,lo
1414 csel $a2,$acc2,$a2,lo
1415 csel $a3,$acc3,$a3,lo
1416 stp $a0,$a1,[$ap,#8*0]
1417 csel $a4,$acc4,$a4,lo
1418 csel $a5,$acc5,$a5,lo
1419 stp $a2,$a3,[$ap,#8*2]
1420 csel $a6,$acc6,$a6,lo
1421 csel $a7,$acc7,$a7,lo
1422 stp $a4,$a5,[$ap,#8*4]
1423 stp $a6,$a7,[$ap,#8*6]
1426 ldp x19,x20,[x29,#16]
1428 ldp x21,x22,[x29,#32]
1430 ldp x23,x24,[x29,#48]
1431 ldp x25,x26,[x29,#64]
1432 ldp x27,x28,[x29,#80]
1434 // x30 is loaded earlier
1435 AARCH64_VALIDATE_LINK_REGISTER
1437 .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
1442 ########################################################################
1443 # Even though this might look as ARMv8 adaptation of mulx4x_mont from
1444 # x86_64-mont5 module, it's different in sense that it performs
1445 # reduction 256 bits at a time.
1447 my ($a0,$a1,$a2,$a3,
1450 $acc0,$acc1,$acc2,$acc3,$acc4,
1451 $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1453 my ($carry,$topmost) = ($rp,"x30");
1456 .type __bn_mul4x_mont,%function
1459 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
1460 // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address.
1461 stp x29,x30,[sp,#-128]!
1463 stp x19,x20,[sp,#16]
1464 stp x21,x22,[sp,#32]
1465 stp x23,x24,[sp,#48]
1466 stp x25,x26,[sp,#64]
1467 stp x27,x28,[sp,#80]
1469 sub $tp,sp,$num,lsl#3
1471 ldr $n0,[$n0] // *n0
1472 sub sp,$tp,#8*4 // alloca
1475 add $ap_end,$ap,$num
1476 stp $rp,$t0,[x29,#96] // offload rp and &b[num]
1478 ldr $bi,[$bp,#8*0] // b[0]
1479 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1480 ldp $a2,$a3,[$ap,#8*2]
1486 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1487 ldp $m2,$m3,[$np,#8*2]
1488 adds $np,$np,#8*4 // clear carry bit
1493 .Loop_mul4x_1st_reduction:
1494 mul $t0,$a0,$bi // lo(a[0..3]*b[0])
1495 adc $carry,$carry,xzr // modulo-scheduled
1501 adds $acc0,$acc0,$t0
1502 umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
1503 adcs $acc1,$acc1,$t1
1504 mul $mi,$acc0,$n0 // t[0]*n0
1505 adcs $acc2,$acc2,$t2
1507 adcs $acc3,$acc3,$t3
1511 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1512 adds $acc1,$acc1,$t0
1513 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
1514 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1515 adcs $acc2,$acc2,$t1
1517 adcs $acc3,$acc3,$t2
1519 adc $acc4,$acc4,$t3 // can't overflow
1521 // (*) adds xzr,$acc0,$t0
1522 subs xzr,$acc0,#1 // (*)
1523 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
1524 adcs $acc0,$acc1,$t1
1526 adcs $acc1,$acc2,$t2
1528 adcs $acc2,$acc3,$t3
1530 adcs $acc3,$acc4,$carry
1532 adds $acc0,$acc0,$t0
1534 adcs $acc1,$acc1,$t1
1535 adcs $acc2,$acc2,$t2
1536 adcs $acc3,$acc3,$t3
1537 //adc $carry,$carry,xzr
1538 cbnz $cnt,.Loop_mul4x_1st_reduction
1540 cbz $t0,.Lmul4x4_post_condition
1542 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1543 ldp $a2,$a3,[$ap,#8*2]
1545 ldr $mi,[sp] // a[0]*n0
1546 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1547 ldp $m2,$m3,[$np,#8*2]
1550 .Loop_mul4x_1st_tail:
1551 mul $t0,$a0,$bi // lo(a[4..7]*b[i])
1552 adc $carry,$carry,xzr // modulo-scheduled
1558 adds $acc0,$acc0,$t0
1559 umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
1560 adcs $acc1,$acc1,$t1
1562 adcs $acc2,$acc2,$t2
1564 adcs $acc3,$acc3,$t3
1567 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1568 adds $acc1,$acc1,$t0
1569 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
1570 adcs $acc2,$acc2,$t1
1572 adcs $acc3,$acc3,$t2
1574 adc $acc4,$acc4,$t3 // can't overflow
1576 adds $acc0,$acc0,$t0
1577 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
1578 adcs $acc1,$acc1,$t1
1580 adcs $acc2,$acc2,$t2
1582 adcs $acc3,$acc3,$t3
1583 adcs $acc4,$acc4,$carry
1586 ldr $mi,[sp,$cnt] // next t[0]*n0
1587 str $acc0,[$tp],#8 // result!!!
1588 adds $acc0,$acc1,$t0
1589 sub $t0,$ap_end,$ap // done yet?
1590 adcs $acc1,$acc2,$t1
1591 adcs $acc2,$acc3,$t2
1592 adcs $acc3,$acc4,$t3
1593 //adc $carry,$carry,xzr
1594 cbnz $cnt,.Loop_mul4x_1st_tail
1596 sub $t1,$ap_end,$num // rewinded $ap
1597 cbz $t0,.Lmul4x_proceed
1599 ldp $a0,$a1,[$ap,#8*0]
1600 ldp $a2,$a3,[$ap,#8*2]
1602 ldp $m0,$m1,[$np,#8*0]
1603 ldp $m2,$m3,[$np,#8*2]
1605 b .Loop_mul4x_1st_tail
1609 ldr $bi,[$bp,#8*4]! // *++b
1610 adc $topmost,$carry,xzr
1611 ldp $a0,$a1,[$t1,#8*0] // a[0..3]
1612 sub $np,$np,$num // rewind np
1613 ldp $a2,$a3,[$t1,#8*2]
1616 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1617 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1618 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1619 ldp $acc2,$acc3,[sp,#8*6]
1621 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1623 ldp $m2,$m3,[$np,#8*2]
1624 adds $np,$np,#8*4 // clear carry bit
1628 .Loop_mul4x_reduction:
1629 mul $t0,$a0,$bi // lo(a[0..3]*b[4])
1630 adc $carry,$carry,xzr // modulo-scheduled
1636 adds $acc0,$acc0,$t0
1637 umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
1638 adcs $acc1,$acc1,$t1
1639 mul $mi,$acc0,$n0 // t[0]*n0
1640 adcs $acc2,$acc2,$t2
1642 adcs $acc3,$acc3,$t3
1646 ldr $bi,[$bp,$cnt] // next b[i]
1647 adds $acc1,$acc1,$t0
1648 // (*) mul $t0,$m0,$mi
1649 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1650 adcs $acc2,$acc2,$t1
1651 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
1652 adcs $acc3,$acc3,$t2
1654 adc $acc4,$acc4,$t3 // can't overflow
1656 // (*) adds xzr,$acc0,$t0
1657 subs xzr,$acc0,#1 // (*)
1658 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
1659 adcs $acc0,$acc1,$t1
1661 adcs $acc1,$acc2,$t2
1663 adcs $acc2,$acc3,$t3
1665 adcs $acc3,$acc4,$carry
1667 adds $acc0,$acc0,$t0
1668 adcs $acc1,$acc1,$t1
1669 adcs $acc2,$acc2,$t2
1670 adcs $acc3,$acc3,$t3
1671 //adc $carry,$carry,xzr
1672 cbnz $cnt,.Loop_mul4x_reduction
1674 adc $carry,$carry,xzr
1675 ldp $t0,$t1,[$tp,#8*4] // t[4..7]
1676 ldp $t2,$t3,[$tp,#8*6]
1677 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1678 ldp $a2,$a3,[$ap,#8*2]
1680 adds $acc0,$acc0,$t0
1681 adcs $acc1,$acc1,$t1
1682 adcs $acc2,$acc2,$t2
1683 adcs $acc3,$acc3,$t3
1684 //adc $carry,$carry,xzr
1686 ldr $mi,[sp] // t[0]*n0
1687 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1688 ldp $m2,$m3,[$np,#8*2]
1693 mul $t0,$a0,$bi // lo(a[4..7]*b[4])
1694 adc $carry,$carry,xzr // modulo-scheduled
1700 adds $acc0,$acc0,$t0
1701 umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
1702 adcs $acc1,$acc1,$t1
1704 adcs $acc2,$acc2,$t2
1706 adcs $acc3,$acc3,$t3
1709 ldr $bi,[$bp,$cnt] // next b[i]
1710 adds $acc1,$acc1,$t0
1711 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
1712 adcs $acc2,$acc2,$t1
1714 adcs $acc3,$acc3,$t2
1716 adc $acc4,$acc4,$t3 // can't overflow
1718 adds $acc0,$acc0,$t0
1719 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
1720 adcs $acc1,$acc1,$t1
1722 adcs $acc2,$acc2,$t2
1724 adcs $acc3,$acc3,$t3
1726 adcs $acc4,$acc4,$carry
1727 ldr $mi,[sp,$cnt] // next a[0]*n0
1729 str $acc0,[$tp],#8 // result!!!
1730 adds $acc0,$acc1,$t0
1731 sub $t0,$ap_end,$ap // done yet?
1732 adcs $acc1,$acc2,$t1
1733 adcs $acc2,$acc3,$t2
1734 adcs $acc3,$acc4,$t3
1735 //adc $carry,$carry,xzr
1736 cbnz $cnt,.Loop_mul4x_tail
1738 sub $t1,$np,$num // rewinded np?
1739 adc $carry,$carry,xzr
1740 cbz $t0,.Loop_mul4x_break
1742 ldp $t0,$t1,[$tp,#8*4]
1743 ldp $t2,$t3,[$tp,#8*6]
1744 ldp $a0,$a1,[$ap,#8*0]
1745 ldp $a2,$a3,[$ap,#8*2]
1747 adds $acc0,$acc0,$t0
1748 adcs $acc1,$acc1,$t1
1749 adcs $acc2,$acc2,$t2
1750 adcs $acc3,$acc3,$t3
1751 //adc $carry,$carry,xzr
1752 ldp $m0,$m1,[$np,#8*0]
1753 ldp $m2,$m3,[$np,#8*2]
1759 ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
1760 adds $acc0,$acc0,$topmost
1761 add $bp,$bp,#8*4 // bp++
1762 adcs $acc1,$acc1,xzr
1763 sub $ap,$ap,$num // rewind ap
1764 adcs $acc2,$acc2,xzr
1765 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1766 adcs $acc3,$acc3,xzr
1767 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1768 adc $topmost,$carry,xzr
1769 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1770 cmp $bp,$t3 // done yet?
1771 ldp $acc2,$acc3,[sp,#8*6]
1772 ldp $m0,$m1,[$t1,#8*0] // n[0..3]
1773 ldp $m2,$m3,[$t1,#8*2]
1778 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1779 ldp $a2,$a3,[$ap,#8*2]
1780 adds $ap,$ap,#8*4 // clear carry bit
1783 b .Loop_mul4x_reduction
1787 // Final step. We see if result is larger than modulus, and
1788 // if it is, subtract the modulus. But comparison implies
1789 // subtraction. So we subtract modulus, see if it borrowed,
1790 // and conditionally copy original value.
1792 mov $ap_end,$t2 // $rp copy
1800 ldp $m0,$m1,[$np,#8*0]
1802 ldp $acc0,$acc1,[$tp,#8*0]
1804 ldp $m2,$m3,[$np,#8*2]
1806 ldp $acc2,$acc3,[$tp,#8*2]
1808 stp $t0,$t1,[$rp,#8*0]
1810 stp $t2,$t3,[$rp,#8*2]
1813 cbnz $cnt,.Lmul4x_sub
1818 ldp $a0,$a1,[$ap_end,#8*0]
1820 stp $t0,$t1,[$rp,#8*0]
1821 ldp $a2,$a3,[$ap_end,#8*2]
1822 stp $t2,$t3,[$rp,#8*2]
1823 ldp $acc0,$acc1,[$ap,#8*0]
1824 ldp $acc2,$acc3,[$ap,#8*2]
1825 sbcs xzr,$topmost,xzr // did it borrow?
1826 ldr x30,[x29,#8] // pull return address
1831 csel $t0,$acc0,$a0,lo
1832 stp xzr,xzr,[$tp,#8*0]
1833 csel $t1,$acc1,$a1,lo
1834 ldp $a0,$a1,[$ap_end,#8*4]
1835 ldp $acc0,$acc1,[$ap,#8*4]
1836 csel $t2,$acc2,$a2,lo
1837 stp xzr,xzr,[$tp,#8*2]
1839 csel $t3,$acc3,$a3,lo
1840 ldp $a2,$a3,[$ap_end,#8*6]
1841 ldp $acc2,$acc3,[$ap,#8*6]
1843 stp $t0,$t1,[$ap_end,#8*0]
1844 stp $t2,$t3,[$ap_end,#8*2]
1845 add $ap_end,$ap_end,#8*4
1846 cbnz $cnt,.Lmul4x_cond_copy
1848 csel $t0,$acc0,$a0,lo
1849 stp xzr,xzr,[$tp,#8*0]
1850 csel $t1,$acc1,$a1,lo
1851 stp xzr,xzr,[$tp,#8*2]
1852 csel $t2,$acc2,$a2,lo
1853 stp xzr,xzr,[$tp,#8*3]
1854 csel $t3,$acc3,$a3,lo
1855 stp xzr,xzr,[$tp,#8*4]
1856 stp $t0,$t1,[$ap_end,#8*0]
1857 stp $t2,$t3,[$ap_end,#8*2]
1862 .Lmul4x4_post_condition:
1863 adc $carry,$carry,xzr
1864 ldr $ap,[x29,#96] // pull rp
1865 // $acc0-3,$carry hold result, $m0-7 hold modulus
1867 ldr x30,[x29,#8] // pull return address
1869 stp xzr,xzr,[sp,#8*0]
1871 stp xzr,xzr,[sp,#8*2]
1873 stp xzr,xzr,[sp,#8*4]
1874 sbcs xzr,$carry,xzr // did it borrow?
1875 stp xzr,xzr,[sp,#8*6]
1877 // $a0-3 hold result-modulus
1878 csel $a0,$acc0,$a0,lo
1879 csel $a1,$acc1,$a1,lo
1880 csel $a2,$acc2,$a2,lo
1881 csel $a3,$acc3,$a3,lo
1882 stp $a0,$a1,[$ap,#8*0]
1883 stp $a2,$a3,[$ap,#8*2]
1886 ldp x19,x20,[x29,#16]
1888 ldp x21,x22,[x29,#32]
1890 ldp x23,x24,[x29,#48]
1891 ldp x25,x26,[x29,#64]
1892 ldp x27,x28,[x29,#80]
1894 // x30 loaded earlier
1895 AARCH64_VALIDATE_LINK_REGISTER
1897 .size __bn_mul4x_mont,.-__bn_mul4x_mont
1901 .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1907 close STDOUT or die "error closing STDOUT: $!";