2 # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # "Teaser" Montgomery multiplication module for ARMv8. Needs more
20 # work. While it does improve RSA sign performance by 20-30% (less for
21 # longer keys) on most processors, for some reason RSA2048 is not
22 # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
23 # instruction issue rate is limited on processor in question, meaning
24 # that dedicated squaring procedure is a must. Well, actually all
25 # contemporary AArch64 processors seem to have limited multiplication
26 # issue rate, i.e. they can't issue multiplication every cycle, which
27 # explains moderate improvement coefficients in comparison to
28 # compiler-generated code. Recall that compiler is instructed to use
29 # umulh and therefore uses same amount of multiplication instructions
30 # to do the job. Assembly's edge is to minimize number of "collateral"
31 # instructions and of course instruction scheduling.
35 # Squaring procedure that handles lengths divisible by 8 improves
36 # RSA/DSA performance by 25-40-60% depending on processor and key
37 # length. Overall improvement coefficients are always positive in
38 # comparison to compiler-generated code. On Cortex-A57 improvement
39 # is still modest on longest key lengths, while others exhibit e.g.
40 # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
41 # on Cortex-A57 and ~60-100% faster on others.
43 # $output is the last argument if it looks like a file (it has an extension)
44 # $flavour is the first argument if it doesn't look like a file
45 my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46 my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
48 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
50 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
51 die "can't locate arm-xlate.pl";
53 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
54 or die "can't call $xlate: $1";
57 ($lo0,$hi0,$aj,$m0,$alo,$ahi,
58 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
59 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
62 $rp="x0"; # BN_ULONG *rp,
63 $ap="x1"; # const BN_ULONG *ap,
64 $bp="x2"; # const BN_ULONG *bp,
65 $np="x3"; # const BN_ULONG *np,
66 $n0="x4"; # const BN_ULONG *n0,
67 $num="x5"; # int num);
73 .type bn_mul_mont,%function
81 stp x29,x30,[sp,#-64]!
87 ldr $m0,[$bp],#8 // bp[0]
89 ldp $hi0,$aj,[$ap],#16 // ap[0..1]
92 and $tp,$tp,#-16 // ABI says so
93 ldp $hi1,$nj,[$np],#16 // np[0..1]
95 mul $lo0,$hi0,$m0 // ap[0]*bp[0]
96 sub $j,$num,#16 // j=num-2
98 mul $alo,$aj,$m0 // ap[1]*bp[0]
101 mul $m1,$lo0,$n0 // "tp[0]"*n0
104 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
106 mul $nlo,$nj,$m1 // np[1]*m1
107 // (*) adds $lo1,$lo1,$lo0 // discarded
108 // (*) As for removal of first multiplication and addition
109 // instructions. The outcome of first addition is
110 // guaranteed to be zero, which leaves two computationally
111 // significant outcomes: it either carries or not. Then
112 // question is when does it carry? Is there alternative
113 // way to deduce it? If you follow operations, you can
114 // observe that condition for carry is quite simple:
115 // $lo0 being non-zero. So that carry can be calculated
116 // by adding -1 to $lo0. That's what next instruction does.
117 subs xzr,$lo0,#1 // (*)
130 mul $alo,$aj,$m0 // ap[j]*bp[0]
135 mul $nlo,$nj,$m1 // np[j]*m1
138 str $lo1,[$tp],#8 // tp[j-1]
143 sub $ap,$ap,$num // rewind $ap
147 sub $np,$np,$num // rewind $np
151 sub $i,$num,#8 // i=num-1
154 adc $ovf,xzr,xzr // upmost overflow bit
158 ldr $m0,[$bp],#8 // bp[i]
159 ldp $hi0,$aj,[$ap],#16
160 ldr $tj,[sp] // tp[0]
163 mul $lo0,$hi0,$m0 // ap[0]*bp[i]
164 sub $j,$num,#16 // j=num-2
166 ldp $hi1,$nj,[$np],#16
167 mul $alo,$aj,$m0 // ap[1]*bp[i]
175 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
177 mul $nlo,$nj,$m1 // np[1]*m1
178 // (*) adds $lo1,$lo1,$lo0
179 subs xzr,$lo0,#1 // (*)
186 ldr $tj,[$tp],#8 // tp[j]
195 mul $alo,$aj,$m0 // ap[j]*bp[i]
200 mul $nlo,$nj,$m1 // np[j]*m1
203 stur $lo1,[$tp,#-16] // tp[j-1]
207 ldr $tj,[$tp],#8 // tp[j]
210 sub $ap,$ap,$num // rewind $ap
214 sub $np,$np,$num // rewind $np
223 adc $ovf,$ovf,xzr // upmost overflow bit
224 stp $lo1,$hi1,[$tp,#-16]
228 // Final step. We see if result is larger than modulus, and
229 // if it is, subtract the modulus. But comparison implies
230 // subtraction. So we subtract modulus, see if it borrowed,
231 // and conditionally copy original value.
232 ldr $tj,[sp] // tp[0]
234 ldr $nj,[$np],#8 // np[0]
235 subs $j,$num,#8 // j=num-1 and clear borrow
238 sbcs $aj,$tj,$nj // tp[j]-np[j]
242 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
246 sbcs $ovf,$ovf,xzr // did it borrow?
247 str $aj,[$ap],#8 // rp[num-1]
249 ldr $tj,[sp] // tp[0]
251 ldr $aj,[$rp],#8 // rp[0]
252 sub $num,$num,#8 // num--
255 sub $num,$num,#8 // num--
256 csel $nj,$tj,$aj,lo // did it borrow?
259 stur xzr,[$tp,#-16] // wipe tp
261 cbnz $num,.Lcond_copy
264 stur xzr,[$tp,#-8] // wipe tp
267 ldp x19,x20,[x29,#16]
269 ldp x21,x22,[x29,#32]
271 ldp x23,x24,[x29,#48]
274 .size bn_mul_mont,.-bn_mul_mont
277 ########################################################################
278 # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
280 my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
281 my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
282 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
283 my ($cnt,$carry,$topmost)=("x27","x28","x30");
284 my ($tp,$ap_end,$na0)=($bp,$np,$carry);
287 .type __bn_sqr8x_mont,%function
293 .inst 0xd503233f // paciasp
294 stp x29,x30,[sp,#-128]!
301 stp $rp,$np,[sp,#96] // offload rp and np
303 ldp $a0,$a1,[$ap,#8*0]
304 ldp $a2,$a3,[$ap,#8*2]
305 ldp $a4,$a5,[$ap,#8*4]
306 ldp $a6,$a7,[$ap,#8*6]
308 sub $tp,sp,$num,lsl#4
317 stp xzr,xzr,[$tp,#8*0]
318 stp xzr,xzr,[$tp,#8*2]
319 stp xzr,xzr,[$tp,#8*4]
320 stp xzr,xzr,[$tp,#8*6]
322 stp xzr,xzr,[$tp,#8*8]
323 stp xzr,xzr,[$tp,#8*10]
324 stp xzr,xzr,[$tp,#8*12]
325 stp xzr,xzr,[$tp,#8*14]
327 cbnz $cnt,.Lsqr8x_zero
340 str $n0,[x29,#112] // offload n0
342 // Multiply everything but a[i]*a[i]
374 mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
378 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
385 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
392 stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
393 adc $acc0,xzr,xzr // t[8]
394 adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
401 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
414 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
421 stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
422 adc $acc1,xzr,xzr // t[9]
428 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
439 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
446 stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
447 adc $acc2,xzr,xzr // t[10]
451 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
460 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
467 stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
468 adc $acc3,xzr,xzr // t[11]
470 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
477 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
483 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
484 adc $acc4,xzr,xzr // t[12]
488 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
493 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
495 umulh $t3,$a7,$a6 // hi(a[7]*a[6])
496 adc $acc5,xzr,xzr // t[13]
498 sub $cnt,$ap_end,$ap // done yet?
502 sub $t0,$ap_end,$num // rewinded ap
503 adc $acc6,xzr,xzr // t[14]
506 cbz $cnt,.Lsqr8x_outer_break
509 ldp $a0,$a1,[$tp,#8*0]
510 ldp $a2,$a3,[$tp,#8*2]
511 ldp $a4,$a5,[$tp,#8*4]
512 ldp $a6,$a7,[$tp,#8*6]
515 ldp $a0,$a1,[$ap,#8*0]
518 ldp $a2,$a3,[$ap,#8*2]
521 ldp $a4,$a5,[$ap,#8*4]
525 ldp $a6,$a7,[$ap,#8*6]
527 //adc $carry,xzr,xzr // moved below
539 // a[f]a[1]........................
541 // a[f]a[2]........................
543 // a[f]a[3]........................
545 // a[f]a[4]........................
547 // a[f]a[5]........................
549 // a[f]a[6]........................
551 // a[f]a[7]........................
554 adc $carry,xzr,xzr // carry bit, modulo-scheduled
575 adc $carry,$carry,xzr
589 adcs $acc7,$carry,$t3
590 //adc $carry,xzr,xzr // moved above
591 cbnz $cnt,.Lsqr8x_mul
592 // note that carry flag is guaranteed
593 // to be zero at this point
594 cmp $ap,$ap_end // done yet?
597 ldp $a0,$a1,[$tp,#8*0]
598 ldp $a2,$a3,[$tp,#8*2]
599 ldp $a4,$a5,[$tp,#8*4]
600 ldp $a6,$a7,[$tp,#8*6]
604 ldp $a0,$a1,[$ap,#8*0]
607 ldp $a2,$a3,[$ap,#8*2]
610 ldp $a4,$a5,[$ap,#8*4]
614 ldp $a6,$a7,[$ap,#8*6]
616 //adc $carry,xzr,xzr // moved above
621 ldp $a0,$a1,[$rp,#8*0]
623 ldp $a2,$a3,[$rp,#8*2]
624 sub $t0,$ap_end,$ap // is it last iteration?
625 ldp $a4,$a5,[$rp,#8*4]
627 ldp $a6,$a7,[$rp,#8*6]
628 cbz $t0,.Lsqr8x_outer_loop
630 stp $acc0,$acc1,[$tp,#8*0]
631 ldp $acc0,$acc1,[$t1,#8*0]
632 stp $acc2,$acc3,[$tp,#8*2]
633 ldp $acc2,$acc3,[$t1,#8*2]
634 stp $acc4,$acc5,[$tp,#8*4]
635 ldp $acc4,$acc5,[$t1,#8*4]
636 stp $acc6,$acc7,[$tp,#8*6]
638 ldp $acc6,$acc7,[$t1,#8*6]
643 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
644 ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
645 ldp $t1,$t2,[sp,#8*1]
646 ldp $a5,$a7,[$t0,#8*2]
648 ldp $t3,$t0,[sp,#8*3]
650 stp $acc0,$acc1,[$tp,#8*0]
652 stp $acc2,$acc3,[$tp,#8*2]
654 stp $acc4,$acc5,[$tp,#8*4]
656 stp $acc6,$acc7,[$tp,#8*6]
659 adds $acc1,$a1,$t1,lsl#1
668 ldp $t1,$t2,[$tp,#8*5]
670 ldp $a1,$a3,[$ap],#8*2
675 stp $acc0,$acc1,[$tp,#8*0]
678 stp $acc2,$acc3,[$tp,#8*2]
680 ldp $t3,$t0,[$tp,#8*7]
685 ldp $t1,$t2,[$tp,#8*9]
687 ldp $a5,$a7,[$ap],#8*2
691 stp $acc4,$acc5,[$tp,#8*4]
693 stp $acc6,$acc7,[$tp,#8*6]
698 ldp $t3,$t0,[$tp,#8*3]
700 cbnz $cnt,.Lsqr4x_shift_n_add
702 my ($np,$np_end)=($ap,$ap_end);
704 ldp $np,$n0,[x29,#104] // pull np and n0
709 ldp $t1,$t2,[$tp,#8*5]
712 stp $acc0,$acc1,[$tp,#8*0]
715 stp $acc2,$acc3,[$tp,#8*2]
719 ldp $acc0,$acc1,[sp,#8*0]
722 ldp $a0,$a1,[$np,#8*0]
725 ldp $a2,$a3,[$np,#8*2]
727 ldp $a4,$a5,[$np,#8*4]
729 // Reduce by 512 bits per iteration
730 mul $na0,$n0,$acc0 // t[0]*n0
731 ldp $a6,$a7,[$np,#8*6]
733 ldp $acc2,$acc3,[sp,#8*2]
734 stp $acc4,$acc5,[$tp,#8*4]
735 ldp $acc4,$acc5,[sp,#8*4]
736 stp $acc6,$acc7,[$tp,#8*6]
737 ldp $acc6,$acc7,[sp,#8*6]
739 mov $topmost,xzr // initial top-most carry
744 // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
748 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
750 // (*) adds xzr,$acc0,$t0
751 subs xzr,$acc0,#1 // (*)
760 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
776 mul $na0,$n0,$acc0 // next t[0]*n0
781 cbnz $cnt,.Lsqr8x_reduction
783 ldp $t0,$t1,[$tp,#8*0]
784 ldp $t2,$t3,[$tp,#8*2]
786 sub $cnt,$np_end,$np // done yet?
789 ldp $t0,$t1,[$tp,#8*4]
792 ldp $t2,$t3,[$tp,#8*6]
797 //adc $carry,xzr,xzr // moved below
798 cbz $cnt,.Lsqr8x8_post_condition
801 ldp $a0,$a1,[$np,#8*0]
802 ldp $a2,$a3,[$np,#8*2]
803 ldp $a4,$a5,[$np,#8*4]
805 ldp $a6,$a7,[$np,#8*6]
810 adc $carry,xzr,xzr // carry bit, modulo-scheduled
831 adc $carry,$carry,xzr
845 adcs $acc7,$carry,$t3
846 //adc $carry,xzr,xzr // moved above
847 cbnz $cnt,.Lsqr8x_tail
848 // note that carry flag is guaranteed
849 // to be zero at this point
850 ldp $a0,$a1,[$tp,#8*0]
851 sub $cnt,$np_end,$np // done yet?
852 sub $t2,$np_end,$num // rewinded np
853 ldp $a2,$a3,[$tp,#8*2]
854 ldp $a4,$a5,[$tp,#8*4]
855 ldp $a6,$a7,[$tp,#8*6]
856 cbz $cnt,.Lsqr8x_tail_break
861 ldp $a0,$a1,[$np,#8*0]
864 ldp $a2,$a3,[$np,#8*2]
867 ldp $a4,$a5,[$np,#8*4]
871 ldp $a6,$a7,[$np,#8*6]
873 //adc $carry,xzr,xzr // moved above
878 ldr $n0,[x29,#112] // pull n0
879 add $cnt,$tp,#8*8 // end of current t[num] window
881 subs xzr,$topmost,#1 // "move" top-most carry to carry bit
884 ldp $acc0,$acc1,[$rp,#8*0]
886 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
888 ldp $a2,$a3,[$t2,#8*2]
891 ldp $a4,$a5,[$t2,#8*4]
894 ldp $a6,$a7,[$t2,#8*6]
896 adc $topmost,xzr,xzr // top-most carry
898 stp $t0,$t1,[$tp,#8*0]
899 stp $acc2,$acc3,[$tp,#8*2]
900 ldp $acc2,$acc3,[$rp,#8*2]
901 stp $acc4,$acc5,[$tp,#8*4]
902 ldp $acc4,$acc5,[$rp,#8*4]
903 cmp $cnt,x29 // did we hit the bottom?
904 stp $acc6,$acc7,[$tp,#8*6]
905 mov $tp,$rp // slide the window
906 ldp $acc6,$acc7,[$rp,#8*6]
908 b.ne .Lsqr8x_reduction
910 // Final step. We see if result is larger than modulus, and
911 // if it is, subtract the modulus. But comparison implies
912 // subtraction. So we subtract modulus, see if it borrowed,
913 // and conditionally copy original value.
914 ldr $rp,[x29,#96] // pull rp
919 mov $ap_end,$rp // $rp copy
923 ldp $a0,$a1,[$np,#8*0]
925 stp $t0,$t1,[$rp,#8*0]
927 ldp $a2,$a3,[$np,#8*2]
929 stp $t2,$t3,[$rp,#8*2]
931 ldp $a4,$a5,[$np,#8*4]
933 ldp $a6,$a7,[$np,#8*6]
935 ldp $acc0,$acc1,[$tp,#8*0]
937 ldp $acc2,$acc3,[$tp,#8*2]
938 ldp $acc4,$acc5,[$tp,#8*4]
939 ldp $acc6,$acc7,[$tp,#8*6]
941 stp $t0,$t1,[$rp,#8*4]
943 stp $t2,$t3,[$rp,#8*6]
946 cbnz $cnt,.Lsqr8x_sub
951 ldp $a0,$a1,[$ap_end,#8*0]
953 stp $t0,$t1,[$rp,#8*0]
955 ldp $a2,$a3,[$ap_end,#8*2]
957 stp $t2,$t3,[$rp,#8*2]
959 ldp $acc0,$acc1,[$ap,#8*0]
961 ldp $acc2,$acc3,[$ap,#8*2]
962 sbcs xzr,$topmost,xzr // did it borrow?
963 ldr x30,[x29,#8] // pull return address
964 stp $t0,$t1,[$rp,#8*4]
965 stp $t2,$t3,[$rp,#8*6]
970 csel $t0,$acc0,$a0,lo
971 stp xzr,xzr,[$tp,#8*0]
972 csel $t1,$acc1,$a1,lo
973 ldp $a0,$a1,[$ap_end,#8*4]
974 ldp $acc0,$acc1,[$ap,#8*4]
975 csel $t2,$acc2,$a2,lo
976 stp xzr,xzr,[$tp,#8*2]
978 csel $t3,$acc3,$a3,lo
979 ldp $a2,$a3,[$ap_end,#8*6]
980 ldp $acc2,$acc3,[$ap,#8*6]
982 stp $t0,$t1,[$ap_end,#8*0]
983 stp $t2,$t3,[$ap_end,#8*2]
984 add $ap_end,$ap_end,#8*4
985 stp xzr,xzr,[$ap,#8*0]
986 stp xzr,xzr,[$ap,#8*2]
987 cbnz $cnt,.Lsqr4x_cond_copy
989 csel $t0,$acc0,$a0,lo
990 stp xzr,xzr,[$tp,#8*0]
991 csel $t1,$acc1,$a1,lo
992 stp xzr,xzr,[$tp,#8*2]
993 csel $t2,$acc2,$a2,lo
994 csel $t3,$acc3,$a3,lo
995 stp $t0,$t1,[$ap_end,#8*0]
996 stp $t2,$t3,[$ap_end,#8*2]
1001 .Lsqr8x8_post_condition:
1003 ldr x30,[x29,#8] // pull return address
1004 // $acc0-7,$carry hold result, $a0-7 hold modulus
1006 ldr $ap,[x29,#96] // pull rp
1008 stp xzr,xzr,[sp,#8*0]
1010 stp xzr,xzr,[sp,#8*2]
1012 stp xzr,xzr,[sp,#8*4]
1014 stp xzr,xzr,[sp,#8*6]
1016 stp xzr,xzr,[sp,#8*8]
1018 stp xzr,xzr,[sp,#8*10]
1020 stp xzr,xzr,[sp,#8*12]
1021 sbcs $carry,$carry,xzr // did it borrow?
1022 stp xzr,xzr,[sp,#8*14]
1024 // $a0-7 hold result-modulus
1025 csel $a0,$acc0,$a0,lo
1026 csel $a1,$acc1,$a1,lo
1027 csel $a2,$acc2,$a2,lo
1028 csel $a3,$acc3,$a3,lo
1029 stp $a0,$a1,[$ap,#8*0]
1030 csel $a4,$acc4,$a4,lo
1031 csel $a5,$acc5,$a5,lo
1032 stp $a2,$a3,[$ap,#8*2]
1033 csel $a6,$acc6,$a6,lo
1034 csel $a7,$acc7,$a7,lo
1035 stp $a4,$a5,[$ap,#8*4]
1036 stp $a6,$a7,[$ap,#8*6]
1039 ldp x19,x20,[x29,#16]
1041 ldp x21,x22,[x29,#32]
1043 ldp x23,x24,[x29,#48]
1044 ldp x25,x26,[x29,#64]
1045 ldp x27,x28,[x29,#80]
1047 .inst 0xd50323bf // autiasp
1049 .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
1054 ########################################################################
1055 # Even though this might look as ARMv8 adaptation of mulx4x_mont from
1056 # x86_64-mont5 module, it's different in sense that it performs
1057 # reduction 256 bits at a time.
1059 my ($a0,$a1,$a2,$a3,
1062 $acc0,$acc1,$acc2,$acc3,$acc4,
1063 $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1065 my ($carry,$topmost) = ($rp,"x30");
1068 .type __bn_mul4x_mont,%function
1071 .inst 0xd503233f // paciasp
1072 stp x29,x30,[sp,#-128]!
1074 stp x19,x20,[sp,#16]
1075 stp x21,x22,[sp,#32]
1076 stp x23,x24,[sp,#48]
1077 stp x25,x26,[sp,#64]
1078 stp x27,x28,[sp,#80]
1080 sub $tp,sp,$num,lsl#3
1082 ldr $n0,[$n0] // *n0
1083 sub sp,$tp,#8*4 // alloca
1086 add $ap_end,$ap,$num
1087 stp $rp,$t0,[x29,#96] // offload rp and &b[num]
1089 ldr $bi,[$bp,#8*0] // b[0]
1090 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1091 ldp $a2,$a3,[$ap,#8*2]
1097 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1098 ldp $m2,$m3,[$np,#8*2]
1099 adds $np,$np,#8*4 // clear carry bit
1104 .Loop_mul4x_1st_reduction:
1105 mul $t0,$a0,$bi // lo(a[0..3]*b[0])
1106 adc $carry,$carry,xzr // modulo-scheduled
1112 adds $acc0,$acc0,$t0
1113 umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
1114 adcs $acc1,$acc1,$t1
1115 mul $mi,$acc0,$n0 // t[0]*n0
1116 adcs $acc2,$acc2,$t2
1118 adcs $acc3,$acc3,$t3
1122 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1123 adds $acc1,$acc1,$t0
1124 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
1125 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1126 adcs $acc2,$acc2,$t1
1128 adcs $acc3,$acc3,$t2
1130 adc $acc4,$acc4,$t3 // can't overflow
1132 // (*) adds xzr,$acc0,$t0
1133 subs xzr,$acc0,#1 // (*)
1134 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
1135 adcs $acc0,$acc1,$t1
1137 adcs $acc1,$acc2,$t2
1139 adcs $acc2,$acc3,$t3
1141 adcs $acc3,$acc4,$carry
1143 adds $acc0,$acc0,$t0
1145 adcs $acc1,$acc1,$t1
1146 adcs $acc2,$acc2,$t2
1147 adcs $acc3,$acc3,$t3
1148 //adc $carry,$carry,xzr
1149 cbnz $cnt,.Loop_mul4x_1st_reduction
1151 cbz $t0,.Lmul4x4_post_condition
1153 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1154 ldp $a2,$a3,[$ap,#8*2]
1156 ldr $mi,[sp] // a[0]*n0
1157 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1158 ldp $m2,$m3,[$np,#8*2]
1161 .Loop_mul4x_1st_tail:
1162 mul $t0,$a0,$bi // lo(a[4..7]*b[i])
1163 adc $carry,$carry,xzr // modulo-scheduled
1169 adds $acc0,$acc0,$t0
1170 umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
1171 adcs $acc1,$acc1,$t1
1173 adcs $acc2,$acc2,$t2
1175 adcs $acc3,$acc3,$t3
1178 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1179 adds $acc1,$acc1,$t0
1180 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
1181 adcs $acc2,$acc2,$t1
1183 adcs $acc3,$acc3,$t2
1185 adc $acc4,$acc4,$t3 // can't overflow
1187 adds $acc0,$acc0,$t0
1188 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
1189 adcs $acc1,$acc1,$t1
1191 adcs $acc2,$acc2,$t2
1193 adcs $acc3,$acc3,$t3
1194 adcs $acc4,$acc4,$carry
1197 ldr $mi,[sp,$cnt] // next t[0]*n0
1198 str $acc0,[$tp],#8 // result!!!
1199 adds $acc0,$acc1,$t0
1200 sub $t0,$ap_end,$ap // done yet?
1201 adcs $acc1,$acc2,$t1
1202 adcs $acc2,$acc3,$t2
1203 adcs $acc3,$acc4,$t3
1204 //adc $carry,$carry,xzr
1205 cbnz $cnt,.Loop_mul4x_1st_tail
1207 sub $t1,$ap_end,$num // rewinded $ap
1208 cbz $t0,.Lmul4x_proceed
1210 ldp $a0,$a1,[$ap,#8*0]
1211 ldp $a2,$a3,[$ap,#8*2]
1213 ldp $m0,$m1,[$np,#8*0]
1214 ldp $m2,$m3,[$np,#8*2]
1216 b .Loop_mul4x_1st_tail
1220 ldr $bi,[$bp,#8*4]! // *++b
1221 adc $topmost,$carry,xzr
1222 ldp $a0,$a1,[$t1,#8*0] // a[0..3]
1223 sub $np,$np,$num // rewind np
1224 ldp $a2,$a3,[$t1,#8*2]
1227 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1228 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1229 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1230 ldp $acc2,$acc3,[sp,#8*6]
1232 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1234 ldp $m2,$m3,[$np,#8*2]
1235 adds $np,$np,#8*4 // clear carry bit
1239 .Loop_mul4x_reduction:
1240 mul $t0,$a0,$bi // lo(a[0..3]*b[4])
1241 adc $carry,$carry,xzr // modulo-scheduled
1247 adds $acc0,$acc0,$t0
1248 umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
1249 adcs $acc1,$acc1,$t1
1250 mul $mi,$acc0,$n0 // t[0]*n0
1251 adcs $acc2,$acc2,$t2
1253 adcs $acc3,$acc3,$t3
1257 ldr $bi,[$bp,$cnt] // next b[i]
1258 adds $acc1,$acc1,$t0
1259 // (*) mul $t0,$m0,$mi
1260 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1261 adcs $acc2,$acc2,$t1
1262 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
1263 adcs $acc3,$acc3,$t2
1265 adc $acc4,$acc4,$t3 // can't overflow
1267 // (*) adds xzr,$acc0,$t0
1268 subs xzr,$acc0,#1 // (*)
1269 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
1270 adcs $acc0,$acc1,$t1
1272 adcs $acc1,$acc2,$t2
1274 adcs $acc2,$acc3,$t3
1276 adcs $acc3,$acc4,$carry
1278 adds $acc0,$acc0,$t0
1279 adcs $acc1,$acc1,$t1
1280 adcs $acc2,$acc2,$t2
1281 adcs $acc3,$acc3,$t3
1282 //adc $carry,$carry,xzr
1283 cbnz $cnt,.Loop_mul4x_reduction
1285 adc $carry,$carry,xzr
1286 ldp $t0,$t1,[$tp,#8*4] // t[4..7]
1287 ldp $t2,$t3,[$tp,#8*6]
1288 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1289 ldp $a2,$a3,[$ap,#8*2]
1291 adds $acc0,$acc0,$t0
1292 adcs $acc1,$acc1,$t1
1293 adcs $acc2,$acc2,$t2
1294 adcs $acc3,$acc3,$t3
1295 //adc $carry,$carry,xzr
1297 ldr $mi,[sp] // t[0]*n0
1298 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1299 ldp $m2,$m3,[$np,#8*2]
1304 mul $t0,$a0,$bi // lo(a[4..7]*b[4])
1305 adc $carry,$carry,xzr // modulo-scheduled
1311 adds $acc0,$acc0,$t0
1312 umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
1313 adcs $acc1,$acc1,$t1
1315 adcs $acc2,$acc2,$t2
1317 adcs $acc3,$acc3,$t3
1320 ldr $bi,[$bp,$cnt] // next b[i]
1321 adds $acc1,$acc1,$t0
1322 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
1323 adcs $acc2,$acc2,$t1
1325 adcs $acc3,$acc3,$t2
1327 adc $acc4,$acc4,$t3 // can't overflow
1329 adds $acc0,$acc0,$t0
1330 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
1331 adcs $acc1,$acc1,$t1
1333 adcs $acc2,$acc2,$t2
1335 adcs $acc3,$acc3,$t3
1337 adcs $acc4,$acc4,$carry
1338 ldr $mi,[sp,$cnt] // next a[0]*n0
1340 str $acc0,[$tp],#8 // result!!!
1341 adds $acc0,$acc1,$t0
1342 sub $t0,$ap_end,$ap // done yet?
1343 adcs $acc1,$acc2,$t1
1344 adcs $acc2,$acc3,$t2
1345 adcs $acc3,$acc4,$t3
1346 //adc $carry,$carry,xzr
1347 cbnz $cnt,.Loop_mul4x_tail
1349 sub $t1,$np,$num // rewinded np?
1350 adc $carry,$carry,xzr
1351 cbz $t0,.Loop_mul4x_break
1353 ldp $t0,$t1,[$tp,#8*4]
1354 ldp $t2,$t3,[$tp,#8*6]
1355 ldp $a0,$a1,[$ap,#8*0]
1356 ldp $a2,$a3,[$ap,#8*2]
1358 adds $acc0,$acc0,$t0
1359 adcs $acc1,$acc1,$t1
1360 adcs $acc2,$acc2,$t2
1361 adcs $acc3,$acc3,$t3
1362 //adc $carry,$carry,xzr
1363 ldp $m0,$m1,[$np,#8*0]
1364 ldp $m2,$m3,[$np,#8*2]
1370 ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
1371 adds $acc0,$acc0,$topmost
1372 add $bp,$bp,#8*4 // bp++
1373 adcs $acc1,$acc1,xzr
1374 sub $ap,$ap,$num // rewind ap
1375 adcs $acc2,$acc2,xzr
1376 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1377 adcs $acc3,$acc3,xzr
1378 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1379 adc $topmost,$carry,xzr
1380 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1381 cmp $bp,$t3 // done yet?
1382 ldp $acc2,$acc3,[sp,#8*6]
1383 ldp $m0,$m1,[$t1,#8*0] // n[0..3]
1384 ldp $m2,$m3,[$t1,#8*2]
1389 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1390 ldp $a2,$a3,[$ap,#8*2]
1391 adds $ap,$ap,#8*4 // clear carry bit
1394 b .Loop_mul4x_reduction
1398 // Final step. We see if result is larger than modulus, and
1399 // if it is, subtract the modulus. But comparison implies
1400 // subtraction. So we subtract modulus, see if it borrowed,
1401 // and conditionally copy original value.
1403 mov $ap_end,$t2 // $rp copy
1411 ldp $m0,$m1,[$np,#8*0]
1413 ldp $acc0,$acc1,[$tp,#8*0]
1415 ldp $m2,$m3,[$np,#8*2]
1417 ldp $acc2,$acc3,[$tp,#8*2]
1419 stp $t0,$t1,[$rp,#8*0]
1421 stp $t2,$t3,[$rp,#8*2]
1424 cbnz $cnt,.Lmul4x_sub
1429 ldp $a0,$a1,[$ap_end,#8*0]
1431 stp $t0,$t1,[$rp,#8*0]
1432 ldp $a2,$a3,[$ap_end,#8*2]
1433 stp $t2,$t3,[$rp,#8*2]
1434 ldp $acc0,$acc1,[$ap,#8*0]
1435 ldp $acc2,$acc3,[$ap,#8*2]
1436 sbcs xzr,$topmost,xzr // did it borrow?
1437 ldr x30,[x29,#8] // pull return address
1442 csel $t0,$acc0,$a0,lo
1443 stp xzr,xzr,[$tp,#8*0]
1444 csel $t1,$acc1,$a1,lo
1445 ldp $a0,$a1,[$ap_end,#8*4]
1446 ldp $acc0,$acc1,[$ap,#8*4]
1447 csel $t2,$acc2,$a2,lo
1448 stp xzr,xzr,[$tp,#8*2]
1450 csel $t3,$acc3,$a3,lo
1451 ldp $a2,$a3,[$ap_end,#8*6]
1452 ldp $acc2,$acc3,[$ap,#8*6]
1454 stp $t0,$t1,[$ap_end,#8*0]
1455 stp $t2,$t3,[$ap_end,#8*2]
1456 add $ap_end,$ap_end,#8*4
1457 cbnz $cnt,.Lmul4x_cond_copy
1459 csel $t0,$acc0,$a0,lo
1460 stp xzr,xzr,[$tp,#8*0]
1461 csel $t1,$acc1,$a1,lo
1462 stp xzr,xzr,[$tp,#8*2]
1463 csel $t2,$acc2,$a2,lo
1464 stp xzr,xzr,[$tp,#8*3]
1465 csel $t3,$acc3,$a3,lo
1466 stp xzr,xzr,[$tp,#8*4]
1467 stp $t0,$t1,[$ap_end,#8*0]
1468 stp $t2,$t3,[$ap_end,#8*2]
1473 .Lmul4x4_post_condition:
1474 adc $carry,$carry,xzr
1475 ldr $ap,[x29,#96] // pull rp
1476 // $acc0-3,$carry hold result, $m0-7 hold modulus
1478 ldr x30,[x29,#8] // pull return address
1480 stp xzr,xzr,[sp,#8*0]
1482 stp xzr,xzr,[sp,#8*2]
1484 stp xzr,xzr,[sp,#8*4]
1485 sbcs xzr,$carry,xzr // did it borrow?
1486 stp xzr,xzr,[sp,#8*6]
1488 // $a0-3 hold result-modulus
1489 csel $a0,$acc0,$a0,lo
1490 csel $a1,$acc1,$a1,lo
1491 csel $a2,$acc2,$a2,lo
1492 csel $a3,$acc3,$a3,lo
1493 stp $a0,$a1,[$ap,#8*0]
1494 stp $a2,$a3,[$ap,#8*2]
1497 ldp x19,x20,[x29,#16]
1499 ldp x21,x22,[x29,#32]
1501 ldp x23,x24,[x29,#48]
1502 ldp x25,x26,[x29,#64]
1503 ldp x27,x28,[x29,#80]
1505 .inst 0xd50323bf // autiasp
1507 .size __bn_mul4x_mont,.-__bn_mul4x_mont
1511 .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1517 close STDOUT or die "error closing STDOUT";