3 # ====================================================================
4 # Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
5 # <appro@openssl.org>. The module is licensed under 2-clause BSD
6 # license. November 2012. All rights reserved.
7 # ====================================================================
9 ######################################################################
10 # Montgomery squaring-n-multiplication module for SPARC T4.
12 # The module consists of three parts:
14 # 1) collection of "single-op" subroutines that perform single
15 # operation, Montgomery squaring or multiplication, on 512-,
16 # 1024-, 1536- and 2048-bit operands;
17 # 2) collection of "multi-op" subroutines that perform 5 squaring and
18 # 1 multiplication operations on operands of above lengths;
19 # 3) fall-back and helper VIS3 subroutines.
21 # RSA sign is dominated by multi-op subroutine, while RSA verify and
22 # DSA - by single-op. Special note about 4096-bit RSA verify result.
23 # Operands are too long for dedicated hardware and it's handled by
24 # VIS3 code, which is why you don't see any improvement. It's surely
25 # possible to improve it [by deploying 'mpmul' instruction], maybe in
28 # Performance improvement.
30 # 64-bit process, VIS3:
31 # sign verify sign/s verify/s
32 # rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4
33 # rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3
34 # rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9
35 # dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9
36 # dsa 2048 bits 0.001056s 0.001233s 946.9 810.8
38 # 64-bit process, this module:
39 # sign verify sign/s verify/s
40 # rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9
41 # rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7
42 # rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5
43 # dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5
44 # dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6
46 ######################################################################
47 # 32-bit process, VIS3:
48 # sign verify sign/s verify/s
49 # rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3
50 # rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4
51 # rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8
52 # dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6
53 # dsa 2048 bits 0.001101s 0.001260s 908.2 793.4
55 # 32-bit process, this module:
56 # sign verify sign/s verify/s
57 # rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0
58 # rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7
59 # rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4
60 # dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2
61 # dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2
63 # 32-bit code is prone to performance degradation as interrupt rate
64 # dispatched to CPU executing the code grows. This is because in
65 # standard process of handling interrupt in 32-bit process context
66 # upper halves of most integer registers used as input or output are
67 # zeroed. This renders result invalid, and operation has to be re-run.
68 # If CPU is "bothered" with timer interrupts only, the penalty is
69 # hardly measurable. But in order to mitigate this problem for higher
70 # interrupt rates contemporary Linux kernel recognizes biased stack
71 # even in 32-bit process context and preserves full register contents.
72 # See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
75 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
76 push(@INC,"${dir}","${dir}../../perlasm");
77 require "sparcv9_modes.pl";
80 open STDOUT,">$output";
83 #include "sparc_arch.h"
86 .register %g2,#scratch
87 .register %g3,#scratch
90 .section ".text",#alloc,#execinstr
97 ########################################################################
98 # Register layout for mont[mul|sqr] instructions.
99 # For details see "Oracle SPARC Architecture 2011" manual at
100 # http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
102 my @R=map("%f".2*$_,(0..11,30,31,12..29));
103 my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
104 my @A=(@N[0..13],@R[14..31]);
105 my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
107 ########################################################################
108 # int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
109 # const u64 *np,const BN_ULONG *n0);
111 sub generate_bn_mul_mont_t4() {
113 my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
116 .globl bn_mul_mont_t4_$NUM
122 #elif defined(SPARCV9_64BIT_STACK)
123 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
124 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
126 and %g1,SPARCV9_64BIT_STACK,%g1
134 sllx $sentinel,32,$sentinel
137 save %sp,-128,%sp ! warm it up
152 or %g4,$sentinel,$sentinel
154 ! copy arguments to global registers
159 ld [%i4+0],%f1 ! load *n0
164 # load ap[$NUM] ########################################################
166 save %sp,-128,%sp; or $sentinel,%fp,%fp
168 for($i=0; $i<14 && $i<$NUM; $i++) {
169 my $lo=$i<13?@A[$i+1]:"%o7";
172 ld [$ap+$i*8+4],@A[$i]
173 sllx @A[$i],32,@A[$i]
177 for(; $i<$NUM; $i++) {
178 my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
185 # load np[$NUM] ########################################################
187 save %sp,-128,%sp; or $sentinel,%fp,%fp
189 for($i=0; $i<14 && $i<$NUM; $i++) {
190 my $lo=$i<13?@N[$i+1]:"%o7";
193 ld [$np+$i*8+4],@N[$i]
194 sllx @N[$i],32,@N[$i]
199 save %sp,-128,%sp; or $sentinel,%fp,%fp
201 for(; $i<28 && $i<$NUM; $i++) {
202 my $lo=$i<27?@N[$i+1]:"%o7";
205 ld [$np+$i*8+4],@N[$i]
206 sllx @N[$i],32,@N[$i]
211 save %sp,-128,%sp; or $sentinel,%fp,%fp
213 for(; $i<$NUM; $i++) {
214 my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
217 ld [$np+$i*8+4],@N[$i]
218 sllx @N[$i],32,@N[$i]
224 be SIZE_T_CC,.Lmsquare_$NUM
228 # load bp[$NUM] ########################################################
230 save %sp,-128,%sp; or $sentinel,%fp,%fp
232 for($i=0; $i<14 && $i<$NUM; $i++) {
233 my $lo=$i<13?@B[$i+1]:"%o7";
236 ld [$bp+$i*8+4],@B[$i]
237 sllx @B[$i],32,@B[$i]
242 save %sp,-128,%sp; or $sentinel,%fp,%fp
244 for(; $i<$NUM; $i++) {
245 my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
248 ld [$bp+$i*8+4],@B[$i]
249 sllx @B[$i],32,@B[$i]
253 # magic ################################################################
255 .word 0x81b02920+$NUM-1 ! montmul $NUM-1
257 fbu,pn %fcc3,.Lmabort_$NUM
259 and %fp,$sentinel,$sentinel
260 brz,pn $sentinel,.Lmabort_$NUM
270 restore; and %fp,$sentinel,$sentinel
271 restore; and %fp,$sentinel,$sentinel
272 restore; and %fp,$sentinel,$sentinel
273 restore; and %fp,$sentinel,$sentinel
274 brz,pn $sentinel,.Lmabort1_$NUM
279 # save tp[$NUM] ########################################################
280 for($i=0; $i<14 && $i<$NUM; $i++) {
282 movxtod @A[$i],@R[$i]
289 and %fp,$sentinel,$sentinel
292 and %fp,$sentinel,$sentinel
293 srl %fp,0,%fp ! just in case?
294 or %o7,$sentinel,$sentinel
295 brz,a,pn $sentinel,.Lmdone_$NUM
296 mov 0,%i0 ! return failure
299 for($i=0; $i<12 && $i<$NUM; $i++) {
300 @R[$i] =~ /%f([0-9]+)/;
301 my $lo = "%f".($1+1);
304 st @R[$i],[$rp+$i*8+4]
307 for(; $i<$NUM; $i++) {
308 my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
316 mov 1,%i0 ! return success
330 mov 0,%i0 ! return failure
336 save %sp,-128,%sp; or $sentinel,%fp,%fp
337 save %sp,-128,%sp; or $sentinel,%fp,%fp
338 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
341 .type bn_mul_mont_t4_$NUM, #function
342 .size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
346 for ($i=8;$i<=32;$i+=8) {
347 &generate_bn_mul_mont_t4($i);
350 ########################################################################
353 my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
358 sll %o5, 3, %o5 ! offset within first cache line
359 add %o5, $ptbl, $ptbl ! of the pwrtbl
363 $code.=<<___ if (!$skip_wr);
368 my ($pwrtbl,$B0,$B1)=@_;
371 ldx [$pwrtbl+0*32], $B0
372 ldx [$pwrtbl+8*32], $B1
373 ldx [$pwrtbl+1*32], %o4
374 ldx [$pwrtbl+9*32], %o5
376 ldx [$pwrtbl+2*32], %o4
378 ldx [$pwrtbl+10*32],%o5
380 ldx [$pwrtbl+3*32], %o4
382 ldx [$pwrtbl+11*32],%o5
383 movneg %icc, %o4, $B0
384 ldx [$pwrtbl+4*32], %o4
385 movneg %icc, %o5, $B1
386 ldx [$pwrtbl+12*32],%o5
388 ldx [$pwrtbl+5*32],%o4
390 ldx [$pwrtbl+13*32],%o5
392 ldx [$pwrtbl+6*32], %o4
394 ldx [$pwrtbl+14*32],%o5
396 ldx [$pwrtbl+7*32], %o4
398 ldx [$pwrtbl+15*32],%o5
399 movneg %xcc, %o4, $B0
400 add $pwrtbl,16*32, $pwrtbl
401 movneg %xcc, %o5, $B1
408 ldx [$pwrtbl+0*32], $Bi
409 ldx [$pwrtbl+1*32], %o4
410 ldx [$pwrtbl+2*32], %o5
412 ldx [$pwrtbl+3*32], %o4
414 ldx [$pwrtbl+4*32], %o5
415 movneg %icc, %o4, $Bi
416 ldx [$pwrtbl+5*32], %o4
418 ldx [$pwrtbl+6*32], %o5
420 ldx [$pwrtbl+7*32], %o4
422 add $pwrtbl,8*32, $pwrtbl
423 movneg %xcc, %o4, $Bi
427 ########################################################################
428 # int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
429 # const u64 *pwrtbl,int pwr,int stride);
431 sub generate_bn_pwr5_mont_t4() {
433 my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
436 .globl bn_pwr5_mont_t4_$NUM
438 bn_pwr5_mont_t4_$NUM:
442 #elif defined(SPARCV9_64BIT_STACK)
443 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
444 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
446 and %g1,SPARCV9_64BIT_STACK,%g1
454 sllx $sentinel,32,$sentinel
457 save %sp,-128,%sp ! warm it up
472 or %g4,$sentinel,$sentinel
474 ! copy arguments to global registers
477 ld [%i2+0],%f1 ! load *n0
480 srl %i4,%g0,%i4 ! pack last arguments
486 # load tp[$NUM] ########################################################
488 save %sp,-128,%sp; or $sentinel,%fp,%fp
490 for($i=0; $i<14 && $i<$NUM; $i++) {
492 ldx [$tp+$i*8],@A[$i]
495 for(; $i<$NUM; $i++) {
497 ldd [$tp+$i*8],@A[$i]
500 # load np[$NUM] ########################################################
502 save %sp,-128,%sp; or $sentinel,%fp,%fp
504 for($i=0; $i<14 && $i<$NUM; $i++) {
506 ldx [$np+$i*8],@N[$i]
510 save %sp,-128,%sp; or $sentinel,%fp,%fp
512 for(; $i<28 && $i<$NUM; $i++) {
514 ldx [$np+$i*8],@N[$i]
518 save %sp,-128,%sp; or $sentinel,%fp,%fp
520 for(; $i<$NUM; $i++) {
522 ldx [$np+$i*8],@N[$i]
525 # load pwrtbl[pwr] ########################################################
527 save %sp,-128,%sp; or $sentinel,%fp,%fp
529 srlx $pwr, 32, %o4 ! unpack $pwr
533 sllx %o4, 32, $pwr ! re-pack $pwr
537 &load_ccr("%o7","%o5","%o4");
544 for($i=0; $i<14 && $i<$NUM; $i+=2) {
545 &load_b_pair("%o7",@B[$i],@B[$i+1]);
548 save %sp,-128,%sp; or $sentinel,%fp,%fp
550 for(; $i<$NUM; $i+=2) {
551 &load_b_pair("%i7",@B[$i],@B[$i+1]);
554 srax $pwr, 32, %o4 ! unpack $pwr
558 sllx %o4, 32, $pwr ! re-pack $pwr
562 &load_ccr("%i7","%o5","%o4",1);
564 # magic ################################################################
565 for($i=0; $i<5; $i++) {
567 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
568 fbu,pn %fcc3,.Labort_$NUM
570 and %fp,$sentinel,$sentinel
571 brz,pn $sentinel,.Labort_$NUM
578 .word 0x81b02920+$NUM-1 ! montmul $NUM-1
579 fbu,pn %fcc3,.Labort_$NUM
581 and %fp,$sentinel,$sentinel
582 brz,pn $sentinel,.Labort_$NUM
587 brgez %o4,.Lstride_$NUM
594 brgez %o4,.Lstride_$NUM
595 restore; and %fp,$sentinel,$sentinel
596 restore; and %fp,$sentinel,$sentinel
597 restore; and %fp,$sentinel,$sentinel
598 restore; and %fp,$sentinel,$sentinel
599 brz,pn $sentinel,.Labort1_$NUM
604 # save tp[$NUM] ########################################################
605 for($i=0; $i<14 && $i<$NUM; $i++) {
607 movxtod @A[$i],@R[$i]
614 and %fp,$sentinel,$sentinel
617 and %fp,$sentinel,$sentinel
618 srl %fp,0,%fp ! just in case?
619 or %o7,$sentinel,$sentinel
620 brz,a,pn $sentinel,.Ldone_$NUM
621 mov 0,%i0 ! return failure
624 for($i=0; $i<$NUM; $i++) {
626 std @R[$i],[$tp+$i*8]
630 mov 1,%i0 ! return success
644 mov 0,%i0 ! return failure
647 .type bn_pwr5_mont_t4_$NUM, #function
648 .size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
652 for ($i=8;$i<=32;$i+=8) {
653 &generate_bn_pwr5_mont_t4($i);
657 ########################################################################
658 # Fall-back subroutines
660 # copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
662 ($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
663 (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
666 $rp="%o0"; # u64 *rp,
667 $ap="%o1"; # const u64 *ap,
668 $bp="%o2"; # const u64 *bp,
669 $np="%o3"; # const u64 *np,
670 $n0p="%o4"; # const BN_ULONG *n0,
671 $num="%o5"; # int num); # caller ensures that num is >=3
673 .globl bn_mul_mont_t4
676 add %sp, STACK_BIAS, %g4 ! real top of stack
677 sll $num, 3, $num ! size in bytes
679 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
681 andn %g1, 63, %g1 ! align at 64 byte
682 sub %g1, STACK_FRAME, %g1 ! new top of stack
687 # +-------------------------------+<----- %sp
689 # +-------------------------------+<----- aligned at 64 bytes
691 # +-------------------------------+
694 # +-------------------------------+<----- aligned at 64 bytes
696 ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
697 ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
700 ld [$n0p+0], $t0 ! pull n0[0..1] value
702 add %sp, STACK_BIAS+STACK_FRAME, $tp
703 ldx [$bp+0], $m0 ! m0=bp[0]
708 ldx [$ap+0], $aj ! ap[0]
710 mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
711 umulxhi $aj, $m0, $hi0
713 ldx [$ap+8], $aj ! ap[1]
715 ldx [$np+0], $nj ! np[0]
717 mulx $lo0, $n0, $m1 ! "tp[0]"*n0
719 mulx $aj, $m0, $alo ! ap[1]*bp[0]
720 umulxhi $aj, $m0, $aj ! ahi=aj
722 mulx $nj, $m1, $lo1 ! np[0]*m1
723 umulxhi $nj, $m1, $hi1
725 ldx [$np+8], $nj ! np[1]
727 addcc $lo0, $lo1, $lo1
729 addxc %g0, $hi1, $hi1
731 mulx $nj, $m1, $nlo ! np[1]*m1
732 umulxhi $nj, $m1, $nj ! nhi=nj
735 sub $num, 24, $cnt ! cnt=num-3
739 addcc $alo, $hi0, $lo0
742 ldx [$ap+0], $aj ! ap[j]
743 addcc $nlo, $hi1, $lo1
745 addxc $nj, %g0, $hi1 ! nhi=nj
747 ldx [$np+0], $nj ! np[j]
748 mulx $aj, $m0, $alo ! ap[j]*bp[0]
750 umulxhi $aj, $m0, $aj ! ahi=aj
752 mulx $nj, $m1, $nlo ! np[j]*m1
753 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
754 umulxhi $nj, $m1, $nj ! nhi=nj
755 addxc %g0, $hi1, $hi1
756 stxa $lo1, [$tp]0xe2 ! tp[j-1]
757 add $tp, 8, $tp ! tp++
760 sub $cnt, 8, $cnt ! j--
762 addcc $alo, $hi0, $lo0
763 addxc $aj, %g0, $hi0 ! ahi=aj
765 addcc $nlo, $hi1, $lo1
767 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
768 addxc %g0, $hi1, $hi1
769 stxa $lo1, [$tp]0xe2 ! tp[j-1]
772 addcc $hi0, $hi1, $hi1
773 addxc %g0, %g0, $ovf ! upmost overflow bit
778 sub $num, 16, $i ! i=num-2
782 ldx [$bp+0], $m0 ! m0=bp[i]
785 sub $ap, $num, $ap ! rewind
789 ldx [$ap+0], $aj ! ap[0]
790 ldx [$np+0], $nj ! np[0]
792 mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
793 ldx [$tp], $tj ! tp[0]
794 umulxhi $aj, $m0, $hi0
795 ldx [$ap+8], $aj ! ap[1]
796 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
797 mulx $aj, $m0, $alo ! ap[1]*bp[i]
798 addxc %g0, $hi0, $hi0
799 mulx $lo0, $n0, $m1 ! tp[0]*n0
800 umulxhi $aj, $m0, $aj ! ahi=aj
801 mulx $nj, $m1, $lo1 ! np[0]*m1
803 umulxhi $nj, $m1, $hi1
804 ldx [$np+8], $nj ! np[1]
806 addcc $lo1, $lo0, $lo1
807 mulx $nj, $m1, $nlo ! np[1]*m1
808 addxc %g0, $hi1, $hi1
809 umulxhi $nj, $m1, $nj ! nhi=nj
812 sub $num, 24, $cnt ! cnt=num-3
815 addcc $alo, $hi0, $lo0
816 ldx [$tp+8], $tj ! tp[j]
817 addxc $aj, %g0, $hi0 ! ahi=aj
818 ldx [$ap+0], $aj ! ap[j]
820 addcc $nlo, $hi1, $lo1
821 mulx $aj, $m0, $alo ! ap[j]*bp[i]
822 addxc $nj, %g0, $hi1 ! nhi=nj
823 ldx [$np+0], $nj ! np[j]
825 umulxhi $aj, $m0, $aj ! ahi=aj
826 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
827 mulx $nj, $m1, $nlo ! np[j]*m1
828 addxc %g0, $hi0, $hi0
829 umulxhi $nj, $m1, $nj ! nhi=nj
830 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
831 addxc %g0, $hi1, $hi1
832 stx $lo1, [$tp] ! tp[j-1]
834 brnz,pt $cnt, .Linner
837 ldx [$tp+8], $tj ! tp[j]
838 addcc $alo, $hi0, $lo0
839 addxc $aj, %g0, $hi0 ! ahi=aj
840 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
841 addxc %g0, $hi0, $hi0
843 addcc $nlo, $hi1, $lo1
844 addxc $nj, %g0, $hi1 ! nhi=nj
845 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
846 addxc %g0, $hi1, $hi1
847 stx $lo1, [$tp] ! tp[j-1]
849 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
850 addxccc $hi1, $hi0, $hi1
858 sub $ap, $num, $ap ! rewind
862 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
870 subccc $tj, $nj, $t2 ! tp[j]-np[j]
875 st $t2, [$rp-4] ! reverse order
880 sub $np, $num, $np ! rewind
884 subc $ovf, %g0, $ovf ! handle upmost overflow bit
887 or $np, $ap, $ap ! ap=borrow?tp:rp
892 .Lcopy: ! copy or in-place refresh
905 .type bn_mul_mont_t4, #function
906 .size bn_mul_mont_t4, .-bn_mul_mont_t4
909 # int bn_mul_mont_gather5(
910 $rp="%o0"; # u64 *rp,
911 $ap="%o1"; # const u64 *ap,
912 $bp="%o2"; # const u64 *pwrtbl,
913 $np="%o3"; # const u64 *np,
914 $n0p="%o4"; # const BN_ULONG *n0,
915 $num="%o5"; # int num, # caller ensures that num is >=3
918 .globl bn_mul_mont_gather5_t4
920 bn_mul_mont_gather5_t4:
921 add %sp, STACK_BIAS, %g4 ! real top of stack
922 sll $num, 3, $num ! size in bytes
924 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
926 andn %g1, 63, %g1 ! align at 64 byte
927 sub %g1, STACK_FRAME, %g1 ! new top of stack
929 LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument
933 # +-------------------------------+<----- %sp
935 # +-------------------------------+<----- aligned at 64 bytes
937 # +-------------------------------+
940 # +-------------------------------+<----- aligned at 64 bytes
942 ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
943 ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
945 &load_ccr($bp,"%g4",$ccr);
946 &load_b($bp,$m0,"%o7"); # m0=bp[0]
949 ld [$n0p+0], $t0 ! pull n0[0..1] value
951 add %sp, STACK_BIAS+STACK_FRAME, $tp
955 ldx [$ap+0], $aj ! ap[0]
957 mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
958 umulxhi $aj, $m0, $hi0
960 ldx [$ap+8], $aj ! ap[1]
962 ldx [$np+0], $nj ! np[0]
964 mulx $lo0, $n0, $m1 ! "tp[0]"*n0
966 mulx $aj, $m0, $alo ! ap[1]*bp[0]
967 umulxhi $aj, $m0, $aj ! ahi=aj
969 mulx $nj, $m1, $lo1 ! np[0]*m1
970 umulxhi $nj, $m1, $hi1
972 ldx [$np+8], $nj ! np[1]
974 addcc $lo0, $lo1, $lo1
976 addxc %g0, $hi1, $hi1
978 mulx $nj, $m1, $nlo ! np[1]*m1
979 umulxhi $nj, $m1, $nj ! nhi=nj
982 sub $num, 24, $cnt ! cnt=num-3
986 addcc $alo, $hi0, $lo0
989 ldx [$ap+0], $aj ! ap[j]
990 addcc $nlo, $hi1, $lo1
992 addxc $nj, %g0, $hi1 ! nhi=nj
994 ldx [$np+0], $nj ! np[j]
995 mulx $aj, $m0, $alo ! ap[j]*bp[0]
997 umulxhi $aj, $m0, $aj ! ahi=aj
999 mulx $nj, $m1, $nlo ! np[j]*m1
1000 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
1001 umulxhi $nj, $m1, $nj ! nhi=nj
1002 addxc %g0, $hi1, $hi1
1003 stxa $lo1, [$tp]0xe2 ! tp[j-1]
1004 add $tp, 8, $tp ! tp++
1006 brnz,pt $cnt, .L1st_g5
1007 sub $cnt, 8, $cnt ! j--
1009 addcc $alo, $hi0, $lo0
1010 addxc $aj, %g0, $hi0 ! ahi=aj
1012 addcc $nlo, $hi1, $lo1
1013 addxc $nj, %g0, $hi1
1014 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
1015 addxc %g0, $hi1, $hi1
1016 stxa $lo1, [$tp]0xe2 ! tp[j-1]
1019 addcc $hi0, $hi1, $hi1
1020 addxc %g0, %g0, $ovf ! upmost overflow bit
1021 stxa $hi1, [$tp]0xe2
1025 sub $num, 16, $i ! i=num-2
1031 &load_b($bp,$m0); # m0=bp[i]
1033 sub $ap, $num, $ap ! rewind
1037 ldx [$ap+0], $aj ! ap[0]
1038 ldx [$np+0], $nj ! np[0]
1040 mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
1041 ldx [$tp], $tj ! tp[0]
1042 umulxhi $aj, $m0, $hi0
1043 ldx [$ap+8], $aj ! ap[1]
1044 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
1045 mulx $aj, $m0, $alo ! ap[1]*bp[i]
1046 addxc %g0, $hi0, $hi0
1047 mulx $lo0, $n0, $m1 ! tp[0]*n0
1048 umulxhi $aj, $m0, $aj ! ahi=aj
1049 mulx $nj, $m1, $lo1 ! np[0]*m1
1051 umulxhi $nj, $m1, $hi1
1052 ldx [$np+8], $nj ! np[1]
1054 addcc $lo1, $lo0, $lo1
1055 mulx $nj, $m1, $nlo ! np[1]*m1
1056 addxc %g0, $hi1, $hi1
1057 umulxhi $nj, $m1, $nj ! nhi=nj
1060 sub $num, 24, $cnt ! cnt=num-3
1063 addcc $alo, $hi0, $lo0
1064 ldx [$tp+8], $tj ! tp[j]
1065 addxc $aj, %g0, $hi0 ! ahi=aj
1066 ldx [$ap+0], $aj ! ap[j]
1068 addcc $nlo, $hi1, $lo1
1069 mulx $aj, $m0, $alo ! ap[j]*bp[i]
1070 addxc $nj, %g0, $hi1 ! nhi=nj
1071 ldx [$np+0], $nj ! np[j]
1073 umulxhi $aj, $m0, $aj ! ahi=aj
1074 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
1075 mulx $nj, $m1, $nlo ! np[j]*m1
1076 addxc %g0, $hi0, $hi0
1077 umulxhi $nj, $m1, $nj ! nhi=nj
1078 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
1079 addxc %g0, $hi1, $hi1
1080 stx $lo1, [$tp] ! tp[j-1]
1082 brnz,pt $cnt, .Linner_g5
1085 ldx [$tp+8], $tj ! tp[j]
1086 addcc $alo, $hi0, $lo0
1087 addxc $aj, %g0, $hi0 ! ahi=aj
1088 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
1089 addxc %g0, $hi0, $hi0
1091 addcc $nlo, $hi1, $lo1
1092 addxc $nj, %g0, $hi1 ! nhi=nj
1093 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
1094 addxc %g0, $hi1, $hi1
1095 stx $lo1, [$tp] ! tp[j-1]
1097 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
1098 addxccc $hi1, $hi0, $hi1
1099 addxc %g0, %g0, $ovf
1103 brnz,pt $i, .Louter_g5
1106 sub $ap, $num, $ap ! rewind
1110 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
1118 subccc $tj, $nj, $t2 ! tp[j]-np[j]
1121 subccc $tj, $nj, $t3
1123 st $t2, [$rp-4] ! reverse order
1125 brnz,pt $cnt, .Lsub_g5
1128 sub $np, $num, $np ! rewind
1132 subc $ovf, %g0, $ovf ! handle upmost overflow bit
1135 or $np, $ap, $ap ! ap=borrow?tp:rp
1140 .Lcopy_g5: ! copy or in-place refresh
1143 stx %g0, [$tp] ! zap
1147 brnz $cnt, .Lcopy_g5
1153 .type bn_mul_mont_gather5_t4, #function
1154 .size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
1169 brnz %o2, .Loop_flip
1173 .type bn_flip_t4, #function
1174 .size bn_flip_t4, .-bn_flip_t4
1176 .globl bn_flip_n_scatter5_t4
1178 bn_flip_n_scatter5_t4:
1181 add %o3, %o2, %o2 ! &pwrtbl[pwr]
1183 .Loop_flip_n_scatter5:
1184 ld [%o0+0], %o4 ! inp[i]
1191 brnz %o1, .Loop_flip_n_scatter5
1195 .type bn_flip_n_scatter5_t4, #function
1196 .size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
1198 .globl bn_gather5_t4
1202 &load_ccr("%o2","%o3","%g1");
1207 &load_b("%o2","%g1");
1211 brnz %o1, .Loop_gather5
1216 .type bn_gather5_t4, #function
1217 .size bn_gather5_t4, .-bn_gather5_t4
1219 .asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"