2 # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by David S. Miller and Andy Polyakov
12 # The module is licensed under 2-clause BSD license.
13 # November 2012. All rights reserved.
14 # ====================================================================
16 ######################################################################
17 # Montgomery squaring-n-multiplication module for SPARC T4.
19 # The module consists of three parts:
21 # 1) collection of "single-op" subroutines that perform single
22 # operation, Montgomery squaring or multiplication, on 512-,
23 # 1024-, 1536- and 2048-bit operands;
24 # 2) collection of "multi-op" subroutines that perform 5 squaring and
25 # 1 multiplication operations on operands of above lengths;
26 # 3) fall-back and helper VIS3 subroutines.
28 # RSA sign is dominated by multi-op subroutine, while RSA verify and
29 # DSA - by single-op. Special note about 4096-bit RSA verify result.
30 # Operands are too long for dedicated hardware and it's handled by
31 # VIS3 code, which is why you don't see any improvement. It's surely
32 # possible to improve it [by deploying 'mpmul' instruction], maybe in
35 # Performance improvement.
37 # 64-bit process, VIS3:
38 # sign verify sign/s verify/s
39 # rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4
40 # rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3
41 # rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9
42 # dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9
43 # dsa 2048 bits 0.001056s 0.001233s 946.9 810.8
45 # 64-bit process, this module:
46 # sign verify sign/s verify/s
47 # rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9
48 # rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7
49 # rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5
50 # dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5
51 # dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6
53 ######################################################################
54 # 32-bit process, VIS3:
55 # sign verify sign/s verify/s
56 # rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3
57 # rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4
58 # rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8
59 # dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6
60 # dsa 2048 bits 0.001101s 0.001260s 908.2 793.4
62 # 32-bit process, this module:
63 # sign verify sign/s verify/s
64 # rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0
65 # rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7
66 # rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4
67 # dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2
68 # dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2
70 # 32-bit code is prone to performance degradation as interrupt rate
71 # dispatched to CPU executing the code grows. This is because in
72 # standard process of handling interrupt in 32-bit process context
73 # upper halves of most integer registers used as input or output are
74 # zeroed. This renders result invalid, and operation has to be re-run.
75 # If CPU is "bothered" with timer interrupts only, the penalty is
76 # hardly measurable. But in order to mitigate this problem for higher
77 # interrupt rates contemporary Linux kernel recognizes biased stack
78 # even in 32-bit process context and preserves full register contents.
79 # See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
82 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
83 push(@INC,"${dir}","${dir}../../perlasm");
84 require "sparcv9_modes.pl";
86 $output = pop and open STDOUT,">$output";
90 # define __ASSEMBLER__ 1
92 #include "crypto/sparc_arch.h"
95 .register %g2,#scratch
96 .register %g3,#scratch
99 .section ".text",#alloc,#execinstr
106 ########################################################################
107 # Register layout for mont[mul|sqr] instructions.
108 # For details see "Oracle SPARC Architecture 2011" manual at
109 # http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
111 my @R=map("%f".2*$_,(0..11,30,31,12..29));
112 my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
113 my @A=(@N[0..13],@R[14..31]);
114 my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
116 ########################################################################
117 # int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
118 # const u64 *np,const BN_ULONG *n0);
120 sub generate_bn_mul_mont_t4() {
122 my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
125 .globl bn_mul_mont_t4_$NUM
131 #elif defined(SPARCV9_64BIT_STACK)
132 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
133 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
135 and %g1,SPARCV9_64BIT_STACK,%g1
143 sllx $sentinel,32,$sentinel
146 save %sp,-128,%sp ! warm it up
161 or %g4,$sentinel,$sentinel
163 ! copy arguments to global registers
168 ld [%i4+0],%f1 ! load *n0
173 # load ap[$NUM] ########################################################
175 save %sp,-128,%sp; or $sentinel,%fp,%fp
177 for($i=0; $i<14 && $i<$NUM; $i++) {
178 my $lo=$i<13?@A[$i+1]:"%o7";
181 ld [$ap+$i*8+4],@A[$i]
182 sllx @A[$i],32,@A[$i]
186 for(; $i<$NUM; $i++) {
187 my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
194 # load np[$NUM] ########################################################
196 save %sp,-128,%sp; or $sentinel,%fp,%fp
198 for($i=0; $i<14 && $i<$NUM; $i++) {
199 my $lo=$i<13?@N[$i+1]:"%o7";
202 ld [$np+$i*8+4],@N[$i]
203 sllx @N[$i],32,@N[$i]
208 save %sp,-128,%sp; or $sentinel,%fp,%fp
210 for(; $i<28 && $i<$NUM; $i++) {
211 my $lo=$i<27?@N[$i+1]:"%o7";
214 ld [$np+$i*8+4],@N[$i]
215 sllx @N[$i],32,@N[$i]
220 save %sp,-128,%sp; or $sentinel,%fp,%fp
222 for(; $i<$NUM; $i++) {
223 my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
226 ld [$np+$i*8+4],@N[$i]
227 sllx @N[$i],32,@N[$i]
233 be SIZE_T_CC,.Lmsquare_$NUM
237 # load bp[$NUM] ########################################################
239 save %sp,-128,%sp; or $sentinel,%fp,%fp
241 for($i=0; $i<14 && $i<$NUM; $i++) {
242 my $lo=$i<13?@B[$i+1]:"%o7";
245 ld [$bp+$i*8+4],@B[$i]
246 sllx @B[$i],32,@B[$i]
251 save %sp,-128,%sp; or $sentinel,%fp,%fp
253 for(; $i<$NUM; $i++) {
254 my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
257 ld [$bp+$i*8+4],@B[$i]
258 sllx @B[$i],32,@B[$i]
262 # magic ################################################################
264 .word 0x81b02920+$NUM-1 ! montmul $NUM-1
266 fbu,pn %fcc3,.Lmabort_$NUM
268 and %fp,$sentinel,$sentinel
269 brz,pn $sentinel,.Lmabort_$NUM
279 restore; and %fp,$sentinel,$sentinel
280 restore; and %fp,$sentinel,$sentinel
281 restore; and %fp,$sentinel,$sentinel
282 restore; and %fp,$sentinel,$sentinel
283 brz,pn $sentinel,.Lmabort1_$NUM
288 # save tp[$NUM] ########################################################
289 for($i=0; $i<14 && $i<$NUM; $i++) {
291 movxtod @A[$i],@R[$i]
298 and %fp,$sentinel,$sentinel
301 and %fp,$sentinel,$sentinel
302 srl %fp,0,%fp ! just in case?
303 or %o7,$sentinel,$sentinel
304 brz,a,pn $sentinel,.Lmdone_$NUM
305 mov 0,%i0 ! return failure
308 for($i=0; $i<12 && $i<$NUM; $i++) {
309 @R[$i] =~ /%f([0-9]+)/;
310 my $lo = "%f".($1+1);
313 st @R[$i],[$rp+$i*8+4]
316 for(; $i<$NUM; $i++) {
317 my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
325 mov 1,%i0 ! return success
339 mov 0,%i0 ! return failure
345 save %sp,-128,%sp; or $sentinel,%fp,%fp
346 save %sp,-128,%sp; or $sentinel,%fp,%fp
347 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
350 .type bn_mul_mont_t4_$NUM, #function
351 .size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
355 for ($i=8;$i<=32;$i+=8) {
356 &generate_bn_mul_mont_t4($i);
359 ########################################################################
362 my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
367 sll %o5, 3, %o5 ! offset within first cache line
368 add %o5, $ptbl, $ptbl ! of the pwrtbl
372 $code.=<<___ if (!$skip_wr);
377 my ($pwrtbl,$B0,$B1)=@_;
380 ldx [$pwrtbl+0*32], $B0
381 ldx [$pwrtbl+8*32], $B1
382 ldx [$pwrtbl+1*32], %o4
383 ldx [$pwrtbl+9*32], %o5
385 ldx [$pwrtbl+2*32], %o4
387 ldx [$pwrtbl+10*32],%o5
389 ldx [$pwrtbl+3*32], %o4
391 ldx [$pwrtbl+11*32],%o5
392 movneg %icc, %o4, $B0
393 ldx [$pwrtbl+4*32], %o4
394 movneg %icc, %o5, $B1
395 ldx [$pwrtbl+12*32],%o5
397 ldx [$pwrtbl+5*32],%o4
399 ldx [$pwrtbl+13*32],%o5
401 ldx [$pwrtbl+6*32], %o4
403 ldx [$pwrtbl+14*32],%o5
405 ldx [$pwrtbl+7*32], %o4
407 ldx [$pwrtbl+15*32],%o5
408 movneg %xcc, %o4, $B0
409 add $pwrtbl,16*32, $pwrtbl
410 movneg %xcc, %o5, $B1
417 ldx [$pwrtbl+0*32], $Bi
418 ldx [$pwrtbl+1*32], %o4
419 ldx [$pwrtbl+2*32], %o5
421 ldx [$pwrtbl+3*32], %o4
423 ldx [$pwrtbl+4*32], %o5
424 movneg %icc, %o4, $Bi
425 ldx [$pwrtbl+5*32], %o4
427 ldx [$pwrtbl+6*32], %o5
429 ldx [$pwrtbl+7*32], %o4
431 add $pwrtbl,8*32, $pwrtbl
432 movneg %xcc, %o4, $Bi
436 ########################################################################
437 # int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
438 # const u64 *pwrtbl,int pwr,int stride);
440 sub generate_bn_pwr5_mont_t4() {
442 my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
445 .globl bn_pwr5_mont_t4_$NUM
447 bn_pwr5_mont_t4_$NUM:
451 #elif defined(SPARCV9_64BIT_STACK)
452 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
453 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
455 and %g1,SPARCV9_64BIT_STACK,%g1
463 sllx $sentinel,32,$sentinel
466 save %sp,-128,%sp ! warm it up
481 or %g4,$sentinel,$sentinel
483 ! copy arguments to global registers
486 ld [%i2+0],%f1 ! load *n0
489 srl %i4,%g0,%i4 ! pack last arguments
495 # load tp[$NUM] ########################################################
497 save %sp,-128,%sp; or $sentinel,%fp,%fp
499 for($i=0; $i<14 && $i<$NUM; $i++) {
501 ldx [$tp+$i*8],@A[$i]
504 for(; $i<$NUM; $i++) {
506 ldd [$tp+$i*8],@A[$i]
509 # load np[$NUM] ########################################################
511 save %sp,-128,%sp; or $sentinel,%fp,%fp
513 for($i=0; $i<14 && $i<$NUM; $i++) {
515 ldx [$np+$i*8],@N[$i]
519 save %sp,-128,%sp; or $sentinel,%fp,%fp
521 for(; $i<28 && $i<$NUM; $i++) {
523 ldx [$np+$i*8],@N[$i]
527 save %sp,-128,%sp; or $sentinel,%fp,%fp
529 for(; $i<$NUM; $i++) {
531 ldx [$np+$i*8],@N[$i]
534 # load pwrtbl[pwr] ########################################################
536 save %sp,-128,%sp; or $sentinel,%fp,%fp
538 srlx $pwr, 32, %o4 ! unpack $pwr
542 sllx %o4, 32, $pwr ! re-pack $pwr
546 &load_ccr("%o7","%o5","%o4");
553 for($i=0; $i<14 && $i<$NUM; $i+=2) {
554 &load_b_pair("%o7",@B[$i],@B[$i+1]);
557 save %sp,-128,%sp; or $sentinel,%fp,%fp
559 for(; $i<$NUM; $i+=2) {
560 &load_b_pair("%i7",@B[$i],@B[$i+1]);
563 srax $pwr, 32, %o4 ! unpack $pwr
567 sllx %o4, 32, $pwr ! re-pack $pwr
571 &load_ccr("%i7","%o5","%o4",1);
573 # magic ################################################################
574 for($i=0; $i<5; $i++) {
576 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
577 fbu,pn %fcc3,.Labort_$NUM
579 and %fp,$sentinel,$sentinel
580 brz,pn $sentinel,.Labort_$NUM
587 .word 0x81b02920+$NUM-1 ! montmul $NUM-1
588 fbu,pn %fcc3,.Labort_$NUM
590 and %fp,$sentinel,$sentinel
591 brz,pn $sentinel,.Labort_$NUM
596 brgez %o4,.Lstride_$NUM
603 brgez %o4,.Lstride_$NUM
604 restore; and %fp,$sentinel,$sentinel
605 restore; and %fp,$sentinel,$sentinel
606 restore; and %fp,$sentinel,$sentinel
607 restore; and %fp,$sentinel,$sentinel
608 brz,pn $sentinel,.Labort1_$NUM
613 # save tp[$NUM] ########################################################
614 for($i=0; $i<14 && $i<$NUM; $i++) {
616 movxtod @A[$i],@R[$i]
623 and %fp,$sentinel,$sentinel
626 and %fp,$sentinel,$sentinel
627 srl %fp,0,%fp ! just in case?
628 or %o7,$sentinel,$sentinel
629 brz,a,pn $sentinel,.Ldone_$NUM
630 mov 0,%i0 ! return failure
633 for($i=0; $i<$NUM; $i++) {
635 std @R[$i],[$tp+$i*8]
639 mov 1,%i0 ! return success
653 mov 0,%i0 ! return failure
656 .type bn_pwr5_mont_t4_$NUM, #function
657 .size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
661 for ($i=8;$i<=32;$i+=8) {
662 &generate_bn_pwr5_mont_t4($i);
666 ########################################################################
667 # Fall-back subroutines
669 # copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
671 ($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
672 (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
675 $rp="%o0"; # u64 *rp,
676 $ap="%o1"; # const u64 *ap,
677 $bp="%o2"; # const u64 *bp,
678 $np="%o3"; # const u64 *np,
679 $n0p="%o4"; # const BN_ULONG *n0,
680 $num="%o5"; # int num); # caller ensures that num is >=3
682 .globl bn_mul_mont_t4
685 add %sp, STACK_BIAS, %g4 ! real top of stack
686 sll $num, 3, $num ! size in bytes
688 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
690 andn %g1, 63, %g1 ! align at 64 byte
691 sub %g1, STACK_FRAME, %g1 ! new top of stack
696 # +-------------------------------+<----- %sp
698 # +-------------------------------+<----- aligned at 64 bytes
700 # +-------------------------------+
703 # +-------------------------------+<----- aligned at 64 bytes
705 ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
706 ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
709 ld [$n0p+0], $t0 ! pull n0[0..1] value
711 add %sp, STACK_BIAS+STACK_FRAME, $tp
712 ldx [$bp+0], $m0 ! m0=bp[0]
717 ldx [$ap+0], $aj ! ap[0]
719 mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
720 umulxhi $aj, $m0, $hi0
722 ldx [$ap+8], $aj ! ap[1]
724 ldx [$np+0], $nj ! np[0]
726 mulx $lo0, $n0, $m1 ! "tp[0]"*n0
728 mulx $aj, $m0, $alo ! ap[1]*bp[0]
729 umulxhi $aj, $m0, $aj ! ahi=aj
731 mulx $nj, $m1, $lo1 ! np[0]*m1
732 umulxhi $nj, $m1, $hi1
734 ldx [$np+8], $nj ! np[1]
736 addcc $lo0, $lo1, $lo1
738 addxc %g0, $hi1, $hi1
740 mulx $nj, $m1, $nlo ! np[1]*m1
741 umulxhi $nj, $m1, $nj ! nhi=nj
744 sub $num, 24, $cnt ! cnt=num-3
748 addcc $alo, $hi0, $lo0
751 ldx [$ap+0], $aj ! ap[j]
752 addcc $nlo, $hi1, $lo1
754 addxc $nj, %g0, $hi1 ! nhi=nj
756 ldx [$np+0], $nj ! np[j]
757 mulx $aj, $m0, $alo ! ap[j]*bp[0]
759 umulxhi $aj, $m0, $aj ! ahi=aj
761 mulx $nj, $m1, $nlo ! np[j]*m1
762 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
763 umulxhi $nj, $m1, $nj ! nhi=nj
764 addxc %g0, $hi1, $hi1
765 stxa $lo1, [$tp]0xe2 ! tp[j-1]
766 add $tp, 8, $tp ! tp++
769 sub $cnt, 8, $cnt ! j--
771 addcc $alo, $hi0, $lo0
772 addxc $aj, %g0, $hi0 ! ahi=aj
774 addcc $nlo, $hi1, $lo1
776 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
777 addxc %g0, $hi1, $hi1
778 stxa $lo1, [$tp]0xe2 ! tp[j-1]
781 addcc $hi0, $hi1, $hi1
782 addxc %g0, %g0, $ovf ! upmost overflow bit
787 sub $num, 16, $i ! i=num-2
791 ldx [$bp+0], $m0 ! m0=bp[i]
794 sub $ap, $num, $ap ! rewind
798 ldx [$ap+0], $aj ! ap[0]
799 ldx [$np+0], $nj ! np[0]
801 mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
802 ldx [$tp], $tj ! tp[0]
803 umulxhi $aj, $m0, $hi0
804 ldx [$ap+8], $aj ! ap[1]
805 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
806 mulx $aj, $m0, $alo ! ap[1]*bp[i]
807 addxc %g0, $hi0, $hi0
808 mulx $lo0, $n0, $m1 ! tp[0]*n0
809 umulxhi $aj, $m0, $aj ! ahi=aj
810 mulx $nj, $m1, $lo1 ! np[0]*m1
812 umulxhi $nj, $m1, $hi1
813 ldx [$np+8], $nj ! np[1]
815 addcc $lo1, $lo0, $lo1
816 mulx $nj, $m1, $nlo ! np[1]*m1
817 addxc %g0, $hi1, $hi1
818 umulxhi $nj, $m1, $nj ! nhi=nj
821 sub $num, 24, $cnt ! cnt=num-3
824 addcc $alo, $hi0, $lo0
825 ldx [$tp+8], $tj ! tp[j]
826 addxc $aj, %g0, $hi0 ! ahi=aj
827 ldx [$ap+0], $aj ! ap[j]
829 addcc $nlo, $hi1, $lo1
830 mulx $aj, $m0, $alo ! ap[j]*bp[i]
831 addxc $nj, %g0, $hi1 ! nhi=nj
832 ldx [$np+0], $nj ! np[j]
834 umulxhi $aj, $m0, $aj ! ahi=aj
835 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
836 mulx $nj, $m1, $nlo ! np[j]*m1
837 addxc %g0, $hi0, $hi0
838 umulxhi $nj, $m1, $nj ! nhi=nj
839 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
840 addxc %g0, $hi1, $hi1
841 stx $lo1, [$tp] ! tp[j-1]
843 brnz,pt $cnt, .Linner
846 ldx [$tp+8], $tj ! tp[j]
847 addcc $alo, $hi0, $lo0
848 addxc $aj, %g0, $hi0 ! ahi=aj
849 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
850 addxc %g0, $hi0, $hi0
852 addcc $nlo, $hi1, $lo1
853 addxc $nj, %g0, $hi1 ! nhi=nj
854 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
855 addxc %g0, $hi1, $hi1
856 stx $lo1, [$tp] ! tp[j-1]
858 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
859 addxccc $hi1, $hi0, $hi1
867 sub $ap, $num, $ap ! rewind
871 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
879 subccc $tj, $nj, $t2 ! tp[j]-np[j]
884 st $t2, [$rp-4] ! reverse order
889 sub $np, $num, $np ! rewind
893 subccc $ovf, %g0, $ovf ! handle upmost overflow bit
898 .Lcopy: ! conditional copy
912 .type bn_mul_mont_t4, #function
913 .size bn_mul_mont_t4, .-bn_mul_mont_t4
916 # int bn_mul_mont_gather5(
917 $rp="%o0"; # u64 *rp,
918 $ap="%o1"; # const u64 *ap,
919 $bp="%o2"; # const u64 *pwrtbl,
920 $np="%o3"; # const u64 *np,
921 $n0p="%o4"; # const BN_ULONG *n0,
922 $num="%o5"; # int num, # caller ensures that num is >=3
925 .globl bn_mul_mont_gather5_t4
927 bn_mul_mont_gather5_t4:
928 add %sp, STACK_BIAS, %g4 ! real top of stack
929 sll $num, 3, $num ! size in bytes
931 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
933 andn %g1, 63, %g1 ! align at 64 byte
934 sub %g1, STACK_FRAME, %g1 ! new top of stack
936 LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument
940 # +-------------------------------+<----- %sp
942 # +-------------------------------+<----- aligned at 64 bytes
944 # +-------------------------------+
947 # +-------------------------------+<----- aligned at 64 bytes
949 ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
950 ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
952 &load_ccr($bp,"%g4",$ccr);
953 &load_b($bp,$m0,"%o7"); # m0=bp[0]
956 ld [$n0p+0], $t0 ! pull n0[0..1] value
958 add %sp, STACK_BIAS+STACK_FRAME, $tp
962 ldx [$ap+0], $aj ! ap[0]
964 mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
965 umulxhi $aj, $m0, $hi0
967 ldx [$ap+8], $aj ! ap[1]
969 ldx [$np+0], $nj ! np[0]
971 mulx $lo0, $n0, $m1 ! "tp[0]"*n0
973 mulx $aj, $m0, $alo ! ap[1]*bp[0]
974 umulxhi $aj, $m0, $aj ! ahi=aj
976 mulx $nj, $m1, $lo1 ! np[0]*m1
977 umulxhi $nj, $m1, $hi1
979 ldx [$np+8], $nj ! np[1]
981 addcc $lo0, $lo1, $lo1
983 addxc %g0, $hi1, $hi1
985 mulx $nj, $m1, $nlo ! np[1]*m1
986 umulxhi $nj, $m1, $nj ! nhi=nj
989 sub $num, 24, $cnt ! cnt=num-3
993 addcc $alo, $hi0, $lo0
996 ldx [$ap+0], $aj ! ap[j]
997 addcc $nlo, $hi1, $lo1
999 addxc $nj, %g0, $hi1 ! nhi=nj
1001 ldx [$np+0], $nj ! np[j]
1002 mulx $aj, $m0, $alo ! ap[j]*bp[0]
1004 umulxhi $aj, $m0, $aj ! ahi=aj
1006 mulx $nj, $m1, $nlo ! np[j]*m1
1007 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
1008 umulxhi $nj, $m1, $nj ! nhi=nj
1009 addxc %g0, $hi1, $hi1
1010 stxa $lo1, [$tp]0xe2 ! tp[j-1]
1011 add $tp, 8, $tp ! tp++
1013 brnz,pt $cnt, .L1st_g5
1014 sub $cnt, 8, $cnt ! j--
1016 addcc $alo, $hi0, $lo0
1017 addxc $aj, %g0, $hi0 ! ahi=aj
1019 addcc $nlo, $hi1, $lo1
1020 addxc $nj, %g0, $hi1
1021 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
1022 addxc %g0, $hi1, $hi1
1023 stxa $lo1, [$tp]0xe2 ! tp[j-1]
1026 addcc $hi0, $hi1, $hi1
1027 addxc %g0, %g0, $ovf ! upmost overflow bit
1028 stxa $hi1, [$tp]0xe2
1032 sub $num, 16, $i ! i=num-2
1038 &load_b($bp,$m0); # m0=bp[i]
1040 sub $ap, $num, $ap ! rewind
1044 ldx [$ap+0], $aj ! ap[0]
1045 ldx [$np+0], $nj ! np[0]
1047 mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
1048 ldx [$tp], $tj ! tp[0]
1049 umulxhi $aj, $m0, $hi0
1050 ldx [$ap+8], $aj ! ap[1]
1051 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
1052 mulx $aj, $m0, $alo ! ap[1]*bp[i]
1053 addxc %g0, $hi0, $hi0
1054 mulx $lo0, $n0, $m1 ! tp[0]*n0
1055 umulxhi $aj, $m0, $aj ! ahi=aj
1056 mulx $nj, $m1, $lo1 ! np[0]*m1
1058 umulxhi $nj, $m1, $hi1
1059 ldx [$np+8], $nj ! np[1]
1061 addcc $lo1, $lo0, $lo1
1062 mulx $nj, $m1, $nlo ! np[1]*m1
1063 addxc %g0, $hi1, $hi1
1064 umulxhi $nj, $m1, $nj ! nhi=nj
1067 sub $num, 24, $cnt ! cnt=num-3
1070 addcc $alo, $hi0, $lo0
1071 ldx [$tp+8], $tj ! tp[j]
1072 addxc $aj, %g0, $hi0 ! ahi=aj
1073 ldx [$ap+0], $aj ! ap[j]
1075 addcc $nlo, $hi1, $lo1
1076 mulx $aj, $m0, $alo ! ap[j]*bp[i]
1077 addxc $nj, %g0, $hi1 ! nhi=nj
1078 ldx [$np+0], $nj ! np[j]
1080 umulxhi $aj, $m0, $aj ! ahi=aj
1081 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
1082 mulx $nj, $m1, $nlo ! np[j]*m1
1083 addxc %g0, $hi0, $hi0
1084 umulxhi $nj, $m1, $nj ! nhi=nj
1085 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
1086 addxc %g0, $hi1, $hi1
1087 stx $lo1, [$tp] ! tp[j-1]
1089 brnz,pt $cnt, .Linner_g5
1092 ldx [$tp+8], $tj ! tp[j]
1093 addcc $alo, $hi0, $lo0
1094 addxc $aj, %g0, $hi0 ! ahi=aj
1095 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
1096 addxc %g0, $hi0, $hi0
1098 addcc $nlo, $hi1, $lo1
1099 addxc $nj, %g0, $hi1 ! nhi=nj
1100 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
1101 addxc %g0, $hi1, $hi1
1102 stx $lo1, [$tp] ! tp[j-1]
1104 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
1105 addxccc $hi1, $hi0, $hi1
1106 addxc %g0, %g0, $ovf
1110 brnz,pt $i, .Louter_g5
1113 sub $ap, $num, $ap ! rewind
1117 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
1125 subccc $tj, $nj, $t2 ! tp[j]-np[j]
1128 subccc $tj, $nj, $t3
1130 st $t2, [$rp-4] ! reverse order
1132 brnz,pt $cnt, .Lsub_g5
1135 sub $np, $num, $np ! rewind
1139 subccc $ovf, %g0, $ovf ! handle upmost overflow bit
1144 .Lcopy_g5: ! conditional copy
1147 stx %g0, [$tp] ! zap
1149 movcs %icc, $tj, $t2
1152 brnz $cnt, .Lcopy_g5
1158 .type bn_mul_mont_gather5_t4, #function
1159 .size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
1174 brnz %o2, .Loop_flip
1178 .type bn_flip_t4, #function
1179 .size bn_flip_t4, .-bn_flip_t4
1181 .globl bn_flip_n_scatter5_t4
1183 bn_flip_n_scatter5_t4:
1186 add %o3, %o2, %o2 ! &pwrtbl[pwr]
1188 .Loop_flip_n_scatter5:
1189 ld [%o0+0], %o4 ! inp[i]
1196 brnz %o1, .Loop_flip_n_scatter5
1200 .type bn_flip_n_scatter5_t4, #function
1201 .size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
1203 .globl bn_gather5_t4
1207 &load_ccr("%o2","%o3","%g1");
1212 &load_b("%o2","%g1");
1216 brnz %o1, .Loop_gather5
1221 .type bn_gather5_t4, #function
1222 .size bn_gather5_t4, .-bn_gather5_t4
1224 .asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"
1230 close STDOUT or die "error closing STDOUT: $!";