2 # Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
20 # Because unlike integer multiplier, which simply stalls whole CPU,
21 # FPU is fully pipelined and can effectively emit 48 bit partial
22 # product every cycle. Why not blended SPARC v9? One can argue that
23 # making this module dependent on UltraSPARC VIS extension limits its
24 # binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
25 # implementations from compatibility matrix. But the rest, whole Sun
26 # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
27 # VIS extension instructions used in this module. This is considered
28 # good enough to not care about HAL SPARC64 users [if any] who have
29 # integer-only pure SPARCv9 module to "fall down" to.
31 # USI&II cores currently exhibit uniform 2x improvement [over pre-
32 # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
33 # performance improves few percents for shorter keys and worsens few
34 # percents for longer keys. This is because USIII integer multiplier
35 # is >3x faster than USI&II one, which is harder to match [but see
36 # TODO list below]. It should also be noted that SPARC64 V features
37 # out-of-order execution, which *might* mean that integer multiplier
38 # is pipelined, which in turn *might* be impossible to match... On
39 # additional note, SPARC64 V implements FP Multiply-Add instruction,
40 # which is perfectly usable in this context... In other words, as far
41 # as Fujitsu SPARC64 V goes, talk to the author:-)
43 # The implementation implies following "non-natural" limitations on
45 # - num may not be less than 4;
46 # - num has to be even;
47 # Failure to meet either condition has no fatal effects, simply
48 # doesn't give any performance gain.
51 # - modulo-schedule inner loop for better performance (on in-order
52 # execution core such as UltraSPARC this shall result in further
53 # noticeable(!) improvement);
54 # - dedicated squaring procedure[?];
56 ######################################################################
59 # Modulo-scheduled inner loops allow to interleave floating point and
60 # integer instructions and minimize Read-After-Write penalties. This
61 # results in *further* 20-50% performance improvement [depending on
62 # key length, more for longer keys] on USI&II cores and 30-80% - on
65 # $output is the last argument if it looks like a file (it has an extension)
66 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
68 $output and open STDOUT,">$output";
70 $fname="bn_mul_mont_fpu";
76 # In order to provide for 32-/64-bit ABI duality, I keep integers wider
77 # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
78 # exclusively for pointers, indexes and other small values...
80 $rp="%i0"; # BN_ULONG *rp,
81 $ap="%i1"; # const BN_ULONG *ap,
82 $bp="%i2"; # const BN_ULONG *bp,
83 $np="%i3"; # const BN_ULONG *np,
84 $n0="%i4"; # const BN_ULONG *n0,
85 $num="%i5"; # int num);
88 $ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
89 $ap_h="%l2"; # to these four vectors as double-precision FP values.
90 $np_l="%l3"; # This way a bunch of fxtods are eliminated in second
91 $np_h="%l4"; # loop and L1-cache aliasing is minimized...
94 $mask="%l7"; # 16-bit mask, 0xffff
96 $n0="%g4"; # reassigned(!) to "64-bit" register
97 $carry="%i4"; # %i4 reused(!) for a carry bit
99 # FP register naming chart
114 $ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
115 $na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
116 $alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
117 $nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
119 $dota="%f24"; $dotb="%f26";
121 $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
122 $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
123 $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
124 $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
126 $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
129 #ifndef __ASSEMBLER__
130 # define __ASSEMBLER__ 1
132 #include "crypto/sparc_arch.h"
134 .section ".text",#alloc,#execinstr
139 save %sp,-$frame-$locals,%sp
144 andcc $num,1,%g0 ! $num has to be even...
146 clr %i0 ! signal "unsupported input value"
149 sethi %hi(0xffff),$mask
150 ld [%i4+0],$n0 ! $n0 reassigned, remember?
151 or $mask,%lo(0xffff),$mask
154 or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
156 sll $num,3,$num ! num*=8
158 add %sp,$bias,%o0 ! real top of stack
160 add %o1,$num,%o1 ! %o1=num*5
162 and %o0,-2048,%o0 ! optimize TLB utilization
163 sub %o0,$bias,%sp ! alloca(5*num*8)
165 rd %asi,%o7 ! save %asi
166 add %sp,$bias+$frame+$locals,$tp
168 add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
173 wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
175 add $rp,$num,$rp ! readjust input pointers to point
176 add $ap,$num,$ap ! at the ends too...
180 stx %o7,[%sp+$bias+$frame+48] ! save %asi
182 sub %g0,$num,$i ! i=-num
183 sub %g0,$num,$j ! j=-num
188 ld [%o3+4],%g1 ! bp[0]
190 ld [%o4+4],%g5 ! ap[0]
199 mulx %o1,%o0,%o0 ! ap[0]*bp[0]
200 mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
201 stx %o0,[%sp+$bias+$frame+0]
203 ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
207 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
212 ! transfer b[i] to FPU as 4x16-bit values
222 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
223 ldda [%sp+$bias+$frame+6]%asi,$na
225 ldda [%sp+$bias+$frame+4]%asi,$nb
227 ldda [%sp+$bias+$frame+2]%asi,$nc
229 ldda [%sp+$bias+$frame+0]%asi,$nd
232 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
236 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
246 faddd $aloa,$nloa,$nloa
249 faddd $alob,$nlob,$nlob
252 faddd $aloc,$nloc,$nloc
255 faddd $alod,$nlod,$nlod
258 faddd $ahia,$nhia,$nhia
261 faddd $ahib,$nhib,$nhib
264 faddd $ahic,$nhic,$dota ! $nhic
265 faddd $ahid,$nhid,$dotb ! $nhid
267 faddd $nloc,$nhia,$nloc
268 faddd $nlod,$nhib,$nlod
275 std $nloa,[%sp+$bias+$frame+0]
277 std $nlob,[%sp+$bias+$frame+8]
279 std $nloc,[%sp+$bias+$frame+16]
281 std $nlod,[%sp+$bias+$frame+24]
283 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
287 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
297 ldx [%sp+$bias+$frame+0],%o0
299 ldx [%sp+$bias+$frame+8],%o1
301 ldx [%sp+$bias+$frame+16],%o2
303 ldx [%sp+$bias+$frame+24],%o3
307 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
311 faddd $aloa,$nloa,$nloa
314 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
318 faddd $alob,$nlob,$nlob
322 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
323 faddd $aloc,$nloc,$nloc
333 !or %o7,%o0,%o0 ! 64-bit result
334 srlx %o3,16,%g1 ! 34-bit carry
337 faddd $alod,$nlod,$nlod
340 faddd $ahia,$nhia,$nhia
343 faddd $ahib,$nhib,$nhib
346 faddd $dota,$nloa,$nloa
347 faddd $dotb,$nlob,$nlob
348 faddd $ahic,$nhic,$dota ! $nhic
349 faddd $ahid,$nhid,$dotb ! $nhid
351 faddd $nloc,$nhia,$nloc
352 faddd $nlod,$nhib,$nlod
359 std $nloa,[%sp+$bias+$frame+0]
360 std $nlob,[%sp+$bias+$frame+8]
362 std $nloc,[%sp+$bias+$frame+16]
364 std $nlod,[%sp+$bias+$frame+24]
366 .align 32 ! incidentally already aligned !
370 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
374 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
384 ldx [%sp+$bias+$frame+0],%o0
386 ldx [%sp+$bias+$frame+8],%o1
388 ldx [%sp+$bias+$frame+16],%o2
390 ldx [%sp+$bias+$frame+24],%o3
394 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
398 faddd $aloa,$nloa,$nloa
401 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
405 faddd $alob,$nlob,$nlob
409 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
411 faddd $aloc,$nloc,$nloc
417 faddd $alod,$nlod,$nlod
423 faddd $ahia,$nhia,$nhia
427 or %o7,%o0,%o0 ! 64-bit result
428 faddd $ahib,$nhib,$nhib
431 faddd $dota,$nloa,$nloa
432 srlx %o3,16,%g1 ! 34-bit carry
433 faddd $dotb,$nlob,$nlob
437 stx %o0,[$tp] ! tp[j-1]=
439 faddd $ahic,$nhic,$dota ! $nhic
440 faddd $ahid,$nhid,$dotb ! $nhid
442 faddd $nloc,$nhia,$nloc
443 faddd $nlod,$nhib,$nlod
450 std $nloa,[%sp+$bias+$frame+0]
451 std $nlob,[%sp+$bias+$frame+8]
452 std $nloc,[%sp+$bias+$frame+16]
453 std $nlod,[%sp+$bias+$frame+24]
463 ldx [%sp+$bias+$frame+0],%o0
464 ldx [%sp+$bias+$frame+8],%o1
465 ldx [%sp+$bias+$frame+16],%o2
466 ldx [%sp+$bias+$frame+24],%o3
469 std $dota,[%sp+$bias+$frame+32]
471 std $dotb,[%sp+$bias+$frame+40]
475 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
484 or %o7,%o0,%o0 ! 64-bit result
485 ldx [%sp+$bias+$frame+32],%o4
487 ldx [%sp+$bias+$frame+40],%o5
488 srlx %o3,16,%g1 ! 34-bit carry
492 stx %o0,[$tp] ! tp[j-1]=
506 stx %o4,[$tp] ! tp[num-1]=
512 sub %g0,$num,$j ! j=-num
513 add %sp,$bias+$frame+$locals,$tp
518 ld [%o3+4],%g1 ! bp[i]
520 ld [%o4+4],%g5 ! ap[0]
527 ldx [$tp],%o2 ! tp[0]
530 mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
531 stx %o0,[%sp+$bias+$frame+0]
533 ! transfer b[i] to FPU as 4x16-bit values
539 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
540 ldda [%sp+$bias+$frame+6]%asi,$na
542 ldda [%sp+$bias+$frame+4]%asi,$nb
544 ldda [%sp+$bias+$frame+2]%asi,$nc
546 ldda [%sp+$bias+$frame+0]%asi,$nd
548 ldd [$ap_l+$j],$alo ! load a[j] in double format
552 ldd [$np_l+$j],$nlo ! load n[j] in double format
562 faddd $aloa,$nloa,$nloa
565 faddd $alob,$nlob,$nlob
568 faddd $aloc,$nloc,$nloc
571 faddd $alod,$nlod,$nlod
574 faddd $ahia,$nhia,$nhia
577 faddd $ahib,$nhib,$nhib
580 faddd $ahic,$nhic,$dota ! $nhic
581 faddd $ahid,$nhid,$dotb ! $nhid
583 faddd $nloc,$nhia,$nloc
584 faddd $nlod,$nhib,$nlod
591 std $nloa,[%sp+$bias+$frame+0]
592 std $nlob,[%sp+$bias+$frame+8]
593 std $nloc,[%sp+$bias+$frame+16]
595 std $nlod,[%sp+$bias+$frame+24]
597 ldd [$ap_l+$j],$alo ! load a[j] in double format
599 ldd [$np_l+$j],$nlo ! load n[j] in double format
607 ldx [%sp+$bias+$frame+0],%o0
608 faddd $aloa,$nloa,$nloa
610 ldx [%sp+$bias+$frame+8],%o1
612 ldx [%sp+$bias+$frame+16],%o2
613 faddd $alob,$nlob,$nlob
615 ldx [%sp+$bias+$frame+24],%o3
619 faddd $aloc,$nloc,$nloc
624 faddd $alod,$nlod,$nlod
629 faddd $ahia,$nhia,$nhia
631 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
637 faddd $ahib,$nhib,$nhib
640 faddd $dota,$nloa,$nloa
642 faddd $dotb,$nlob,$nlob
645 faddd $ahic,$nhic,$dota ! $nhic
647 faddd $ahid,$nhid,$dotb ! $nhid
648 or %o7,%o0,%o0 ! 64-bit result
650 faddd $nloc,$nhia,$nloc
653 faddd $nlod,$nhib,$nlod
654 srlx %o3,16,%g1 ! 34-bit carry
663 std $nloa,[%sp+$bias+$frame+0]
664 std $nlob,[%sp+$bias+$frame+8]
666 std $nloc,[%sp+$bias+$frame+16]
667 bz,pn %icc,.Linnerskip
668 std $nlod,[%sp+$bias+$frame+24]
674 ldd [$ap_l+$j],$alo ! load a[j] in double format
676 ldd [$np_l+$j],$nlo ! load n[j] in double format
684 ldx [%sp+$bias+$frame+0],%o0
685 faddd $aloa,$nloa,$nloa
687 ldx [%sp+$bias+$frame+8],%o1
689 ldx [%sp+$bias+$frame+16],%o2
690 faddd $alob,$nlob,$nlob
692 ldx [%sp+$bias+$frame+24],%o3
696 faddd $aloc,$nloc,$nloc
701 faddd $alod,$nlod,$nlod
706 faddd $ahia,$nhia,$nhia
708 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
713 faddd $ahib,$nhib,$nhib
716 faddd $dota,$nloa,$nloa
718 faddd $dotb,$nlob,$nlob
721 faddd $ahic,$nhic,$dota ! $nhic
723 faddd $ahid,$nhid,$dotb ! $nhid
724 or %o7,%o0,%o0 ! 64-bit result
725 faddd $nloc,$nhia,$nloc
727 ldx [$tp+8],%o7 ! tp[j]
728 faddd $nlod,$nhib,$nlod
729 srlx %o3,16,%g1 ! 34-bit carry
739 stx %o0,[$tp] ! tp[j-1]
742 std $nloa,[%sp+$bias+$frame+0]
743 std $nlob,[%sp+$bias+$frame+8]
744 std $nloc,[%sp+$bias+$frame+16]
746 std $nlod,[%sp+$bias+$frame+24]
754 ldx [%sp+$bias+$frame+0],%o0
755 ldx [%sp+$bias+$frame+8],%o1
756 ldx [%sp+$bias+$frame+16],%o2
757 ldx [%sp+$bias+$frame+24],%o3
760 std $dota,[%sp+$bias+$frame+32]
762 std $dotb,[%sp+$bias+$frame+40]
766 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
775 ldx [%sp+$bias+$frame+32],%o4
776 or %o7,%o0,%o0 ! 64-bit result
777 ldx [%sp+$bias+$frame+40],%o5
779 ldx [$tp+8],%o7 ! tp[j]
780 srlx %o3,16,%g1 ! 34-bit carry
788 stx %o0,[$tp] ! tp[j-1]
802 stx %o4,[$tp] ! tp[num-1]
811 add $tp,8,$tp ! adjust tp to point at the end
813 sub %g0,$num,%o7 ! n=-num
815 subcc %g0,%g0,%g0 ! clear %icc.c
832 sub %g0,$num,%o7 ! n=-num
853 sub %g0,$num,%o7 ! n=-num
864 ldx [%sp+$bias+$frame+48],%o7
865 wr %g0,%o7,%asi ! restore %asi
871 .type $fname,#function
872 .size $fname,(.-$fname)
873 .asciz "Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
877 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
879 # Below substitution makes it possible to compile without demanding
880 # VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
881 # dare to do this, because VIS capability is detected at run-time now
882 # and this routine is not called on CPU not capable to execute it. Do
883 # note that fzeros is not the only VIS dependency! Another dependency
884 # is implicit and is just _a_ numerical value loaded to %asi register,
885 # which assembler can't recognize as VIS specific...
886 $code =~ s/fzeros\s+%f([0-9]+)/
887 sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
892 close STDOUT or die "error closing STDOUT: $!";