2 # Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
20 # for undertaken effort are multiple. First of all, UltraSPARC is not
21 # the whole SPARCv9 universe and other VIS-free implementations deserve
22 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
23 # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
24 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
25 # several integrated RSA/DSA accelerator circuits accessible through
26 # kernel driver [only(*)], but having decent user-land software
27 # implementation is important too. Finally, reasons like desire to
28 # experiment with dedicated squaring procedure. Yes, this module
29 # implements one, because it was easiest to draft it in SPARCv9
32 # (*) Engine accessing the driver in question is on my TODO list.
33 # For reference, accelerator is estimated to give 6 to 10 times
34 # improvement on single-threaded RSA sign. It should be noted
35 # that 6-10x improvement coefficient does not actually mean
36 # something extraordinary in terms of absolute [single-threaded]
37 # performance, as SPARCv9 instruction set is by all means least
38 # suitable for high performance crypto among other 64 bit
39 # platforms. 6-10x factor simply places T1 in same performance
40 # domain as say AMD64 and IA-64. Improvement of RSA verify don't
41 # appear impressive at all, but it's the sign operation which is
42 # far more critical/interesting.
44 # You might notice that inner loops are modulo-scheduled:-) This has
45 # essentially negligible impact on UltraSPARC performance, it's
46 # Fujitsu SPARC64 V users who should notice and hopefully appreciate
47 # the advantage... Currently this module surpasses sparcv9a-mont.pl
48 # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
49 # module still have hidden potential [see TODO list there], which is
50 # estimated to be larger than 20%...
52 $output = pop and open STDOUT,">$output";
55 $rp="%i0"; # BN_ULONG *rp,
56 $ap="%i1"; # const BN_ULONG *ap,
57 $bp="%i2"; # const BN_ULONG *bp,
58 $np="%i3"; # const BN_ULONG *np,
59 $n0="%i4"; # const BN_ULONG *n0,
60 $num="%i5"; # int num);
70 $mask="%g1"; # 32 bits, what a waste...
83 $fname="bn_mul_mont_int";
87 # define __ASSEMBLER__ 1
89 #include "crypto/sparc_arch.h"
91 .section ".text",#alloc,#execinstr
96 cmp %o5,4 ! 128 bits minimum
98 sethi %hi(0xffffffff),$mask
104 sll $num,2,$num ! num*=4
105 or $mask,%lo(0xffffffff),$mask
109 ld [$bp],$mul0 ! bp[0]
112 add %sp,$bias,%o7 ! real top of stack
113 ld [$ap],$car0 ! ap[0] ! redundant in squaring context
115 ld [$ap+4],$apj ! ap[1]
117 ld [$np],$car1 ! np[0]
118 sub %o7,$bias,%sp ! alloca
119 ld [$np+4],$npj ! np[1]
120 be,pt SIZE_T_CC,.Lbn_sqr_mont
123 mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
124 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
125 and $car0,$mask,$acc0
126 add %sp,$bias+$frame,$tp
127 ld [$ap+8],$apj !prologue!
129 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
130 and $mul1,$mask,$mul1
132 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
133 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
135 add $acc0,$car1,$car1
136 ld [$np+8],$npj !prologue!
138 mov $tmp0,$acc0 !prologue!
141 mulx $apj,$mul0,$tmp0
142 mulx $npj,$mul1,$tmp1
143 add $acc0,$car0,$car0
144 ld [$ap+$j],$apj ! ap[j]
145 and $car0,$mask,$acc0
146 add $acc1,$car1,$car1
147 ld [$np+$j],$npj ! np[j]
149 add $acc0,$car1,$car1
160 mulx $apj,$mul0,$tmp0 !epilogue!
161 mulx $npj,$mul1,$tmp1
162 add $acc0,$car0,$car0
163 and $car0,$mask,$acc0
164 add $acc1,$car1,$car1
166 add $acc0,$car1,$car1
170 add $tmp0,$car0,$car0
171 and $car0,$mask,$acc0
172 add $tmp1,$car1,$car1
174 add $acc0,$car1,$car1
178 add $car0,$car1,$car1
183 ld [$bp+4],$mul0 ! bp[1]
185 add %sp,$bias+$frame,$tp
186 ld [$ap],$car0 ! ap[0]
187 ld [$ap+4],$apj ! ap[1]
188 ld [$np],$car1 ! np[0]
189 ld [$np+4],$npj ! np[1]
190 ld [$tp],$tmp1 ! tp[0]
191 ld [$tp+4],$tpj ! tp[1]
194 mulx $car0,$mul0,$car0
195 mulx $apj,$mul0,$tmp0 !prologue!
196 add $tmp1,$car0,$car0
197 ld [$ap+8],$apj !prologue!
198 and $car0,$mask,$acc0
201 and $mul1,$mask,$mul1
203 mulx $car1,$mul1,$car1
204 mulx $npj,$mul1,$acc1 !prologue!
206 add $acc0,$car1,$car1
207 ld [$np+8],$npj !prologue!
209 mov $tmp0,$acc0 !prologue!
212 mulx $apj,$mul0,$tmp0
213 mulx $npj,$mul1,$tmp1
215 ld [$ap+$j],$apj ! ap[j]
216 add $acc0,$car0,$car0
217 add $acc1,$car1,$car1
218 ld [$np+$j],$npj ! np[j]
219 and $car0,$mask,$acc0
220 ld [$tp+8],$tpj ! tp[j]
222 add $acc0,$car1,$car1
225 st $car1,[$tp] ! tp[j-1]
233 mulx $apj,$mul0,$tmp0 !epilogue!
234 mulx $npj,$mul1,$tmp1
236 add $acc0,$car0,$car0
237 ld [$tp+8],$tpj ! tp[j]
238 and $car0,$mask,$acc0
239 add $acc1,$car1,$car1
241 add $acc0,$car1,$car1
242 st $car1,[$tp] ! tp[j-1]
246 add $tmp0,$car0,$car0
247 and $car0,$mask,$acc0
248 add $tmp1,$car1,$car1
249 add $acc0,$car1,$car1
250 st $car1,[$tp+4] ! tp[j-1]
255 add $car0,$car1,$car1
257 add $car2,$car1,$car1
262 ld [$bp+$i],$mul0 ! bp[i]
270 sub %g0,$num,%o7 ! k=-num
272 subcc %g0,%g0,%g0 ! clear %icc.c
277 subccc %o0,%o1,%o1 ! tp[j]-np[j]
282 subccc $car2,0,$car2 ! handle upmost overflow bit
286 ld [$tp+%o7],%o1 ! conditional copy
288 st %g0,[$tp+%o7] ! zap tp
300 ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
301 ######## code without following dedicated squaring procedure.
308 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
309 mulx $apj,$mul0,$tmp0 !prologue!
310 and $car0,$mask,$acc0
311 add %sp,$bias+$frame,$tp
312 ld [$ap+8],$apj !prologue!
314 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
316 and $mul1,$mask,$mul1
318 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
319 mulx $npj,$mul1,$acc1 !prologue!
321 ld [$np+8],$npj !prologue!
323 add $acc0,$car1,$car1
325 mov $tmp0,$acc0 !prologue!
328 mulx $apj,$mul0,$tmp0
329 mulx $npj,$mul1,$tmp1
330 add $acc0,$car0,$car0 ! ap[j]*a0+c0
331 add $acc1,$car1,$car1
332 ld [$ap+$j],$apj ! ap[j]
333 and $car0,$mask,$acc0
334 ld [$np+$j],$npj ! np[j]
336 add $acc0,$acc0,$acc0
341 and $acc0,$mask,$acc0
343 add $acc0,$car1,$car1
351 mulx $apj,$mul0,$tmp0 ! epilogue
352 mulx $npj,$mul1,$tmp1
353 add $acc0,$car0,$car0 ! ap[j]*a0+c0
354 add $acc1,$car1,$car1
355 and $car0,$mask,$acc0
357 add $acc0,$acc0,$acc0
360 and $acc0,$mask,$acc0
361 add $acc0,$car1,$car1
365 add $tmp0,$car0,$car0 ! ap[j]*a0+c0
366 add $tmp1,$car1,$car1
367 and $car0,$mask,$acc0
369 add $acc0,$acc0,$acc0
372 and $acc0,$mask,$acc0
373 add $acc0,$car1,$car1
377 add $car0,$car0,$car0
379 add $car0,$car1,$car1
383 ld [%sp+$bias+$frame],$tmp0 ! tp[0]
384 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
385 ld [%sp+$bias+$frame+8],$tpj ! tp[2]
386 ld [$ap+4],$mul0 ! ap[1]
387 ld [$ap+8],$apj ! ap[2]
388 ld [$np],$car1 ! np[0]
389 ld [$np+4],$npj ! np[1]
392 mulx $mul0,$mul0,$car0
393 and $mul1,$mask,$mul1
395 mulx $car1,$mul1,$car1
396 mulx $npj,$mul1,$acc1
397 add $tmp0,$car1,$car1
398 and $car0,$mask,$acc0
399 ld [$np+8],$npj ! np[2]
401 add $tmp1,$car1,$car1
403 add $acc0,$car1,$car1
405 add $acc1,$car1,$car1
408 st $car1,[%sp+$bias+$frame] ! tp[0]=
410 add %sp,$bias+$frame+4,$tp
413 mulx $apj,$mul0,$acc0
414 mulx $npj,$mul1,$acc1
415 add $acc0,$car0,$car0
417 ld [$ap+$j],$apj ! ap[j]
418 and $car0,$mask,$acc0
419 ld [$np+$j],$npj ! np[j]
421 add $acc1,$car1,$car1
422 ld [$tp+8],$tpj ! tp[j]
423 add $acc0,$acc0,$acc0
425 add $sbit,$acc0,$acc0
427 and $acc0,$mask,$acc0
429 add $acc0,$car1,$car1
430 st $car1,[$tp] ! tp[j-1]
436 mulx $apj,$mul0,$acc0
437 mulx $npj,$mul1,$acc1
438 add $acc0,$car0,$car0
440 and $car0,$mask,$acc0
442 add $acc1,$car1,$car1
443 add $acc0,$acc0,$acc0
444 add $sbit,$acc0,$acc0
446 and $acc0,$mask,$acc0
447 add $acc0,$car1,$car1
448 st $car1,[$tp] ! tp[j-1]
451 add $car0,$car0,$car0
452 add $sbit,$car0,$car0
453 add $car0,$car1,$car1
454 add $car2,$car1,$car1
458 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
459 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
460 ld [$ap+8],$mul0 ! ap[2]
461 ld [$np],$car1 ! np[0]
462 ld [$np+4],$npj ! np[1]
464 and $mul1,$mask,$mul1
467 mulx $mul0,$mul0,$car0
468 mulx $car1,$mul1,$car1
469 and $car0,$mask,$acc0
470 add $tmp1,$car1,$car1
472 add %sp,$bias+$frame,$tp
480 mulx $npj,$mul1,$acc1
485 add $acc1,$car1,$car1
494 ld [$ap+$j],$apj ! ap[j]
495 mulx $npj,$mul1,$acc1
497 ld [$np+$j],$npj ! np[j]
499 and $car1,$mask,$car1
500 add $tmp0,$sbit,$sbit
501 add $acc0,$car1,$car1
502 ld [$tp+8],$tpj ! tp[j]
503 add $acc1,$car1,$car1
509 be,pn %icc,.Lsqr_no_inner2
513 mulx $apj,$mul0,$acc0
514 mulx $npj,$mul1,$acc1
516 add $acc0,$car0,$car0
517 ld [$ap+$j],$apj ! ap[j]
518 and $car0,$mask,$acc0
519 ld [$np+$j],$npj ! np[j]
521 add $acc0,$acc0,$acc0
522 ld [$tp+8],$tpj ! tp[j]
523 add $sbit,$acc0,$acc0
526 and $acc0,$mask,$acc0
528 add $acc0,$car1,$car1
529 add $acc1,$car1,$car1
530 st $car1,[$tp] ! tp[j-1]
536 mulx $apj,$mul0,$acc0
537 mulx $npj,$mul1,$acc1
539 add $acc0,$car0,$car0
540 and $car0,$mask,$acc0
542 add $acc0,$acc0,$acc0
543 add $sbit,$acc0,$acc0
545 and $acc0,$mask,$acc0
546 add $acc0,$car1,$car1
547 add $acc1,$car1,$car1
548 st $car1,[$tp] ! tp[j-1]
551 add $car0,$car0,$car0
552 add $sbit,$car0,$car0
553 add $car0,$car1,$car1
554 add $car2,$car1,$car1
559 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
560 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
561 ld [$ap+$i],$mul0 ! ap[j]
562 ld [$np],$car1 ! np[0]
563 ld [$np+4],$npj ! np[1]
565 and $mul1,$mask,$mul1
568 mulx $mul0,$mul0,$car0
569 mulx $car1,$mul1,$car1
570 and $car0,$mask,$acc0
571 add $tmp1,$car1,$car1
573 add %sp,$bias+$frame,$tp
578 cmp $tmp0,$num ! i<num-1
583 mulx $npj,$mul1,$acc1
588 add $acc1,$car1,$car1
596 mulx $npj,$mul1,$acc1
599 and $acc0,$mask,$acc0
600 add $tmp0,$sbit,$sbit
601 add $acc0,$car1,$car1
602 add $acc1,$car1,$car1
606 add $car0,$car0,$car0 ! recover $car0
607 add $sbit,$car0,$car0
608 add $car0,$car1,$car1
609 add $car2,$car1,$car1
615 .type $fname,#function
616 .size $fname,(.-$fname)
617 .asciz "Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
620 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
622 close STDOUT or die "error closing STDOUT: $!";