3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
13 # for undertaken effort are multiple. First of all, UltraSPARC is not
14 # the whole SPARCv9 universe and other VIS-free implementations deserve
15 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
16 # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
17 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
18 # several integrated RSA/DSA accelerator circuits accessible through
19 # kernel driver [only(*)], but having decent user-land software
20 # implementation is important too. Finally, reasons like desire to
21 # experiment with dedicated squaring procedure. Yes, this module
22 # implements one, because it was easiest to draft it in SPARCv9
25 # (*) Engine accessing the driver in question is on my TODO list.
26 # For reference, acceleator is estimated to give 6 to 10 times
27 # improvement on single-threaded RSA sign. It should be noted
28 # that 6-10x improvement coefficient does not actually mean
29 # something extraordinary in terms of absolute [single-threaded]
30 # performance, as SPARCv9 instruction set is by all means least
31 # suitable for high performance crypto among other 64 bit
32 # platforms. 6-10x factor simply places T1 in same performance
33 # domain as say AMD64 and IA-64. Improvement of RSA verify don't
34 # appear impressive at all, but it's the sign operation which is
35 # far more critical/interesting.
37 # You might notice that inner loops are modulo-scheduled:-) This has
38 # essentially negligible impact on UltraSPARC performance, it's
39 # Fujitsu SPARC64 V users who should notice and hopefully appreciate
40 # the advantage... Currently this module surpasses sparcv9a-mont.pl
41 # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
42 # module still have hidden potential [see TODO list there], which is
43 # estimated to be larger than 20%...
46 $rp="%i0"; # BN_ULONG *rp,
47 $ap="%i1"; # const BN_ULONG *ap,
48 $bp="%i2"; # const BN_ULONG *bp,
49 $np="%i3"; # const BN_ULONG *np,
50 $n0="%i4"; # const BN_ULONG *n0,
51 $num="%i5"; # int num);
54 for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
55 if ($bits==64) { $bias=2047; $frame=192; }
56 else { $bias=0; $frame=128; }
63 $mask="%g1"; # 32 bits, what a waste...
76 $fname="bn_mul_mont_int";
79 .section ".text",#alloc,#execinstr
84 cmp %o5,4 ! 128 bits minimum
86 sethi %hi(0xffffffff),$mask
92 sll $num,2,$num ! num*=4
93 or $mask,%lo(0xffffffff),$mask
97 ld [$bp],$mul0 ! bp[0]
100 add %sp,$bias,%o7 ! real top of stack
101 ld [$ap],$car0 ! ap[0] ! redundant in squaring context
103 ld [$ap+4],$apj ! ap[1]
105 ld [$np],$car1 ! np[0]
106 sub %o7,$bias,%sp ! alloca
107 ld [$np+4],$npj ! np[1]
108 be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
111 mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
112 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
113 and $car0,$mask,$acc0
114 add %sp,$bias+$frame,$tp
115 ld [$ap+8],$apj !prologue!
117 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
118 and $mul1,$mask,$mul1
120 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
121 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
123 add $acc0,$car1,$car1
124 ld [$np+8],$npj !prologue!
126 mov $tmp0,$acc0 !prologue!
129 mulx $apj,$mul0,$tmp0
130 mulx $npj,$mul1,$tmp1
131 add $acc0,$car0,$car0
132 ld [$ap+$j],$apj ! ap[j]
133 and $car0,$mask,$acc0
134 add $acc1,$car1,$car1
135 ld [$np+$j],$npj ! np[j]
137 add $acc0,$car1,$car1
148 mulx $apj,$mul0,$tmp0 !epilogue!
149 mulx $npj,$mul1,$tmp1
150 add $acc0,$car0,$car0
151 and $car0,$mask,$acc0
152 add $acc1,$car1,$car1
154 add $acc0,$car1,$car1
158 add $tmp0,$car0,$car0
159 and $car0,$mask,$acc0
160 add $tmp1,$car1,$car1
162 add $acc0,$car1,$car1
166 add $car0,$car1,$car1
171 ld [$bp+4],$mul0 ! bp[1]
173 add %sp,$bias+$frame,$tp
174 ld [$ap],$car0 ! ap[0]
175 ld [$ap+4],$apj ! ap[1]
176 ld [$np],$car1 ! np[0]
177 ld [$np+4],$npj ! np[1]
178 ld [$tp],$tmp1 ! tp[0]
179 ld [$tp+4],$tpj ! tp[1]
182 mulx $car0,$mul0,$car0
183 mulx $apj,$mul0,$tmp0 !prologue!
184 add $tmp1,$car0,$car0
185 ld [$ap+8],$apj !prologue!
186 and $car0,$mask,$acc0
189 and $mul1,$mask,$mul1
191 mulx $car1,$mul1,$car1
192 mulx $npj,$mul1,$acc1 !prologue!
194 add $acc0,$car1,$car1
195 ld [$np+8],$npj !prologue!
197 mov $tmp0,$acc0 !prologue!
200 mulx $apj,$mul0,$tmp0
201 mulx $npj,$mul1,$tmp1
203 ld [$ap+$j],$apj ! ap[j]
204 add $acc0,$car0,$car0
205 add $acc1,$car1,$car1
206 ld [$np+$j],$npj ! np[j]
207 and $car0,$mask,$acc0
208 ld [$tp+8],$tpj ! tp[j]
210 add $acc0,$car1,$car1
213 st $car1,[$tp] ! tp[j-1]
221 mulx $apj,$mul0,$tmp0 !epilogue!
222 mulx $npj,$mul1,$tmp1
224 add $acc0,$car0,$car0
225 ld [$tp+8],$tpj ! tp[j]
226 and $car0,$mask,$acc0
227 add $acc1,$car1,$car1
229 add $acc0,$car1,$car1
230 st $car1,[$tp] ! tp[j-1]
234 add $tmp0,$car0,$car0
235 and $car0,$mask,$acc0
236 add $tmp1,$car1,$car1
237 add $acc0,$car1,$car1
238 st $car1,[$tp+4] ! tp[j-1]
243 add $car0,$car1,$car1
245 add $car2,$car1,$car1
250 ld [$bp+$i],$mul0 ! bp[i]
258 sub %g0,$num,%o7 ! k=-num
260 subcc %g0,%g0,%g0 ! clear %icc.c
265 subccc %o0,%o1,%o1 ! tp[j]-np[j]
270 subccc $car2,0,$car2 ! handle upmost overflow bit
274 ld [$tp+%o7],%o1 ! conditional copy
276 st %g0,[$tp+%o7] ! zap tp
288 ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
289 ######## code without following dedicated squaring procedure.
296 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
297 mulx $apj,$mul0,$tmp0 !prologue!
298 and $car0,$mask,$acc0
299 add %sp,$bias+$frame,$tp
300 ld [$ap+8],$apj !prologue!
302 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
304 and $mul1,$mask,$mul1
306 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
307 mulx $npj,$mul1,$acc1 !prologue!
309 ld [$np+8],$npj !prologue!
311 add $acc0,$car1,$car1
313 mov $tmp0,$acc0 !prologue!
316 mulx $apj,$mul0,$tmp0
317 mulx $npj,$mul1,$tmp1
318 add $acc0,$car0,$car0 ! ap[j]*a0+c0
319 add $acc1,$car1,$car1
320 ld [$ap+$j],$apj ! ap[j]
321 and $car0,$mask,$acc0
322 ld [$np+$j],$npj ! np[j]
324 add $acc0,$acc0,$acc0
329 and $acc0,$mask,$acc0
331 add $acc0,$car1,$car1
339 mulx $apj,$mul0,$tmp0 ! epilogue
340 mulx $npj,$mul1,$tmp1
341 add $acc0,$car0,$car0 ! ap[j]*a0+c0
342 add $acc1,$car1,$car1
343 and $car0,$mask,$acc0
345 add $acc0,$acc0,$acc0
348 and $acc0,$mask,$acc0
349 add $acc0,$car1,$car1
353 add $tmp0,$car0,$car0 ! ap[j]*a0+c0
354 add $tmp1,$car1,$car1
355 and $car0,$mask,$acc0
357 add $acc0,$acc0,$acc0
360 and $acc0,$mask,$acc0
361 add $acc0,$car1,$car1
365 add $car0,$car0,$car0
367 add $car0,$car1,$car1
371 ld [%sp+$bias+$frame],$tmp0 ! tp[0]
372 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
373 ld [%sp+$bias+$frame+8],$tpj ! tp[2]
374 ld [$ap+4],$mul0 ! ap[1]
375 ld [$ap+8],$apj ! ap[2]
376 ld [$np],$car1 ! np[0]
377 ld [$np+4],$npj ! np[1]
380 mulx $mul0,$mul0,$car0
381 and $mul1,$mask,$mul1
383 mulx $car1,$mul1,$car1
384 mulx $npj,$mul1,$acc1
385 add $tmp0,$car1,$car1
386 and $car0,$mask,$acc0
387 ld [$np+8],$npj ! np[2]
389 add $tmp1,$car1,$car1
391 add $acc0,$car1,$car1
393 add $acc1,$car1,$car1
396 st $car1,[%sp+$bias+$frame] ! tp[0]=
398 add %sp,$bias+$frame+4,$tp
401 mulx $apj,$mul0,$acc0
402 mulx $npj,$mul1,$acc1
403 add $acc0,$car0,$car0
405 ld [$ap+$j],$apj ! ap[j]
406 and $car0,$mask,$acc0
407 ld [$np+$j],$npj ! np[j]
409 add $acc1,$car1,$car1
410 ld [$tp+8],$tpj ! tp[j]
411 add $acc0,$acc0,$acc0
413 add $sbit,$acc0,$acc0
415 and $acc0,$mask,$acc0
417 add $acc0,$car1,$car1
418 st $car1,[$tp] ! tp[j-1]
424 mulx $apj,$mul0,$acc0
425 mulx $npj,$mul1,$acc1
426 add $acc0,$car0,$car0
428 and $car0,$mask,$acc0
430 add $acc1,$car1,$car1
431 add $acc0,$acc0,$acc0
432 add $sbit,$acc0,$acc0
434 and $acc0,$mask,$acc0
435 add $acc0,$car1,$car1
436 st $car1,[$tp] ! tp[j-1]
439 add $car0,$car0,$car0
440 add $sbit,$car0,$car0
441 add $car0,$car1,$car1
442 add $car2,$car1,$car1
446 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
447 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
448 ld [$ap+8],$mul0 ! ap[2]
449 ld [$np],$car1 ! np[0]
450 ld [$np+4],$npj ! np[1]
452 and $mul1,$mask,$mul1
455 mulx $mul0,$mul0,$car0
456 mulx $car1,$mul1,$car1
457 and $car0,$mask,$acc0
458 add $tmp1,$car1,$car1
460 add %sp,$bias+$frame,$tp
468 mulx $npj,$mul1,$acc1
473 add $acc1,$car1,$car1
482 ld [$ap+$j],$apj ! ap[j]
483 mulx $npj,$mul1,$acc1
485 ld [$np+$j],$npj ! np[j]
487 and $car1,$mask,$car1
488 add $tmp0,$sbit,$sbit
489 add $acc0,$car1,$car1
490 ld [$tp+8],$tpj ! tp[j]
491 add $acc1,$car1,$car1
497 be,pn %icc,.Lsqr_no_inner2
501 mulx $apj,$mul0,$acc0
502 mulx $npj,$mul1,$acc1
504 add $acc0,$car0,$car0
505 ld [$ap+$j],$apj ! ap[j]
506 and $car0,$mask,$acc0
507 ld [$np+$j],$npj ! np[j]
509 add $acc0,$acc0,$acc0
510 ld [$tp+8],$tpj ! tp[j]
511 add $sbit,$acc0,$acc0
514 and $acc0,$mask,$acc0
516 add $acc0,$car1,$car1
517 add $acc1,$car1,$car1
518 st $car1,[$tp] ! tp[j-1]
524 mulx $apj,$mul0,$acc0
525 mulx $npj,$mul1,$acc1
527 add $acc0,$car0,$car0
528 and $car0,$mask,$acc0
530 add $acc0,$acc0,$acc0
531 add $sbit,$acc0,$acc0
533 and $acc0,$mask,$acc0
534 add $acc0,$car1,$car1
535 add $acc1,$car1,$car1
536 st $car1,[$tp] ! tp[j-1]
539 add $car0,$car0,$car0
540 add $sbit,$car0,$car0
541 add $car0,$car1,$car1
542 add $car2,$car1,$car1
547 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
548 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
549 ld [$ap+$i],$mul0 ! ap[j]
550 ld [$np],$car1 ! np[0]
551 ld [$np+4],$npj ! np[1]
553 and $mul1,$mask,$mul1
556 mulx $mul0,$mul0,$car0
557 mulx $car1,$mul1,$car1
558 and $car0,$mask,$acc0
559 add $tmp1,$car1,$car1
561 add %sp,$bias+$frame,$tp
566 cmp $tmp0,$num ! i<num-1
571 mulx $npj,$mul1,$acc1
576 add $acc1,$car1,$car1
584 mulx $npj,$mul1,$acc1
587 and $acc0,$mask,$acc0
588 add $tmp0,$sbit,$sbit
589 add $acc0,$car1,$car1
590 add $acc1,$car1,$car1
594 add $car0,$car0,$car0 ! recover $car0
595 add $sbit,$car0,$car0
596 add $car0,$car1,$car1
597 add $car2,$car1,$car1
603 .type $fname,#function
604 .size $fname,(.-$fname)
605 .asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
608 $code =~ s/\`([^\`]*)\`/eval($1)/gem;