3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
11 # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
12 # for undertaken effort are multiple. First of all, UltraSPARC is not
13 # the whole SPARCv9 universe and other VIS-free implementations deserve
14 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
15 # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
16 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
17 # several integrated RSA/DSA accelerator circuits accessible through
18 # kernel driver [only(*)], but having decent user-land software
19 # implementation is important too. Finally, reasons like desire to
20 # experiment with dedicated squaring procedure. Yes, this module
21 # implements one, because it was easiest to draft it in SPARCv9
24 # (*) Engine accessing the driver in question is on my TODO list.
25 # For reference, acceleator is estimated to give 6 to 10 times
26 # improvement on single-threaded RSA sign. It should be noted
27 # that 6-10x improvement coefficient does not actually mean
28 # something extraordinary in terms of absolute [single-threaded]
29 # performance, as SPARCv9 instruction set is by all means least
30 # suitable for high performance crypto among other 64 bit
31 # platforms. 6-10x factor simply places T1 in same performance
32 # domain as say AMD64 and IA-64. Improvement of RSA verify don't
33 # appear impressive at all, but it's the sign operation which is
34 # far more critical/interesting.
36 # You might notice that inner loops are modulo-scheduled:-) This has
37 # essentially negligible impact on UltraSPARC performance, it's
38 # Fujitsu SPARC64 V users who should notice and hopefully appreciate
39 # the advantage... Currently this module surpasses sparcv9a-mont.pl
40 # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
41 # module still have hidden potential [see TODO list there], which is
42 # estimated to be larger than 20%...
45 $rp="%i0"; # BN_ULONG *rp,
46 $ap="%i1"; # const BN_ULONG *ap,
47 $bp="%i2"; # const BN_ULONG *bp,
48 $np="%i3"; # const BN_ULONG *np,
49 $n0="%i4"; # const BN_ULONG *n0,
50 $num="%i5"; # int num);
53 for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
54 if ($bits==64) { $bias=2047; $frame=192; }
55 else { $bias=0; $frame=128; }
62 $mask="%g1"; # 32 bits, what a waste...
75 $fname="bn_mul_mont_int";
78 .section ".text",#alloc,#execinstr
83 cmp %o5,4 ! 128 bits minimum
85 sethi %hi(0xffffffff),$mask
91 sll $num,2,$num ! num*=4
92 or $mask,%lo(0xffffffff),$mask
96 ld [$bp],$mul0 ! bp[0]
99 add %sp,$bias,%o7 ! real top of stack
100 ld [$ap],$car0 ! ap[0] ! redundant in squaring context
102 ld [$ap+4],$apj ! ap[1]
104 ld [$np],$car1 ! np[0]
105 sub %o7,$bias,%sp ! alloca
106 ld [$np+4],$npj ! np[1]
107 be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
110 mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
111 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
112 and $car0,$mask,$acc0
113 add %sp,$bias+$frame,$tp
114 ld [$ap+8],$apj !prologue!
116 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
117 and $mul1,$mask,$mul1
119 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
120 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
122 add $acc0,$car1,$car1
123 ld [$np+8],$npj !prologue!
125 mov $tmp0,$acc0 !prologue!
128 mulx $apj,$mul0,$tmp0
129 mulx $npj,$mul1,$tmp1
130 add $acc0,$car0,$car0
131 ld [$ap+$j],$apj ! ap[j]
132 and $car0,$mask,$acc0
133 add $acc1,$car1,$car1
134 ld [$np+$j],$npj ! np[j]
136 add $acc0,$car1,$car1
147 mulx $apj,$mul0,$tmp0 !epilogue!
148 mulx $npj,$mul1,$tmp1
149 add $acc0,$car0,$car0
150 and $car0,$mask,$acc0
151 add $acc1,$car1,$car1
153 add $acc0,$car1,$car1
157 add $tmp0,$car0,$car0
158 and $car0,$mask,$acc0
159 add $tmp1,$car1,$car1
161 add $acc0,$car1,$car1
165 add $car0,$car1,$car1
170 ld [$bp+4],$mul0 ! bp[1]
172 add %sp,$bias+$frame,$tp
173 ld [$ap],$car0 ! ap[0]
174 ld [$ap+4],$apj ! ap[1]
175 ld [$np],$car1 ! np[0]
176 ld [$np+4],$npj ! np[1]
177 ld [$tp],$tmp1 ! tp[0]
178 ld [$tp+4],$tpj ! tp[1]
181 mulx $car0,$mul0,$car0
182 mulx $apj,$mul0,$tmp0 !prologue!
183 add $tmp1,$car0,$car0
184 ld [$ap+8],$apj !prologue!
185 and $car0,$mask,$acc0
188 and $mul1,$mask,$mul1
190 mulx $car1,$mul1,$car1
191 mulx $npj,$mul1,$acc1 !prologue!
193 add $acc0,$car1,$car1
194 ld [$np+8],$npj !prologue!
196 mov $tmp0,$acc0 !prologue!
199 mulx $apj,$mul0,$tmp0
200 mulx $npj,$mul1,$tmp1
202 ld [$ap+$j],$apj ! ap[j]
203 add $acc0,$car0,$car0
204 add $acc1,$car1,$car1
205 ld [$np+$j],$npj ! np[j]
206 and $car0,$mask,$acc0
207 ld [$tp+8],$tpj ! tp[j]
209 add $acc0,$car1,$car1
212 st $car1,[$tp] ! tp[j-1]
220 mulx $apj,$mul0,$tmp0 !epilogue!
221 mulx $npj,$mul1,$tmp1
223 add $acc0,$car0,$car0
224 ld [$tp+8],$tpj ! tp[j]
225 and $car0,$mask,$acc0
226 add $acc1,$car1,$car1
228 add $acc0,$car1,$car1
229 st $car1,[$tp] ! tp[j-1]
233 add $tmp0,$car0,$car0
234 and $car0,$mask,$acc0
235 add $tmp1,$car1,$car1
236 add $acc0,$car1,$car1
237 st $car1,[$tp+4] ! tp[j-1]
242 add $car0,$car1,$car1
244 add $car2,$car1,$car1
249 ld [$bp+$i],$mul0 ! bp[i]
258 cmp $car2,0 ! clears %icc.c
260 sub %g0,$num,%o7 ! k=-num
262 cmp $car1,$npj ! compare top-most $tp and $np words
263 bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken
301 ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
302 ######## code without following dedicated squaring procedure.
304 $sbit="%i2"; # re-use $bp!
309 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
310 mulx $apj,$mul0,$tmp0 !prologue!
311 and $car0,$mask,$acc0
312 add %sp,$bias+$frame,$tp
313 ld [$ap+8],$apj !prologue!
315 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
317 and $mul1,$mask,$mul1
319 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
320 mulx $npj,$mul1,$acc1 !prologue!
322 ld [$np+8],$npj !prologue!
324 add $acc0,$car1,$car1
326 mov $tmp0,$acc0 !prologue!
329 mulx $apj,$mul0,$tmp0
330 mulx $npj,$mul1,$tmp1
331 add $acc0,$car0,$car0 ! ap[j]*a0+c0
332 add $acc1,$car1,$car1
333 ld [$ap+$j],$apj ! ap[j]
334 and $car0,$mask,$acc0
335 ld [$np+$j],$npj ! np[j]
337 add $acc0,$acc0,$acc0
342 and $acc0,$mask,$acc0
344 add $acc0,$car1,$car1
352 mulx $apj,$mul0,$tmp0 ! epilogue
353 mulx $npj,$mul1,$tmp1
354 add $acc0,$car0,$car0 ! ap[j]*a0+c0
355 add $acc1,$car1,$car1
356 and $car0,$mask,$acc0
358 add $acc0,$acc0,$acc0
361 and $acc0,$mask,$acc0
362 add $acc0,$car1,$car1
366 add $tmp0,$car0,$car0 ! ap[j]*a0+c0
367 add $tmp1,$car1,$car1
368 and $car0,$mask,$acc0
370 add $acc0,$acc0,$acc0
373 and $acc0,$mask,$acc0
374 add $acc0,$car1,$car1
378 add $car0,$car0,$car0
380 add $car0,$car1,$car1
384 ld [%sp+$bias+$frame],$tmp0 ! tp[0]
385 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
386 ld [%sp+$bias+$frame+8],$tpj ! tp[2]
387 ld [$ap+4],$mul0 ! ap[1]
388 ld [$ap+8],$apj ! ap[2]
389 ld [$np],$car1 ! np[0]
390 ld [$np+4],$npj ! np[1]
393 mulx $mul0,$mul0,$car0
394 and $mul1,$mask,$mul1
396 mulx $car1,$mul1,$car1
397 mulx $npj,$mul1,$acc1
398 add $tmp0,$car1,$car1
399 and $car0,$mask,$acc0
400 ld [$np+8],$npj ! np[2]
402 add $tmp1,$car1,$car1
404 add $acc0,$car1,$car1
406 add $acc1,$car1,$car1
409 st $car1,[%sp+$bias+$frame] ! tp[0]=
411 add %sp,$bias+$frame+4,$tp
414 mulx $apj,$mul0,$acc0
415 mulx $npj,$mul1,$acc1
416 add $acc0,$car0,$car0
418 ld [$ap+$j],$apj ! ap[j]
419 and $car0,$mask,$acc0
420 ld [$np+$j],$npj ! np[j]
422 add $acc1,$car1,$car1
423 ld [$tp+8],$tpj ! tp[j]
424 add $acc0,$acc0,$acc0
428 and $acc0,$mask,$acc0
430 add $acc0,$car1,$car1
431 st $car1,[$tp] ! tp[j-1]
437 mulx $apj,$mul0,$acc0
438 mulx $npj,$mul1,$acc1
439 add $acc0,$car0,$car0
441 and $car0,$mask,$acc0
443 add $acc1,$car1,$car1
444 add $acc0,$acc0,$acc0
447 and $acc0,$mask,$acc0
448 add $acc0,$car1,$car1
449 st $car1,[$tp] ! tp[j-1]
452 add $car0,$car0,$car0
454 add $car0,$car1,$car1
455 add $car2,$car1,$car1
459 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
460 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
461 ld [$ap+8],$mul0 ! ap[2]
462 ld [$np],$car1 ! np[0]
463 ld [$np+4],$npj ! np[1]
465 and $mul1,$mask,$mul1
468 mulx $mul0,$mul0,$car0
469 mulx $car1,$mul1,$car1
470 and $car0,$mask,$acc0
471 add $tmp1,$car1,$car1
473 add %sp,$bias+$frame,$tp
481 mulx $npj,$mul1,$acc1
486 add $acc1,$car1,$car1
495 ld [$ap+$j],$apj ! ap[j]
496 mulx $npj,$mul1,$acc1
498 ld [$np+$j],$npj ! np[j]
499 add $acc0,$car1,$car1
500 ld [$tp+8],$tpj ! tp[j]
501 add $acc1,$car1,$car1
507 be,pn %icc,.Lsqr_no_inner2
511 mulx $apj,$mul0,$acc0
512 mulx $npj,$mul1,$acc1
514 add $acc0,$car0,$car0
515 ld [$ap+$j],$apj ! ap[j]
516 and $car0,$mask,$acc0
517 ld [$np+$j],$npj ! np[j]
519 add $acc0,$acc0,$acc0
520 ld [$tp+8],$tpj ! tp[j]
524 and $acc0,$mask,$acc0
526 add $acc0,$car1,$car1
527 add $acc1,$car1,$car1
528 st $car1,[$tp] ! tp[j-1]
534 mulx $apj,$mul0,$acc0
535 mulx $npj,$mul1,$acc1
537 add $acc0,$car0,$car0
538 and $car0,$mask,$acc0
540 add $acc0,$acc0,$acc0
543 and $acc0,$mask,$acc0
544 add $acc0,$car1,$car1
545 add $acc1,$car1,$car1
546 st $car1,[$tp] ! tp[j-1]
549 add $car0,$car0,$car0
551 add $car0,$car1,$car1
552 add $car2,$car1,$car1
557 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
558 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
559 ld [$ap+$i],$mul0 ! ap[j]
560 ld [$np],$car1 ! np[0]
561 ld [$np+4],$npj ! np[1]
563 and $mul1,$mask,$mul1
566 mulx $mul0,$mul0,$car0
567 mulx $car1,$mul1,$car1
568 and $car0,$mask,$acc0
569 add $tmp1,$car1,$car1
571 add %sp,$bias+$frame,$tp
576 cmp $tmp0,$num ! i<num-1
581 mulx $npj,$mul1,$acc1
586 add $acc1,$car1,$car1
594 mulx $npj,$mul1,$acc1
596 add $acc0,$car1,$car1
597 add $acc1,$car1,$car1
601 add $car0,$car0,$car0 ! recover $car0
603 add $car0,$car1,$car1
604 add $car2,$car1,$car1
610 .type $fname,#function
611 .size $fname,(.-$fname)
613 $code =~ s/\`([^\`]*)\`/eval($1)/gem;