3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
11 # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
12 # for undertaken effort are multiple. First of all, UltraSPARC is not
13 # the whole SPARCv9 universe and other VIS-free implementations deserve
14 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
15 # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
16 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
17 # several integrated RSA/DSA accelerator circuits accessible through
18 # kernel driver [only(*)], but having decent user-land software
19 # implementation is important too. Finally, reasons like desire to
20 # experiment with dedicated squaring procedure. Yes, this module
21 # implements one, because it was easiest to draft it in SPARCv9
24 # (*) Engine accessing the driver in question is on my TODO list.
25 # For reference, acceleator is estimated to give 6 to 10 times
26 # improvement on single-threaded RSA sign. It should be noted
27 # that 6-10x improvement coefficient does not actually mean
28 # something extraordinary in terms of absolute [single-threaded]
29 # performance, as SPARCv9 instruction set is by all means least
30 # suitable for high performance crypto among other 64 bit
31 # platforms. 6-10x factor simply places T1 in same performance
32 # domain as say AMD64 and IA-64. Improvement of RSA verify don't
33 # appear impressive at all, but it's the sign operation which is
34 # far more critical/interesting.
36 # You might notice that inner loops are modulo-scheduled:-) This has
37 # essentially negligible impact on UltraSPARC performance, it's
38 # Fujitsu SPARC64 V users who should notice and hopefully appreciate
39 # the advantage... Currently this module surpasses sparcv9a-mont.pl
40 # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
41 # module still have hidden potential [see TODO list there], which is
42 # estimated to be larger than 20%...
45 $rp="%i0"; # BN_ULONG *rp,
46 $ap="%i1"; # const BN_ULONG *ap,
47 $bp="%i2"; # const BN_ULONG *bp,
48 $np="%i3"; # const BN_ULONG *np,
49 $n0="%i4"; # const BN_ULONG *n0,
50 $num="%i5"; # int num);
53 for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
54 if ($bits==64) { $bias=2047; $frame=192; }
55 else { $bias=0; $frame=128; }
62 $mask="%g1"; # 32 bits, what a waste...
75 $fname="bn_mul_mont_int";
78 .section ".text",#alloc,#execinstr
83 cmp %o5,4 ! 128 bits minimum
85 sethi %hi(0xffffffff),$mask
91 sll $num,2,$num ! num*=4
92 or $mask,%lo(0xffffffff),$mask
96 ld [$bp],$mul0 ! bp[0]
97 be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
100 add %sp,$bias,%o7 ! real top of stack
101 ld [$ap],$car0 ! ap[0]
103 ld [$ap+4],$apj ! ap[1]
105 ld [$np],$car1 ! np[0]
106 sub %o7,$bias,%sp ! alloca
107 ld [$np+4],$npj ! np[1]
110 mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
111 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
112 and $car0,$mask,$acc0
113 add %sp,$bias+$frame,$tp
114 ld [$ap+8],$apj !prologue!
116 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
117 and $mul1,$mask,$mul1
119 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
120 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
122 add $acc0,$car1,$car1
123 ld [$np+8],$npj !prologue!
125 mov $tmp0,$acc0 !prologue!
128 mulx $apj,$mul0,$tmp0
129 mulx $npj,$mul1,$tmp1
130 add $acc0,$car0,$car0
131 ld [$ap+$j],$apj ! ap[j]
132 and $car0,$mask,$acc0
133 add $acc1,$car1,$car1
134 ld [$np+$j],$npj ! np[j]
136 add $acc0,$car1,$car1
147 mulx $apj,$mul0,$tmp0 !epilogue!
148 mulx $npj,$mul1,$tmp1
149 add $acc0,$car0,$car0
150 and $car0,$mask,$acc0
151 add $acc1,$car1,$car1
153 add $acc0,$car1,$car1
157 add $tmp0,$car0,$car0
158 and $car0,$mask,$acc0
159 add $tmp1,$car1,$car1
161 add $acc0,$car1,$car1
165 add $car0,$car1,$car1
170 ld [$bp+4],$mul0 ! bp[1]
172 add %sp,$bias+$frame,$tp
173 ld [$ap],$car0 ! ap[0]
174 ld [$ap+4],$apj ! ap[1]
175 ld [$np],$car1 ! np[0]
176 ld [$np+4],$npj ! np[1]
177 ld [$tp],$tmp1 ! tp[0]
178 ld [$tp+4],$tpj ! tp[1]
181 mulx $car0,$mul0,$car0
182 mulx $apj,$mul0,$tmp0 !prologue!
183 add $tmp1,$car0,$car0
184 ld [$ap+8],$apj !prologue!
185 and $car0,$mask,$acc0
188 and $mul1,$mask,$mul1
190 mulx $car1,$mul1,$car1
191 mulx $npj,$mul1,$acc1 !prologue!
193 add $acc0,$car1,$car1
194 ld [$np+8],$npj !prologue!
196 mov $tmp0,$acc0 !prologue!
199 mulx $apj,$mul0,$tmp0
200 mulx $npj,$mul1,$tmp1
202 ld [$ap+$j],$apj ! ap[j]
203 add $acc0,$car0,$car0
204 add $acc1,$car1,$car1
205 ld [$np+$j],$npj ! np[j]
206 and $car0,$mask,$acc0
207 ld [$tp+8],$tpj ! tp[j]
209 add $acc0,$car1,$car1
212 st $car1,[$tp] ! tp[j-1]
220 mulx $apj,$mul0,$tmp0 !epilogue!
221 mulx $npj,$mul1,$tmp1
223 add $acc0,$car0,$car0
224 ld [$tp+8],$tpj ! tp[j]
225 and $car0,$mask,$acc0
226 add $acc1,$car1,$car1
228 add $acc0,$car1,$car1
229 st $car1,[$tp] ! tp[j-1]
233 add $tmp0,$car0,$car0
234 and $car0,$mask,$acc0
235 add $tmp1,$car1,$car1
236 add $acc0,$car1,$car1
237 st $car1,[$tp+4] ! tp[j-1]
242 add $car0,$car1,$car1
244 add $car2,$car1,$car1
249 ld [$bp+$i],$mul0 ! bp[i]
258 cmp $car2,0 ! clears %icc.c
260 sub %g0,$num,%o7 ! k=-num
262 cmp $car1,$npj ! compare top-most $tp and $np words
263 bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken
301 ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
302 ######## code without following dedicated squaring procedure.
304 $sbit="%i2"; # re-use $bp!
309 add %sp,$bias,%o7 ! real top of stack
310 ld [$ap+4],$apj ! ap[1]
312 ld [$np],$car1 ! np[0]
314 ld [$np+4],$npj ! np[1]
315 sub %o7,$bias,%sp ! alloca
318 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
319 mulx $apj,$mul0,$tmp0 !prologue!
320 and $car0,$mask,$acc0
321 add %sp,$bias+$frame,$tp
322 ld [$ap+8],$apj !prologue!
324 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
326 and $mul1,$mask,$mul1
328 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
329 mulx $npj,$mul1,$acc1 !prologue!
331 ld [$np+8],$npj !prologue!
333 add $acc0,$car1,$car1
335 mov $tmp0,$acc0 !prologue!
338 mulx $apj,$mul0,$tmp0
339 mulx $npj,$mul1,$tmp1
340 add $acc0,$car0,$car0 ! ap[j]*a0+c0
341 add $acc1,$car1,$car1
342 ld [$ap+$j],$apj ! ap[j]
343 and $car0,$mask,$acc0
344 ld [$np+$j],$npj ! np[j]
346 add $acc0,$acc0,$acc0
351 and $acc0,$mask,$acc0
353 add $acc0,$car1,$car1
361 mulx $apj,$mul0,$tmp0 ! epilogue
362 mulx $npj,$mul1,$tmp1
363 add $acc0,$car0,$car0 ! ap[j]*a0+c0
364 add $acc1,$car1,$car1
365 and $car0,$mask,$acc0
367 add $acc0,$acc0,$acc0
370 and $acc0,$mask,$acc0
371 add $acc0,$car1,$car1
375 add $tmp0,$car0,$car0 ! ap[j]*a0+c0
376 add $tmp1,$car1,$car1
377 and $car0,$mask,$acc0
379 add $acc0,$acc0,$acc0
382 and $acc0,$mask,$acc0
383 add $acc0,$car1,$car1
387 add $car0,$car0,$car0
389 add $car0,$car1,$car1
393 ld [%sp+$bias+$frame],$tmp0 ! tp[0]
394 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
395 ld [%sp+$bias+$frame+8],$tpj ! tp[2]
396 ld [$ap+4],$mul0 ! ap[1]
397 ld [$ap+8],$apj ! ap[2]
398 ld [$np],$car1 ! np[0]
399 ld [$np+4],$npj ! np[1]
402 mulx $mul0,$mul0,$car0
403 and $mul1,$mask,$mul1
405 mulx $car1,$mul1,$car1
406 mulx $npj,$mul1,$acc1
407 add $tmp0,$car1,$car1
408 and $car0,$mask,$acc0
409 ld [$np+8],$npj ! np[2]
411 add $tmp1,$car1,$car1
413 add $acc0,$car1,$car1
415 add $acc1,$car1,$car1
418 st $car1,[%sp+$bias+$frame] ! tp[0]=
420 add %sp,$bias+$frame+4,$tp
423 mulx $apj,$mul0,$acc0
424 mulx $npj,$mul1,$acc1
425 add $acc0,$car0,$car0
427 ld [$ap+$j],$apj ! ap[j]
428 and $car0,$mask,$acc0
429 ld [$np+$j],$npj ! np[j]
431 add $acc1,$car1,$car1
432 ld [$tp+8],$tpj ! tp[j]
433 add $acc0,$acc0,$acc0
437 and $acc0,$mask,$acc0
439 add $acc0,$car1,$car1
440 st $car1,[$tp] ! tp[j-1]
446 mulx $apj,$mul0,$acc0
447 mulx $npj,$mul1,$acc1
448 add $acc0,$car0,$car0
450 and $car0,$mask,$acc0
452 add $acc1,$car1,$car1
453 add $acc0,$acc0,$acc0
456 and $acc0,$mask,$acc0
457 add $acc0,$car1,$car1
458 st $car1,[$tp] ! tp[j-1]
461 add $car0,$car0,$car0
463 add $car0,$car1,$car1
464 add $car2,$car1,$car1
468 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
469 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
470 ld [$ap+8],$mul0 ! ap[2]
471 ld [$np],$car1 ! np[0]
472 ld [$np+4],$npj ! np[1]
474 and $mul1,$mask,$mul1
477 mulx $mul0,$mul0,$car0
478 mulx $car1,$mul1,$car1
479 and $car0,$mask,$acc0
480 add $tmp1,$car1,$car1
482 add %sp,$bias+$frame,$tp
490 mulx $npj,$mul1,$acc1
495 add $acc1,$car1,$car1
504 ld [$ap+$j],$apj ! ap[j]
505 mulx $npj,$mul1,$acc1
507 ld [$np+$j],$npj ! np[j]
508 add $acc0,$car1,$car1
509 ld [$tp+8],$tpj ! tp[j]
510 add $acc1,$car1,$car1
516 be,pn %icc,.Lsqr_no_inner2
520 mulx $apj,$mul0,$acc0
521 mulx $npj,$mul1,$acc1
523 add $acc0,$car0,$car0
524 ld [$ap+$j],$apj ! ap[j]
525 and $car0,$mask,$acc0
526 ld [$np+$j],$npj ! np[j]
528 add $acc0,$acc0,$acc0
529 ld [$tp+8],$tpj ! tp[j]
533 and $acc0,$mask,$acc0
535 add $acc0,$car1,$car1
536 add $acc1,$car1,$car1
537 st $car1,[$tp] ! tp[j-1]
543 mulx $apj,$mul0,$acc0
544 mulx $npj,$mul1,$acc1
546 add $acc0,$car0,$car0
547 and $car0,$mask,$acc0
549 add $acc0,$acc0,$acc0
552 and $acc0,$mask,$acc0
553 add $acc0,$car1,$car1
554 add $acc1,$car1,$car1
555 st $car1,[$tp] ! tp[j-1]
558 add $car0,$car0,$car0
560 add $car0,$car1,$car1
561 add $car2,$car1,$car1
566 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
567 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
568 ld [$ap+$i],$mul0 ! ap[j]
569 ld [$np],$car1 ! np[0]
570 ld [$np+4],$npj ! np[1]
572 and $mul1,$mask,$mul1
575 mulx $mul0,$mul0,$car0
576 mulx $car1,$mul1,$car1
577 and $car0,$mask,$acc0
578 add $tmp1,$car1,$car1
580 add %sp,$bias+$frame,$tp
585 cmp $tmp0,$num ! i<num-1
590 mulx $npj,$mul1,$acc1
595 add $acc1,$car1,$car1
603 mulx $npj,$mul1,$acc1
605 add $acc0,$car1,$car1
606 add $acc1,$car1,$car1
610 add $car0,$car0,$car0 ! recover $car0
612 add $car0,$car1,$car1
613 add $car2,$car1,$car1
619 .type $fname,#function
620 .size $fname,(.-$fname)
622 $code =~ s/\`([^\`]*)\`/eval($1)/gem;