3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
11 # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
12 # for undertaken effort are multiple. First of all, UltraSPARC is not
13 # the whole SPARCv9 universe and other VIS-free implementations deserve
14 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
15 # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
16 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
17 # several integrated RSA/DSA accelerator circuits accessible through
18 # kernel driver [only(*)], but having decent user-land software
19 # implementation is important too. Finally, reasons like desire to
20 # experiment with dedicated squaring procedure. Yes, this module
21 # implements one, because it was easiest to draft it in SPARCv9
24 # (*) Engine accessing the driver in question is on my TODO list.
25 # For reference, acceleator is estimated to give 6 to 10 times
26 # improvement on single-threaded RSA sign. It should be noted
27 # that 6-10x improvement coefficient does not actually mean
28 # something extraordinary in terms of absolute [single-threaded]
29 # performance, as SPARCv9 instruction set is by all means least
30 # suitable for high performance crypto among other 64 bit
31 # platforms. 6-10x factor simply places T1 in same performance
32 # domain as say AMD64 and IA-64. Improvement of RSA verify don't
33 # appear impressive at all, but it's the sign operation which is
34 # far more critical/interesting.
36 # You might notice that inner loops are modulo-scheduled:-) This has
37 # essentially negligible impact on UltraSPARC performance, it's
38 # Fujitsu SPARC64 V users who should notice and hopefully appreciate
39 # the advantage... Currently this module surpasses sparcv9a-mont.pl
40 # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
41 # module still have hidden potential [see TODO list there], which is
42 # estimated to be larger than 20%...
45 $rp="%i0"; # BN_ULONG *rp,
46 $ap="%i1"; # const BN_ULONG *ap,
47 $bp="%i2"; # const BN_ULONG *bp,
48 $np="%i3"; # const BN_ULONG *np,
49 $n0="%i4"; # const BN_ULONG *n0,
50 $num="%i5"; # int num);
53 for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
54 if ($bits==64) { $bias=2047; $frame=192; }
55 else { $bias=0; $frame=128; }
62 $mask="%g1"; # 32 bits, what a waste...
78 .section ".text",#alloc,#execinstr
83 cmp %o5,4 ! 128 bits minimum
85 sethi %hi(0xffffffff),$mask
91 sll $num,2,$num ! num*=4
92 or $mask,%lo(0xffffffff),$mask
96 ld [$bp],$mul0 ! bp[0]
97 be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
100 add %sp,$bias,%o7 ! real top of stack
101 ld [$ap],$car0 ! ap[0]
103 ld [$ap+4],$apj ! ap[1]
105 ld [$np],$car1 ! np[0]
106 sub %o7,$bias,%sp ! alloca
107 ld [$np+4],$npj ! np[1]
110 mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
111 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
112 and $car0,$mask,$acc0
113 add %sp,$bias+$frame,$tp
114 ld [$ap+8],$apj !prologue!
116 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
117 and $mul1,$mask,$mul1
119 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
120 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
122 add $acc0,$car1,$car1
123 ld [$np+8],$npj !prologue!
125 mov $tmp0,$acc0 !prologue!
128 mulx $apj,$mul0,$tmp0
129 mulx $npj,$mul1,$tmp1
130 add $acc0,$car0,$car0
131 ld [$ap+$j],$apj ! ap[j]
132 and $car0,$mask,$acc0
133 add $acc1,$car1,$car1
134 ld [$np+$j],$npj ! np[j]
136 add $acc0,$car1,$car1
147 mulx $apj,$mul0,$tmp0 !epilogue!
148 mulx $npj,$mul1,$tmp1
149 add $acc0,$car0,$car0
150 and $car0,$mask,$acc0
151 add $acc1,$car1,$car1
153 add $acc0,$car1,$car1
157 add $tmp0,$car0,$car0
158 and $car0,$mask,$acc0
159 add $tmp1,$car1,$car1
161 add $acc0,$car1,$car1
165 add $car0,$car1,$car1
170 ld [$bp+4],$mul0 ! bp[1]
172 add %sp,$bias+$frame,$tp
173 ld [$ap],$car0 ! ap[0]
174 ld [$ap+4],$apj ! ap[1]
175 ld [$np],$car1 ! np[0]
176 ld [$np+4],$npj ! np[1]
177 ld [$tp],$tmp1 ! tp[0]
178 ld [$tp+4],$tpj ! tp[1]
181 mulx $car0,$mul0,$car0
182 mulx $apj,$mul0,$tmp0 !prologue!
183 add $tmp1,$car0,$car0
184 ld [$ap+8],$apj !prologue!
185 and $car0,$mask,$acc0
188 and $mul1,$mask,$mul1
190 mulx $car1,$mul1,$car1
191 mulx $npj,$mul1,$acc1 !prologue!
193 add $acc0,$car1,$car1
194 ld [$np+8],$npj !prologue!
196 mov $tmp0,$acc0 !prologue!
199 mulx $apj,$mul0,$tmp0
200 mulx $npj,$mul1,$tmp1
202 ld [$ap+$j],$apj ! ap[j]
203 add $acc0,$car0,$car0
204 add $acc1,$car1,$car1
205 ld [$np+$j],$npj ! np[j]
206 and $car0,$mask,$acc0
207 ld [$tp+8],$tpj ! tp[j]
209 add $acc0,$car1,$car1
212 st $car1,[$tp] ! tp[j-1]
220 mulx $apj,$mul0,$tmp0 !epilogue!
221 mulx $npj,$mul1,$tmp1
223 add $acc0,$car0,$car0
224 ld [$tp+8],$tpj ! tp[j]
225 and $car0,$mask,$acc0
226 add $acc1,$car1,$car1
228 add $acc0,$car1,$car1
229 st $car1,[$tp] ! tp[j-1]
233 add $tmp0,$car0,$car0
234 and $car0,$mask,$acc0
235 add $tmp1,$car1,$car1
236 add $acc0,$car1,$car1
237 st $car1,[$tp+4] ! tp[j-1]
242 add $car0,$car1,$car1
244 add $car2,$car1,$car1
249 ld [$bp+$i],$mul0 ! bp[i]
258 cmp $car2,0 ! clears %icc.c
260 sub %g0,$num,%o7 ! k=-num
262 cmp $car1,$npj ! compare top-most $tp and $np words
263 bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken
301 ######## bn_sqr_mont gives up to 20% improvement over above code
303 $sbit="%i2"; # re-use $bp!
308 add %sp,$bias,%o7 ! real top of stack
309 ld [$ap+4],$apj ! ap[1]
311 ld [$np],$car1 ! np[0]
313 ld [$np+4],$npj ! np[1]
314 sub %o7,$bias,%sp ! alloca
317 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
318 mulx $apj,$mul0,$tmp0 !prologue!
319 and $car0,$mask,$acc0
320 add %sp,$bias+$frame,$tp
321 ld [$ap+8],$apj !prologue!
323 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
325 and $mul1,$mask,$mul1
327 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
328 mulx $npj,$mul1,$acc1 !prologue!
330 ld [$np+8],$npj !prologue!
332 add $acc0,$car1,$car1
334 mov $tmp0,$acc0 !prologue!
337 mulx $apj,$mul0,$tmp0
338 mulx $npj,$mul1,$tmp1
339 add $acc0,$car0,$car0 ! ap[j]*a0+c0
340 add $acc1,$car1,$car1
341 ld [$ap+$j],$apj ! ap[j]
342 and $car0,$mask,$acc0
343 ld [$np+$j],$npj ! np[j]
345 add $acc0,$acc0,$acc0
350 and $acc0,$mask,$acc0
352 add $acc0,$car1,$car1
360 mulx $apj,$mul0,$tmp0 ! epilogue
361 mulx $npj,$mul1,$tmp1
362 add $acc0,$car0,$car0 ! ap[j]*a0+c0
363 add $acc1,$car1,$car1
364 and $car0,$mask,$acc0
366 add $acc0,$acc0,$acc0
369 and $acc0,$mask,$acc0
370 add $acc0,$car1,$car1
374 add $tmp0,$car0,$car0 ! ap[j]*a0+c0
375 add $tmp1,$car1,$car1
376 and $car0,$mask,$acc0
378 add $acc0,$acc0,$acc0
381 and $acc0,$mask,$acc0
382 add $acc0,$car1,$car1
386 add $car0,$car0,$car0
388 add $car0,$car1,$car1
392 ld [%sp+$bias+$frame],$tmp0 ! tp[0]
393 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
394 ld [%sp+$bias+$frame+8],$tpj ! tp[2]
395 ld [$ap+4],$mul0 ! ap[1]
396 ld [$ap+8],$apj ! ap[2]
397 ld [$np],$car1 ! np[0]
398 ld [$np+4],$npj ! np[1]
401 mulx $mul0,$mul0,$car0
402 and $mul1,$mask,$mul1
404 mulx $car1,$mul1,$car1
405 mulx $npj,$mul1,$acc1
406 add $tmp0,$car1,$car1
407 and $car0,$mask,$acc0
408 ld [$np+8],$npj ! np[2]
410 add $tmp1,$car1,$car1
412 add $acc0,$car1,$car1
414 add $acc1,$car1,$car1
417 st $car1,[%sp+$bias+$frame] ! tp[0]=
419 add %sp,$bias+$frame+4,$tp
422 mulx $apj,$mul0,$acc0
423 mulx $npj,$mul1,$acc1
424 add $acc0,$car0,$car0
426 ld [$ap+$j],$apj ! ap[j]
427 and $car0,$mask,$acc0
428 ld [$np+$j],$npj ! np[j]
430 add $acc1,$car1,$car1
431 ld [$tp+8],$tpj ! tp[j]
432 add $acc0,$acc0,$acc0
436 and $acc0,$mask,$acc0
438 add $acc0,$car1,$car1
439 st $car1,[$tp] ! tp[j-1]
445 mulx $apj,$mul0,$acc0
446 mulx $npj,$mul1,$acc1
447 add $acc0,$car0,$car0
449 and $car0,$mask,$acc0
451 add $acc1,$car1,$car1
452 add $acc0,$acc0,$acc0
455 and $acc0,$mask,$acc0
456 add $acc0,$car1,$car1
457 st $car1,[$tp] ! tp[j-1]
460 add $car0,$car0,$car0
462 add $car0,$car1,$car1
463 add $car2,$car1,$car1
467 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
468 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
469 ld [$ap+8],$mul0 ! ap[2]
470 ld [$np],$car1 ! np[0]
471 ld [$np+4],$npj ! np[1]
473 and $mul1,$mask,$mul1
476 mulx $mul0,$mul0,$car0
477 mulx $car1,$mul1,$car1
478 and $car0,$mask,$acc0
479 add $tmp1,$car1,$car1
481 add %sp,$bias+$frame,$tp
489 mulx $npj,$mul1,$acc1
494 add $acc1,$car1,$car1
503 ld [$ap+$j],$apj ! ap[j]
504 mulx $npj,$mul1,$acc1
506 ld [$np+$j],$npj ! np[j]
507 add $acc0,$car1,$car1
508 ld [$tp+8],$tpj ! tp[j]
509 add $acc1,$car1,$car1
515 be,pn %icc,.Lsqr_no_inner2
519 mulx $apj,$mul0,$acc0
520 mulx $npj,$mul1,$acc1
522 add $acc0,$car0,$car0
523 ld [$ap+$j],$apj ! ap[j]
524 and $car0,$mask,$acc0
525 ld [$np+$j],$npj ! np[j]
527 add $acc0,$acc0,$acc0
528 ld [$tp+8],$tpj ! tp[j]
532 and $acc0,$mask,$acc0
534 add $acc0,$car1,$car1
535 add $acc1,$car1,$car1
536 st $car1,[$tp] ! tp[j-1]
542 mulx $apj,$mul0,$acc0
543 mulx $npj,$mul1,$acc1
545 add $acc0,$car0,$car0
546 and $car0,$mask,$acc0
548 add $acc0,$acc0,$acc0
551 and $acc0,$mask,$acc0
552 add $acc0,$car1,$car1
553 add $acc1,$car1,$car1
554 st $car1,[$tp] ! tp[j-1]
557 add $car0,$car0,$car0
559 add $car0,$car1,$car1
560 add $car2,$car1,$car1
565 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
566 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
567 ld [$ap+$i],$mul0 ! ap[j]
568 ld [$np],$car1 ! np[0]
569 ld [$np+4],$npj ! np[1]
571 and $mul1,$mask,$mul1
574 mulx $mul0,$mul0,$car0
575 mulx $car1,$mul1,$car1
576 and $car0,$mask,$acc0
577 add $tmp1,$car1,$car1
579 add %sp,$bias+$frame,$tp
584 cmp $tmp0,$num ! i<num-1
589 mulx $npj,$mul1,$acc1
594 add $acc1,$car1,$car1
602 mulx $npj,$mul1,$acc1
604 add $acc0,$car1,$car1
605 add $acc1,$car1,$car1
609 add $car0,$car0,$car0 ! recover $car0
611 add $car0,$car1,$car1
612 add $car2,$car1,$car1
618 .type $fname,#function
619 .size $fname,(.-$fname)
621 $code =~ s/\`([^\`]*)\`/eval($1)/gem;