3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
13 # for undertaken effort are multiple. First of all, UltraSPARC is not
14 # the whole SPARCv9 universe and other VIS-free implementations deserve
15 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
16 # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
17 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
18 # several integrated RSA/DSA accelerator circuits accessible through
19 # kernel driver [only(*)], but having decent user-land software
20 # implementation is important too. Finally, reasons like desire to
21 # experiment with dedicated squaring procedure. Yes, this module
22 # implements one, because it was easiest to draft it in SPARCv9
25 # (*) Engine accessing the driver in question is on my TODO list.
26 # For reference, acceleator is estimated to give 6 to 10 times
27 # improvement on single-threaded RSA sign. It should be noted
28 # that 6-10x improvement coefficient does not actually mean
29 # something extraordinary in terms of absolute [single-threaded]
30 # performance, as SPARCv9 instruction set is by all means least
31 # suitable for high performance crypto among other 64 bit
32 # platforms. 6-10x factor simply places T1 in same performance
33 # domain as say AMD64 and IA-64. Improvement of RSA verify don't
34 # appear impressive at all, but it's the sign operation which is
35 # far more critical/interesting.
37 # You might notice that inner loops are modulo-scheduled:-) This has
38 # essentially negligible impact on UltraSPARC performance, it's
39 # Fujitsu SPARC64 V users who should notice and hopefully appreciate
40 # the advantage... Currently this module surpasses sparcv9a-mont.pl
41 # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
42 # module still have hidden potential [see TODO list there], which is
43 # estimated to be larger than 20%...
46 $rp="%i0"; # BN_ULONG *rp,
47 $ap="%i1"; # const BN_ULONG *ap,
48 $bp="%i2"; # const BN_ULONG *bp,
49 $np="%i3"; # const BN_ULONG *np,
50 $n0="%i4"; # const BN_ULONG *n0,
51 $num="%i5"; # int num);
54 for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
55 if ($bits==64) { $bias=2047; $frame=192; }
56 else { $bias=0; $frame=128; }
63 $mask="%g1"; # 32 bits, what a waste...
76 $fname="bn_mul_mont_int";
79 .section ".text",#alloc,#execinstr
84 cmp %o5,4 ! 128 bits minimum
86 sethi %hi(0xffffffff),$mask
92 sll $num,2,$num ! num*=4
93 or $mask,%lo(0xffffffff),$mask
97 ld [$bp],$mul0 ! bp[0]
100 add %sp,$bias,%o7 ! real top of stack
101 ld [$ap],$car0 ! ap[0] ! redundant in squaring context
103 ld [$ap+4],$apj ! ap[1]
105 ld [$np],$car1 ! np[0]
106 sub %o7,$bias,%sp ! alloca
107 ld [$np+4],$npj ! np[1]
108 be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
111 mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
112 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
113 and $car0,$mask,$acc0
114 add %sp,$bias+$frame,$tp
115 ld [$ap+8],$apj !prologue!
117 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
118 and $mul1,$mask,$mul1
120 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
121 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
123 add $acc0,$car1,$car1
124 ld [$np+8],$npj !prologue!
126 mov $tmp0,$acc0 !prologue!
129 mulx $apj,$mul0,$tmp0
130 mulx $npj,$mul1,$tmp1
131 add $acc0,$car0,$car0
132 ld [$ap+$j],$apj ! ap[j]
133 and $car0,$mask,$acc0
134 add $acc1,$car1,$car1
135 ld [$np+$j],$npj ! np[j]
137 add $acc0,$car1,$car1
148 mulx $apj,$mul0,$tmp0 !epilogue!
149 mulx $npj,$mul1,$tmp1
150 add $acc0,$car0,$car0
151 and $car0,$mask,$acc0
152 add $acc1,$car1,$car1
154 add $acc0,$car1,$car1
158 add $tmp0,$car0,$car0
159 and $car0,$mask,$acc0
160 add $tmp1,$car1,$car1
162 add $acc0,$car1,$car1
166 add $car0,$car1,$car1
171 ld [$bp+4],$mul0 ! bp[1]
173 add %sp,$bias+$frame,$tp
174 ld [$ap],$car0 ! ap[0]
175 ld [$ap+4],$apj ! ap[1]
176 ld [$np],$car1 ! np[0]
177 ld [$np+4],$npj ! np[1]
178 ld [$tp],$tmp1 ! tp[0]
179 ld [$tp+4],$tpj ! tp[1]
182 mulx $car0,$mul0,$car0
183 mulx $apj,$mul0,$tmp0 !prologue!
184 add $tmp1,$car0,$car0
185 ld [$ap+8],$apj !prologue!
186 and $car0,$mask,$acc0
189 and $mul1,$mask,$mul1
191 mulx $car1,$mul1,$car1
192 mulx $npj,$mul1,$acc1 !prologue!
194 add $acc0,$car1,$car1
195 ld [$np+8],$npj !prologue!
197 mov $tmp0,$acc0 !prologue!
200 mulx $apj,$mul0,$tmp0
201 mulx $npj,$mul1,$tmp1
203 ld [$ap+$j],$apj ! ap[j]
204 add $acc0,$car0,$car0
205 add $acc1,$car1,$car1
206 ld [$np+$j],$npj ! np[j]
207 and $car0,$mask,$acc0
208 ld [$tp+8],$tpj ! tp[j]
210 add $acc0,$car1,$car1
213 st $car1,[$tp] ! tp[j-1]
221 mulx $apj,$mul0,$tmp0 !epilogue!
222 mulx $npj,$mul1,$tmp1
224 add $acc0,$car0,$car0
225 ld [$tp+8],$tpj ! tp[j]
226 and $car0,$mask,$acc0
227 add $acc1,$car1,$car1
229 add $acc0,$car1,$car1
230 st $car1,[$tp] ! tp[j-1]
234 add $tmp0,$car0,$car0
235 and $car0,$mask,$acc0
236 add $tmp1,$car1,$car1
237 add $acc0,$car1,$car1
238 st $car1,[$tp+4] ! tp[j-1]
243 add $car0,$car1,$car1
245 add $car2,$car1,$car1
250 ld [$bp+$i],$mul0 ! bp[i]
259 sub %g0,$num,%o7 ! k=-num
261 srl $npj,30,%o0 ! boundary condition...
262 brz,pn %o0,.Lcopy ! ... is met
263 subcc %g0,%g0,%g0 ! clear %icc.c
269 subccc %o0,%o1,%o1 ! tp[j]-np[j]
274 subc $car2,0,$car2 ! handle upmost overflow bit
282 ld [$ap+%o7],%o0 ! copy or in-place refresh
283 st %g0,[$tp+%o7] ! zap tp
294 ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
295 ######## code without following dedicated squaring procedure.
297 $sbit="%i2"; # re-use $bp!
302 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
303 mulx $apj,$mul0,$tmp0 !prologue!
304 and $car0,$mask,$acc0
305 add %sp,$bias+$frame,$tp
306 ld [$ap+8],$apj !prologue!
308 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
310 and $mul1,$mask,$mul1
312 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
313 mulx $npj,$mul1,$acc1 !prologue!
315 ld [$np+8],$npj !prologue!
317 add $acc0,$car1,$car1
319 mov $tmp0,$acc0 !prologue!
322 mulx $apj,$mul0,$tmp0
323 mulx $npj,$mul1,$tmp1
324 add $acc0,$car0,$car0 ! ap[j]*a0+c0
325 add $acc1,$car1,$car1
326 ld [$ap+$j],$apj ! ap[j]
327 and $car0,$mask,$acc0
328 ld [$np+$j],$npj ! np[j]
330 add $acc0,$acc0,$acc0
335 and $acc0,$mask,$acc0
337 add $acc0,$car1,$car1
345 mulx $apj,$mul0,$tmp0 ! epilogue
346 mulx $npj,$mul1,$tmp1
347 add $acc0,$car0,$car0 ! ap[j]*a0+c0
348 add $acc1,$car1,$car1
349 and $car0,$mask,$acc0
351 add $acc0,$acc0,$acc0
354 and $acc0,$mask,$acc0
355 add $acc0,$car1,$car1
359 add $tmp0,$car0,$car0 ! ap[j]*a0+c0
360 add $tmp1,$car1,$car1
361 and $car0,$mask,$acc0
363 add $acc0,$acc0,$acc0
366 and $acc0,$mask,$acc0
367 add $acc0,$car1,$car1
371 add $car0,$car0,$car0
373 add $car0,$car1,$car1
377 ld [%sp+$bias+$frame],$tmp0 ! tp[0]
378 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
379 ld [%sp+$bias+$frame+8],$tpj ! tp[2]
380 ld [$ap+4],$mul0 ! ap[1]
381 ld [$ap+8],$apj ! ap[2]
382 ld [$np],$car1 ! np[0]
383 ld [$np+4],$npj ! np[1]
386 mulx $mul0,$mul0,$car0
387 and $mul1,$mask,$mul1
389 mulx $car1,$mul1,$car1
390 mulx $npj,$mul1,$acc1
391 add $tmp0,$car1,$car1
392 and $car0,$mask,$acc0
393 ld [$np+8],$npj ! np[2]
395 add $tmp1,$car1,$car1
397 add $acc0,$car1,$car1
399 add $acc1,$car1,$car1
402 st $car1,[%sp+$bias+$frame] ! tp[0]=
404 add %sp,$bias+$frame+4,$tp
407 mulx $apj,$mul0,$acc0
408 mulx $npj,$mul1,$acc1
409 add $acc0,$car0,$car0
411 ld [$ap+$j],$apj ! ap[j]
412 and $car0,$mask,$acc0
413 ld [$np+$j],$npj ! np[j]
415 add $acc1,$car1,$car1
416 ld [$tp+8],$tpj ! tp[j]
417 add $acc0,$acc0,$acc0
421 and $acc0,$mask,$acc0
423 add $acc0,$car1,$car1
424 st $car1,[$tp] ! tp[j-1]
430 mulx $apj,$mul0,$acc0
431 mulx $npj,$mul1,$acc1
432 add $acc0,$car0,$car0
434 and $car0,$mask,$acc0
436 add $acc1,$car1,$car1
437 add $acc0,$acc0,$acc0
440 and $acc0,$mask,$acc0
441 add $acc0,$car1,$car1
442 st $car1,[$tp] ! tp[j-1]
445 add $car0,$car0,$car0
447 add $car0,$car1,$car1
448 add $car2,$car1,$car1
452 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
453 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
454 ld [$ap+8],$mul0 ! ap[2]
455 ld [$np],$car1 ! np[0]
456 ld [$np+4],$npj ! np[1]
458 and $mul1,$mask,$mul1
461 mulx $mul0,$mul0,$car0
462 mulx $car1,$mul1,$car1
463 and $car0,$mask,$acc0
464 add $tmp1,$car1,$car1
466 add %sp,$bias+$frame,$tp
474 mulx $npj,$mul1,$acc1
479 add $acc1,$car1,$car1
488 ld [$ap+$j],$apj ! ap[j]
489 mulx $npj,$mul1,$acc1
491 ld [$np+$j],$npj ! np[j]
492 add $acc0,$car1,$car1
493 ld [$tp+8],$tpj ! tp[j]
494 add $acc1,$car1,$car1
500 be,pn %icc,.Lsqr_no_inner2
504 mulx $apj,$mul0,$acc0
505 mulx $npj,$mul1,$acc1
507 add $acc0,$car0,$car0
508 ld [$ap+$j],$apj ! ap[j]
509 and $car0,$mask,$acc0
510 ld [$np+$j],$npj ! np[j]
512 add $acc0,$acc0,$acc0
513 ld [$tp+8],$tpj ! tp[j]
517 and $acc0,$mask,$acc0
519 add $acc0,$car1,$car1
520 add $acc1,$car1,$car1
521 st $car1,[$tp] ! tp[j-1]
527 mulx $apj,$mul0,$acc0
528 mulx $npj,$mul1,$acc1
530 add $acc0,$car0,$car0
531 and $car0,$mask,$acc0
533 add $acc0,$acc0,$acc0
536 and $acc0,$mask,$acc0
537 add $acc0,$car1,$car1
538 add $acc1,$car1,$car1
539 st $car1,[$tp] ! tp[j-1]
542 add $car0,$car0,$car0
544 add $car0,$car1,$car1
545 add $car2,$car1,$car1
550 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
551 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
552 ld [$ap+$i],$mul0 ! ap[j]
553 ld [$np],$car1 ! np[0]
554 ld [$np+4],$npj ! np[1]
556 and $mul1,$mask,$mul1
559 mulx $mul0,$mul0,$car0
560 mulx $car1,$mul1,$car1
561 and $car0,$mask,$acc0
562 add $tmp1,$car1,$car1
564 add %sp,$bias+$frame,$tp
569 cmp $tmp0,$num ! i<num-1
574 mulx $npj,$mul1,$acc1
579 add $acc1,$car1,$car1
587 mulx $npj,$mul1,$acc1
589 add $acc0,$car1,$car1
590 add $acc1,$car1,$car1
594 add $car0,$car0,$car0 ! recover $car0
596 add $car0,$car1,$car1
597 add $car2,$car1,$car1
603 .type $fname,#function
604 .size $fname,(.-$fname)
605 .asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
607 $code =~ s/\`([^\`]*)\`/eval($1)/gem;