3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
14 if ($output =~ /32\-mont\.s/) {
17 $FRAME= $SIZE_T*16+8*12;
18 $fname= "bn_mul_mont_ppc64";
20 $STUX= "stwux"; # store indexed and update
23 die "not implemented yet";
24 } elsif ($output =~ /64\-mont\.s/) {
27 $FRAME= $SIZE_T*16+8*12;
28 $fname= "bn_mul_mont";
30 # same as above, but 64-bit mnemonics...
31 $STUX= "stdux"; # store indexed and update
34 } else { die "nonsense $output"; }
36 ( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) ||
37 die "can't call ../perlasm/ppc-xlate.pl: $!";
49 $rp="r9"; # $rp is reassigned
53 # non-volatile registers
69 # PPC offers enough register bank capacity to unroll inner loops twice
120 # sp----------->+-------------------------------+
122 # +-------------------------------+
124 # +-------------------------------+
125 # | 14 saved gpr, r14-r27 |
128 # +16*size_t +-------------------------------+
129 # | 12 saved fpr, f14-f25 |
132 # +12*8 +-------------------------------+
133 # | 8 gpr<->fpr transfer zone |
136 # +8*8 +-------------------------------+
137 # | __int64 tmp[-1] |
138 # +-------------------------------+
139 # | __int64 tmp[num] |
143 # +(num+1)*8 +-------------------------------+
144 # | double a_lo[num] |
148 # +num*8 +-------------------------------+
149 # | double a_hi[num] |
153 # +num*8 +-------------------------------+
154 # | double n_lo[num] |
158 # +num*8 +-------------------------------+
159 # | double n_hi[num] |
163 # +-------------------------------+
173 mr $rp,r3 ; $rp is reassigned
174 li r3,0 ; possible "not handled" return code
176 andi. r0,$num,1 ; $num has to be even
179 slwi $num,$num,3 ; num*=8
181 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
182 add $tp,$tp,$num ; place for tp[num+1]
183 addi $tp,$tp,`$FRAME+$TRANSFER+8+$RZONE`
184 subf $tp,$tp,$sp ; $sp-$tp
185 and $tp,$tp,$i ; minimize TLB usage
186 subf $tp,$sp,$tp ; $tp-$sp
187 $STUX $sp,$sp,$tp ; alloca
189 $PUSH r14,`2*$SIZE_T`($sp)
190 $PUSH r15,`3*$SIZE_T`($sp)
191 $PUSH r16,`4*$SIZE_T`($sp)
192 $PUSH r17,`5*$SIZE_T`($sp)
193 $PUSH r18,`6*$SIZE_T`($sp)
194 $PUSH r19,`7*$SIZE_T`($sp)
195 $PUSH r20,`8*$SIZE_T`($sp)
196 $PUSH r21,`9*$SIZE_T`($sp)
197 $PUSH r22,`10*$SIZE_T`($sp)
198 $PUSH r23,`11*$SIZE_T`($sp)
199 $PUSH r24,`12*$SIZE_T`($sp)
200 $PUSH r25,`13*$SIZE_T`($sp)
201 $PUSH r26,`14*$SIZE_T`($sp)
202 $PUSH r27,`15*$SIZE_T`($sp)
203 stfd f14,`16*$SIZE_T+0`($sp)
204 stfd f15,`16*$SIZE_T+8`($sp)
205 stfd f16,`16*$SIZE_T+16`($sp)
206 stfd f17,`16*$SIZE_T+24`($sp)
207 stfd f18,`16*$SIZE_T+32`($sp)
208 stfd f19,`16*$SIZE_T+40`($sp)
209 stfd f20,`16*$SIZE_T+48`($sp)
210 stfd f21,`16*$SIZE_T+56`($sp)
211 stfd f22,`16*$SIZE_T+64`($sp)
212 stfd f23,`16*$SIZE_T+72`($sp)
213 stfd f24,`16*$SIZE_T+80`($sp)
214 stfd f25,`16*$SIZE_T+88`($sp)
215 std r0,$FRAME($sp) ; r0 is still 0
216 lfd $dota,$FRAME($sp)
217 lfd $dotb,$FRAME($sp)
219 addi $tp,$sp,`$FRAME+$TRANSFER`
220 ; note that {an}p_{lh} are off by 1, this is because they
221 ; are used with stfdu/lfdu instruction...
227 ld $a0,0($ap) ; pull ap[0] value
228 ld $n0,0($n0) ; pull n0[0] value
229 srwi $j,$num,`3+1` ; counter register, num/2
231 ld $t3,0($bp) ; bp[0]
232 mulld $t7,$a0,$t3 ; ap[0]*bp[0]
233 mulld $t7,$t7,$n0 ; tp[0]*n0
235 ; transfer bp[0] to FPU as 4x16-bit values
240 std $t0,`$FRAME+0`($sp)
241 std $t1,`$FRAME+8`($sp)
242 std $t2,`$FRAME+16`($sp)
243 std $t3,`$FRAME+24`($sp)
244 lfd $ba,`$FRAME+0`($sp)
245 lfd $bb,`$FRAME+8`($sp)
246 lfd $bc,`$FRAME+16`($sp)
247 lfd $bd,`$FRAME+24`($sp)
253 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
258 std $t4,`$FRAME+32`($sp)
259 std $t5,`$FRAME+40`($sp)
260 std $t6,`$FRAME+48`($sp)
261 std $t7,`$FRAME+56`($sp)
262 lfd $na,`$FRAME+32`($sp)
263 lfd $nb,`$FRAME+40`($sp)
264 lfd $nc,`$FRAME+48`($sp)
265 lfd $nd,`$FRAME+56`($sp)
271 addi $tp,$sp,`$FRAME+$TRANSFER-8`
276 lwz $t0,4($ap) ; load a[j] as 32-bit word pair
278 lwz $t2,4($np) ; load n[j] as 32-bit word pair
280 std $t0,`$FRAME+0`($sp)
281 std $t1,`$FRAME+8`($sp)
282 std $t2,`$FRAME+16`($sp)
283 std $t3,`$FRAME+24`($sp)
284 lfd $A0,`$FRAME+0`($sp)
285 lfd $A1,`$FRAME+8`($sp)
286 lfd $N0,`$FRAME+16`($sp)
287 lfd $N1,`$FRAME+24`($sp)
292 stfdu $A0,8($ap_l) ; save a[j] in double format
294 stfdu $N0,8($np_l) ; save n[j] in double format
297 lwz $t4,12($ap) ; load a[j+1] as 32-bit word pair
299 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
301 std $t4,`$FRAME+32`($sp)
302 std $t5,`$FRAME+40`($sp)
303 std $t6,`$FRAME+48`($sp)
304 std $t7,`$FRAME+56`($sp)
305 lfd $A2,`$FRAME+32`($sp)
306 lfd $A3,`$FRAME+40`($sp)
307 lfd $N2,`$FRAME+48`($sp)
308 lfd $N3,`$FRAME+56`($sp)
313 stfdu $A2,8($ap_l) ; save a[j+1] in double format
315 stfdu $N2,8($np_l) ; save n[j+1] in double format
320 fmadd $T0a,$A0,$ba,$dota
321 fmadd $T0b,$A0,$bb,$dotb
329 fmadd $T1a,$A0,$bc,$T1a
330 fmadd $T1b,$A0,$bd,$T1b
331 fmadd $T2a,$A1,$bc,$T2a
332 fmadd $T2b,$A1,$bd,$T2b
333 fmadd $T3a,$A2,$bc,$T3a
334 fmadd $T3b,$A2,$bd,$T3b
338 fmadd $T0a,$N0,$na,$T0a
339 fmadd $T0b,$N0,$nb,$T0b
340 fmadd $T1a,$N1,$na,$T1a
341 fmadd $T1b,$N1,$nb,$T1b
342 fmadd $T2a,$N2,$na,$T2a
343 fmadd $T2b,$N2,$nb,$T2b
344 fmadd $T3a,$N3,$na,$T3a
345 fmadd $T3b,$N3,$nb,$T3b
347 fmadd $T1a,$N0,$nc,$T1a
348 fmadd $T1b,$N0,$nd,$T1b
349 fmadd $T2a,$N1,$nc,$T2a
350 fmadd $T2b,$N1,$nd,$T2b
351 fmadd $T3a,$N2,$nc,$T3a
352 fmadd $T3b,$N2,$nd,$T3b
353 fmadd $dota,$N3,$nc,$dota
354 fmadd $dotb,$N3,$nd,$dotb
365 stfd $T0a,`$FRAME+0`($sp)
366 stfd $T0b,`$FRAME+8`($sp)
367 stfd $T1a,`$FRAME+16`($sp)
368 stfd $T1b,`$FRAME+24`($sp)
369 stfd $T2a,`$FRAME+32`($sp)
370 stfd $T2b,`$FRAME+40`($sp)
371 stfd $T3a,`$FRAME+48`($sp)
372 stfd $T3b,`$FRAME+56`($sp)
373 ld $t0,`$FRAME+0`($sp)
374 ld $t1,`$FRAME+8`($sp)
375 ld $t2,`$FRAME+16`($sp)
376 ld $t3,`$FRAME+24`($sp)
377 ld $t4,`$FRAME+32`($sp)
378 ld $t5,`$FRAME+40`($sp)
379 ld $t6,`$FRAME+48`($sp)
380 ld $t7,`$FRAME+56`($sp)
382 add $t0,$t0,$carry ; can not overflow
400 insrdi $t0,$t3,16,0 ; 0..63 bits
403 insrdi $t4,$t7,16,0 ; 64..127 bits
404 srdi $carry,$t7,16 ; upper 33 bits
406 std $t0,8($tp) ; tp[j-1]
407 stdu $t4,16($tp) ; tp[j]
412 stfd $dota,`$FRAME+0`($sp)
413 stfd $dotb,`$FRAME+8`($sp)
414 ld $t0,`$FRAME+0`($sp)
415 ld $t1,`$FRAME+8`($sp)
416 add $t0,$t0,$carry ; can not overflow
421 std $t0,8($tp) ; tp[num-1]
423 subf $ap_l,$num,$ap_l ; rewind pointers
424 subf $ap_h,$num,$ap_h
425 subf $np_l,$num,$np_l
426 subf $np_h,$num,$np_h
431 ldx $t3,$bp,$i ; bp[i]
432 ld $t0,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
433 mulld $t7,$a0,$t3 ; ap[0]*bp[i]
434 add $t7,$t7,$t0 ; ap[0]*bp[i]+tp[0]
435 mulld $t7,$t7,$n0 ; tp[0]*n0
437 ; transfer b[i] to FPU as 4x16-bit values
442 std $t0,`$FRAME+0`($sp)
443 std $t1,`$FRAME+8`($sp)
444 std $t2,`$FRAME+16`($sp)
445 std $t3,`$FRAME+24`($sp)
446 lfd $ba,`$FRAME+0`($sp)
447 lfd $bb,`$FRAME+8`($sp)
448 lfd $bc,`$FRAME+16`($sp)
449 lfd $bd,`$FRAME+24`($sp)
455 ; transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
460 std $t4,`$FRAME+32`($sp)
461 std $t5,`$FRAME+40`($sp)
462 std $t6,`$FRAME+48`($sp)
463 std $t7,`$FRAME+56`($sp)
464 lfd $na,`$FRAME+32`($sp)
465 lfd $nb,`$FRAME+40`($sp)
466 lfd $nc,`$FRAME+48`($sp)
467 lfd $nd,`$FRAME+56`($sp)
473 addi $tp,$sp,`$FRAME+$TRANSFER`
474 fsub $dota,$dota,$dota
475 fsub $dotb,$dotb,$dotb
480 lfdu $A0,8($ap_l) ; load a[j] in double format
482 lfdu $N0,8($np_l) ; load n[j] in double format
484 lfdu $A2,8($ap_l) ; load a[j+1] in double format
486 lfdu $N2,8($np_l) ; load n[j+1] in double format
489 fmadd $T0a,$A0,$ba,$dota
490 fmadd $T0b,$A0,$bb,$dotb
498 fmadd $T1a,$A0,$bc,$T1a
499 fmadd $T1b,$A0,$bd,$T1b
500 fmadd $T2a,$A1,$bc,$T2a
501 fmadd $T2b,$A1,$bd,$T2b
502 fmadd $T3a,$A2,$bc,$T3a
503 fmadd $T3b,$A2,$bd,$T3b
507 fmadd $T0a,$N0,$na,$T0a
508 fmadd $T0b,$N0,$nb,$T0b
509 fmadd $T1a,$N1,$na,$T1a
510 fmadd $T1b,$N1,$nb,$T1b
511 fmadd $T2a,$N2,$na,$T2a
512 fmadd $T2b,$N2,$nb,$T2b
513 fmadd $T3a,$N3,$na,$T3a
514 fmadd $T3b,$N3,$nb,$T3b
516 fmadd $T1a,$N0,$nc,$T1a
517 fmadd $T1b,$N0,$nd,$T1b
518 fmadd $T2a,$N1,$nc,$T2a
519 fmadd $T2b,$N1,$nd,$T2b
520 fmadd $T3a,$N2,$nc,$T3a
521 fmadd $T3b,$N2,$nd,$T3b
522 fmadd $dota,$N3,$nc,$dota
523 fmadd $dotb,$N3,$nd,$dotb
534 stfd $T0a,`$FRAME+0`($sp)
535 stfd $T0b,`$FRAME+8`($sp)
536 stfd $T1a,`$FRAME+16`($sp)
537 stfd $T1b,`$FRAME+24`($sp)
538 stfd $T2a,`$FRAME+32`($sp)
539 stfd $T2b,`$FRAME+40`($sp)
540 stfd $T3a,`$FRAME+48`($sp)
541 stfd $T3b,`$FRAME+56`($sp)
542 ld $t0,`$FRAME+0`($sp)
543 ld $t1,`$FRAME+8`($sp)
544 ld $t2,`$FRAME+16`($sp)
545 ld $t3,`$FRAME+24`($sp)
546 ld $t4,`$FRAME+32`($sp)
547 ld $t5,`$FRAME+40`($sp)
548 ld $t6,`$FRAME+48`($sp)
549 ld $t7,`$FRAME+56`($sp)
551 add $t0,$t0,$carry ; can not overflow
569 insrdi $t0,$t3,16,0 ; 0..63 bits
572 insrdi $t4,$t7,16,0 ; 64..127 bits
573 srdi $carry,$t7,16 ; upper 33 bits
575 ld $t1,8($tp) ; tp[j]
576 ldu $t2,16($tp) ; tp[j+1]
582 std $t3,-16($tp) ; tp[j-1]
583 std $t5,-8($tp) ; tp[j]
588 stfd $dota,`$FRAME+0`($sp)
589 stfd $dotb,`$FRAME+8`($sp)
590 ld $t0,`$FRAME+0`($sp)
591 ld $t1,`$FRAME+8`($sp)
592 add $carry,$carry,$ovf ; comsume upmost overflow
593 add $t0,$t0,$carry ; can not overflow
598 std $t0,0($tp) ; tp[num-1]
600 subf $ap_l,$num,$ap_l ; rewind pointers
601 subf $ap_h,$num,$ap_h
602 subf $np_l,$num,$np_l
603 subf $np_h,$num,$np_h
608 subf $np,$num,$np ; rewind np
609 subfc $i,$i,$i ; j=0 and "clear" XER[CA]
610 addi $tp,$sp,`$FRAME+$TRANSFER+8`
611 addi $t4,$sp,`$FRAME+$TRANSFER+16`
621 subfe $t0,$t1,$t0 ; tp[j]-np[j]
622 subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1]
629 subfe $ovf,$i,$ovf ; handle upmost overflow bit
632 or $ap,$ap,$np ; ap=borrow?tp:rp
637 Lcopy: ; copy or in-place refresh
640 stdu $i,8($ap_l) ; zap {an}p_{lh}
650 stdx $i,$tp,$i ; zap tp at once
655 $POP r14,`2*$SIZE_T`($sp)
656 $POP r15,`3*$SIZE_T`($sp)
657 $POP r16,`4*$SIZE_T`($sp)
658 $POP r17,`5*$SIZE_T`($sp)
659 $POP r18,`6*$SIZE_T`($sp)
660 $POP r19,`7*$SIZE_T`($sp)
661 $POP r20,`8*$SIZE_T`($sp)
662 $POP r21,`9*$SIZE_T`($sp)
663 $POP r22,`10*$SIZE_T`($sp)
664 $POP r23,`11*$SIZE_T`($sp)
665 $POP r24,`12*$SIZE_T`($sp)
666 $POP r25,`13*$SIZE_T`($sp)
667 $POP r26,`14*$SIZE_T`($sp)
668 $POP r27,`15*$SIZE_T`($sp)
669 lfd f14,`16*$SIZE_T+0`($sp)
670 lfd f15,`16*$SIZE_T+8`($sp)
671 lfd f16,`16*$SIZE_T+16`($sp)
672 lfd f17,`16*$SIZE_T+24`($sp)
673 lfd f18,`16*$SIZE_T+32`($sp)
674 lfd f19,`16*$SIZE_T+40`($sp)
675 lfd f20,`16*$SIZE_T+48`($sp)
676 lfd f21,`16*$SIZE_T+56`($sp)
677 lfd f22,`16*$SIZE_T+64`($sp)
678 lfd f23,`16*$SIZE_T+72`($sp)
679 lfd f24,`16*$SIZE_T+80`($sp)
680 lfd f25,`16*$SIZE_T+88`($sp)
682 li r3,1 ; signal "handled"
685 .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
688 $code =~ s/\`([^\`]*)\`/eval $1/gem;