This is also informational commit exposing loop modulo scheduling "factor."
[openssl.git] / crypto / bn / asm / ppc64-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # December 2007
11
12 $output = shift;
13
14 if ($output =~ /32\-mont\.s/) {
15         $SIZE_T=4;
16         $RZONE= 224;
17         $FRAME= $SIZE_T*16+8*12;
18         $fname= "bn_mul_mont_ppc64";
19
20         $STUX=  "stwux";        # store indexed and update
21         $PUSH=  "stw";
22         $POP=   "lwz";
23         die "not implemented yet";
24 } elsif ($output =~ /64\-mont\.s/) {
25         $SIZE_T=8;
26         $RZONE= 288;
27         $FRAME= $SIZE_T*16+8*12;
28         $fname= "bn_mul_mont";
29
30         # same as above, but 64-bit mnemonics...
31         $STUX=  "stdux";        # store indexed and update
32         $PUSH=  "std";
33         $POP=   "ld";
34 } else { die "nonsense $output"; }
35
36 ( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) ||
37         die "can't call ../perlasm/ppc-xlate.pl: $!";
38
39 $TRANSFER=8*8;
40
41 $sp="r1";
42 $toc="r2";
43 $rp="r3";       $ovf="r3";
44 $ap="r4";
45 $bp="r5";
46 $np="r6";
47 $n0="r7";
48 $num="r8";
49 $rp="r9";       # $rp is reassigned
50 $tp="r10";
51 $j="r11";
52 $i="r12";
53 # non-volatile registers
54 $ap_l="r14";
55 $ap_h="r15";
56 $np_l="r16";
57 $np_h="r17";
58 $carry="r18";
59 $a0="r19";      # ap[0]
60 $t0="r20";
61 $t1="r21";
62 $t2="r22";
63 $t3="r23";
64 $t4="r24";
65 $t5="r25";
66 $t6="r26";
67 $t7="r27";
68
69 # PPC offers enough register bank capacity to unroll inner loops twice
70 #
71 #     ..A3A2A1A0
72 #           dcba
73 #    -----------
74 #            A0a
75 #           A0b
76 #          A0c
77 #         A0d
78 #          A1a
79 #         A1b
80 #        A1c
81 #       A1d
82 #        A2a
83 #       A2b
84 #      A2c
85 #     A2d
86 #      A3a
87 #     A3b
88 #    A3c
89 #   A3d
90 #    ..a
91 #   ..b
92 #
93 $ba="f0";
94 $bb="f1";
95 $bc="f2";
96 $bd="f3";
97 $na="f4";
98 $nb="f5";
99 $nc="f6";
100 $nd="f7";
101 $dota="f8";
102 $dotb="f9";
103 $A0="f10";
104 $A1="f11";
105 $A2="f12";
106 $A3="f13";
107 $N0="f14";
108 $N1="f15";
109 $N2="f16";
110 $N3="f17";
111 $T0a="f18";
112 $T0b="f19";
113 $T1a="f20";
114 $T1b="f21";
115 $T2a="f22";
116 $T2b="f23";
117 $T3a="f24";
118 $T3b="f25";
119
120 # sp----------->+-------------------------------+
121 #               | saved sp                      |
122 #               +-------------------------------+
123 #               |                               |
124 #               +-------------------------------+
125 #               | 14 saved gpr, r14-r27         |
126 #               .                               .
127 #               .                               .
128 #   +16*size_t  +-------------------------------+
129 #               | 12 saved fpr, f14-f25         |
130 #               .                               .
131 #               .                               .
132 #   +12*8       +-------------------------------+
133 #               | 8 gpr<->fpr transfer zone     |
134 #               .                               .
135 #               .                               .
136 #   +8*8        +-------------------------------+
137 #               | __int64 tmp[-1]               |
138 #               +-------------------------------+
139 #               | __int64 tmp[num]              |
140 #               .                               .
141 #               .                               .
142 #               .                               .
143 #   +(num+1)*8  +-------------------------------+
144 #               | double a_lo[num]              |
145 #               .                               .
146 #               .                               .
147 #               .                               .
148 #   +num*8      +-------------------------------+
149 #               | double a_hi[num]              |
150 #               .                               .
151 #               .                               .
152 #               .                               .
153 #   +num*8      +-------------------------------+
154 #               | double n_lo[num]              |
155 #               .                               .
156 #               .                               .
157 #               .                               .
158 #   +num*8      +-------------------------------+
159 #               | double n_hi[num]              |
160 #               .                               .
161 #               .                               .
162 #               .                               .
163 #               +-------------------------------+
164
165 $code=<<___;
166 .machine "any"
167 .text
168
169 .globl  .$fname
170 .align  4
171 .$fname:
172         cmpwi   $num,4
173         mr      $rp,r3          ; $rp is reassigned
174         li      r3,0            ; possible "not handled" return code
175         bltlr-
176         andi.   r0,$num,1       ; $num has to be even
177         bnelr-
178
179         slwi    $num,$num,3     ; num*=8
180         li      $i,-4096
181         slwi    $tp,$num,2      ; place for {an}p_{lh}[num], i.e. 4*num
182         add     $tp,$tp,$num    ; place for tp[num+1]
183         addi    $tp,$tp,`$FRAME+$TRANSFER+8+$RZONE`
184         subf    $tp,$tp,$sp     ; $sp-$tp
185         and     $tp,$tp,$i      ; minimize TLB usage
186         subf    $tp,$sp,$tp     ; $tp-$sp
187         $STUX   $sp,$sp,$tp     ; alloca
188
189         $PUSH   r14,`2*$SIZE_T`($sp)
190         $PUSH   r15,`3*$SIZE_T`($sp)
191         $PUSH   r16,`4*$SIZE_T`($sp)
192         $PUSH   r17,`5*$SIZE_T`($sp)
193         $PUSH   r18,`6*$SIZE_T`($sp)
194         $PUSH   r19,`7*$SIZE_T`($sp)
195         $PUSH   r20,`8*$SIZE_T`($sp)
196         $PUSH   r21,`9*$SIZE_T`($sp)
197         $PUSH   r22,`10*$SIZE_T`($sp)
198         $PUSH   r23,`11*$SIZE_T`($sp)
199         $PUSH   r24,`12*$SIZE_T`($sp)
200         $PUSH   r25,`13*$SIZE_T`($sp)
201         $PUSH   r26,`14*$SIZE_T`($sp)
202         $PUSH   r27,`15*$SIZE_T`($sp)
203         stfd    f14,`16*$SIZE_T+0`($sp)
204         stfd    f15,`16*$SIZE_T+8`($sp)
205         stfd    f16,`16*$SIZE_T+16`($sp)
206         stfd    f17,`16*$SIZE_T+24`($sp)
207         stfd    f18,`16*$SIZE_T+32`($sp)
208         stfd    f19,`16*$SIZE_T+40`($sp)
209         stfd    f20,`16*$SIZE_T+48`($sp)
210         stfd    f21,`16*$SIZE_T+56`($sp)
211         stfd    f22,`16*$SIZE_T+64`($sp)
212         stfd    f23,`16*$SIZE_T+72`($sp)
213         stfd    f24,`16*$SIZE_T+80`($sp)
214         stfd    f25,`16*$SIZE_T+88`($sp)
215         std     r0,$FRAME($sp)  ; r0 is still 0
216         lfd     $dota,$FRAME($sp)
217         lfd     $dotb,$FRAME($sp)
218
219         addi    $tp,$sp,`$FRAME+$TRANSFER`
220         ; note that {an}p_{lh} are off by 1, this is because they
221         ; are used with stfdu/lfdu instruction...
222         add     $ap_l,$tp,$num
223         add     $ap_h,$ap_l,$num
224         add     $np_l,$ap_h,$num
225         add     $np_h,$np_l,$num
226
227         ld      $a0,0($ap)      ; pull ap[0] value
228         ld      $n0,0($n0)      ; pull n0[0] value
229         srwi    $j,$num,`3+1`   ; counter register, num/2
230 \f
231         ld      $t3,0($bp)      ; bp[0]
232         mulld   $t7,$a0,$t3     ; ap[0]*bp[0]
233         mulld   $t7,$t7,$n0     ; tp[0]*n0
234
235         ; transfer bp[0] to FPU as 4x16-bit values
236         extrdi  $t0,$t3,16,48
237         extrdi  $t1,$t3,16,32
238         extrdi  $t2,$t3,16,16
239         extrdi  $t3,$t3,16,0
240         std     $t0,`$FRAME+0`($sp)
241         std     $t1,`$FRAME+8`($sp)
242         std     $t2,`$FRAME+16`($sp)
243         std     $t3,`$FRAME+24`($sp)
244         lfd     $ba,`$FRAME+0`($sp)
245         lfd     $bb,`$FRAME+8`($sp)
246         lfd     $bc,`$FRAME+16`($sp)
247         lfd     $bd,`$FRAME+24`($sp)
248         fcfid   $ba,$ba
249         fcfid   $bb,$bb
250         fcfid   $bc,$bc
251         fcfid   $bd,$bd
252
253         ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
254         extrdi  $t4,$t7,16,48
255         extrdi  $t5,$t7,16,32
256         extrdi  $t6,$t7,16,16
257         extrdi  $t7,$t7,16,0
258         std     $t4,`$FRAME+32`($sp)
259         std     $t5,`$FRAME+40`($sp)
260         std     $t6,`$FRAME+48`($sp)
261         std     $t7,`$FRAME+56`($sp)
262         lfd     $na,`$FRAME+32`($sp)
263         lfd     $nb,`$FRAME+40`($sp)
264         lfd     $nc,`$FRAME+48`($sp)
265         lfd     $nd,`$FRAME+56`($sp)
266         fcfid   $na,$na
267         fcfid   $nb,$nb
268         fcfid   $nc,$nc
269         fcfid   $nd,$nd
270         addi    $j,$j,-1
271 \f
272         addi    $tp,$sp,`$FRAME+$TRANSFER-8`
273         li      $carry,0
274         mtctr   $j
275
276         lwz     $t0,4($ap)              ; load a[j] as 32-bit word pair
277         lwz     $t1,0($ap)
278         lwz     $t2,4($np)              ; load n[j] as 32-bit word pair
279         lwz     $t3,0($np)
280         std     $t0,`$FRAME+0`($sp)
281         std     $t1,`$FRAME+8`($sp)
282         std     $t2,`$FRAME+16`($sp)
283         std     $t3,`$FRAME+24`($sp)
284         lfd     $A0,`$FRAME+0`($sp)
285         lfd     $A1,`$FRAME+8`($sp)
286         lfd     $N0,`$FRAME+16`($sp)
287         lfd     $N1,`$FRAME+24`($sp)
288         fcfid   $A0,$A0
289         fcfid   $A1,$A1
290         fcfid   $N0,$N0
291         fcfid   $N1,$N1
292         stfdu   $A0,8($ap_l)            ; save a[j] in double format
293         stfdu   $A1,8($ap_h)
294         stfdu   $N0,8($np_l)            ; save n[j] in double format
295         stfdu   $N1,8($np_h)
296
297         lwz     $t4,12($ap)             ; load a[j+1] as 32-bit word pair
298         lwz     $t5,8($ap)
299         lwz     $t6,12($np)             ; load n[j+1] as 32-bit word pair
300         lwz     $t7,8($np)
301         std     $t4,`$FRAME+32`($sp)
302         std     $t5,`$FRAME+40`($sp)
303         std     $t6,`$FRAME+48`($sp)
304         std     $t7,`$FRAME+56`($sp)
305         lfd     $A2,`$FRAME+32`($sp)
306         lfd     $A3,`$FRAME+40`($sp)
307         lfd     $N2,`$FRAME+48`($sp)
308         lfd     $N3,`$FRAME+56`($sp)
309         fcfid   $A2,$A2
310         fcfid   $A3,$A3
311         fcfid   $N2,$N2
312         fcfid   $N3,$N3
313         stfdu   $A2,8($ap_l)            ; save a[j+1] in double format
314         stfdu   $A3,8($ap_h)
315         stfdu   $N2,8($np_l)            ; save n[j+1] in double format
316         stfdu   $N3,8($np_h)
317         addi    $ap,$ap,16
318         addi    $np,$np,16
319
320         fmadd   $T0a,$A0,$ba,$dota
321         fmadd   $T0b,$A0,$bb,$dotb
322         fmul    $T1a,$A1,$ba
323         fmul    $T1b,$A1,$bb
324         fmul    $T2a,$A2,$ba
325         fmul    $T2b,$A2,$bb
326         fmul    $T3a,$A3,$ba
327         fmul    $T3b,$A3,$bb
328
329         fmadd   $T1a,$A0,$bc,$T1a
330         fmadd   $T1b,$A0,$bd,$T1b
331         fmadd   $T2a,$A1,$bc,$T2a
332         fmadd   $T2b,$A1,$bd,$T2b
333         fmadd   $T3a,$A2,$bc,$T3a
334         fmadd   $T3b,$A2,$bd,$T3b
335         fmul    $dota,$A3,$bc
336         fmul    $dotb,$A3,$bd
337
338         fmadd   $T0a,$N0,$na,$T0a
339         fmadd   $T0b,$N0,$nb,$T0b
340         fmadd   $T1a,$N1,$na,$T1a
341         fmadd   $T1b,$N1,$nb,$T1b
342         fmadd   $T2a,$N2,$na,$T2a
343         fmadd   $T2b,$N2,$nb,$T2b
344         fmadd   $T3a,$N3,$na,$T3a
345         fmadd   $T3b,$N3,$nb,$T3b
346
347         fmadd   $T1a,$N0,$nc,$T1a
348         fmadd   $T1b,$N0,$nd,$T1b
349         fmadd   $T2a,$N1,$nc,$T2a
350         fmadd   $T2b,$N1,$nd,$T2b
351         fmadd   $T3a,$N2,$nc,$T3a
352         fmadd   $T3b,$N2,$nd,$T3b
353         fmadd   $dota,$N3,$nc,$dota
354         fmadd   $dotb,$N3,$nd,$dotb
355
356         fctid   $T0a,$T0a
357         fctid   $T0b,$T0b
358         fctid   $T1a,$T1a
359         fctid   $T1b,$T1b
360         fctid   $T2a,$T2a
361         fctid   $T2b,$T2b
362         fctid   $T3a,$T3a
363         fctid   $T3b,$T3b
364
365         stfd    $T0a,`$FRAME+0`($sp)
366         stfd    $T0b,`$FRAME+8`($sp)
367         stfd    $T1a,`$FRAME+16`($sp)
368         stfd    $T1b,`$FRAME+24`($sp)
369         stfd    $T2a,`$FRAME+32`($sp)
370         stfd    $T2b,`$FRAME+40`($sp)
371         stfd    $T3a,`$FRAME+48`($sp)
372         stfd    $T3b,`$FRAME+56`($sp)
373 .align  4
374 L1st:
375         ld      $t0,`$FRAME+0`($sp)
376         ld      $t1,`$FRAME+8`($sp)
377         ld      $t2,`$FRAME+16`($sp)
378         ld      $t3,`$FRAME+24`($sp)
379         ld      $t4,`$FRAME+32`($sp)
380         ld      $t5,`$FRAME+40`($sp)
381         ld      $t6,`$FRAME+48`($sp)
382         ld      $t7,`$FRAME+56`($sp)
383
384         add     $t0,$t0,$carry          ; can not overflow
385         srdi    $carry,$t0,16
386         add     $t1,$t1,$carry
387         srdi    $carry,$t1,16
388         add     $t2,$t2,$carry
389         srdi    $carry,$t2,16
390         add     $t3,$t3,$carry
391         srdi    $carry,$t3,16
392         add     $t4,$t4,$carry
393         srdi    $carry,$t4,16
394         add     $t5,$t5,$carry
395         srdi    $carry,$t5,16
396         add     $t6,$t6,$carry
397         srdi    $carry,$t6,16
398         add     $t7,$t7,$carry
399
400         insrdi  $t0,$t1,16,32
401         insrdi  $t0,$t2,16,16
402         insrdi  $t0,$t3,16,0            ; 0..63 bits
403         insrdi  $t4,$t5,16,32
404         insrdi  $t4,$t6,16,16
405         insrdi  $t4,$t7,16,0            ; 64..127 bits
406         srdi    $carry,$t7,16           ; upper 33 bits
407
408         std     $t0,8($tp)              ; tp[j-1]
409         stdu    $t4,16($tp)             ; tp[j]
410
411         lwz     $t0,4($ap)              ; load a[j] as 32-bit word pair
412         lwz     $t1,0($ap)
413         lwz     $t2,4($np)              ; load n[j] as 32-bit word pair
414         lwz     $t3,0($np)
415         std     $t0,`$FRAME+0`($sp)
416         std     $t1,`$FRAME+8`($sp)
417         std     $t2,`$FRAME+16`($sp)
418         std     $t3,`$FRAME+24`($sp)
419         lfd     $A0,`$FRAME+0`($sp)
420         lfd     $A1,`$FRAME+8`($sp)
421         lfd     $N0,`$FRAME+16`($sp)
422         lfd     $N1,`$FRAME+24`($sp)
423         fcfid   $A0,$A0
424         fcfid   $A1,$A1
425         fcfid   $N0,$N0
426         fcfid   $N1,$N1
427         stfdu   $A0,8($ap_l)            ; save a[j] in double format
428         stfdu   $A1,8($ap_h)
429         stfdu   $N0,8($np_l)            ; save n[j] in double format
430         stfdu   $N1,8($np_h)
431
432         lwz     $t4,12($ap)             ; load a[j+1] as 32-bit word pair
433         lwz     $t5,8($ap)
434         lwz     $t6,12($np)             ; load n[j+1] as 32-bit word pair
435         lwz     $t7,8($np)
436         std     $t4,`$FRAME+32`($sp)
437         std     $t5,`$FRAME+40`($sp)
438         std     $t6,`$FRAME+48`($sp)
439         std     $t7,`$FRAME+56`($sp)
440         lfd     $A2,`$FRAME+32`($sp)
441         lfd     $A3,`$FRAME+40`($sp)
442         lfd     $N2,`$FRAME+48`($sp)
443         lfd     $N3,`$FRAME+56`($sp)
444         fcfid   $A2,$A2
445         fcfid   $A3,$A3
446         fcfid   $N2,$N2
447         fcfid   $N3,$N3
448         stfdu   $A2,8($ap_l)            ; save a[j+1] in double format
449         stfdu   $A3,8($ap_h)
450         stfdu   $N2,8($np_l)            ; save n[j+1] in double format
451         stfdu   $N3,8($np_h)
452         addi    $ap,$ap,16
453         addi    $np,$np,16
454
455         fmadd   $T0a,$A0,$ba,$dota
456         fmadd   $T0b,$A0,$bb,$dotb
457         fmul    $T1a,$A1,$ba
458         fmul    $T1b,$A1,$bb
459         fmul    $T2a,$A2,$ba
460         fmul    $T2b,$A2,$bb
461         fmul    $T3a,$A3,$ba
462         fmul    $T3b,$A3,$bb
463
464         fmadd   $T1a,$A0,$bc,$T1a
465         fmadd   $T1b,$A0,$bd,$T1b
466         fmadd   $T2a,$A1,$bc,$T2a
467         fmadd   $T2b,$A1,$bd,$T2b
468         fmadd   $T3a,$A2,$bc,$T3a
469         fmadd   $T3b,$A2,$bd,$T3b
470         fmul    $dota,$A3,$bc
471         fmul    $dotb,$A3,$bd
472
473         fmadd   $T0a,$N0,$na,$T0a
474         fmadd   $T0b,$N0,$nb,$T0b
475         fmadd   $T1a,$N1,$na,$T1a
476         fmadd   $T1b,$N1,$nb,$T1b
477         fmadd   $T2a,$N2,$na,$T2a
478         fmadd   $T2b,$N2,$nb,$T2b
479         fmadd   $T3a,$N3,$na,$T3a
480         fmadd   $T3b,$N3,$nb,$T3b
481
482         fmadd   $T1a,$N0,$nc,$T1a
483         fmadd   $T1b,$N0,$nd,$T1b
484         fmadd   $T2a,$N1,$nc,$T2a
485         fmadd   $T2b,$N1,$nd,$T2b
486         fmadd   $T3a,$N2,$nc,$T3a
487         fmadd   $T3b,$N2,$nd,$T3b
488         fmadd   $dota,$N3,$nc,$dota
489         fmadd   $dotb,$N3,$nd,$dotb
490
491         fctid   $T0a,$T0a
492         fctid   $T0b,$T0b
493         fctid   $T1a,$T1a
494         fctid   $T1b,$T1b
495         fctid   $T2a,$T2a
496         fctid   $T2b,$T2b
497         fctid   $T3a,$T3a
498         fctid   $T3b,$T3b
499
500         stfd    $T0a,`$FRAME+0`($sp)
501         stfd    $T0b,`$FRAME+8`($sp)
502         stfd    $T1a,`$FRAME+16`($sp)
503         stfd    $T1b,`$FRAME+24`($sp)
504         stfd    $T2a,`$FRAME+32`($sp)
505         stfd    $T2b,`$FRAME+40`($sp)
506         stfd    $T3a,`$FRAME+48`($sp)
507         stfd    $T3b,`$FRAME+56`($sp)
508         bdnz-   L1st
509
510         ld      $t0,`$FRAME+0`($sp)
511         ld      $t1,`$FRAME+8`($sp)
512         ld      $t2,`$FRAME+16`($sp)
513         ld      $t3,`$FRAME+24`($sp)
514         ld      $t4,`$FRAME+32`($sp)
515         ld      $t5,`$FRAME+40`($sp)
516         ld      $t6,`$FRAME+48`($sp)
517         ld      $t7,`$FRAME+56`($sp)
518
519         add     $t0,$t0,$carry          ; can not overflow
520         srdi    $carry,$t0,16
521         add     $t1,$t1,$carry
522         srdi    $carry,$t1,16
523         add     $t2,$t2,$carry
524         srdi    $carry,$t2,16
525         add     $t3,$t3,$carry
526         srdi    $carry,$t3,16
527         add     $t4,$t4,$carry
528         srdi    $carry,$t4,16
529         add     $t5,$t5,$carry
530         srdi    $carry,$t5,16
531         add     $t6,$t6,$carry
532         srdi    $carry,$t6,16
533         add     $t7,$t7,$carry
534
535         insrdi  $t0,$t1,16,32
536         insrdi  $t0,$t2,16,16
537         insrdi  $t0,$t3,16,0            ; 0..63 bits
538         insrdi  $t4,$t5,16,32
539         insrdi  $t4,$t6,16,16
540         insrdi  $t4,$t7,16,0            ; 64..127 bits
541         srdi    $carry,$t7,16           ; upper 33 bits
542
543         std     $t0,8($tp)              ; tp[j-1]
544         stdu    $t4,16($tp)             ; tp[j]
545 \f
546         fctid   $dota,$dota
547         fctid   $dotb,$dotb
548         stfd    $dota,`$FRAME+0`($sp)
549         stfd    $dotb,`$FRAME+8`($sp)
550         ld      $t0,`$FRAME+0`($sp)
551         ld      $t1,`$FRAME+8`($sp)
552         add     $t0,$t0,$carry          ; can not overflow
553         srdi    $carry,$t0,16
554         add     $t1,$t1,$carry
555         insrdi  $t0,$t1,48,0
556         srdi    $ovf,$t1,48
557         std     $t0,8($tp)              ; tp[num-1]
558
559         subf    $ap_l,$num,$ap_l        ; rewind pointers
560         subf    $ap_h,$num,$ap_h
561         subf    $np_l,$num,$np_l
562         subf    $np_h,$num,$np_h
563 \f
564         li      $i,8                    ; i=1
565 .align  4
566 Louter:
567         ldx     $t3,$bp,$i      ; bp[i]
568         ld      $t0,`$FRAME+$TRANSFER+8`($sp)   ; tp[0]
569         mulld   $t7,$a0,$t3     ; ap[0]*bp[i]
570         add     $t7,$t7,$t0     ; ap[0]*bp[i]+tp[0]
571         mulld   $t7,$t7,$n0     ; tp[0]*n0
572
573         ; transfer b[i] to FPU as 4x16-bit values
574         extrdi  $t0,$t3,16,48
575         extrdi  $t1,$t3,16,32
576         extrdi  $t2,$t3,16,16
577         extrdi  $t3,$t3,16,0
578         std     $t0,`$FRAME+0`($sp)
579         std     $t1,`$FRAME+8`($sp)
580         std     $t2,`$FRAME+16`($sp)
581         std     $t3,`$FRAME+24`($sp)
582         lfd     $ba,`$FRAME+0`($sp)
583         lfd     $bb,`$FRAME+8`($sp)
584         lfd     $bc,`$FRAME+16`($sp)
585         lfd     $bd,`$FRAME+24`($sp)
586         fcfid   $ba,$ba
587         fcfid   $bb,$bb
588         fcfid   $bc,$bc
589         fcfid   $bd,$bd
590
591         ; transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
592         extrdi  $t4,$t7,16,48
593         extrdi  $t5,$t7,16,32
594         extrdi  $t6,$t7,16,16
595         extrdi  $t7,$t7,16,0
596         std     $t4,`$FRAME+32`($sp)
597         std     $t5,`$FRAME+40`($sp)
598         std     $t6,`$FRAME+48`($sp)
599         std     $t7,`$FRAME+56`($sp)
600         lfd     $na,`$FRAME+32`($sp)
601         lfd     $nb,`$FRAME+40`($sp)
602         lfd     $nc,`$FRAME+48`($sp)
603         lfd     $nd,`$FRAME+56`($sp)
604         fcfid   $na,$na
605         fcfid   $nb,$nb
606         fcfid   $nc,$nc
607         fcfid   $nd,$nd
608 \f
609         addi    $tp,$sp,`$FRAME+$TRANSFER`
610         fsub    $dota,$dota,$dota
611         fsub    $dotb,$dotb,$dotb
612         li      $carry,0
613         mtctr   $j
614
615         lfdu    $A0,8($ap_l)            ; load a[j] in double format
616         lfdu    $A1,8($ap_h)
617         lfdu    $N0,8($np_l)            ; load n[j] in double format
618         lfdu    $N1,8($np_h)
619         lfdu    $A2,8($ap_l)            ; load a[j+1] in double format
620         lfdu    $A3,8($ap_h)
621         lfdu    $N2,8($np_l)            ; load n[j+1] in double format
622         lfdu    $N3,8($np_h)
623
624         fmadd   $T0a,$A0,$ba,$dota
625         fmadd   $T0b,$A0,$bb,$dotb
626         fmul    $T1a,$A1,$ba
627         fmul    $T1b,$A1,$bb
628         fmul    $T2a,$A2,$ba
629         fmul    $T2b,$A2,$bb
630         fmul    $T3a,$A3,$ba
631         fmul    $T3b,$A3,$bb
632
633         fmadd   $T1a,$A0,$bc,$T1a
634         fmadd   $T1b,$A0,$bd,$T1b
635         fmadd   $T2a,$A1,$bc,$T2a
636         fmadd   $T2b,$A1,$bd,$T2b
637         fmadd   $T3a,$A2,$bc,$T3a
638         fmadd   $T3b,$A2,$bd,$T3b
639         fmul    $dota,$A3,$bc
640         fmul    $dotb,$A3,$bd
641
642         fmadd   $T0a,$N0,$na,$T0a
643         fmadd   $T0b,$N0,$nb,$T0b
644         fmadd   $T1a,$N1,$na,$T1a
645         fmadd   $T1b,$N1,$nb,$T1b
646         fmadd   $T2a,$N2,$na,$T2a
647         fmadd   $T2b,$N2,$nb,$T2b
648         fmadd   $T3a,$N3,$na,$T3a
649         fmadd   $T3b,$N3,$nb,$T3b
650
651         fmadd   $T1a,$N0,$nc,$T1a
652         fmadd   $T1b,$N0,$nd,$T1b
653         fmadd   $T2a,$N1,$nc,$T2a
654         fmadd   $T2b,$N1,$nd,$T2b
655         fmadd   $T3a,$N2,$nc,$T3a
656         fmadd   $T3b,$N2,$nd,$T3b
657         fmadd   $dota,$N3,$nc,$dota
658         fmadd   $dotb,$N3,$nd,$dotb
659
660         fctid   $T0a,$T0a
661         fctid   $T0b,$T0b
662         fctid   $T1a,$T1a
663         fctid   $T1b,$T1b
664         fctid   $T2a,$T2a
665         fctid   $T2b,$T2b
666         fctid   $T3a,$T3a
667         fctid   $T3b,$T3b
668
669         stfd    $T0a,`$FRAME+0`($sp)
670         stfd    $T0b,`$FRAME+8`($sp)
671         stfd    $T1a,`$FRAME+16`($sp)
672         stfd    $T1b,`$FRAME+24`($sp)
673         stfd    $T2a,`$FRAME+32`($sp)
674         stfd    $T2b,`$FRAME+40`($sp)
675         stfd    $T3a,`$FRAME+48`($sp)
676         stfd    $T3b,`$FRAME+56`($sp)
677 .align  4
678 Linner:
679         ld      $t0,`$FRAME+0`($sp)
680         ld      $t1,`$FRAME+8`($sp)
681         ld      $t2,`$FRAME+16`($sp)
682         ld      $t3,`$FRAME+24`($sp)
683         ld      $t4,`$FRAME+32`($sp)
684         ld      $t5,`$FRAME+40`($sp)
685         ld      $t6,`$FRAME+48`($sp)
686         ld      $t7,`$FRAME+56`($sp)
687
688         add     $t0,$t0,$carry          ; can not overflow
689         srdi    $carry,$t0,16
690         add     $t1,$t1,$carry
691         srdi    $carry,$t1,16
692         add     $t2,$t2,$carry
693         srdi    $carry,$t2,16
694         add     $t3,$t3,$carry
695         srdi    $carry,$t3,16
696         add     $t4,$t4,$carry
697         srdi    $carry,$t4,16
698         add     $t5,$t5,$carry
699         srdi    $carry,$t5,16
700         add     $t6,$t6,$carry
701         srdi    $carry,$t6,16
702         add     $t7,$t7,$carry
703
704         insrdi  $t0,$t1,16,32
705         insrdi  $t0,$t2,16,16
706         insrdi  $t0,$t3,16,0            ; 0..63 bits
707         insrdi  $t4,$t5,16,32
708         insrdi  $t4,$t6,16,16
709         insrdi  $t4,$t7,16,0            ; 64..127 bits
710         srdi    $carry,$t7,16           ; upper 33 bits
711
712         ld      $t1,8($tp)              ; tp[j]
713         ldu     $t2,16($tp)             ; tp[j+1]
714
715         addc    $t3,$t0,$t1
716         adde    $t5,$t4,$t2
717         addze   $carry,$carry
718
719         std     $t3,-16($tp)            ; tp[j-1]
720         std     $t5,-8($tp)             ; tp[j]
721
722         lfdu    $A0,8($ap_l)            ; load a[j] in double format
723         lfdu    $A1,8($ap_h)
724         lfdu    $N0,8($np_l)            ; load n[j] in double format
725         lfdu    $N1,8($np_h)
726         lfdu    $A2,8($ap_l)            ; load a[j+1] in double format
727         lfdu    $A3,8($ap_h)
728         lfdu    $N2,8($np_l)            ; load n[j+1] in double format
729         lfdu    $N3,8($np_h)
730
731         fmadd   $T0a,$A0,$ba,$dota
732         fmadd   $T0b,$A0,$bb,$dotb
733         fmul    $T1a,$A1,$ba
734         fmul    $T1b,$A1,$bb
735         fmul    $T2a,$A2,$ba
736         fmul    $T2b,$A2,$bb
737         fmul    $T3a,$A3,$ba
738         fmul    $T3b,$A3,$bb
739
740         fmadd   $T1a,$A0,$bc,$T1a
741         fmadd   $T1b,$A0,$bd,$T1b
742         fmadd   $T2a,$A1,$bc,$T2a
743         fmadd   $T2b,$A1,$bd,$T2b
744         fmadd   $T3a,$A2,$bc,$T3a
745         fmadd   $T3b,$A2,$bd,$T3b
746         fmul    $dota,$A3,$bc
747         fmul    $dotb,$A3,$bd
748
749         fmadd   $T0a,$N0,$na,$T0a
750         fmadd   $T0b,$N0,$nb,$T0b
751         fmadd   $T1a,$N1,$na,$T1a
752         fmadd   $T1b,$N1,$nb,$T1b
753         fmadd   $T2a,$N2,$na,$T2a
754         fmadd   $T2b,$N2,$nb,$T2b
755         fmadd   $T3a,$N3,$na,$T3a
756         fmadd   $T3b,$N3,$nb,$T3b
757
758         fmadd   $T1a,$N0,$nc,$T1a
759         fmadd   $T1b,$N0,$nd,$T1b
760         fmadd   $T2a,$N1,$nc,$T2a
761         fmadd   $T2b,$N1,$nd,$T2b
762         fmadd   $T3a,$N2,$nc,$T3a
763         fmadd   $T3b,$N2,$nd,$T3b
764         fmadd   $dota,$N3,$nc,$dota
765         fmadd   $dotb,$N3,$nd,$dotb
766
767         fctid   $T0a,$T0a
768         fctid   $T0b,$T0b
769         fctid   $T1a,$T1a
770         fctid   $T1b,$T1b
771         fctid   $T2a,$T2a
772         fctid   $T2b,$T2b
773         fctid   $T3a,$T3a
774         fctid   $T3b,$T3b
775
776         stfd    $T0a,`$FRAME+0`($sp)
777         stfd    $T0b,`$FRAME+8`($sp)
778         stfd    $T1a,`$FRAME+16`($sp)
779         stfd    $T1b,`$FRAME+24`($sp)
780         stfd    $T2a,`$FRAME+32`($sp)
781         stfd    $T2b,`$FRAME+40`($sp)
782         stfd    $T3a,`$FRAME+48`($sp)
783         stfd    $T3b,`$FRAME+56`($sp)
784         bdnz-   Linner
785
786         ld      $t0,`$FRAME+0`($sp)
787         ld      $t1,`$FRAME+8`($sp)
788         ld      $t2,`$FRAME+16`($sp)
789         ld      $t3,`$FRAME+24`($sp)
790         ld      $t4,`$FRAME+32`($sp)
791         ld      $t5,`$FRAME+40`($sp)
792         ld      $t6,`$FRAME+48`($sp)
793         ld      $t7,`$FRAME+56`($sp)
794
795         add     $t0,$t0,$carry          ; can not overflow
796         srdi    $carry,$t0,16
797         add     $t1,$t1,$carry
798         srdi    $carry,$t1,16
799         add     $t2,$t2,$carry
800         srdi    $carry,$t2,16
801         add     $t3,$t3,$carry
802         srdi    $carry,$t3,16
803         add     $t4,$t4,$carry
804         srdi    $carry,$t4,16
805         add     $t5,$t5,$carry
806         srdi    $carry,$t5,16
807         add     $t6,$t6,$carry
808         srdi    $carry,$t6,16
809         add     $t7,$t7,$carry
810
811         insrdi  $t0,$t1,16,32
812         insrdi  $t0,$t2,16,16
813         insrdi  $t0,$t3,16,0            ; 0..63 bits
814         insrdi  $t4,$t5,16,32
815         insrdi  $t4,$t6,16,16
816         insrdi  $t4,$t7,16,0            ; 64..127 bits
817         srdi    $carry,$t7,16           ; upper 33 bits
818
819         ld      $t1,8($tp)              ; tp[j]
820         ldu     $t2,16($tp)             ; tp[j+1]
821
822         addc    $t3,$t0,$t1
823         adde    $t5,$t4,$t2
824         addze   $carry,$carry
825
826         std     $t3,-16($tp)            ; tp[j-1]
827         std     $t5,-8($tp)             ; tp[j]
828 \f
829         fctid   $dota,$dota
830         fctid   $dotb,$dotb
831         stfd    $dota,`$FRAME+0`($sp)
832         stfd    $dotb,`$FRAME+8`($sp)
833         ld      $t0,`$FRAME+0`($sp)
834         ld      $t1,`$FRAME+8`($sp)
835         add     $carry,$carry,$ovf      ; comsume upmost overflow
836         add     $t0,$t0,$carry          ; can not overflow
837         srdi    $carry,$t0,16
838         add     $t1,$t1,$carry
839         insrdi  $t0,$t1,48,0
840         srdi    $ovf,$t1,48
841         std     $t0,0($tp)              ; tp[num-1]
842
843         subf    $ap_l,$num,$ap_l        ; rewind pointers
844         subf    $ap_h,$num,$ap_h
845         subf    $np_l,$num,$np_l
846         subf    $np_h,$num,$np_h
847         addi    $i,$i,8
848         cmpw    $i,$num
849         blt-    Louter
850 \f
851         subf    $np,$num,$np    ; rewind np
852         subfc   $i,$i,$i        ; j=0 and "clear" XER[CA]
853         addi    $tp,$sp,`$FRAME+$TRANSFER+8`
854         addi    $t4,$sp,`$FRAME+$TRANSFER+16`
855         addi    $t5,$np,8
856         addi    $t6,$rp,8
857         addi    $j,$j,1
858         mtctr   $j
859
860 .align  4
861 Lsub:   ldx     $t0,$tp,$i
862         ldx     $t1,$np,$i
863         ldx     $t2,$t4,$i
864         ldx     $t3,$t5,$i
865         subfe   $t0,$t1,$t0     ; tp[j]-np[j]
866         subfe   $t2,$t3,$t2     ; tp[j+1]-np[j+1]
867         stdx    $t0,$rp,$i
868         stdx    $t2,$t6,$i
869         addi    $i,$i,16
870         bdnz-   Lsub
871
872         li      $i,0
873         subfe   $ovf,$i,$ovf    ; handle upmost overflow bit
874         and     $ap,$tp,$ovf
875         andc    $np,$rp,$ovf
876         or      $ap,$ap,$np     ; ap=borrow?tp:rp
877         addi    $t7,$ap,8
878         mtctr   $j
879
880 .align  4
881 Lcopy:                          ; copy or in-place refresh
882         ldx     $t0,$ap,$i
883         ldx     $t1,$t7,$i
884         stdu    $i,8($ap_l)     ; zap {an}p_{lh}
885         stdu    $i,8($ap_h)
886         stdu    $i,8($np_l)
887         stdu    $i,8($np_h)
888         stdu    $i,8($ap_l)
889         stdu    $i,8($ap_h)
890         stdu    $i,8($np_l)
891         stdu    $i,8($np_h)
892         stdx    $t0,$rp,$i
893         stdx    $t1,$t6,$i
894         stdx    $i,$tp,$i       ; zap tp at once
895         stdx    $i,$t4,$i
896         addi    $i,$i,16
897         bdnz-   Lcopy
898
899         $POP    r14,`2*$SIZE_T`($sp)
900         $POP    r15,`3*$SIZE_T`($sp)
901         $POP    r16,`4*$SIZE_T`($sp)
902         $POP    r17,`5*$SIZE_T`($sp)
903         $POP    r18,`6*$SIZE_T`($sp)
904         $POP    r19,`7*$SIZE_T`($sp)
905         $POP    r20,`8*$SIZE_T`($sp)
906         $POP    r21,`9*$SIZE_T`($sp)
907         $POP    r22,`10*$SIZE_T`($sp)
908         $POP    r23,`11*$SIZE_T`($sp)
909         $POP    r24,`12*$SIZE_T`($sp)
910         $POP    r25,`13*$SIZE_T`($sp)
911         $POP    r26,`14*$SIZE_T`($sp)
912         $POP    r27,`15*$SIZE_T`($sp)
913         lfd     f14,`16*$SIZE_T+0`($sp)
914         lfd     f15,`16*$SIZE_T+8`($sp)
915         lfd     f16,`16*$SIZE_T+16`($sp)
916         lfd     f17,`16*$SIZE_T+24`($sp)
917         lfd     f18,`16*$SIZE_T+32`($sp)
918         lfd     f19,`16*$SIZE_T+40`($sp)
919         lfd     f20,`16*$SIZE_T+48`($sp)
920         lfd     f21,`16*$SIZE_T+56`($sp)
921         lfd     f22,`16*$SIZE_T+64`($sp)
922         lfd     f23,`16*$SIZE_T+72`($sp)
923         lfd     f24,`16*$SIZE_T+80`($sp)
924         lfd     f25,`16*$SIZE_T+88`($sp)
925         $POP    $sp,0($sp)
926         li      r3,1    ; signal "handled"
927         blr
928         .long   0
929 .asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
930 ___
931
932 $code =~ s/\`([^\`]*)\`/eval $1/gem;
933 print $code;
934 close STDOUT;