New Montgomery multiplication module, ppc64-mont.pl. Reference, non-optimized
[openssl.git] / crypto / bn / asm / ppc64-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # December 2007
11
12 $output = shift;
13
14 if ($output =~ /32\-mont\.s/) {
15         $SIZE_T=4;
16         $RZONE= 224;
17         $FRAME= $SIZE_T*16+8*12;
18         $fname= "bn_mul_mont_ppc64";
19
20         $STUX=  "stwux";        # store indexed and update
21         $PUSH=  "stw";
22         $POP=   "lwz";
23         die "not implemented yet";
24 } elsif ($output =~ /64\-mont\.s/) {
25         $SIZE_T=8;
26         $RZONE= 288;
27         $FRAME= $SIZE_T*16+8*12;
28         $fname= "bn_mul_mont";
29
30         # same as above, but 64-bit mnemonics...
31         $STUX=  "stdux";        # store indexed and update
32         $PUSH=  "std";
33         $POP=   "ld";
34 } else { die "nonsense $output"; }
35
36 ( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) ||
37         die "can't call ../perlasm/ppc-xlate.pl: $!";
38
39 $TRANSFER=8*8;
40
41 $sp="r1";
42 $toc="r2";
43 $rp="r3";       $ovf="r3";
44 $ap="r4";
45 $bp="r5";
46 $np="r6";
47 $n0="r7";
48 $num="r8";
49 $rp="r9";       # $rp is reassigned
50 $tp="r10";
51 $j="r11";
52 $i="r12";
53 # non-volatile registers
54 $ap_l="r14";
55 $ap_h="r15";
56 $np_l="r16";
57 $np_h="r17";
58 $carry="r18";
59 $a0="r19";      # ap[0]
60 $t0="r20";
61 $t1="r21";
62 $t2="r22";
63 $t3="r23";
64 $t4="r24";
65 $t5="r25";
66 $t6="r26";
67 $t7="r27";
68
69 # PPC offers enough register bank capacity to unroll inner loops twice
70 #
71 #     ..A3A2A1A0
72 #           dcba
73 #    -----------
74 #            A0a
75 #           A0b
76 #          A0c
77 #         A0d
78 #          A1a
79 #         A1b
80 #        A1c
81 #       A1d
82 #        A2a
83 #       A2b
84 #      A2c
85 #     A2d
86 #      A3a
87 #     A3b
88 #    A3c
89 #   A3d
90 #    ..a
91 #   ..b
92 #
93 $ba="f0";
94 $bb="f1";
95 $bc="f2";
96 $bd="f3";
97 $na="f4";
98 $nb="f5";
99 $nc="f6";
100 $nd="f7";
101 $dota="f8";
102 $dotb="f9";
103 $A0="f10";
104 $A1="f11";
105 $A2="f12";
106 $A3="f13";
107 $N0="f14";
108 $N1="f15";
109 $N2="f16";
110 $N3="f17";
111 $T0a="f18";
112 $T0b="f19";
113 $T1a="f20";
114 $T1b="f21";
115 $T2a="f22";
116 $T2b="f23";
117 $T3a="f24";
118 $T3b="f25";
119
120 # sp----------->+-------------------------------+
121 #               | saved sp                      |
122 #               +-------------------------------+
123 #               |                               |
124 #               +-------------------------------+
125 #               | 14 saved gpr, r14-r27         |
126 #               .                               .
127 #               .                               .
128 #   +16*size_t  +-------------------------------+
129 #               | 12 saved fpr, f14-f25         |
130 #               .                               .
131 #               .                               .
132 #   +12*8       +-------------------------------+
133 #               | 8 gpr<->fpr transfer zone     |
134 #               .                               .
135 #               .                               .
136 #   +8*8        +-------------------------------+
137 #               | __int64 tmp[-1]               |
138 #               +-------------------------------+
139 #               | __int64 tmp[num]              |
140 #               .                               .
141 #               .                               .
142 #               .                               .
143 #   +(num+1)*8  +-------------------------------+
144 #               | double a_lo[num]              |
145 #               .                               .
146 #               .                               .
147 #               .                               .
148 #   +num*8      +-------------------------------+
149 #               | double a_hi[num]              |
150 #               .                               .
151 #               .                               .
152 #               .                               .
153 #   +num*8      +-------------------------------+
154 #               | double n_lo[num]              |
155 #               .                               .
156 #               .                               .
157 #               .                               .
158 #   +num*8      +-------------------------------+
159 #               | double n_hi[num]              |
160 #               .                               .
161 #               .                               .
162 #               .                               .
163 #               +-------------------------------+
164
165 $code=<<___;
166 .machine "any"
167 .text
168
169 .globl  .$fname
170 .align  4
171 .$fname:
172         cmpwi   $num,4
173         mr      $rp,r3          ; $rp is reassigned
174         li      r3,0            ; possible "not handled" return code
175         bltlr-
176         andi.   r0,$num,1       ; $num has to be even
177         bnelr-
178
179         slwi    $num,$num,3     ; num*=8
180         li      $i,-4096
181         slwi    $tp,$num,2      ; place for {an}p_{lh}[num], i.e. 4*num
182         add     $tp,$tp,$num    ; place for tp[num+1]
183         addi    $tp,$tp,`$FRAME+$TRANSFER+8+$RZONE`
184         subf    $tp,$tp,$sp     ; $sp-$tp
185         and     $tp,$tp,$i      ; minimize TLB usage
186         subf    $tp,$sp,$tp     ; $tp-$sp
187         $STUX   $sp,$sp,$tp     ; alloca
188
189         $PUSH   r14,`2*$SIZE_T`($sp)
190         $PUSH   r15,`3*$SIZE_T`($sp)
191         $PUSH   r16,`4*$SIZE_T`($sp)
192         $PUSH   r17,`5*$SIZE_T`($sp)
193         $PUSH   r18,`6*$SIZE_T`($sp)
194         $PUSH   r19,`7*$SIZE_T`($sp)
195         $PUSH   r20,`8*$SIZE_T`($sp)
196         $PUSH   r21,`9*$SIZE_T`($sp)
197         $PUSH   r22,`10*$SIZE_T`($sp)
198         $PUSH   r23,`11*$SIZE_T`($sp)
199         $PUSH   r24,`12*$SIZE_T`($sp)
200         $PUSH   r25,`13*$SIZE_T`($sp)
201         $PUSH   r26,`14*$SIZE_T`($sp)
202         $PUSH   r27,`15*$SIZE_T`($sp)
203         stfd    f14,`16*$SIZE_T+0`($sp)
204         stfd    f15,`16*$SIZE_T+8`($sp)
205         stfd    f16,`16*$SIZE_T+16`($sp)
206         stfd    f17,`16*$SIZE_T+24`($sp)
207         stfd    f18,`16*$SIZE_T+32`($sp)
208         stfd    f19,`16*$SIZE_T+40`($sp)
209         stfd    f20,`16*$SIZE_T+48`($sp)
210         stfd    f21,`16*$SIZE_T+56`($sp)
211         stfd    f22,`16*$SIZE_T+64`($sp)
212         stfd    f23,`16*$SIZE_T+72`($sp)
213         stfd    f24,`16*$SIZE_T+80`($sp)
214         stfd    f25,`16*$SIZE_T+88`($sp)
215         std     r0,$FRAME($sp)  ; r0 is still 0
216         lfd     $dota,$FRAME($sp)
217         lfd     $dotb,$FRAME($sp)
218
219         addi    $tp,$sp,`$FRAME+$TRANSFER`
220         ; note that {an}p_{lh} are off by 1, this is because they
221         ; are used with stfdu/lfdu instruction...
222         add     $ap_l,$tp,$num
223         add     $ap_h,$ap_l,$num
224         add     $np_l,$ap_h,$num
225         add     $np_h,$np_l,$num
226
227         ld      $a0,0($ap)      ; pull ap[0] value
228         ld      $n0,0($n0)      ; pull n0[0] value
229         srwi    $j,$num,`3+1`   ; counter register, num/2
230 \f
231         ld      $t3,0($bp)      ; bp[0]
232         mulld   $t7,$a0,$t3     ; ap[0]*bp[0]
233         mulld   $t7,$t7,$n0     ; tp[0]*n0
234
235         ; transfer bp[0] to FPU as 4x16-bit values
236         extrdi  $t0,$t3,16,48
237         extrdi  $t1,$t3,16,32
238         extrdi  $t2,$t3,16,16
239         extrdi  $t3,$t3,16,0
240         std     $t0,`$FRAME+0`($sp)
241         std     $t1,`$FRAME+8`($sp)
242         std     $t2,`$FRAME+16`($sp)
243         std     $t3,`$FRAME+24`($sp)
244         lfd     $ba,`$FRAME+0`($sp)
245         lfd     $bb,`$FRAME+8`($sp)
246         lfd     $bc,`$FRAME+16`($sp)
247         lfd     $bd,`$FRAME+24`($sp)
248         fcfid   $ba,$ba
249         fcfid   $bb,$bb
250         fcfid   $bc,$bc
251         fcfid   $bd,$bd
252
253         ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
254         extrdi  $t4,$t7,16,48
255         extrdi  $t5,$t7,16,32
256         extrdi  $t6,$t7,16,16
257         extrdi  $t7,$t7,16,0
258         std     $t4,`$FRAME+32`($sp)
259         std     $t5,`$FRAME+40`($sp)
260         std     $t6,`$FRAME+48`($sp)
261         std     $t7,`$FRAME+56`($sp)
262         lfd     $na,`$FRAME+32`($sp)
263         lfd     $nb,`$FRAME+40`($sp)
264         lfd     $nc,`$FRAME+48`($sp)
265         lfd     $nd,`$FRAME+56`($sp)
266         fcfid   $na,$na
267         fcfid   $nb,$nb
268         fcfid   $nc,$nc
269         fcfid   $nd,$nd
270 \f
271         addi    $tp,$sp,`$FRAME+$TRANSFER-8`
272         li      $carry,0
273         mtctr   $j
274 .align  4
275 L1st:
276         lwz     $t0,4($ap)              ; load a[j] as 32-bit word pair
277         lwz     $t1,0($ap)
278         lwz     $t2,4($np)              ; load n[j] as 32-bit word pair
279         lwz     $t3,0($np)
280         std     $t0,`$FRAME+0`($sp)
281         std     $t1,`$FRAME+8`($sp)
282         std     $t2,`$FRAME+16`($sp)
283         std     $t3,`$FRAME+24`($sp)
284         lfd     $A0,`$FRAME+0`($sp)
285         lfd     $A1,`$FRAME+8`($sp)
286         lfd     $N0,`$FRAME+16`($sp)
287         lfd     $N1,`$FRAME+24`($sp)
288         fcfid   $A0,$A0
289         fcfid   $A1,$A1
290         fcfid   $N0,$N0
291         fcfid   $N1,$N1
292         stfdu   $A0,8($ap_l)            ; save a[j] in double format
293         stfdu   $A1,8($ap_h)
294         stfdu   $N0,8($np_l)            ; save n[j] in double format
295         stfdu   $N1,8($np_h)
296
297         lwz     $t4,12($ap)             ; load a[j+1] as 32-bit word pair
298         lwz     $t5,8($ap)
299         lwz     $t6,12($np)             ; load n[j+1] as 32-bit word pair
300         lwz     $t7,8($np)
301         std     $t4,`$FRAME+32`($sp)
302         std     $t5,`$FRAME+40`($sp)
303         std     $t6,`$FRAME+48`($sp)
304         std     $t7,`$FRAME+56`($sp)
305         lfd     $A2,`$FRAME+32`($sp)
306         lfd     $A3,`$FRAME+40`($sp)
307         lfd     $N2,`$FRAME+48`($sp)
308         lfd     $N3,`$FRAME+56`($sp)
309         fcfid   $A2,$A2
310         fcfid   $A3,$A3
311         fcfid   $N2,$N2
312         fcfid   $N3,$N3
313         stfdu   $A2,8($ap_l)            ; save a[j+1] in double format
314         stfdu   $A3,8($ap_h)
315         stfdu   $N2,8($np_l)            ; save n[j+1] in double format
316         stfdu   $N3,8($np_h)
317         addi    $ap,$ap,16
318         addi    $np,$np,16
319
320         fmadd   $T0a,$A0,$ba,$dota
321         fmadd   $T0b,$A0,$bb,$dotb
322         fmul    $T1a,$A1,$ba
323         fmul    $T1b,$A1,$bb
324         fmul    $T2a,$A2,$ba
325         fmul    $T2b,$A2,$bb
326         fmul    $T3a,$A3,$ba
327         fmul    $T3b,$A3,$bb
328
329         fmadd   $T1a,$A0,$bc,$T1a
330         fmadd   $T1b,$A0,$bd,$T1b
331         fmadd   $T2a,$A1,$bc,$T2a
332         fmadd   $T2b,$A1,$bd,$T2b
333         fmadd   $T3a,$A2,$bc,$T3a
334         fmadd   $T3b,$A2,$bd,$T3b
335         fmul    $dota,$A3,$bc
336         fmul    $dotb,$A3,$bd
337
338         fmadd   $T0a,$N0,$na,$T0a
339         fmadd   $T0b,$N0,$nb,$T0b
340         fmadd   $T1a,$N1,$na,$T1a
341         fmadd   $T1b,$N1,$nb,$T1b
342         fmadd   $T2a,$N2,$na,$T2a
343         fmadd   $T2b,$N2,$nb,$T2b
344         fmadd   $T3a,$N3,$na,$T3a
345         fmadd   $T3b,$N3,$nb,$T3b
346
347         fmadd   $T1a,$N0,$nc,$T1a
348         fmadd   $T1b,$N0,$nd,$T1b
349         fmadd   $T2a,$N1,$nc,$T2a
350         fmadd   $T2b,$N1,$nd,$T2b
351         fmadd   $T3a,$N2,$nc,$T3a
352         fmadd   $T3b,$N2,$nd,$T3b
353         fmadd   $dota,$N3,$nc,$dota
354         fmadd   $dotb,$N3,$nd,$dotb
355
356         fctid   $T0a,$T0a
357         fctid   $T0b,$T0b
358         fctid   $T1a,$T1a
359         fctid   $T1b,$T1b
360         fctid   $T2a,$T2a
361         fctid   $T2b,$T2b
362         fctid   $T3a,$T3a
363         fctid   $T3b,$T3b
364
365         stfd    $T0a,`$FRAME+0`($sp)
366         stfd    $T0b,`$FRAME+8`($sp)
367         stfd    $T1a,`$FRAME+16`($sp)
368         stfd    $T1b,`$FRAME+24`($sp)
369         stfd    $T2a,`$FRAME+32`($sp)
370         stfd    $T2b,`$FRAME+40`($sp)
371         stfd    $T3a,`$FRAME+48`($sp)
372         stfd    $T3b,`$FRAME+56`($sp)
373         ld      $t0,`$FRAME+0`($sp)
374         ld      $t1,`$FRAME+8`($sp)
375         ld      $t2,`$FRAME+16`($sp)
376         ld      $t3,`$FRAME+24`($sp)
377         ld      $t4,`$FRAME+32`($sp)
378         ld      $t5,`$FRAME+40`($sp)
379         ld      $t6,`$FRAME+48`($sp)
380         ld      $t7,`$FRAME+56`($sp)
381
382         add     $t0,$t0,$carry          ; can not overflow
383         srdi    $carry,$t0,16
384         add     $t1,$t1,$carry
385         srdi    $carry,$t1,16
386         add     $t2,$t2,$carry
387         srdi    $carry,$t2,16
388         add     $t3,$t3,$carry
389         srdi    $carry,$t3,16
390         add     $t4,$t4,$carry
391         srdi    $carry,$t4,16
392         add     $t5,$t5,$carry
393         srdi    $carry,$t5,16
394         add     $t6,$t6,$carry
395         srdi    $carry,$t6,16
396         add     $t7,$t7,$carry
397
398         insrdi  $t0,$t1,16,32
399         insrdi  $t0,$t2,16,16
400         insrdi  $t0,$t3,16,0            ; 0..63 bits
401         insrdi  $t4,$t5,16,32
402         insrdi  $t4,$t6,16,16
403         insrdi  $t4,$t7,16,0            ; 64..127 bits
404         srdi    $carry,$t7,16           ; upper 33 bits
405
406         std     $t0,8($tp)              ; tp[j-1]
407         stdu    $t4,16($tp)             ; tp[j]
408         bdnz-   L1st
409 \f
410         fctid   $dota,$dota
411         fctid   $dotb,$dotb
412         stfd    $dota,`$FRAME+0`($sp)
413         stfd    $dotb,`$FRAME+8`($sp)
414         ld      $t0,`$FRAME+0`($sp)
415         ld      $t1,`$FRAME+8`($sp)
416         add     $t0,$t0,$carry          ; can not overflow
417         srdi    $carry,$t0,16
418         add     $t1,$t1,$carry
419         insrdi  $t0,$t1,48,0
420         srdi    $ovf,$t1,48
421         std     $t0,8($tp)              ; tp[num-1]
422
423         subf    $ap_l,$num,$ap_l        ; rewind pointers
424         subf    $ap_h,$num,$ap_h
425         subf    $np_l,$num,$np_l
426         subf    $np_h,$num,$np_h
427 \f
428         li      $i,8                    ; i=1
429 .align  4
430 Louter:
431         ldx     $t3,$bp,$i      ; bp[i]
432         ld      $t0,`$FRAME+$TRANSFER+8`($sp)   ; tp[0]
433         mulld   $t7,$a0,$t3     ; ap[0]*bp[i]
434         add     $t7,$t7,$t0     ; ap[0]*bp[i]+tp[0]
435         mulld   $t7,$t7,$n0     ; tp[0]*n0
436
437         ; transfer b[i] to FPU as 4x16-bit values
438         extrdi  $t0,$t3,16,48
439         extrdi  $t1,$t3,16,32
440         extrdi  $t2,$t3,16,16
441         extrdi  $t3,$t3,16,0
442         std     $t0,`$FRAME+0`($sp)
443         std     $t1,`$FRAME+8`($sp)
444         std     $t2,`$FRAME+16`($sp)
445         std     $t3,`$FRAME+24`($sp)
446         lfd     $ba,`$FRAME+0`($sp)
447         lfd     $bb,`$FRAME+8`($sp)
448         lfd     $bc,`$FRAME+16`($sp)
449         lfd     $bd,`$FRAME+24`($sp)
450         fcfid   $ba,$ba
451         fcfid   $bb,$bb
452         fcfid   $bc,$bc
453         fcfid   $bd,$bd
454
455         ; transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
456         extrdi  $t4,$t7,16,48
457         extrdi  $t5,$t7,16,32
458         extrdi  $t6,$t7,16,16
459         extrdi  $t7,$t7,16,0
460         std     $t4,`$FRAME+32`($sp)
461         std     $t5,`$FRAME+40`($sp)
462         std     $t6,`$FRAME+48`($sp)
463         std     $t7,`$FRAME+56`($sp)
464         lfd     $na,`$FRAME+32`($sp)
465         lfd     $nb,`$FRAME+40`($sp)
466         lfd     $nc,`$FRAME+48`($sp)
467         lfd     $nd,`$FRAME+56`($sp)
468         fcfid   $na,$na
469         fcfid   $nb,$nb
470         fcfid   $nc,$nc
471         fcfid   $nd,$nd
472 \f
473         addi    $tp,$sp,`$FRAME+$TRANSFER`
474         fsub    $dota,$dota,$dota
475         fsub    $dotb,$dotb,$dotb
476         li      $carry,0
477         mtctr   $j
478 .align  4
479 Linner:
480         lfdu    $A0,8($ap_l)            ; load a[j] in double format
481         lfdu    $A1,8($ap_h)
482         lfdu    $N0,8($np_l)            ; load n[j] in double format
483         lfdu    $N1,8($np_h)
484         lfdu    $A2,8($ap_l)            ; load a[j+1] in double format
485         lfdu    $A3,8($ap_h)
486         lfdu    $N2,8($np_l)            ; load n[j+1] in double format
487         lfdu    $N3,8($np_h)
488
489         fmadd   $T0a,$A0,$ba,$dota
490         fmadd   $T0b,$A0,$bb,$dotb
491         fmul    $T1a,$A1,$ba
492         fmul    $T1b,$A1,$bb
493         fmul    $T2a,$A2,$ba
494         fmul    $T2b,$A2,$bb
495         fmul    $T3a,$A3,$ba
496         fmul    $T3b,$A3,$bb
497
498         fmadd   $T1a,$A0,$bc,$T1a
499         fmadd   $T1b,$A0,$bd,$T1b
500         fmadd   $T2a,$A1,$bc,$T2a
501         fmadd   $T2b,$A1,$bd,$T2b
502         fmadd   $T3a,$A2,$bc,$T3a
503         fmadd   $T3b,$A2,$bd,$T3b
504         fmul    $dota,$A3,$bc
505         fmul    $dotb,$A3,$bd
506
507         fmadd   $T0a,$N0,$na,$T0a
508         fmadd   $T0b,$N0,$nb,$T0b
509         fmadd   $T1a,$N1,$na,$T1a
510         fmadd   $T1b,$N1,$nb,$T1b
511         fmadd   $T2a,$N2,$na,$T2a
512         fmadd   $T2b,$N2,$nb,$T2b
513         fmadd   $T3a,$N3,$na,$T3a
514         fmadd   $T3b,$N3,$nb,$T3b
515
516         fmadd   $T1a,$N0,$nc,$T1a
517         fmadd   $T1b,$N0,$nd,$T1b
518         fmadd   $T2a,$N1,$nc,$T2a
519         fmadd   $T2b,$N1,$nd,$T2b
520         fmadd   $T3a,$N2,$nc,$T3a
521         fmadd   $T3b,$N2,$nd,$T3b
522         fmadd   $dota,$N3,$nc,$dota
523         fmadd   $dotb,$N3,$nd,$dotb
524
525         fctid   $T0a,$T0a
526         fctid   $T0b,$T0b
527         fctid   $T1a,$T1a
528         fctid   $T1b,$T1b
529         fctid   $T2a,$T2a
530         fctid   $T2b,$T2b
531         fctid   $T3a,$T3a
532         fctid   $T3b,$T3b
533
534         stfd    $T0a,`$FRAME+0`($sp)
535         stfd    $T0b,`$FRAME+8`($sp)
536         stfd    $T1a,`$FRAME+16`($sp)
537         stfd    $T1b,`$FRAME+24`($sp)
538         stfd    $T2a,`$FRAME+32`($sp)
539         stfd    $T2b,`$FRAME+40`($sp)
540         stfd    $T3a,`$FRAME+48`($sp)
541         stfd    $T3b,`$FRAME+56`($sp)
542         ld      $t0,`$FRAME+0`($sp)
543         ld      $t1,`$FRAME+8`($sp)
544         ld      $t2,`$FRAME+16`($sp)
545         ld      $t3,`$FRAME+24`($sp)
546         ld      $t4,`$FRAME+32`($sp)
547         ld      $t5,`$FRAME+40`($sp)
548         ld      $t6,`$FRAME+48`($sp)
549         ld      $t7,`$FRAME+56`($sp)
550
551         add     $t0,$t0,$carry          ; can not overflow
552         srdi    $carry,$t0,16
553         add     $t1,$t1,$carry
554         srdi    $carry,$t1,16
555         add     $t2,$t2,$carry
556         srdi    $carry,$t2,16
557         add     $t3,$t3,$carry
558         srdi    $carry,$t3,16
559         add     $t4,$t4,$carry
560         srdi    $carry,$t4,16
561         add     $t5,$t5,$carry
562         srdi    $carry,$t5,16
563         add     $t6,$t6,$carry
564         srdi    $carry,$t6,16
565         add     $t7,$t7,$carry
566
567         insrdi  $t0,$t1,16,32
568         insrdi  $t0,$t2,16,16
569         insrdi  $t0,$t3,16,0            ; 0..63 bits
570         insrdi  $t4,$t5,16,32
571         insrdi  $t4,$t6,16,16
572         insrdi  $t4,$t7,16,0            ; 64..127 bits
573         srdi    $carry,$t7,16           ; upper 33 bits
574
575         ld      $t1,8($tp)              ; tp[j]
576         ldu     $t2,16($tp)             ; tp[j+1]
577
578         addc    $t3,$t0,$t1
579         adde    $t5,$t4,$t2
580         addze   $carry,$carry
581
582         std     $t3,-16($tp)            ; tp[j-1]
583         std     $t5,-8($tp)             ; tp[j]
584         bdnz-   Linner
585 \f
586         fctid   $dota,$dota
587         fctid   $dotb,$dotb
588         stfd    $dota,`$FRAME+0`($sp)
589         stfd    $dotb,`$FRAME+8`($sp)
590         ld      $t0,`$FRAME+0`($sp)
591         ld      $t1,`$FRAME+8`($sp)
592         add     $carry,$carry,$ovf      ; comsume upmost overflow
593         add     $t0,$t0,$carry          ; can not overflow
594         srdi    $carry,$t0,16
595         add     $t1,$t1,$carry
596         insrdi  $t0,$t1,48,0
597         srdi    $ovf,$t1,48
598         std     $t0,0($tp)              ; tp[num-1]
599
600         subf    $ap_l,$num,$ap_l        ; rewind pointers
601         subf    $ap_h,$num,$ap_h
602         subf    $np_l,$num,$np_l
603         subf    $np_h,$num,$np_h
604         addi    $i,$i,8
605         cmpw    $i,$num
606         blt-    Louter
607 \f
608         subf    $np,$num,$np    ; rewind np
609         subfc   $i,$i,$i        ; j=0 and "clear" XER[CA]
610         addi    $tp,$sp,`$FRAME+$TRANSFER+8`
611         addi    $t4,$sp,`$FRAME+$TRANSFER+16`
612         addi    $t5,$np,8
613         addi    $t6,$rp,8
614         mtctr   $j
615
616 .align  4
617 Lsub:   ldx     $t0,$tp,$i
618         ldx     $t1,$np,$i
619         ldx     $t2,$t4,$i
620         ldx     $t3,$t5,$i
621         subfe   $t0,$t1,$t0     ; tp[j]-np[j]
622         subfe   $t2,$t3,$t2     ; tp[j+1]-np[j+1]
623         stdx    $t0,$rp,$i
624         stdx    $t2,$t6,$i
625         addi    $i,$i,16
626         bdnz-   Lsub
627
628         li      $i,0
629         subfe   $ovf,$i,$ovf    ; handle upmost overflow bit
630         and     $ap,$tp,$ovf
631         andc    $np,$rp,$ovf
632         or      $ap,$ap,$np     ; ap=borrow?tp:rp
633         addi    $t7,$ap,8
634         mtctr   $j
635
636 .align  4
637 Lcopy:                          ; copy or in-place refresh
638         ldx     $t0,$ap,$i
639         ldx     $t1,$t7,$i
640         stdu    $i,8($ap_l)     ; zap {an}p_{lh}
641         stdu    $i,8($ap_h)
642         stdu    $i,8($np_l)
643         stdu    $i,8($np_h)
644         stdu    $i,8($ap_l)
645         stdu    $i,8($ap_h)
646         stdu    $i,8($np_l)
647         stdu    $i,8($np_h)
648         stdx    $t0,$rp,$i
649         stdx    $t1,$t6,$i
650         stdx    $i,$tp,$i       ; zap tp at once
651         stdx    $i,$t4,$i
652         addi    $i,$i,16
653         bdnz-   Lcopy
654
655         $POP    r14,`2*$SIZE_T`($sp)
656         $POP    r15,`3*$SIZE_T`($sp)
657         $POP    r16,`4*$SIZE_T`($sp)
658         $POP    r17,`5*$SIZE_T`($sp)
659         $POP    r18,`6*$SIZE_T`($sp)
660         $POP    r19,`7*$SIZE_T`($sp)
661         $POP    r20,`8*$SIZE_T`($sp)
662         $POP    r21,`9*$SIZE_T`($sp)
663         $POP    r22,`10*$SIZE_T`($sp)
664         $POP    r23,`11*$SIZE_T`($sp)
665         $POP    r24,`12*$SIZE_T`($sp)
666         $POP    r25,`13*$SIZE_T`($sp)
667         $POP    r26,`14*$SIZE_T`($sp)
668         $POP    r27,`15*$SIZE_T`($sp)
669         lfd     f14,`16*$SIZE_T+0`($sp)
670         lfd     f15,`16*$SIZE_T+8`($sp)
671         lfd     f16,`16*$SIZE_T+16`($sp)
672         lfd     f17,`16*$SIZE_T+24`($sp)
673         lfd     f18,`16*$SIZE_T+32`($sp)
674         lfd     f19,`16*$SIZE_T+40`($sp)
675         lfd     f20,`16*$SIZE_T+48`($sp)
676         lfd     f21,`16*$SIZE_T+56`($sp)
677         lfd     f22,`16*$SIZE_T+64`($sp)
678         lfd     f23,`16*$SIZE_T+72`($sp)
679         lfd     f24,`16*$SIZE_T+80`($sp)
680         lfd     f25,`16*$SIZE_T+88`($sp)
681         $POP    $sp,0($sp)
682         li      r3,1    ; signal "handled"
683         blr
684         .long   0
685 .asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
686 ___
687
688 $code =~ s/\`([^\`]*)\`/eval $1/gem;
689 print $code;
690 close STDOUT;