x86_64-xlate.pl update, engage x86_64 assembler in mingw64.
[openssl.git] / crypto / bn / asm / ppc64-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # December 2007
11
12 # The reason for undertaken effort is basically following. Even though
13 # Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
14 # performance was observed to be less than impressive, essentially as
15 # fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
16 # Well, it's not surprising that IBM had to make some sacrifices to
17 # boost the clock frequency that much, but no overall improvement?
18 # Having observed how much difference did switching to FPU make on
19 # UltraSPARC, playing same stunt on Power 6 appeared appropriate...
20 # Unfortunately the resulting performance improvement is not as
21 # impressive, ~30%, and in absolute terms is still very far from what
22 # one would expect from 4.7GHz CPU. There is a chance that I'm doing
23 # something wrong, but in the lack of assembler level micro-profiling
24 # data or at least decent platform guide I can't tell... Or better
25 # results might be achieved with VMX... Anyway, this module provides
26 # *worse* performance on other PowerPC implementations, ~40-15% slower
27 # on PPC970 depending on key length and ~40% slower on Power 5 for all
28 # key lengths. As it's obviously inappropriate as "best all-round"
29 # alternative, it has to be complemented with run-time CPU family
30 # detection. Oh! It should also be noted that unlike other PowerPC
31 # implementation IALU ppc-mont.pl module performs *suboptimaly* on
32 # >=1024-bit key lengths on Power 6. It should also be noted that
33 # *everything* said so far applies to 64-bit builds! As far as 32-bit
34 # application executed on 64-bit CPU goes, this module is likely to
35 # become preferred choice, because it's easy to adapt it for such
36 # case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
37
38 # February 2008
39
40 # Micro-profiling assisted optimization results in ~15% improvement
41 # over original ppc64-mont.pl version, or overall ~50% improvement
42 # over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
43 # Power 6 CPU, this module is 5-150% faster depending on key length,
44 # [hereafter] more for longer keys. But if compared to ppc-mont.pl
45 # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
46 # in absolute terms, but it's apparently the way Power 6 is...
47
48 $flavour = shift;
49
50 if ($flavour =~ /32/) {
51         $SIZE_T=4;
52         $RZONE= 224;
53         $FRAME= $SIZE_T*12+8*12;
54         $fname= "bn_mul_mont_ppc64";
55
56         $STUX=  "stwux";        # store indexed and update
57         $PUSH=  "stw";
58         $POP=   "lwz";
59         die "not implemented yet";
60 } elsif ($flavour =~ /64/) {
61         $SIZE_T=8;
62         $RZONE= 288;
63         $FRAME= $SIZE_T*12+8*12;
64         $fname= "bn_mul_mont";
65
66         # same as above, but 64-bit mnemonics...
67         $STUX=  "stdux";        # store indexed and update
68         $PUSH=  "std";
69         $POP=   "ld";
70 } else { die "nonsense $flavour"; }
71
72 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
73 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
74 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
75 die "can't locate ppc-xlate.pl";
76
77 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
78
79 $FRAME=($FRAME+63)&~63;
80 $TRANSFER=16*8;
81
82 $carry="r0";
83 $sp="r1";
84 $toc="r2";
85 $rp="r3";       $ovf="r3";
86 $ap="r4";
87 $bp="r5";
88 $np="r6";
89 $n0="r7";
90 $num="r8";
91 $rp="r9";       # $rp is reassigned
92 $tp="r10";
93 $j="r11";
94 $i="r12";
95 # non-volatile registers
96 $nap_d="r14";   # interleaved ap and np in double format
97 $a0="r15";      # ap[0]
98 $t0="r16";      # temporary registers
99 $t1="r17";
100 $t2="r18";
101 $t3="r19";
102 $t4="r20";
103 $t5="r21";
104 $t6="r22";
105 $t7="r23";
106
107 # PPC offers enough register bank capacity to unroll inner loops twice
108 #
109 #     ..A3A2A1A0
110 #           dcba
111 #    -----------
112 #            A0a
113 #           A0b
114 #          A0c
115 #         A0d
116 #          A1a
117 #         A1b
118 #        A1c
119 #       A1d
120 #        A2a
121 #       A2b
122 #      A2c
123 #     A2d
124 #      A3a
125 #     A3b
126 #    A3c
127 #   A3d
128 #    ..a
129 #   ..b
130 #
131 $ba="f0";       $bb="f1";       $bc="f2";       $bd="f3";
132 $na="f4";       $nb="f5";       $nc="f6";       $nd="f7";
133 $dota="f8";     $dotb="f9";
134 $A0="f10";      $A1="f11";      $A2="f12";      $A3="f13";
135 $N0="f14";      $N1="f15";      $N2="f16";      $N3="f17";
136 $T0a="f18";     $T0b="f19";
137 $T1a="f20";     $T1b="f21";
138 $T2a="f22";     $T2b="f23";
139 $T3a="f24";     $T3b="f25";
140 \f
141 # sp----------->+-------------------------------+
142 #               | saved sp                      |
143 #               +-------------------------------+
144 #               |                               |
145 #               +-------------------------------+
146 #               | 10 saved gpr, r14-r23         |
147 #               .                               .
148 #               .                               .
149 #   +12*size_t  +-------------------------------+
150 #               | 12 saved fpr, f14-f25         |
151 #               .                               .
152 #               .                               .
153 #   +12*8       +-------------------------------+
154 #               | padding to 64 byte boundary   |
155 #               .                               .
156 #   +X          +-------------------------------+
157 #               | 16 gpr<->fpr transfer zone    |
158 #               .                               .
159 #               .                               .
160 #   +16*8       +-------------------------------+
161 #               | __int64 tmp[-1]               |
162 #               +-------------------------------+
163 #               | __int64 tmp[num]              |
164 #               .                               .
165 #               .                               .
166 #               .                               .
167 #   +(num+1)*8  +-------------------------------+
168 #               | padding to 64 byte boundary   |
169 #               .                               .
170 #   +X          +-------------------------------+
171 #               | double nap_d[4*num]           |
172 #               .                               .
173 #               .                               .
174 #               .                               .
175 #               +-------------------------------+
176 \f
177 $code=<<___;
178 .machine "any"
179 .text
180
181 .globl  .$fname
182 .align  5
183 .$fname:
184         cmpwi   $num,4
185         mr      $rp,r3          ; $rp is reassigned
186         li      r3,0            ; possible "not handled" return code
187         bltlr-
188         andi.   r0,$num,1       ; $num has to be even
189         bnelr-
190
191         slwi    $num,$num,3     ; num*=8
192         li      $i,-4096
193         slwi    $tp,$num,2      ; place for {an}p_{lh}[num], i.e. 4*num
194         add     $tp,$tp,$num    ; place for tp[num+1]
195         addi    $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
196         subf    $tp,$tp,$sp     ; $sp-$tp
197         and     $tp,$tp,$i      ; minimize TLB usage
198         subf    $tp,$sp,$tp     ; $tp-$sp
199         $STUX   $sp,$sp,$tp     ; alloca
200
201         $PUSH   r14,`2*$SIZE_T`($sp)
202         $PUSH   r15,`3*$SIZE_T`($sp)
203         $PUSH   r16,`4*$SIZE_T`($sp)
204         $PUSH   r17,`5*$SIZE_T`($sp)
205         $PUSH   r18,`6*$SIZE_T`($sp)
206         $PUSH   r19,`7*$SIZE_T`($sp)
207         $PUSH   r20,`8*$SIZE_T`($sp)
208         $PUSH   r21,`9*$SIZE_T`($sp)
209         $PUSH   r22,`10*$SIZE_T`($sp)
210         $PUSH   r23,`11*$SIZE_T`($sp)
211         stfd    f14,`12*$SIZE_T+0`($sp)
212         stfd    f15,`12*$SIZE_T+8`($sp)
213         stfd    f16,`12*$SIZE_T+16`($sp)
214         stfd    f17,`12*$SIZE_T+24`($sp)
215         stfd    f18,`12*$SIZE_T+32`($sp)
216         stfd    f19,`12*$SIZE_T+40`($sp)
217         stfd    f20,`12*$SIZE_T+48`($sp)
218         stfd    f21,`12*$SIZE_T+56`($sp)
219         stfd    f22,`12*$SIZE_T+64`($sp)
220         stfd    f23,`12*$SIZE_T+72`($sp)
221         stfd    f24,`12*$SIZE_T+80`($sp)
222         stfd    f25,`12*$SIZE_T+88`($sp)
223
224         ld      $a0,0($ap)      ; pull ap[0] value
225         ld      $n0,0($n0)      ; pull n0[0] value
226         ld      $t3,0($bp)      ; bp[0]
227
228         addi    $tp,$sp,`$FRAME+$TRANSFER+8+64`
229         li      $i,-64
230         add     $nap_d,$tp,$num
231         and     $nap_d,$nap_d,$i        ; align to 64 bytes
232 \f
233         mulld   $t7,$a0,$t3     ; ap[0]*bp[0]
234         ; nap_d is off by 1, because it's used with stfdu/lfdu
235         addi    $nap_d,$nap_d,-8
236         srwi    $j,$num,`3+1`   ; counter register, num/2
237         mulld   $t7,$t7,$n0     ; tp[0]*n0
238         addi    $j,$j,-1
239         addi    $tp,$sp,`$FRAME+$TRANSFER-8`
240         li      $carry,0
241         mtctr   $j
242
243         ; transfer bp[0] to FPU as 4x16-bit values
244         extrdi  $t0,$t3,16,48
245         extrdi  $t1,$t3,16,32
246         extrdi  $t2,$t3,16,16
247         extrdi  $t3,$t3,16,0
248         std     $t0,`$FRAME+0`($sp)
249         std     $t1,`$FRAME+8`($sp)
250         std     $t2,`$FRAME+16`($sp)
251         std     $t3,`$FRAME+24`($sp)
252         ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
253         extrdi  $t4,$t7,16,48
254         extrdi  $t5,$t7,16,32
255         extrdi  $t6,$t7,16,16
256         extrdi  $t7,$t7,16,0
257         std     $t4,`$FRAME+32`($sp)
258         std     $t5,`$FRAME+40`($sp)
259         std     $t6,`$FRAME+48`($sp)
260         std     $t7,`$FRAME+56`($sp)
261         lwz     $t0,4($ap)              ; load a[j] as 32-bit word pair
262         lwz     $t1,0($ap)
263         lwz     $t2,12($ap)             ; load a[j+1] as 32-bit word pair
264         lwz     $t3,8($ap)
265         lwz     $t4,4($np)              ; load n[j] as 32-bit word pair
266         lwz     $t5,0($np)
267         lwz     $t6,12($np)             ; load n[j+1] as 32-bit word pair
268         lwz     $t7,8($np)
269         lfd     $ba,`$FRAME+0`($sp)
270         lfd     $bb,`$FRAME+8`($sp)
271         lfd     $bc,`$FRAME+16`($sp)
272         lfd     $bd,`$FRAME+24`($sp)
273         lfd     $na,`$FRAME+32`($sp)
274         lfd     $nb,`$FRAME+40`($sp)
275         lfd     $nc,`$FRAME+48`($sp)
276         lfd     $nd,`$FRAME+56`($sp)
277         std     $t0,`$FRAME+64`($sp)
278         std     $t1,`$FRAME+72`($sp)
279         std     $t2,`$FRAME+80`($sp)
280         std     $t3,`$FRAME+88`($sp)
281         std     $t4,`$FRAME+96`($sp)
282         std     $t5,`$FRAME+104`($sp)
283         std     $t6,`$FRAME+112`($sp)
284         std     $t7,`$FRAME+120`($sp)
285         fcfid   $ba,$ba
286         fcfid   $bb,$bb
287         fcfid   $bc,$bc
288         fcfid   $bd,$bd
289         fcfid   $na,$na
290         fcfid   $nb,$nb
291         fcfid   $nc,$nc
292         fcfid   $nd,$nd
293
294         lfd     $A0,`$FRAME+64`($sp)
295         lfd     $A1,`$FRAME+72`($sp)
296         lfd     $A2,`$FRAME+80`($sp)
297         lfd     $A3,`$FRAME+88`($sp)
298         lfd     $N0,`$FRAME+96`($sp)
299         lfd     $N1,`$FRAME+104`($sp)
300         lfd     $N2,`$FRAME+112`($sp)
301         lfd     $N3,`$FRAME+120`($sp)
302         fcfid   $A0,$A0
303         fcfid   $A1,$A1
304         fcfid   $A2,$A2
305         fcfid   $A3,$A3
306         fcfid   $N0,$N0
307         fcfid   $N1,$N1
308         fcfid   $N2,$N2
309         fcfid   $N3,$N3
310         addi    $ap,$ap,16
311         addi    $np,$np,16
312
313         fmul    $T1a,$A1,$ba
314         fmul    $T1b,$A1,$bb
315         stfd    $A0,8($nap_d)           ; save a[j] in double format
316         stfd    $A1,16($nap_d)
317         fmul    $T2a,$A2,$ba
318         fmul    $T2b,$A2,$bb
319         stfd    $A2,24($nap_d)          ; save a[j+1] in double format
320         stfd    $A3,32($nap_d)
321         fmul    $T3a,$A3,$ba
322         fmul    $T3b,$A3,$bb
323         stfd    $N0,40($nap_d)          ; save n[j] in double format
324         stfd    $N1,48($nap_d)
325         fmul    $T0a,$A0,$ba
326         fmul    $T0b,$A0,$bb
327         stfd    $N2,56($nap_d)          ; save n[j+1] in double format
328         stfdu   $N3,64($nap_d)
329
330         fmadd   $T1a,$A0,$bc,$T1a
331         fmadd   $T1b,$A0,$bd,$T1b
332         fmadd   $T2a,$A1,$bc,$T2a
333         fmadd   $T2b,$A1,$bd,$T2b
334         fmadd   $T3a,$A2,$bc,$T3a
335         fmadd   $T3b,$A2,$bd,$T3b
336         fmul    $dota,$A3,$bc
337         fmul    $dotb,$A3,$bd
338
339         fmadd   $T1a,$N1,$na,$T1a
340         fmadd   $T1b,$N1,$nb,$T1b
341         fmadd   $T2a,$N2,$na,$T2a
342         fmadd   $T2b,$N2,$nb,$T2b
343         fmadd   $T3a,$N3,$na,$T3a
344         fmadd   $T3b,$N3,$nb,$T3b
345         fmadd   $T0a,$N0,$na,$T0a
346         fmadd   $T0b,$N0,$nb,$T0b
347
348         fmadd   $T1a,$N0,$nc,$T1a
349         fmadd   $T1b,$N0,$nd,$T1b
350         fmadd   $T2a,$N1,$nc,$T2a
351         fmadd   $T2b,$N1,$nd,$T2b
352         fmadd   $T3a,$N2,$nc,$T3a
353         fmadd   $T3b,$N2,$nd,$T3b
354         fmadd   $dota,$N3,$nc,$dota
355         fmadd   $dotb,$N3,$nd,$dotb
356
357         fctid   $T0a,$T0a
358         fctid   $T0b,$T0b
359         fctid   $T1a,$T1a
360         fctid   $T1b,$T1b
361         fctid   $T2a,$T2a
362         fctid   $T2b,$T2b
363         fctid   $T3a,$T3a
364         fctid   $T3b,$T3b
365
366         stfd    $T0a,`$FRAME+0`($sp)
367         stfd    $T0b,`$FRAME+8`($sp)
368         stfd    $T1a,`$FRAME+16`($sp)
369         stfd    $T1b,`$FRAME+24`($sp)
370         stfd    $T2a,`$FRAME+32`($sp)
371         stfd    $T2b,`$FRAME+40`($sp)
372         stfd    $T3a,`$FRAME+48`($sp)
373         stfd    $T3b,`$FRAME+56`($sp)
374 \f
375 .align  5
376 L1st:
377         lwz     $t0,4($ap)              ; load a[j] as 32-bit word pair
378         lwz     $t1,0($ap)
379         lwz     $t2,12($ap)             ; load a[j+1] as 32-bit word pair
380         lwz     $t3,8($ap)
381         lwz     $t4,4($np)              ; load n[j] as 32-bit word pair
382         lwz     $t5,0($np)
383         lwz     $t6,12($np)             ; load n[j+1] as 32-bit word pair
384         lwz     $t7,8($np)
385         std     $t0,`$FRAME+64`($sp)
386         std     $t1,`$FRAME+72`($sp)
387         std     $t2,`$FRAME+80`($sp)
388         std     $t3,`$FRAME+88`($sp)
389         std     $t4,`$FRAME+96`($sp)
390         std     $t5,`$FRAME+104`($sp)
391         std     $t6,`$FRAME+112`($sp)
392         std     $t7,`$FRAME+120`($sp)
393         ld      $t0,`$FRAME+0`($sp)
394         ld      $t1,`$FRAME+8`($sp)
395         ld      $t2,`$FRAME+16`($sp)
396         ld      $t3,`$FRAME+24`($sp)
397         ld      $t4,`$FRAME+32`($sp)
398         ld      $t5,`$FRAME+40`($sp)
399         ld      $t6,`$FRAME+48`($sp)
400         ld      $t7,`$FRAME+56`($sp)
401         lfd     $A0,`$FRAME+64`($sp)
402         lfd     $A1,`$FRAME+72`($sp)
403         lfd     $A2,`$FRAME+80`($sp)
404         lfd     $A3,`$FRAME+88`($sp)
405         lfd     $N0,`$FRAME+96`($sp)
406         lfd     $N1,`$FRAME+104`($sp)
407         lfd     $N2,`$FRAME+112`($sp)
408         lfd     $N3,`$FRAME+120`($sp)
409         fcfid   $A0,$A0
410         fcfid   $A1,$A1
411         fcfid   $A2,$A2
412         fcfid   $A3,$A3
413         fcfid   $N0,$N0
414         fcfid   $N1,$N1
415         fcfid   $N2,$N2
416         fcfid   $N3,$N3
417         addi    $ap,$ap,16
418         addi    $np,$np,16
419
420         fmul    $T1a,$A1,$ba
421         fmul    $T1b,$A1,$bb
422         fmul    $T2a,$A2,$ba
423         fmul    $T2b,$A2,$bb
424         stfd    $A0,8($nap_d)           ; save a[j] in double format
425         stfd    $A1,16($nap_d)
426         fmul    $T3a,$A3,$ba
427         fmul    $T3b,$A3,$bb
428         fmadd   $T0a,$A0,$ba,$dota
429         fmadd   $T0b,$A0,$bb,$dotb
430         stfd    $A2,24($nap_d)          ; save a[j+1] in double format
431         stfd    $A3,32($nap_d)
432
433         fmadd   $T1a,$A0,$bc,$T1a
434         fmadd   $T1b,$A0,$bd,$T1b
435         fmadd   $T2a,$A1,$bc,$T2a
436         fmadd   $T2b,$A1,$bd,$T2b
437         stfd    $N0,40($nap_d)          ; save n[j] in double format
438         stfd    $N1,48($nap_d)
439         fmadd   $T3a,$A2,$bc,$T3a
440         fmadd   $T3b,$A2,$bd,$T3b
441          add    $t0,$t0,$carry          ; can not overflow
442         fmul    $dota,$A3,$bc
443         fmul    $dotb,$A3,$bd
444         stfd    $N2,56($nap_d)          ; save n[j+1] in double format
445         stfdu   $N3,64($nap_d)
446          srdi   $carry,$t0,16
447          add    $t1,$t1,$carry
448          srdi   $carry,$t1,16
449
450         fmadd   $T1a,$N1,$na,$T1a
451         fmadd   $T1b,$N1,$nb,$T1b
452          insrdi $t0,$t1,16,32
453         fmadd   $T2a,$N2,$na,$T2a
454         fmadd   $T2b,$N2,$nb,$T2b
455          add    $t2,$t2,$carry
456         fmadd   $T3a,$N3,$na,$T3a
457         fmadd   $T3b,$N3,$nb,$T3b
458          srdi   $carry,$t2,16
459         fmadd   $T0a,$N0,$na,$T0a
460         fmadd   $T0b,$N0,$nb,$T0b
461          insrdi $t0,$t2,16,16
462          add    $t3,$t3,$carry
463          srdi   $carry,$t3,16
464
465         fmadd   $T1a,$N0,$nc,$T1a
466         fmadd   $T1b,$N0,$nd,$T1b
467          insrdi $t0,$t3,16,0            ; 0..63 bits
468         fmadd   $T2a,$N1,$nc,$T2a
469         fmadd   $T2b,$N1,$nd,$T2b
470          add    $t4,$t4,$carry
471         fmadd   $T3a,$N2,$nc,$T3a
472         fmadd   $T3b,$N2,$nd,$T3b
473          srdi   $carry,$t4,16
474         fmadd   $dota,$N3,$nc,$dota
475         fmadd   $dotb,$N3,$nd,$dotb
476          add    $t5,$t5,$carry
477          srdi   $carry,$t5,16
478          insrdi $t4,$t5,16,32
479
480         fctid   $T0a,$T0a
481         fctid   $T0b,$T0b
482          add    $t6,$t6,$carry
483         fctid   $T1a,$T1a
484         fctid   $T1b,$T1b
485          srdi   $carry,$t6,16
486         fctid   $T2a,$T2a
487         fctid   $T2b,$T2b
488          insrdi $t4,$t6,16,16
489         fctid   $T3a,$T3a
490         fctid   $T3b,$T3b
491          add    $t7,$t7,$carry
492          insrdi $t4,$t7,16,0            ; 64..127 bits
493          srdi   $carry,$t7,16           ; upper 33 bits
494
495         stfd    $T0a,`$FRAME+0`($sp)
496         stfd    $T0b,`$FRAME+8`($sp)
497         stfd    $T1a,`$FRAME+16`($sp)
498         stfd    $T1b,`$FRAME+24`($sp)
499         stfd    $T2a,`$FRAME+32`($sp)
500         stfd    $T2b,`$FRAME+40`($sp)
501         stfd    $T3a,`$FRAME+48`($sp)
502         stfd    $T3b,`$FRAME+56`($sp)
503          std    $t0,8($tp)              ; tp[j-1]
504          stdu   $t4,16($tp)             ; tp[j]
505         bdnz-   L1st
506 \f
507         fctid   $dota,$dota
508         fctid   $dotb,$dotb
509
510         ld      $t0,`$FRAME+0`($sp)
511         ld      $t1,`$FRAME+8`($sp)
512         ld      $t2,`$FRAME+16`($sp)
513         ld      $t3,`$FRAME+24`($sp)
514         ld      $t4,`$FRAME+32`($sp)
515         ld      $t5,`$FRAME+40`($sp)
516         ld      $t6,`$FRAME+48`($sp)
517         ld      $t7,`$FRAME+56`($sp)
518         stfd    $dota,`$FRAME+64`($sp)
519         stfd    $dotb,`$FRAME+72`($sp)
520
521         add     $t0,$t0,$carry          ; can not overflow
522         srdi    $carry,$t0,16
523         add     $t1,$t1,$carry
524         srdi    $carry,$t1,16
525         insrdi  $t0,$t1,16,32
526         add     $t2,$t2,$carry
527         srdi    $carry,$t2,16
528         insrdi  $t0,$t2,16,16
529         add     $t3,$t3,$carry
530         srdi    $carry,$t3,16
531         insrdi  $t0,$t3,16,0            ; 0..63 bits
532         add     $t4,$t4,$carry
533         srdi    $carry,$t4,16
534         add     $t5,$t5,$carry
535         srdi    $carry,$t5,16
536         insrdi  $t4,$t5,16,32
537         add     $t6,$t6,$carry
538         srdi    $carry,$t6,16
539         insrdi  $t4,$t6,16,16
540         add     $t7,$t7,$carry
541         insrdi  $t4,$t7,16,0            ; 64..127 bits
542         srdi    $carry,$t7,16           ; upper 33 bits
543         ld      $t6,`$FRAME+64`($sp)
544         ld      $t7,`$FRAME+72`($sp)
545
546         std     $t0,8($tp)              ; tp[j-1]
547         stdu    $t4,16($tp)             ; tp[j]
548
549         add     $t6,$t6,$carry          ; can not overflow
550         srdi    $carry,$t6,16
551         add     $t7,$t7,$carry
552         insrdi  $t6,$t7,48,0
553         srdi    $ovf,$t7,48
554         std     $t6,8($tp)              ; tp[num-1]
555
556         slwi    $t7,$num,2
557         subf    $nap_d,$t7,$nap_d       ; rewind pointer
558 \f
559         li      $i,8                    ; i=1
560 .align  5
561 Louter:
562         ldx     $t3,$bp,$i      ; bp[i]
563         ld      $t6,`$FRAME+$TRANSFER+8`($sp)   ; tp[0]
564         mulld   $t7,$a0,$t3     ; ap[0]*bp[i]
565
566         addi    $tp,$sp,`$FRAME+$TRANSFER`
567         add     $t7,$t7,$t6     ; ap[0]*bp[i]+tp[0]
568         li      $carry,0
569         mulld   $t7,$t7,$n0     ; tp[0]*n0
570         mtctr   $j
571
572         ; transfer bp[i] to FPU as 4x16-bit values
573         extrdi  $t0,$t3,16,48
574         extrdi  $t1,$t3,16,32
575         extrdi  $t2,$t3,16,16
576         extrdi  $t3,$t3,16,0
577         std     $t0,`$FRAME+0`($sp)
578         std     $t1,`$FRAME+8`($sp)
579         std     $t2,`$FRAME+16`($sp)
580         std     $t3,`$FRAME+24`($sp)
581         ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
582         extrdi  $t4,$t7,16,48
583         extrdi  $t5,$t7,16,32
584         extrdi  $t6,$t7,16,16
585         extrdi  $t7,$t7,16,0
586         std     $t4,`$FRAME+32`($sp)
587         std     $t5,`$FRAME+40`($sp)
588         std     $t6,`$FRAME+48`($sp)
589         std     $t7,`$FRAME+56`($sp)
590
591         lfd     $A0,8($nap_d)           ; load a[j] in double format
592         lfd     $A1,16($nap_d)
593         lfd     $A2,24($nap_d)          ; load a[j+1] in double format
594         lfd     $A3,32($nap_d)
595         lfd     $N0,40($nap_d)          ; load n[j] in double format
596         lfd     $N1,48($nap_d)
597         lfd     $N2,56($nap_d)          ; load n[j+1] in double format
598         lfdu    $N3,64($nap_d)
599
600         lfd     $ba,`$FRAME+0`($sp)
601         lfd     $bb,`$FRAME+8`($sp)
602         lfd     $bc,`$FRAME+16`($sp)
603         lfd     $bd,`$FRAME+24`($sp)
604         lfd     $na,`$FRAME+32`($sp)
605         lfd     $nb,`$FRAME+40`($sp)
606         lfd     $nc,`$FRAME+48`($sp)
607         lfd     $nd,`$FRAME+56`($sp)
608
609         fcfid   $ba,$ba
610         fcfid   $bb,$bb
611         fcfid   $bc,$bc
612         fcfid   $bd,$bd
613         fcfid   $na,$na
614         fcfid   $nb,$nb
615         fcfid   $nc,$nc
616         fcfid   $nd,$nd
617
618         fmul    $T1a,$A1,$ba
619         fmul    $T1b,$A1,$bb
620         fmul    $T2a,$A2,$ba
621         fmul    $T2b,$A2,$bb
622         fmul    $T3a,$A3,$ba
623         fmul    $T3b,$A3,$bb
624         fmul    $T0a,$A0,$ba
625         fmul    $T0b,$A0,$bb
626
627         fmadd   $T1a,$A0,$bc,$T1a
628         fmadd   $T1b,$A0,$bd,$T1b
629         fmadd   $T2a,$A1,$bc,$T2a
630         fmadd   $T2b,$A1,$bd,$T2b
631         fmadd   $T3a,$A2,$bc,$T3a
632         fmadd   $T3b,$A2,$bd,$T3b
633         fmul    $dota,$A3,$bc
634         fmul    $dotb,$A3,$bd
635
636         fmadd   $T1a,$N1,$na,$T1a
637         fmadd   $T1b,$N1,$nb,$T1b
638          lfd    $A0,8($nap_d)           ; load a[j] in double format
639          lfd    $A1,16($nap_d)
640         fmadd   $T2a,$N2,$na,$T2a
641         fmadd   $T2b,$N2,$nb,$T2b
642          lfd    $A2,24($nap_d)          ; load a[j+1] in double format
643          lfd    $A3,32($nap_d)
644         fmadd   $T3a,$N3,$na,$T3a
645         fmadd   $T3b,$N3,$nb,$T3b
646         fmadd   $T0a,$N0,$na,$T0a
647         fmadd   $T0b,$N0,$nb,$T0b
648
649         fmadd   $T1a,$N0,$nc,$T1a
650         fmadd   $T1b,$N0,$nd,$T1b
651         fmadd   $T2a,$N1,$nc,$T2a
652         fmadd   $T2b,$N1,$nd,$T2b
653         fmadd   $T3a,$N2,$nc,$T3a
654         fmadd   $T3b,$N2,$nd,$T3b
655         fmadd   $dota,$N3,$nc,$dota
656         fmadd   $dotb,$N3,$nd,$dotb
657
658         fctid   $T0a,$T0a
659         fctid   $T0b,$T0b
660         fctid   $T1a,$T1a
661         fctid   $T1b,$T1b
662         fctid   $T2a,$T2a
663         fctid   $T2b,$T2b
664         fctid   $T3a,$T3a
665         fctid   $T3b,$T3b
666
667         stfd    $T0a,`$FRAME+0`($sp)
668         stfd    $T0b,`$FRAME+8`($sp)
669         stfd    $T1a,`$FRAME+16`($sp)
670         stfd    $T1b,`$FRAME+24`($sp)
671         stfd    $T2a,`$FRAME+32`($sp)
672         stfd    $T2b,`$FRAME+40`($sp)
673         stfd    $T3a,`$FRAME+48`($sp)
674         stfd    $T3b,`$FRAME+56`($sp)
675 \f
676 .align  5
677 Linner:
678         fmul    $T1a,$A1,$ba
679         fmul    $T1b,$A1,$bb
680         fmul    $T2a,$A2,$ba
681         fmul    $T2b,$A2,$bb
682         lfd     $N0,40($nap_d)          ; load n[j] in double format
683         lfd     $N1,48($nap_d)
684         fmul    $T3a,$A3,$ba
685         fmul    $T3b,$A3,$bb
686         fmadd   $T0a,$A0,$ba,$dota
687         fmadd   $T0b,$A0,$bb,$dotb
688         lfd     $N2,56($nap_d)          ; load n[j+1] in double format
689         lfdu    $N3,64($nap_d)
690
691         fmadd   $T1a,$A0,$bc,$T1a
692         fmadd   $T1b,$A0,$bd,$T1b
693         fmadd   $T2a,$A1,$bc,$T2a
694         fmadd   $T2b,$A1,$bd,$T2b
695          lfd    $A0,8($nap_d)           ; load a[j] in double format
696          lfd    $A1,16($nap_d)
697         fmadd   $T3a,$A2,$bc,$T3a
698         fmadd   $T3b,$A2,$bd,$T3b
699         fmul    $dota,$A3,$bc
700         fmul    $dotb,$A3,$bd
701          lfd    $A2,24($nap_d)          ; load a[j+1] in double format
702          lfd    $A3,32($nap_d)
703
704         fmadd   $T1a,$N1,$na,$T1a
705         fmadd   $T1b,$N1,$nb,$T1b
706          ld     $t0,`$FRAME+0`($sp)
707          ld     $t1,`$FRAME+8`($sp)
708         fmadd   $T2a,$N2,$na,$T2a
709         fmadd   $T2b,$N2,$nb,$T2b
710          ld     $t2,`$FRAME+16`($sp)
711          ld     $t3,`$FRAME+24`($sp)
712         fmadd   $T3a,$N3,$na,$T3a
713         fmadd   $T3b,$N3,$nb,$T3b
714          add    $t0,$t0,$carry          ; can not overflow
715          ld     $t4,`$FRAME+32`($sp)
716          ld     $t5,`$FRAME+40`($sp)
717         fmadd   $T0a,$N0,$na,$T0a
718         fmadd   $T0b,$N0,$nb,$T0b
719          srdi   $carry,$t0,16
720          add    $t1,$t1,$carry
721          srdi   $carry,$t1,16
722          ld     $t6,`$FRAME+48`($sp)
723          ld     $t7,`$FRAME+56`($sp)
724
725         fmadd   $T1a,$N0,$nc,$T1a
726         fmadd   $T1b,$N0,$nd,$T1b
727          insrdi $t0,$t1,16,32
728          ld     $t1,8($tp)              ; tp[j]
729         fmadd   $T2a,$N1,$nc,$T2a
730         fmadd   $T2b,$N1,$nd,$T2b
731          add    $t2,$t2,$carry
732         fmadd   $T3a,$N2,$nc,$T3a
733         fmadd   $T3b,$N2,$nd,$T3b
734          srdi   $carry,$t2,16
735          insrdi $t0,$t2,16,16
736         fmadd   $dota,$N3,$nc,$dota
737         fmadd   $dotb,$N3,$nd,$dotb
738          add    $t3,$t3,$carry
739          ldu    $t2,16($tp)             ; tp[j+1]
740          srdi   $carry,$t3,16
741          insrdi $t0,$t3,16,0            ; 0..63 bits
742          add    $t4,$t4,$carry
743
744         fctid   $T0a,$T0a
745         fctid   $T0b,$T0b
746          srdi   $carry,$t4,16
747         fctid   $T1a,$T1a
748         fctid   $T1b,$T1b
749          add    $t5,$t5,$carry
750         fctid   $T2a,$T2a
751         fctid   $T2b,$T2b
752          srdi   $carry,$t5,16
753          insrdi $t4,$t5,16,32
754         fctid   $T3a,$T3a
755         fctid   $T3b,$T3b
756          add    $t6,$t6,$carry
757          srdi   $carry,$t6,16
758          insrdi $t4,$t6,16,16
759
760         stfd    $T0a,`$FRAME+0`($sp)
761         stfd    $T0b,`$FRAME+8`($sp)
762          add    $t7,$t7,$carry
763          addc   $t3,$t0,$t1
764         stfd    $T1a,`$FRAME+16`($sp)
765         stfd    $T1b,`$FRAME+24`($sp)
766          insrdi $t4,$t7,16,0            ; 64..127 bits
767          srdi   $carry,$t7,16           ; upper 33 bits
768         stfd    $T2a,`$FRAME+32`($sp)
769         stfd    $T2b,`$FRAME+40`($sp)
770          adde   $t5,$t4,$t2
771         stfd    $T3a,`$FRAME+48`($sp)
772         stfd    $T3b,`$FRAME+56`($sp)
773          addze  $carry,$carry
774          std    $t3,-16($tp)            ; tp[j-1]
775          std    $t5,-8($tp)             ; tp[j]
776         bdnz-   Linner
777 \f
778         fctid   $dota,$dota
779         fctid   $dotb,$dotb
780         ld      $t0,`$FRAME+0`($sp)
781         ld      $t1,`$FRAME+8`($sp)
782         ld      $t2,`$FRAME+16`($sp)
783         ld      $t3,`$FRAME+24`($sp)
784         ld      $t4,`$FRAME+32`($sp)
785         ld      $t5,`$FRAME+40`($sp)
786         ld      $t6,`$FRAME+48`($sp)
787         ld      $t7,`$FRAME+56`($sp)
788         stfd    $dota,`$FRAME+64`($sp)
789         stfd    $dotb,`$FRAME+72`($sp)
790
791         add     $t0,$t0,$carry          ; can not overflow
792         srdi    $carry,$t0,16
793         add     $t1,$t1,$carry
794         srdi    $carry,$t1,16
795         insrdi  $t0,$t1,16,32
796         add     $t2,$t2,$carry
797         ld      $t1,8($tp)              ; tp[j]
798         srdi    $carry,$t2,16
799         insrdi  $t0,$t2,16,16
800         add     $t3,$t3,$carry
801         ldu     $t2,16($tp)             ; tp[j+1]
802         srdi    $carry,$t3,16
803         insrdi  $t0,$t3,16,0            ; 0..63 bits
804         add     $t4,$t4,$carry
805         srdi    $carry,$t4,16
806         add     $t5,$t5,$carry
807         srdi    $carry,$t5,16
808         insrdi  $t4,$t5,16,32
809         add     $t6,$t6,$carry
810         srdi    $carry,$t6,16
811         insrdi  $t4,$t6,16,16
812         add     $t7,$t7,$carry
813         insrdi  $t4,$t7,16,0            ; 64..127 bits
814         srdi    $carry,$t7,16           ; upper 33 bits
815         ld      $t6,`$FRAME+64`($sp)
816         ld      $t7,`$FRAME+72`($sp)
817
818         addc    $t3,$t0,$t1
819         adde    $t5,$t4,$t2
820         addze   $carry,$carry
821
822         std     $t3,-16($tp)            ; tp[j-1]
823         std     $t5,-8($tp)             ; tp[j]
824
825         add     $carry,$carry,$ovf      ; comsume upmost overflow
826         add     $t6,$t6,$carry          ; can not overflow
827         srdi    $carry,$t6,16
828         add     $t7,$t7,$carry
829         insrdi  $t6,$t7,48,0
830         srdi    $ovf,$t7,48
831         std     $t6,0($tp)              ; tp[num-1]
832
833         slwi    $t7,$num,2
834         addi    $i,$i,8
835         subf    $nap_d,$t7,$nap_d       ; rewind pointer
836         cmpw    $i,$num
837         blt-    Louter
838 \f
839         subf    $np,$num,$np    ; rewind np
840         addi    $j,$j,1         ; restore counter
841         subfc   $i,$i,$i        ; j=0 and "clear" XER[CA]
842         addi    $tp,$sp,`$FRAME+$TRANSFER+8`
843         addi    $t4,$sp,`$FRAME+$TRANSFER+16`
844         addi    $t5,$np,8
845         addi    $t6,$rp,8
846         mtctr   $j
847
848 .align  4
849 Lsub:   ldx     $t0,$tp,$i
850         ldx     $t1,$np,$i
851         ldx     $t2,$t4,$i
852         ldx     $t3,$t5,$i
853         subfe   $t0,$t1,$t0     ; tp[j]-np[j]
854         subfe   $t2,$t3,$t2     ; tp[j+1]-np[j+1]
855         stdx    $t0,$rp,$i
856         stdx    $t2,$t6,$i
857         addi    $i,$i,16
858         bdnz-   Lsub
859
860         li      $i,0
861         subfe   $ovf,$i,$ovf    ; handle upmost overflow bit
862         and     $ap,$tp,$ovf
863         andc    $np,$rp,$ovf
864         or      $ap,$ap,$np     ; ap=borrow?tp:rp
865         addi    $t7,$ap,8
866         mtctr   $j
867
868 .align  4
869 Lcopy:                          ; copy or in-place refresh
870         ldx     $t0,$ap,$i
871         ldx     $t1,$t7,$i
872         std     $i,8($nap_d)    ; zap nap_d
873         std     $i,16($nap_d)
874         std     $i,24($nap_d)
875         std     $i,32($nap_d)
876         std     $i,40($nap_d)
877         std     $i,48($nap_d)
878         std     $i,56($nap_d)
879         stdu    $i,64($nap_d)
880         stdx    $t0,$rp,$i
881         stdx    $t1,$t6,$i
882         stdx    $i,$tp,$i       ; zap tp at once
883         stdx    $i,$t4,$i
884         addi    $i,$i,16
885         bdnz-   Lcopy
886 \f
887         $POP    r14,`2*$SIZE_T`($sp)
888         $POP    r15,`3*$SIZE_T`($sp)
889         $POP    r16,`4*$SIZE_T`($sp)
890         $POP    r17,`5*$SIZE_T`($sp)
891         $POP    r18,`6*$SIZE_T`($sp)
892         $POP    r19,`7*$SIZE_T`($sp)
893         $POP    r20,`8*$SIZE_T`($sp)
894         $POP    r21,`9*$SIZE_T`($sp)
895         $POP    r22,`10*$SIZE_T`($sp)
896         $POP    r23,`11*$SIZE_T`($sp)
897         lfd     f14,`12*$SIZE_T+0`($sp)
898         lfd     f15,`12*$SIZE_T+8`($sp)
899         lfd     f16,`12*$SIZE_T+16`($sp)
900         lfd     f17,`12*$SIZE_T+24`($sp)
901         lfd     f18,`12*$SIZE_T+32`($sp)
902         lfd     f19,`12*$SIZE_T+40`($sp)
903         lfd     f20,`12*$SIZE_T+48`($sp)
904         lfd     f21,`12*$SIZE_T+56`($sp)
905         lfd     f22,`12*$SIZE_T+64`($sp)
906         lfd     f23,`12*$SIZE_T+72`($sp)
907         lfd     f24,`12*$SIZE_T+80`($sp)
908         lfd     f25,`12*$SIZE_T+88`($sp)
909         $POP    $sp,0($sp)
910         li      r3,1    ; signal "handled"
911         blr
912         .long   0
913 .asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
914 ___
915
916 $code =~ s/\`([^\`]*)\`/eval $1/gem;
917 print $code;
918 close STDOUT;