Final (for this commit series) optimized version and with commentary section.
[openssl.git] / crypto / bn / asm / ppc64-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # December 2007
11
12 # The reason for undertaken effort is basically following. Even though
13 # Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
14 # performance was observed to be less than impressive, essentially as
15 # fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
16 # Well, it's not surprising that IBM had to make some sacrifices to
17 # boost the clock frequency that much, but no overall improvement?
18 # Having observed how much difference did switching to FPU make on
19 # UltraSPARC, playing same stunt on Power 6 appeared appropriate...
20 # Unfortunately the resulting performance improvement is not as
21 # impressive, ~30%, and in absolute terms is still very far from what
22 # one would expect from 4.7GHz CPU. There is a chance that I'm doing
23 # something wrong, but in the lack of assembler level micro-profiling
24 # data or at least decent platform guide I can't tell... Or better
25 # results might be achieved with VMX... Anyway, this module provides
26 # *worse* performance on other PowerPC implementations, ~40-15% slower
27 # on PPC970 depending on key length and ~40% slower on Power 5 for all
28 # key lengths. As it's obviously inappropriate as "best all-round"
29 # alternative, it has to be complemented with run-time CPU family
30 # detection. Oh! It should also be noted that unlike other PowerPC
31 # implementation IALU ppc-mont.pl module performs *suboptimaly* on
32 # >=1024-bit key lengths on Power 6. It should also be noted that
33 # *everything* said so far applies to 64-bit builds! As far as 32-bit
34 # application executed on 64-bit CPU goes, this module is likely to
35 # become preferred choice, because it's easy to adapt it for such
36 # case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
37
38 $output = shift;
39
40 if ($output =~ /32\-mont\.s/) {
41         $SIZE_T=4;
42         $RZONE= 224;
43         $FRAME= $SIZE_T*12+8*12;
44         $fname= "bn_mul_mont_ppc64";
45
46         $STUX=  "stwux";        # store indexed and update
47         $PUSH=  "stw";
48         $POP=   "lwz";
49         die "not implemented yet";
50 } elsif ($output =~ /64\-mont\.s/) {
51         $SIZE_T=8;
52         $RZONE= 288;
53         $FRAME= $SIZE_T*12+8*12;
54         $fname= "bn_mul_mont";
55
56         # same as above, but 64-bit mnemonics...
57         $STUX=  "stdux";        # store indexed and update
58         $PUSH=  "std";
59         $POP=   "ld";
60 } else { die "nonsense $output"; }
61
62 ( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) ||
63         die "can't call ../perlasm/ppc-xlate.pl: $!";
64
65 $FRAME=($FRAME+63)&~63;
66 $TRANSFER=16*8;
67
68 $carry="r0";
69 $sp="r1";
70 $toc="r2";
71 $rp="r3";       $ovf="r3";
72 $ap="r4";
73 $bp="r5";
74 $np="r6";
75 $n0="r7";
76 $num="r8";
77 $rp="r9";       # $rp is reassigned
78 $tp="r10";
79 $j="r11";
80 $i="r12";
81 # non-volatile registers
82 $nap_d="r14";   # interleaved ap and np in double format
83 $a0="r15";      # ap[0]
84 $t0="r16";      # temporary registers
85 $t1="r17";
86 $t2="r18";
87 $t3="r19";
88 $t4="r20";
89 $t5="r21";
90 $t6="r22";
91 $t7="r23";
92
93 # PPC offers enough register bank capacity to unroll inner loops twice
94 #
95 #     ..A3A2A1A0
96 #           dcba
97 #    -----------
98 #            A0a
99 #           A0b
100 #          A0c
101 #         A0d
102 #          A1a
103 #         A1b
104 #        A1c
105 #       A1d
106 #        A2a
107 #       A2b
108 #      A2c
109 #     A2d
110 #      A3a
111 #     A3b
112 #    A3c
113 #   A3d
114 #    ..a
115 #   ..b
116 #
117 $ba="f0";
118 $bb="f1";
119 $bc="f2";
120 $bd="f3";
121 $na="f4";
122 $nb="f5";
123 $nc="f6";
124 $nd="f7";
125 $dota="f8";
126 $dotb="f9";
127 $A0="f10";
128 $A1="f11";
129 $A2="f12";
130 $A3="f13";
131 $N0="f14";
132 $N1="f15";
133 $N2="f16";
134 $N3="f17";
135 $T0a="f18";
136 $T0b="f19";
137 $T1a="f20";
138 $T1b="f21";
139 $T2a="f22";
140 $T2b="f23";
141 $T3a="f24";
142 $T3b="f25";
143
144 # sp----------->+-------------------------------+
145 #               | saved sp                      |
146 #               +-------------------------------+
147 #               |                               |
148 #               +-------------------------------+
149 #               | 10 saved gpr, r14-r23         |
150 #               .                               .
151 #               .                               .
152 #   +12*size_t  +-------------------------------+
153 #               | 12 saved fpr, f14-f25         |
154 #               .                               .
155 #               .                               .
156 #   +12*8       +-------------------------------+
157 #               | padding to 64 byte boundary   |
158 #               .                               .
159 #               +-------------------------------+
160 #               | 16 gpr<->fpr transfer zone    |
161 #               .                               .
162 #               .                               .
163 #   +8*8        +-------------------------------+
164 #               | __int64 tmp[-1]               |
165 #               +-------------------------------+
166 #               | __int64 tmp[num]              |
167 #               .                               .
168 #               .                               .
169 #               .                               .
170 #   +(num+1)*8  +-------------------------------+
171 #               | padding to 64 byte boundary   |
172 #               .                               .
173 #               +-------------------------------+
174 #               | double nap_d[4*num]           |
175 #               .                               .
176 #               .                               .
177 #               .                               .
178 #               +-------------------------------+
179
180 $code=<<___;
181 .machine "any"
182 .text
183
184 .globl  .$fname
185 .align  5
186 .$fname:
187         cmpwi   $num,4
188         mr      $rp,r3          ; $rp is reassigned
189         li      r3,0            ; possible "not handled" return code
190         bltlr-
191         andi.   r0,$num,1       ; $num has to be even
192         bnelr-
193
194         slwi    $num,$num,3     ; num*=8
195         li      $i,-4096
196         slwi    $tp,$num,2      ; place for {an}p_{lh}[num], i.e. 4*num
197         add     $tp,$tp,$num    ; place for tp[num+1]
198         addi    $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
199         subf    $tp,$tp,$sp     ; $sp-$tp
200         and     $tp,$tp,$i      ; minimize TLB usage
201         subf    $tp,$sp,$tp     ; $tp-$sp
202         $STUX   $sp,$sp,$tp     ; alloca
203
204         $PUSH   r14,`2*$SIZE_T`($sp)
205         $PUSH   r15,`3*$SIZE_T`($sp)
206         $PUSH   r16,`4*$SIZE_T`($sp)
207         $PUSH   r17,`5*$SIZE_T`($sp)
208         $PUSH   r18,`6*$SIZE_T`($sp)
209         $PUSH   r19,`7*$SIZE_T`($sp)
210         $PUSH   r20,`8*$SIZE_T`($sp)
211         $PUSH   r21,`9*$SIZE_T`($sp)
212         $PUSH   r22,`10*$SIZE_T`($sp)
213         $PUSH   r23,`11*$SIZE_T`($sp)
214         stfd    f14,`12*$SIZE_T+0`($sp)
215         stfd    f15,`12*$SIZE_T+8`($sp)
216         stfd    f16,`12*$SIZE_T+16`($sp)
217         stfd    f17,`12*$SIZE_T+24`($sp)
218         stfd    f18,`12*$SIZE_T+32`($sp)
219         stfd    f19,`12*$SIZE_T+40`($sp)
220         stfd    f20,`12*$SIZE_T+48`($sp)
221         stfd    f21,`12*$SIZE_T+56`($sp)
222         stfd    f22,`12*$SIZE_T+64`($sp)
223         stfd    f23,`12*$SIZE_T+72`($sp)
224         stfd    f24,`12*$SIZE_T+80`($sp)
225         stfd    f25,`12*$SIZE_T+88`($sp)
226
227         ld      $a0,0($ap)      ; pull ap[0] value
228         ld      $n0,0($n0)      ; pull n0[0] value
229         ld      $t3,0($bp)      ; bp[0]
230
231         addi    $tp,$sp,`$FRAME+$TRANSFER+8+64`
232         li      $i,-64
233         add     $nap_d,$tp,$num
234         and     $nap_d,$nap_d,$i        ; align to 64 bytes
235 \f
236         mulld   $t7,$a0,$t3     ; ap[0]*bp[0]
237         ; nap_d is off by 1, because it's used with stfdu/lfdu
238         addi    $nap_d,$nap_d,-8
239         srwi    $j,$num,`3+1`   ; counter register, num/2
240         mulld   $t7,$t7,$n0     ; tp[0]*n0
241         addi    $j,$j,-1
242         addi    $tp,$sp,`$FRAME+$TRANSFER-8`
243         li      $carry,0
244         mtctr   $j
245
246         ; transfer bp[0] to FPU as 4x16-bit values
247         extrdi  $t0,$t3,16,48
248         extrdi  $t1,$t3,16,32
249         extrdi  $t2,$t3,16,16
250         extrdi  $t3,$t3,16,0
251         std     $t0,`$FRAME+0`($sp)
252         std     $t1,`$FRAME+8`($sp)
253         std     $t2,`$FRAME+16`($sp)
254         std     $t3,`$FRAME+24`($sp)
255         ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
256         extrdi  $t4,$t7,16,48
257         extrdi  $t5,$t7,16,32
258         extrdi  $t6,$t7,16,16
259         extrdi  $t7,$t7,16,0
260         std     $t4,`$FRAME+32`($sp)
261         std     $t5,`$FRAME+40`($sp)
262         std     $t6,`$FRAME+48`($sp)
263         std     $t7,`$FRAME+56`($sp)
264         lwz     $t0,4($ap)              ; load a[j] as 32-bit word pair
265         lwz     $t1,0($ap)
266         lwz     $t2,4($np)              ; load n[j] as 32-bit word pair
267         lwz     $t3,0($np)
268         lwz     $t4,12($ap)             ; load a[j+1] as 32-bit word pair
269         lwz     $t5,8($ap)
270         lwz     $t6,12($np)             ; load n[j+1] as 32-bit word pair
271         lwz     $t7,8($np)
272         lfd     $ba,`$FRAME+0`($sp)
273         lfd     $bb,`$FRAME+8`($sp)
274         lfd     $bc,`$FRAME+16`($sp)
275         lfd     $bd,`$FRAME+24`($sp)
276         lfd     $na,`$FRAME+32`($sp)
277         lfd     $nb,`$FRAME+40`($sp)
278         lfd     $nc,`$FRAME+48`($sp)
279         lfd     $nd,`$FRAME+56`($sp)
280         std     $t0,`$FRAME+64`($sp)
281         std     $t1,`$FRAME+72`($sp)
282         std     $t2,`$FRAME+80`($sp)
283         std     $t3,`$FRAME+88`($sp)
284         std     $t4,`$FRAME+96`($sp)
285         std     $t5,`$FRAME+104`($sp)
286         std     $t6,`$FRAME+112`($sp)
287         std     $t7,`$FRAME+120`($sp)
288         fcfid   $ba,$ba
289         fcfid   $bb,$bb
290         fcfid   $bc,$bc
291         fcfid   $bd,$bd
292         fcfid   $na,$na
293         fcfid   $nb,$nb
294         fcfid   $nc,$nc
295         fcfid   $nd,$nd
296
297         lfd     $A0,`$FRAME+64`($sp)
298         lfd     $A1,`$FRAME+72`($sp)
299         lfd     $N0,`$FRAME+80`($sp)
300         lfd     $N1,`$FRAME+88`($sp)
301         lfd     $A2,`$FRAME+96`($sp)
302         lfd     $A3,`$FRAME+104`($sp)
303         lfd     $N2,`$FRAME+112`($sp)
304         lfd     $N3,`$FRAME+120`($sp)
305         fcfid   $A0,$A0
306         fcfid   $A1,$A1
307         fcfid   $N0,$N0
308         fcfid   $N1,$N1
309         fcfid   $A2,$A2
310         fcfid   $A3,$A3
311         fcfid   $N2,$N2
312         fcfid   $N3,$N3
313         addi    $ap,$ap,16
314         addi    $np,$np,16
315
316         fmul    $T1a,$A1,$ba
317         fmul    $T1b,$A1,$bb
318         stfd    $A0,8($nap_d)           ; save a[j] in double format
319         stfd    $A1,16($nap_d)
320         fmul    $T2a,$A2,$ba
321         fmul    $T2b,$A2,$bb
322         stfd    $N0,24($nap_d)          ; save n[j] in double format
323         stfd    $N1,32($nap_d)
324         fmul    $T3a,$A3,$ba
325         fmul    $T3b,$A3,$bb
326         stfd    $A2,40($nap_d)          ; save a[j+1] in double format
327         stfd    $A3,48($nap_d)
328         fmul    $T0a,$A0,$ba
329         fmul    $T0b,$A0,$bb
330         stfd    $N2,56($nap_d)          ; save n[j+1] in double format
331         stfdu   $N3,64($nap_d)
332
333         fmadd   $T1a,$A0,$bc,$T1a
334         fmadd   $T1b,$A0,$bd,$T1b
335         fmadd   $T2a,$A1,$bc,$T2a
336         fmadd   $T2b,$A1,$bd,$T2b
337         fmadd   $T3a,$A2,$bc,$T3a
338         fmadd   $T3b,$A2,$bd,$T3b
339         fmul    $dota,$A3,$bc
340         fmul    $dotb,$A3,$bd
341
342         fmadd   $T1a,$N1,$na,$T1a
343         fmadd   $T1b,$N1,$nb,$T1b
344         fmadd   $T2a,$N2,$na,$T2a
345         fmadd   $T2b,$N2,$nb,$T2b
346         fmadd   $T3a,$N3,$na,$T3a
347         fmadd   $T3b,$N3,$nb,$T3b
348         fmadd   $T0a,$N0,$na,$T0a
349         fmadd   $T0b,$N0,$nb,$T0b
350
351         fmadd   $T1a,$N0,$nc,$T1a
352         fmadd   $T1b,$N0,$nd,$T1b
353         fmadd   $T2a,$N1,$nc,$T2a
354         fmadd   $T2b,$N1,$nd,$T2b
355         fmadd   $T3a,$N2,$nc,$T3a
356         fmadd   $T3b,$N2,$nd,$T3b
357         fmadd   $dota,$N3,$nc,$dota
358         fmadd   $dotb,$N3,$nd,$dotb
359
360         fctid   $T0a,$T0a
361         fctid   $T0b,$T0b
362         fctid   $T1a,$T1a
363         fctid   $T1b,$T1b
364         fctid   $T2a,$T2a
365         fctid   $T2b,$T2b
366         fctid   $T3a,$T3a
367         fctid   $T3b,$T3b
368
369         stfd    $T0a,`$FRAME+0`($sp)
370         stfd    $T0b,`$FRAME+8`($sp)
371         stfd    $T1a,`$FRAME+16`($sp)
372         stfd    $T1b,`$FRAME+24`($sp)
373         stfd    $T2a,`$FRAME+32`($sp)
374         stfd    $T2b,`$FRAME+40`($sp)
375         stfd    $T3a,`$FRAME+48`($sp)
376         stfd    $T3b,`$FRAME+56`($sp)
377 \f
378 .align  5
379 L1st:
380         lwz     $t0,4($ap)              ; load a[j] as 32-bit word pair
381         lwz     $t1,0($ap)
382         lwz     $t2,4($np)              ; load n[j] as 32-bit word pair
383         lwz     $t3,0($np)
384         lwz     $t4,12($ap)             ; load a[j+1] as 32-bit word pair
385         lwz     $t5,8($ap)
386         lwz     $t6,12($np)             ; load n[j+1] as 32-bit word pair
387         lwz     $t7,8($np)
388         std     $t0,`$FRAME+64`($sp)
389         std     $t1,`$FRAME+72`($sp)
390         std     $t2,`$FRAME+80`($sp)
391         std     $t3,`$FRAME+88`($sp)
392         std     $t4,`$FRAME+96`($sp)
393         std     $t5,`$FRAME+104`($sp)
394         std     $t6,`$FRAME+112`($sp)
395         std     $t7,`$FRAME+120`($sp)
396         ld      $t0,`$FRAME+0`($sp)
397         ld      $t1,`$FRAME+8`($sp)
398         ld      $t2,`$FRAME+16`($sp)
399         ld      $t3,`$FRAME+24`($sp)
400         ld      $t4,`$FRAME+32`($sp)
401         ld      $t5,`$FRAME+40`($sp)
402         ld      $t6,`$FRAME+48`($sp)
403         ld      $t7,`$FRAME+56`($sp)
404         lfd     $A0,`$FRAME+64`($sp)
405         lfd     $A1,`$FRAME+72`($sp)
406         lfd     $N0,`$FRAME+80`($sp)
407         lfd     $N1,`$FRAME+88`($sp)
408         lfd     $A2,`$FRAME+96`($sp)
409         lfd     $A3,`$FRAME+104`($sp)
410         lfd     $N2,`$FRAME+112`($sp)
411         lfd     $N3,`$FRAME+120`($sp)
412         fcfid   $A0,$A0
413         fcfid   $A1,$A1
414         fcfid   $N0,$N0
415         fcfid   $N1,$N1
416         fcfid   $A2,$A2
417         fcfid   $A3,$A3
418         fcfid   $N2,$N2
419         fcfid   $N3,$N3
420         addi    $ap,$ap,16
421         addi    $np,$np,16
422
423         fmul    $T1a,$A1,$ba
424         fmul    $T1b,$A1,$bb
425         stfd    $A0,8($nap_d)           ; save a[j] in double format
426         stfd    $A1,16($nap_d)
427         fmul    $T2a,$A2,$ba
428         fmul    $T2b,$A2,$bb
429         stfd    $N0,24($nap_d)          ; save n[j] in double format
430         stfd    $N1,32($nap_d)
431          add    $t0,$t0,$carry          ; can not overflow
432         fmul    $T3a,$A3,$ba
433         fmul    $T3b,$A3,$bb
434         stfd    $A2,40($nap_d)          ; save a[j+1] in double format
435         stfd    $A3,48($nap_d)
436         fmadd   $T0a,$A0,$ba,$dota
437         fmadd   $T0b,$A0,$bb,$dotb
438         stfd    $N2,56($nap_d)          ; save n[j+1] in double format
439         stfdu   $N3,64($nap_d)
440          srdi   $carry,$t0,16
441
442          add    $t1,$t1,$carry
443         fmadd   $T1a,$A0,$bc,$T1a
444         fmadd   $T1b,$A0,$bd,$T1b
445         fmadd   $T2a,$A1,$bc,$T2a
446         fmadd   $T2b,$A1,$bd,$T2b
447          srdi   $carry,$t1,16
448          insrdi $t0,$t1,16,32
449         fmadd   $T3a,$A2,$bc,$T3a
450         fmadd   $T3b,$A2,$bd,$T3b
451         fmul    $dota,$A3,$bc
452         fmul    $dotb,$A3,$bd
453          add    $t2,$t2,$carry
454
455          srdi   $carry,$t2,16
456          insrdi $t0,$t2,16,16
457         fmadd   $T1a,$N1,$na,$T1a
458         fmadd   $T1b,$N1,$nb,$T1b
459         fmadd   $T2a,$N2,$na,$T2a
460         fmadd   $T2b,$N2,$nb,$T2b
461          add    $t3,$t3,$carry
462         fmadd   $T3a,$N3,$na,$T3a
463         fmadd   $T3b,$N3,$nb,$T3b
464         fmadd   $T0a,$N0,$na,$T0a
465         fmadd   $T0b,$N0,$nb,$T0b
466          srdi   $carry,$t3,16
467          insrdi $t0,$t3,16,0            ; 0..63 bits
468
469          add    $t4,$t4,$carry
470         fmadd   $T1a,$N0,$nc,$T1a
471         fmadd   $T1b,$N0,$nd,$T1b
472         fmadd   $T2a,$N1,$nc,$T2a
473         fmadd   $T2b,$N1,$nd,$T2b
474          srdi   $carry,$t4,16
475         fmadd   $T3a,$N2,$nc,$T3a
476         fmadd   $T3b,$N2,$nd,$T3b
477         fmadd   $dota,$N3,$nc,$dota
478         fmadd   $dotb,$N3,$nd,$dotb
479          add    $t5,$t5,$carry
480
481          srdi   $carry,$t5,16
482          insrdi $t4,$t5,16,32
483         fctid   $T0a,$T0a
484         fctid   $T0b,$T0b
485          add    $t6,$t6,$carry
486         fctid   $T1a,$T1a
487         fctid   $T1b,$T1b
488          srdi   $carry,$t6,16
489          insrdi $t4,$t6,16,16
490         fctid   $T2a,$T2a
491         fctid   $T2b,$T2b
492          add    $t7,$t7,$carry
493         fctid   $T3a,$T3a
494         fctid   $T3b,$T3b
495          insrdi $t4,$t7,16,0            ; 64..127 bits
496          srdi   $carry,$t7,16           ; upper 33 bits
497
498         stfd    $T0a,`$FRAME+0`($sp)
499         stfd    $T0b,`$FRAME+8`($sp)
500         stfd    $T1a,`$FRAME+16`($sp)
501         stfd    $T1b,`$FRAME+24`($sp)
502         stfd    $T2a,`$FRAME+32`($sp)
503         stfd    $T2b,`$FRAME+40`($sp)
504         stfd    $T3a,`$FRAME+48`($sp)
505         stfd    $T3b,`$FRAME+56`($sp)
506          std    $t0,8($tp)              ; tp[j-1]
507          stdu   $t4,16($tp)             ; tp[j]
508         bdnz-   L1st
509 \f
510         fctid   $dota,$dota
511         fctid   $dotb,$dotb
512
513         ld      $t0,`$FRAME+0`($sp)
514         ld      $t1,`$FRAME+8`($sp)
515         ld      $t2,`$FRAME+16`($sp)
516         ld      $t3,`$FRAME+24`($sp)
517         ld      $t4,`$FRAME+32`($sp)
518         ld      $t5,`$FRAME+40`($sp)
519         ld      $t6,`$FRAME+48`($sp)
520         ld      $t7,`$FRAME+56`($sp)
521         stfd    $dota,`$FRAME+64`($sp)
522         stfd    $dotb,`$FRAME+72`($sp)
523
524         add     $t0,$t0,$carry          ; can not overflow
525         srdi    $carry,$t0,16
526         add     $t1,$t1,$carry
527         srdi    $carry,$t1,16
528         insrdi  $t0,$t1,16,32
529         add     $t2,$t2,$carry
530         srdi    $carry,$t2,16
531         insrdi  $t0,$t2,16,16
532         add     $t3,$t3,$carry
533         srdi    $carry,$t3,16
534         insrdi  $t0,$t3,16,0            ; 0..63 bits
535         add     $t4,$t4,$carry
536         srdi    $carry,$t4,16
537         add     $t5,$t5,$carry
538         srdi    $carry,$t5,16
539         insrdi  $t4,$t5,16,32
540         add     $t6,$t6,$carry
541         srdi    $carry,$t6,16
542         insrdi  $t4,$t6,16,16
543         add     $t7,$t7,$carry
544         insrdi  $t4,$t7,16,0            ; 64..127 bits
545         srdi    $carry,$t7,16           ; upper 33 bits
546         ld      $t6,`$FRAME+64`($sp)
547         ld      $t7,`$FRAME+72`($sp)
548
549         std     $t0,8($tp)              ; tp[j-1]
550         stdu    $t4,16($tp)             ; tp[j]
551
552         add     $t6,$t6,$carry          ; can not overflow
553         srdi    $carry,$t6,16
554         add     $t7,$t7,$carry
555         insrdi  $t6,$t7,48,0
556         srdi    $ovf,$t7,48
557         std     $t6,8($tp)              ; tp[num-1]
558
559         slwi    $t7,$num,2
560         subf    $nap_d,$t7,$nap_d       ; rewind pointer
561 \f
562         li      $i,8                    ; i=1
563 .align  5
564 Louter:
565         ldx     $t3,$bp,$i      ; bp[i]
566         ld      $t6,`$FRAME+$TRANSFER+8`($sp)   ; tp[0]
567         mulld   $t7,$a0,$t3     ; ap[0]*bp[i]
568
569         addi    $tp,$sp,`$FRAME+$TRANSFER`
570         add     $t7,$t7,$t6     ; ap[0]*bp[i]+tp[0]
571         li      $carry,0
572         mulld   $t7,$t7,$n0     ; tp[0]*n0
573         mtctr   $j
574
575         ; transfer bp[i] to FPU as 4x16-bit values
576         extrdi  $t0,$t3,16,48
577         extrdi  $t1,$t3,16,32
578         extrdi  $t2,$t3,16,16
579         extrdi  $t3,$t3,16,0
580         std     $t0,`$FRAME+0`($sp)
581         std     $t1,`$FRAME+8`($sp)
582         std     $t2,`$FRAME+16`($sp)
583         std     $t3,`$FRAME+24`($sp)
584         ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
585         extrdi  $t4,$t7,16,48
586         extrdi  $t5,$t7,16,32
587         extrdi  $t6,$t7,16,16
588         extrdi  $t7,$t7,16,0
589         std     $t4,`$FRAME+32`($sp)
590         std     $t5,`$FRAME+40`($sp)
591         std     $t6,`$FRAME+48`($sp)
592         std     $t7,`$FRAME+56`($sp)
593
594         lfd     $A0,8($nap_d)           ; load a[j] in double format
595         lfd     $A1,16($nap_d)
596         lfd     $N0,24($nap_d)          ; load n[j] in double format
597         lfd     $N1,32($nap_d)
598         lfd     $A2,40($nap_d)          ; load a[j+1] in double format
599         lfd     $A3,48($nap_d)
600         lfd     $N2,56($nap_d)          ; load n[j+1] in double format
601         lfdu    $N3,64($nap_d)
602
603         lfd     $ba,`$FRAME+0`($sp)
604         lfd     $bb,`$FRAME+8`($sp)
605         lfd     $bc,`$FRAME+16`($sp)
606         lfd     $bd,`$FRAME+24`($sp)
607         lfd     $na,`$FRAME+32`($sp)
608         lfd     $nb,`$FRAME+40`($sp)
609         lfd     $nc,`$FRAME+48`($sp)
610         lfd     $nd,`$FRAME+56`($sp)
611
612         fcfid   $ba,$ba
613         fcfid   $bb,$bb
614         fcfid   $bc,$bc
615         fcfid   $bd,$bd
616         fcfid   $na,$na
617         fcfid   $nb,$nb
618         fcfid   $nc,$nc
619         fcfid   $nd,$nd
620
621         fmul    $T1a,$A1,$ba
622         fmul    $T1b,$A1,$bb
623         fmul    $T2a,$A2,$ba
624         fmul    $T2b,$A2,$bb
625         fmul    $T3a,$A3,$ba
626         fmul    $T3b,$A3,$bb
627         fmul    $T0a,$A0,$ba
628         fmul    $T0b,$A0,$bb
629
630         fmadd   $T1a,$A0,$bc,$T1a
631         fmadd   $T1b,$A0,$bd,$T1b
632         fmadd   $T2a,$A1,$bc,$T2a
633         fmadd   $T2b,$A1,$bd,$T2b
634         fmadd   $T3a,$A2,$bc,$T3a
635         fmadd   $T3b,$A2,$bd,$T3b
636         fmul    $dota,$A3,$bc
637         fmul    $dotb,$A3,$bd
638
639         fmadd   $T1a,$N1,$na,$T1a
640         fmadd   $T1b,$N1,$nb,$T1b
641         fmadd   $T2a,$N2,$na,$T2a
642         fmadd   $T2b,$N2,$nb,$T2b
643         fmadd   $T3a,$N3,$na,$T3a
644         fmadd   $T3b,$N3,$nb,$T3b
645         fmadd   $T0a,$N0,$na,$T0a
646         fmadd   $T0b,$N0,$nb,$T0b
647
648         fmadd   $T1a,$N0,$nc,$T1a
649         fmadd   $T1b,$N0,$nd,$T1b
650         fmadd   $T2a,$N1,$nc,$T2a
651         fmadd   $T2b,$N1,$nd,$T2b
652         fmadd   $T3a,$N2,$nc,$T3a
653         fmadd   $T3b,$N2,$nd,$T3b
654         fmadd   $dota,$N3,$nc,$dota
655         fmadd   $dotb,$N3,$nd,$dotb
656
657         fctid   $T0a,$T0a
658         fctid   $T0b,$T0b
659         fctid   $T1a,$T1a
660         fctid   $T1b,$T1b
661         fctid   $T2a,$T2a
662         fctid   $T2b,$T2b
663         fctid   $T3a,$T3a
664         fctid   $T3b,$T3b
665
666         stfd    $T0a,`$FRAME+0`($sp)
667         stfd    $T0b,`$FRAME+8`($sp)
668         stfd    $T1a,`$FRAME+16`($sp)
669         stfd    $T1b,`$FRAME+24`($sp)
670         stfd    $T2a,`$FRAME+32`($sp)
671         stfd    $T2b,`$FRAME+40`($sp)
672         stfd    $T3a,`$FRAME+48`($sp)
673         stfd    $T3b,`$FRAME+56`($sp)
674 \f
675 .align  5
676 Linner:
677         lfd     $A0,8($nap_d)           ; load a[j] in double format
678         lfd     $A1,16($nap_d)
679         lfd     $N0,24($nap_d)          ; load n[j] in double format
680         lfd     $N1,32($nap_d)
681         lfd     $A2,40($nap_d)          ; load a[j+1] in double format
682         lfd     $A3,48($nap_d)
683         lfd     $N2,56($nap_d)          ; load n[j+1] in double format
684         lfdu    $N3,64($nap_d)
685
686          ld     $t0,`$FRAME+0`($sp)
687          ld     $t1,`$FRAME+8`($sp)
688          ld     $t2,`$FRAME+16`($sp)
689          ld     $t3,`$FRAME+24`($sp)
690
691         fmul    $T1a,$A1,$ba
692         fmul    $T1b,$A1,$bb
693          ld     $t4,`$FRAME+32`($sp)
694          ld     $t5,`$FRAME+40`($sp)
695         fmul    $T2a,$A2,$ba
696         fmul    $T2b,$A2,$bb
697          add    $t0,$t0,$carry          ; can not overflow
698         fmul    $T3a,$A3,$ba
699         fmul    $T3b,$A3,$bb
700          ld     $t6,`$FRAME+48`($sp)
701          ld     $t7,`$FRAME+56`($sp)
702         fmadd   $T0a,$A0,$ba,$dota
703         fmadd   $T0b,$A0,$bb,$dotb
704          srdi   $carry,$t0,16
705          add    $t1,$t1,$carry
706         fmadd   $T1a,$A0,$bc,$T1a
707         fmadd   $T1b,$A0,$bd,$T1b
708         fmadd   $T2a,$A1,$bc,$T2a
709         fmadd   $T2b,$A1,$bd,$T2b
710          srdi   $carry,$t1,16
711          insrdi $t0,$t1,16,32
712         fmadd   $T3a,$A2,$bc,$T3a
713         fmadd   $T3b,$A2,$bd,$T3b
714         fmul    $dota,$A3,$bc
715         fmul    $dotb,$A3,$bd
716          add    $t2,$t2,$carry
717          ldu    $t1,8($tp)              ; tp[j]
718          srdi   $carry,$t2,16
719          insrdi $t0,$t2,16,16
720         fmadd   $T1a,$N1,$na,$T1a
721         fmadd   $T1b,$N1,$nb,$T1b
722         fmadd   $T2a,$N2,$na,$T2a
723         fmadd   $T2b,$N2,$nb,$T2b
724          add    $t3,$t3,$carry
725          ldu    $t2,8($tp)              ; tp[j+1]
726         fmadd   $T3a,$N3,$na,$T3a
727         fmadd   $T3b,$N3,$nb,$T3b
728         fmadd   $T0a,$N0,$na,$T0a
729         fmadd   $T0b,$N0,$nb,$T0b
730          srdi   $carry,$t3,16
731          insrdi $t0,$t3,16,0            ; 0..63 bits
732          add    $t4,$t4,$carry
733         fmadd   $T1a,$N0,$nc,$T1a
734         fmadd   $T1b,$N0,$nd,$T1b
735         fmadd   $T2a,$N1,$nc,$T2a
736         fmadd   $T2b,$N1,$nd,$T2b
737          srdi   $carry,$t4,16
738         fmadd   $T3a,$N2,$nc,$T3a
739         fmadd   $T3b,$N2,$nd,$T3b
740         fmadd   $dota,$N3,$nc,$dota
741         fmadd   $dotb,$N3,$nd,$dotb
742          add    $t5,$t5,$carry
743          srdi   $carry,$t5,16
744          insrdi $t4,$t5,16,32
745         fctid   $T0a,$T0a
746         fctid   $T0b,$T0b
747         fctid   $T1a,$T1a
748         fctid   $T1b,$T1b
749          add    $t6,$t6,$carry
750         fctid   $T2a,$T2a
751         fctid   $T2b,$T2b
752          srdi   $carry,$t6,16
753          insrdi $t4,$t6,16,16
754         fctid   $T3a,$T3a
755         fctid   $T3b,$T3b
756
757         stfd    $T0a,`$FRAME+0`($sp)
758         stfd    $T0b,`$FRAME+8`($sp)
759          add    $t7,$t7,$carry
760          addc   $t3,$t0,$t1
761         stfd    $T1a,`$FRAME+16`($sp)
762         stfd    $T1b,`$FRAME+24`($sp)
763          insrdi $t4,$t7,16,0            ; 64..127 bits
764          srdi   $carry,$t7,16           ; upper 33 bits
765         stfd    $T2a,`$FRAME+32`($sp)
766         stfd    $T2b,`$FRAME+40`($sp)
767          adde   $t5,$t4,$t2
768         stfd    $T3a,`$FRAME+48`($sp)
769         stfd    $T3b,`$FRAME+56`($sp)
770          addze  $carry,$carry
771          std    $t3,-16($tp)            ; tp[j-1]
772          std    $t5,-8($tp)             ; tp[j]
773         bdnz-   Linner
774 \f
775         fctid   $dota,$dota
776         fctid   $dotb,$dotb
777         ld      $t0,`$FRAME+0`($sp)
778         ld      $t1,`$FRAME+8`($sp)
779         ld      $t2,`$FRAME+16`($sp)
780         ld      $t3,`$FRAME+24`($sp)
781         ld      $t4,`$FRAME+32`($sp)
782         ld      $t5,`$FRAME+40`($sp)
783         ld      $t6,`$FRAME+48`($sp)
784         ld      $t7,`$FRAME+56`($sp)
785         stfd    $dota,`$FRAME+64`($sp)
786         stfd    $dotb,`$FRAME+72`($sp)
787
788         add     $t0,$t0,$carry          ; can not overflow
789         srdi    $carry,$t0,16
790         add     $t1,$t1,$carry
791         srdi    $carry,$t1,16
792         insrdi  $t0,$t1,16,32
793         add     $t2,$t2,$carry
794         ld      $t1,8($tp)              ; tp[j]
795         srdi    $carry,$t2,16
796         insrdi  $t0,$t2,16,16
797         add     $t3,$t3,$carry
798         ldu     $t2,16($tp)             ; tp[j+1]
799         srdi    $carry,$t3,16
800         insrdi  $t0,$t3,16,0            ; 0..63 bits
801         add     $t4,$t4,$carry
802         srdi    $carry,$t4,16
803         add     $t5,$t5,$carry
804         srdi    $carry,$t5,16
805         insrdi  $t4,$t5,16,32
806         add     $t6,$t6,$carry
807         srdi    $carry,$t6,16
808         insrdi  $t4,$t6,16,16
809         add     $t7,$t7,$carry
810         insrdi  $t4,$t7,16,0            ; 64..127 bits
811         srdi    $carry,$t7,16           ; upper 33 bits
812         ld      $t6,`$FRAME+64`($sp)
813         ld      $t7,`$FRAME+72`($sp)
814
815         addc    $t3,$t0,$t1
816         adde    $t5,$t4,$t2
817         addze   $carry,$carry
818
819         std     $t3,-16($tp)            ; tp[j-1]
820         std     $t5,-8($tp)             ; tp[j]
821
822         add     $carry,$carry,$ovf      ; comsume upmost overflow
823         add     $t6,$t6,$carry          ; can not overflow
824         srdi    $carry,$t6,16
825         add     $t7,$t7,$carry
826         insrdi  $t6,$t7,48,0
827         srdi    $ovf,$t7,48
828         std     $t6,0($tp)              ; tp[num-1]
829
830         slwi    $t7,$num,2
831         addi    $i,$i,8
832         subf    $nap_d,$t7,$nap_d       ; rewind pointer
833         cmpw    $i,$num
834         blt-    Louter
835 \f
836         subf    $np,$num,$np    ; rewind np
837         addi    $j,$j,1         ; restore counter
838         subfc   $i,$i,$i        ; j=0 and "clear" XER[CA]
839         addi    $tp,$sp,`$FRAME+$TRANSFER+8`
840         addi    $t4,$sp,`$FRAME+$TRANSFER+16`
841         addi    $t5,$np,8
842         addi    $t6,$rp,8
843         mtctr   $j
844
845 .align  4
846 Lsub:   ldx     $t0,$tp,$i
847         ldx     $t1,$np,$i
848         ldx     $t2,$t4,$i
849         ldx     $t3,$t5,$i
850         subfe   $t0,$t1,$t0     ; tp[j]-np[j]
851         subfe   $t2,$t3,$t2     ; tp[j+1]-np[j+1]
852         stdx    $t0,$rp,$i
853         stdx    $t2,$t6,$i
854         addi    $i,$i,16
855         bdnz-   Lsub
856
857         li      $i,0
858         subfe   $ovf,$i,$ovf    ; handle upmost overflow bit
859         and     $ap,$tp,$ovf
860         andc    $np,$rp,$ovf
861         or      $ap,$ap,$np     ; ap=borrow?tp:rp
862         addi    $t7,$ap,8
863         mtctr   $j
864
865 .align  4
866 Lcopy:                          ; copy or in-place refresh
867         ldx     $t0,$ap,$i
868         ldx     $t1,$t7,$i
869         std     $i,8($nap_d)    ; zap nap_d
870         std     $i,16($nap_d)
871         std     $i,24($nap_d)
872         std     $i,32($nap_d)
873         std     $i,40($nap_d)
874         std     $i,48($nap_d)
875         std     $i,56($nap_d)
876         stdu    $i,64($nap_d)
877         stdx    $t0,$rp,$i
878         stdx    $t1,$t6,$i
879         stdx    $i,$tp,$i       ; zap tp at once
880         stdx    $i,$t4,$i
881         addi    $i,$i,16
882         bdnz-   Lcopy
883
884         $POP    r14,`2*$SIZE_T`($sp)
885         $POP    r15,`3*$SIZE_T`($sp)
886         $POP    r16,`4*$SIZE_T`($sp)
887         $POP    r17,`5*$SIZE_T`($sp)
888         $POP    r18,`6*$SIZE_T`($sp)
889         $POP    r19,`7*$SIZE_T`($sp)
890         $POP    r20,`8*$SIZE_T`($sp)
891         $POP    r21,`9*$SIZE_T`($sp)
892         $POP    r22,`10*$SIZE_T`($sp)
893         $POP    r23,`11*$SIZE_T`($sp)
894         lfd     f14,`12*$SIZE_T+0`($sp)
895         lfd     f15,`12*$SIZE_T+8`($sp)
896         lfd     f16,`12*$SIZE_T+16`($sp)
897         lfd     f17,`12*$SIZE_T+24`($sp)
898         lfd     f18,`12*$SIZE_T+32`($sp)
899         lfd     f19,`12*$SIZE_T+40`($sp)
900         lfd     f20,`12*$SIZE_T+48`($sp)
901         lfd     f21,`12*$SIZE_T+56`($sp)
902         lfd     f22,`12*$SIZE_T+64`($sp)
903         lfd     f23,`12*$SIZE_T+72`($sp)
904         lfd     f24,`12*$SIZE_T+80`($sp)
905         lfd     f25,`12*$SIZE_T+88`($sp)
906         $POP    $sp,0($sp)
907         li      r3,1    ; signal "handled"
908         blr
909         .long   0
910 .asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
911 ___
912
913 $code =~ s/\`([^\`]*)\`/eval $1/gem;
914 print $code;
915 close STDOUT;