d9cc983b93206d67bf669a14f8a415dde9ff479b
[openssl.git] / crypto / bn / asm / ppc-mont.pl
1 #! /usr/bin/env perl
2 # Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # April 2006
18
19 # "Teaser" Montgomery multiplication module for PowerPC. It's possible
20 # to gain a bit more by modulo-scheduling outer loop, then dedicated
21 # squaring procedure should give further 20% and code can be adapted
22 # for 32-bit application running on 64-bit CPU. As for the latter.
23 # It won't be able to achieve "native" 64-bit performance, because in
24 # 32-bit application context every addc instruction will have to be
25 # expanded as addc, twice right shift by 32 and finally adde, etc.
26 # So far RSA *sign* performance improvement over pre-bn_mul_mont asm
27 # for 64-bit application running on PPC970/G5 is:
28 #
29 # 512-bit       +65%
30 # 1024-bit      +35%
31 # 2048-bit      +18%
32 # 4096-bit      +4%
33
34 # September 2016
35 #
36 # Add multiplication procedure operating on lengths divisible by 4
37 # and squaring procedure operating on lengths divisible by 8. Length
38 # is expressed in number of limbs. RSA private key operations are
39 # ~35-50% faster (more for longer keys) on contemporary high-end POWER
40 # processors in 64-bit builds, [mysteriously enough] more in 32-bit
41 # builds. On low-end 32-bit processors performance improvement turned
42 # to be marginal...
43
44 $flavour = shift;
45
46 if ($flavour =~ /32/) {
47         $BITS=  32;
48         $BNSZ=  $BITS/8;
49         $SIZE_T=4;
50         $RZONE= 224;
51
52         $LD=    "lwz";          # load
53         $LDU=   "lwzu";         # load and update
54         $LDX=   "lwzx";         # load indexed
55         $ST=    "stw";          # store
56         $STU=   "stwu";         # store and update
57         $STX=   "stwx";         # store indexed
58         $STUX=  "stwux";        # store indexed and update
59         $UMULL= "mullw";        # unsigned multiply low
60         $UMULH= "mulhwu";       # unsigned multiply high
61         $UCMP=  "cmplw";        # unsigned compare
62         $SHRI=  "srwi";         # unsigned shift right by immediate
63         $SHLI=  "slwi";         # unsigned shift left by immediate
64         $PUSH=  $ST;
65         $POP=   $LD;
66 } elsif ($flavour =~ /64/) {
67         $BITS=  64;
68         $BNSZ=  $BITS/8;
69         $SIZE_T=8;
70         $RZONE= 288;
71
72         # same as above, but 64-bit mnemonics...
73         $LD=    "ld";           # load
74         $LDU=   "ldu";          # load and update
75         $LDX=   "ldx";          # load indexed
76         $ST=    "std";          # store
77         $STU=   "stdu";         # store and update
78         $STX=   "stdx";         # store indexed
79         $STUX=  "stdux";        # store indexed and update
80         $UMULL= "mulld";        # unsigned multiply low
81         $UMULH= "mulhdu";       # unsigned multiply high
82         $UCMP=  "cmpld";        # unsigned compare
83         $SHRI=  "srdi";         # unsigned shift right by immediate
84         $SHLI=  "sldi";         # unsigned shift left by immediate
85         $PUSH=  $ST;
86         $POP=   $LD;
87 } else { die "nonsense $flavour"; }
88
89 $FRAME=8*$SIZE_T+$RZONE;
90 $LOCALS=8*$SIZE_T;
91
92 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
93 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
94 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
95 die "can't locate ppc-xlate.pl";
96
97 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
98
99 $sp="r1";
100 $toc="r2";
101 $rp="r3";
102 $ap="r4";
103 $bp="r5";
104 $np="r6";
105 $n0="r7";
106 $num="r8";
107
108 {
109 my $ovf=$rp;
110 my $rp="r9";    # $rp is reassigned
111 my $aj="r10";
112 my $nj="r11";
113 my $tj="r12";
114 # non-volatile registers
115 my $i="r20";
116 my $j="r21";
117 my $tp="r22";
118 my $m0="r23";
119 my $m1="r24";
120 my $lo0="r25";
121 my $hi0="r26";
122 my $lo1="r27";
123 my $hi1="r28";
124 my $alo="r29";
125 my $ahi="r30";
126 my $nlo="r31";
127 #
128 my $nhi="r0";
129
130 $code=<<___;
131 .machine "any"
132 .text
133
134 .globl  .bn_mul_mont_int
135 .align  5
136 .bn_mul_mont_int:
137         mr      $rp,r3          ; $rp is reassigned
138         li      r3,0
139 ___
140 $code.=<<___ if ($BNSZ==4);
141         cmpwi   $num,32         ; longer key performance is not better
142         bgelr
143 ___
144 $code.=<<___;
145         slwi    $num,$num,`log($BNSZ)/log(2)`
146         li      $tj,-4096
147         addi    $ovf,$num,$FRAME
148         subf    $ovf,$ovf,$sp   ; $sp-$ovf
149         and     $ovf,$ovf,$tj   ; minimize TLB usage
150         subf    $ovf,$sp,$ovf   ; $ovf-$sp
151         mr      $tj,$sp
152         srwi    $num,$num,`log($BNSZ)/log(2)`
153         $STUX   $sp,$sp,$ovf
154
155         $PUSH   r20,`-12*$SIZE_T`($tj)
156         $PUSH   r21,`-11*$SIZE_T`($tj)
157         $PUSH   r22,`-10*$SIZE_T`($tj)
158         $PUSH   r23,`-9*$SIZE_T`($tj)
159         $PUSH   r24,`-8*$SIZE_T`($tj)
160         $PUSH   r25,`-7*$SIZE_T`($tj)
161         $PUSH   r26,`-6*$SIZE_T`($tj)
162         $PUSH   r27,`-5*$SIZE_T`($tj)
163         $PUSH   r28,`-4*$SIZE_T`($tj)
164         $PUSH   r29,`-3*$SIZE_T`($tj)
165         $PUSH   r30,`-2*$SIZE_T`($tj)
166         $PUSH   r31,`-1*$SIZE_T`($tj)
167
168         $LD     $n0,0($n0)      ; pull n0[0] value
169         addi    $num,$num,-2    ; adjust $num for counter register
170 \f
171         $LD     $m0,0($bp)      ; m0=bp[0]
172         $LD     $aj,0($ap)      ; ap[0]
173         addi    $tp,$sp,$LOCALS
174         $UMULL  $lo0,$aj,$m0    ; ap[0]*bp[0]
175         $UMULH  $hi0,$aj,$m0
176
177         $LD     $aj,$BNSZ($ap)  ; ap[1]
178         $LD     $nj,0($np)      ; np[0]
179
180         $UMULL  $m1,$lo0,$n0    ; "tp[0]"*n0
181
182         $UMULL  $alo,$aj,$m0    ; ap[1]*bp[0]
183         $UMULH  $ahi,$aj,$m0
184
185         $UMULL  $lo1,$nj,$m1    ; np[0]*m1
186         $UMULH  $hi1,$nj,$m1
187         $LD     $nj,$BNSZ($np)  ; np[1]
188         addc    $lo1,$lo1,$lo0
189         addze   $hi1,$hi1
190
191         $UMULL  $nlo,$nj,$m1    ; np[1]*m1
192         $UMULH  $nhi,$nj,$m1
193
194         mtctr   $num
195         li      $j,`2*$BNSZ`
196 .align  4
197 L1st:
198         $LDX    $aj,$ap,$j      ; ap[j]
199         addc    $lo0,$alo,$hi0
200         $LDX    $nj,$np,$j      ; np[j]
201         addze   $hi0,$ahi
202         $UMULL  $alo,$aj,$m0    ; ap[j]*bp[0]
203         addc    $lo1,$nlo,$hi1
204         $UMULH  $ahi,$aj,$m0
205         addze   $hi1,$nhi
206         $UMULL  $nlo,$nj,$m1    ; np[j]*m1
207         addc    $lo1,$lo1,$lo0  ; np[j]*m1+ap[j]*bp[0]
208         $UMULH  $nhi,$nj,$m1
209         addze   $hi1,$hi1
210         $ST     $lo1,0($tp)     ; tp[j-1]
211
212         addi    $j,$j,$BNSZ     ; j++
213         addi    $tp,$tp,$BNSZ   ; tp++
214         bdnz    L1st
215 ;L1st
216         addc    $lo0,$alo,$hi0
217         addze   $hi0,$ahi
218
219         addc    $lo1,$nlo,$hi1
220         addze   $hi1,$nhi
221         addc    $lo1,$lo1,$lo0  ; np[j]*m1+ap[j]*bp[0]
222         addze   $hi1,$hi1
223         $ST     $lo1,0($tp)     ; tp[j-1]
224
225         li      $ovf,0
226         addc    $hi1,$hi1,$hi0
227         addze   $ovf,$ovf       ; upmost overflow bit
228         $ST     $hi1,$BNSZ($tp)
229 \f
230         li      $i,$BNSZ
231 .align  4
232 Louter:
233         $LDX    $m0,$bp,$i      ; m0=bp[i]
234         $LD     $aj,0($ap)      ; ap[0]
235         addi    $tp,$sp,$LOCALS
236         $LD     $tj,$LOCALS($sp); tp[0]
237         $UMULL  $lo0,$aj,$m0    ; ap[0]*bp[i]
238         $UMULH  $hi0,$aj,$m0
239         $LD     $aj,$BNSZ($ap)  ; ap[1]
240         $LD     $nj,0($np)      ; np[0]
241         addc    $lo0,$lo0,$tj   ; ap[0]*bp[i]+tp[0]
242         $UMULL  $alo,$aj,$m0    ; ap[j]*bp[i]
243         addze   $hi0,$hi0
244         $UMULL  $m1,$lo0,$n0    ; tp[0]*n0
245         $UMULH  $ahi,$aj,$m0
246         $UMULL  $lo1,$nj,$m1    ; np[0]*m1
247         $UMULH  $hi1,$nj,$m1
248         $LD     $nj,$BNSZ($np)  ; np[1]
249         addc    $lo1,$lo1,$lo0
250         $UMULL  $nlo,$nj,$m1    ; np[1]*m1
251         addze   $hi1,$hi1
252         $UMULH  $nhi,$nj,$m1
253 \f
254         mtctr   $num
255         li      $j,`2*$BNSZ`
256 .align  4
257 Linner:
258         $LDX    $aj,$ap,$j      ; ap[j]
259         addc    $lo0,$alo,$hi0
260         $LD     $tj,$BNSZ($tp)  ; tp[j]
261         addze   $hi0,$ahi
262         $LDX    $nj,$np,$j      ; np[j]
263         addc    $lo1,$nlo,$hi1
264         $UMULL  $alo,$aj,$m0    ; ap[j]*bp[i]
265         addze   $hi1,$nhi
266         $UMULH  $ahi,$aj,$m0
267         addc    $lo0,$lo0,$tj   ; ap[j]*bp[i]+tp[j]
268         $UMULL  $nlo,$nj,$m1    ; np[j]*m1
269         addze   $hi0,$hi0
270         $UMULH  $nhi,$nj,$m1
271         addc    $lo1,$lo1,$lo0  ; np[j]*m1+ap[j]*bp[i]+tp[j]
272         addi    $j,$j,$BNSZ     ; j++
273         addze   $hi1,$hi1
274         $ST     $lo1,0($tp)     ; tp[j-1]
275         addi    $tp,$tp,$BNSZ   ; tp++
276         bdnz    Linner
277 ;Linner
278         $LD     $tj,$BNSZ($tp)  ; tp[j]
279         addc    $lo0,$alo,$hi0
280         addze   $hi0,$ahi
281         addc    $lo0,$lo0,$tj   ; ap[j]*bp[i]+tp[j]
282         addze   $hi0,$hi0
283
284         addc    $lo1,$nlo,$hi1
285         addze   $hi1,$nhi
286         addc    $lo1,$lo1,$lo0  ; np[j]*m1+ap[j]*bp[i]+tp[j]
287         addze   $hi1,$hi1
288         $ST     $lo1,0($tp)     ; tp[j-1]
289
290         addic   $ovf,$ovf,-1    ; move upmost overflow to XER[CA]
291         li      $ovf,0
292         adde    $hi1,$hi1,$hi0
293         addze   $ovf,$ovf
294         $ST     $hi1,$BNSZ($tp)
295 ;
296         slwi    $tj,$num,`log($BNSZ)/log(2)`
297         $UCMP   $i,$tj
298         addi    $i,$i,$BNSZ
299         ble     Louter
300 \f
301         addi    $num,$num,2     ; restore $num
302         subfc   $j,$j,$j        ; j=0 and "clear" XER[CA]
303         addi    $tp,$sp,$LOCALS
304         mtctr   $num
305
306 .align  4
307 Lsub:   $LDX    $tj,$tp,$j
308         $LDX    $nj,$np,$j
309         subfe   $aj,$nj,$tj     ; tp[j]-np[j]
310         $STX    $aj,$rp,$j
311         addi    $j,$j,$BNSZ
312         bdnz    Lsub
313
314         li      $j,0
315         mtctr   $num
316         subfe   $ovf,$j,$ovf    ; handle upmost overflow bit
317
318 .align  4
319 Lcopy:                          ; conditional copy
320         $LDX    $tj,$tp,$j
321         $LDX    $aj,$rp,$j
322         and     $tj,$tj,$ovf
323         andc    $aj,$aj,$ovf
324         $STX    $j,$tp,$j       ; zap at once
325         or      $aj,$aj,$tj
326         $STX    $aj,$rp,$j
327         addi    $j,$j,$BNSZ
328         bdnz    Lcopy
329
330         $POP    $tj,0($sp)
331         li      r3,1
332         $POP    r20,`-12*$SIZE_T`($tj)
333         $POP    r21,`-11*$SIZE_T`($tj)
334         $POP    r22,`-10*$SIZE_T`($tj)
335         $POP    r23,`-9*$SIZE_T`($tj)
336         $POP    r24,`-8*$SIZE_T`($tj)
337         $POP    r25,`-7*$SIZE_T`($tj)
338         $POP    r26,`-6*$SIZE_T`($tj)
339         $POP    r27,`-5*$SIZE_T`($tj)
340         $POP    r28,`-4*$SIZE_T`($tj)
341         $POP    r29,`-3*$SIZE_T`($tj)
342         $POP    r30,`-2*$SIZE_T`($tj)
343         $POP    r31,`-1*$SIZE_T`($tj)
344         mr      $sp,$tj
345         blr
346         .long   0
347         .byte   0,12,4,0,0x80,12,6,0
348         .long   0
349 .size   .bn_mul_mont_int,.-.bn_mul_mont_int
350 ___
351 }
352 if (1) {
353 my ($a0,$a1,$a2,$a3,
354     $t0,$t1,$t2,$t3,
355     $m0,$m1,$m2,$m3,
356     $acc0,$acc1,$acc2,$acc3,$acc4,
357     $bi,$mi,$tp,$ap_end,$cnt) = map("r$_",(9..12,14..31));
358 my  ($carry,$zero) = ($rp,"r0");
359
360 # sp----------->+-------------------------------+
361 #               | saved sp                      |
362 #               +-------------------------------+
363 #               .                               .
364 # +8*size_t     +-------------------------------+
365 #               | 4 "n0*t0"                     |
366 #               .                               .
367 #               .                               .
368 # +12*size_t    +-------------------------------+
369 #               | size_t tmp[num]               |
370 #               .                               .
371 #               .                               .
372 #               .                               .
373 #               +-------------------------------+
374 #               | topmost carry                 |
375 #               .                               .
376 # -18*size_t    +-------------------------------+
377 #               | 18 saved gpr, r14-r31         |
378 #               .                               .
379 #               .                               .
380 #               +-------------------------------+
381 $code.=<<___;
382 .globl  .bn_mul4x_mont_int
383 .align  5
384 .bn_mul4x_mont_int:
385         andi.   r0,$num,7
386         bne     .Lmul4x_do
387         $UCMP   $ap,$bp
388         bne     .Lmul4x_do
389         b       .Lsqr8x_do
390 .Lmul4x_do:
391         slwi    $num,$num,`log($SIZE_T)/log(2)`
392         mr      $a0,$sp
393         li      $a1,-32*$SIZE_T
394         sub     $a1,$a1,$num
395         $STUX   $sp,$sp,$a1             # alloca
396
397         $PUSH   r14,-$SIZE_T*18($a0)
398         $PUSH   r15,-$SIZE_T*17($a0)
399         $PUSH   r16,-$SIZE_T*16($a0)
400         $PUSH   r17,-$SIZE_T*15($a0)
401         $PUSH   r18,-$SIZE_T*14($a0)
402         $PUSH   r19,-$SIZE_T*13($a0)
403         $PUSH   r20,-$SIZE_T*12($a0)
404         $PUSH   r21,-$SIZE_T*11($a0)
405         $PUSH   r22,-$SIZE_T*10($a0)
406         $PUSH   r23,-$SIZE_T*9($a0)
407         $PUSH   r24,-$SIZE_T*8($a0)
408         $PUSH   r25,-$SIZE_T*7($a0)
409         $PUSH   r26,-$SIZE_T*6($a0)
410         $PUSH   r27,-$SIZE_T*5($a0)
411         $PUSH   r28,-$SIZE_T*4($a0)
412         $PUSH   r29,-$SIZE_T*3($a0)
413         $PUSH   r30,-$SIZE_T*2($a0)
414         $PUSH   r31,-$SIZE_T*1($a0)
415
416         subi    $ap,$ap,$SIZE_T         # bias by -1
417         subi    $np,$np,$SIZE_T         # bias by -1
418         subi    $rp,$rp,$SIZE_T         # bias by -1
419         $LD     $n0,0($n0)              # *n0
420
421         add     $t0,$bp,$num
422         add     $ap_end,$ap,$num
423         subi    $t0,$t0,$SIZE_T*4       # &b[num-4]
424
425         $LD     $bi,$SIZE_T*0($bp)      # b[0]
426         li      $acc0,0
427         $LD     $a0,$SIZE_T*1($ap)      # a[0..3]
428         li      $acc1,0
429         $LD     $a1,$SIZE_T*2($ap)
430         li      $acc2,0
431         $LD     $a2,$SIZE_T*3($ap)
432         li      $acc3,0
433         $LDU    $a3,$SIZE_T*4($ap)
434         $LD     $m0,$SIZE_T*1($np)      # n[0..3]
435         $LD     $m1,$SIZE_T*2($np)
436         $LD     $m2,$SIZE_T*3($np)
437         $LDU    $m3,$SIZE_T*4($np)
438
439         $PUSH   $rp,$SIZE_T*6($sp)      # offload rp and &b[num-4]
440         $PUSH   $t0,$SIZE_T*7($sp)
441         li      $carry,0
442         addic   $tp,$sp,$SIZE_T*7       # &t[-1], clear carry bit
443         li      $cnt,0
444         li      $zero,0
445         b       .Loop_mul4x_1st_reduction
446
447 .align  5
448 .Loop_mul4x_1st_reduction:
449         $UMULL  $t0,$a0,$bi             # lo(a[0..3]*b[0])
450         addze   $carry,$carry           # modulo-scheduled
451         $UMULL  $t1,$a1,$bi
452         addi    $cnt,$cnt,$SIZE_T
453         $UMULL  $t2,$a2,$bi
454         andi.   $cnt,$cnt,$SIZE_T*4-1
455         $UMULL  $t3,$a3,$bi
456         addc    $acc0,$acc0,$t0
457         $UMULH  $t0,$a0,$bi             # hi(a[0..3]*b[0])
458         adde    $acc1,$acc1,$t1
459         $UMULH  $t1,$a1,$bi
460         adde    $acc2,$acc2,$t2
461         $UMULL  $mi,$acc0,$n0           # t[0]*n0
462         adde    $acc3,$acc3,$t3
463         $UMULH  $t2,$a2,$bi
464         addze   $acc4,$zero
465         $UMULH  $t3,$a3,$bi
466         $LDX    $bi,$bp,$cnt            # next b[i] (or b[0])
467         addc    $acc1,$acc1,$t0
468         # (*)   mul     $t0,$m0,$mi     # lo(n[0..3]*t[0]*n0)
469         $STU    $mi,$SIZE_T($tp)        # put aside t[0]*n0 for tail processing
470         adde    $acc2,$acc2,$t1
471         $UMULL  $t1,$m1,$mi
472         adde    $acc3,$acc3,$t2
473         $UMULL  $t2,$m2,$mi
474         adde    $acc4,$acc4,$t3         # can't overflow
475         $UMULL  $t3,$m3,$mi
476         # (*)   addc    $acc0,$acc0,$t0
477         # (*)   As for removal of first multiplication and addition
478         #       instructions. The outcome of first addition is
479         #       guaranteed to be zero, which leaves two computationally
480         #       significant outcomes: it either carries or not. Then
481         #       question is when does it carry? Is there alternative
482         #       way to deduce it? If you follow operations, you can
483         #       observe that condition for carry is quite simple:
484         #       $acc0 being non-zero. So that carry can be calculated
485         #       by adding -1 to $acc0. That's what next instruction does.
486         addic   $acc0,$acc0,-1          # (*), discarded
487         $UMULH  $t0,$m0,$mi             # hi(n[0..3]*t[0]*n0)
488         adde    $acc0,$acc1,$t1
489         $UMULH  $t1,$m1,$mi
490         adde    $acc1,$acc2,$t2
491         $UMULH  $t2,$m2,$mi
492         adde    $acc2,$acc3,$t3
493         $UMULH  $t3,$m3,$mi
494         adde    $acc3,$acc4,$carry
495         addze   $carry,$zero
496         addc    $acc0,$acc0,$t0
497         adde    $acc1,$acc1,$t1
498         adde    $acc2,$acc2,$t2
499         adde    $acc3,$acc3,$t3
500         #addze  $carry,$carry
501         bne     .Loop_mul4x_1st_reduction
502
503         $UCMP   $ap_end,$ap
504         beq     .Lmul4x4_post_condition
505
506         $LD     $a0,$SIZE_T*1($ap)      # a[4..7]
507         $LD     $a1,$SIZE_T*2($ap)
508         $LD     $a2,$SIZE_T*3($ap)
509         $LDU    $a3,$SIZE_T*4($ap)
510         $LD     $mi,$SIZE_T*8($sp)      # a[0]*n0
511         $LD     $m0,$SIZE_T*1($np)      # n[4..7]
512         $LD     $m1,$SIZE_T*2($np)
513         $LD     $m2,$SIZE_T*3($np)
514         $LDU    $m3,$SIZE_T*4($np)
515         b       .Loop_mul4x_1st_tail
516
517 .align  5
518 .Loop_mul4x_1st_tail:
519         $UMULL  $t0,$a0,$bi             # lo(a[4..7]*b[i])
520         addze   $carry,$carry           # modulo-scheduled
521         $UMULL  $t1,$a1,$bi
522         addi    $cnt,$cnt,$SIZE_T
523         $UMULL  $t2,$a2,$bi
524         andi.   $cnt,$cnt,$SIZE_T*4-1
525         $UMULL  $t3,$a3,$bi
526         addc    $acc0,$acc0,$t0
527         $UMULH  $t0,$a0,$bi             # hi(a[4..7]*b[i])
528         adde    $acc1,$acc1,$t1
529         $UMULH  $t1,$a1,$bi
530         adde    $acc2,$acc2,$t2
531         $UMULH  $t2,$a2,$bi
532         adde    $acc3,$acc3,$t3
533         $UMULH  $t3,$a3,$bi
534         addze   $acc4,$zero
535         $LDX    $bi,$bp,$cnt            # next b[i] (or b[0])
536         addc    $acc1,$acc1,$t0
537         $UMULL  $t0,$m0,$mi             # lo(n[4..7]*a[0]*n0)
538         adde    $acc2,$acc2,$t1
539         $UMULL  $t1,$m1,$mi
540         adde    $acc3,$acc3,$t2
541         $UMULL  $t2,$m2,$mi
542         adde    $acc4,$acc4,$t3         # can't overflow
543         $UMULL  $t3,$m3,$mi
544         addc    $acc0,$acc0,$t0
545         $UMULH  $t0,$m0,$mi             # hi(n[4..7]*a[0]*n0)
546         adde    $acc1,$acc1,$t1
547         $UMULH  $t1,$m1,$mi
548         adde    $acc2,$acc2,$t2
549         $UMULH  $t2,$m2,$mi
550         adde    $acc3,$acc3,$t3
551         adde    $acc4,$acc4,$carry
552         $UMULH  $t3,$m3,$mi
553         addze   $carry,$zero
554         addi    $mi,$sp,$SIZE_T*8
555         $LDX    $mi,$mi,$cnt            # next t[0]*n0
556         $STU    $acc0,$SIZE_T($tp)      # word of result
557         addc    $acc0,$acc1,$t0
558         adde    $acc1,$acc2,$t1
559         adde    $acc2,$acc3,$t2
560         adde    $acc3,$acc4,$t3
561         #addze  $carry,$carry
562         bne     .Loop_mul4x_1st_tail
563
564         sub     $t1,$ap_end,$num        # rewinded $ap
565         $UCMP   $ap_end,$ap             # done yet?
566         beq     .Lmul4x_proceed
567
568         $LD     $a0,$SIZE_T*1($ap)
569         $LD     $a1,$SIZE_T*2($ap)
570         $LD     $a2,$SIZE_T*3($ap)
571         $LDU    $a3,$SIZE_T*4($ap)
572         $LD     $m0,$SIZE_T*1($np)
573         $LD     $m1,$SIZE_T*2($np)
574         $LD     $m2,$SIZE_T*3($np)
575         $LDU    $m3,$SIZE_T*4($np)
576         b       .Loop_mul4x_1st_tail
577
578 .align  5
579 .Lmul4x_proceed:
580         $LDU    $bi,$SIZE_T*4($bp)      # *++b
581         addze   $carry,$carry           # topmost carry
582         $LD     $a0,$SIZE_T*1($t1)
583         $LD     $a1,$SIZE_T*2($t1)
584         $LD     $a2,$SIZE_T*3($t1)
585         $LD     $a3,$SIZE_T*4($t1)
586         addi    $ap,$t1,$SIZE_T*4
587         sub     $np,$np,$num            # rewind np
588
589         $ST     $acc0,$SIZE_T*1($tp)    # result
590         $ST     $acc1,$SIZE_T*2($tp)
591         $ST     $acc2,$SIZE_T*3($tp)
592         $ST     $acc3,$SIZE_T*4($tp)
593         $ST     $carry,$SIZE_T*5($tp)   # save topmost carry
594         $LD     $acc0,$SIZE_T*12($sp)   # t[0..3]
595         $LD     $acc1,$SIZE_T*13($sp)
596         $LD     $acc2,$SIZE_T*14($sp)
597         $LD     $acc3,$SIZE_T*15($sp)
598
599         $LD     $m0,$SIZE_T*1($np)      # n[0..3]
600         $LD     $m1,$SIZE_T*2($np)
601         $LD     $m2,$SIZE_T*3($np)
602         $LDU    $m3,$SIZE_T*4($np)
603         addic   $tp,$sp,$SIZE_T*7       # &t[-1], clear carry bit
604         li      $carry,0
605         b       .Loop_mul4x_reduction
606
607 .align  5
608 .Loop_mul4x_reduction:
609         $UMULL  $t0,$a0,$bi             # lo(a[0..3]*b[4])
610         addze   $carry,$carry           # modulo-scheduled
611         $UMULL  $t1,$a1,$bi
612         addi    $cnt,$cnt,$SIZE_T
613         $UMULL  $t2,$a2,$bi
614         andi.   $cnt,$cnt,$SIZE_T*4-1
615         $UMULL  $t3,$a3,$bi
616         addc    $acc0,$acc0,$t0
617         $UMULH  $t0,$a0,$bi             # hi(a[0..3]*b[4])
618         adde    $acc1,$acc1,$t1
619         $UMULH  $t1,$a1,$bi
620         adde    $acc2,$acc2,$t2
621         $UMULL  $mi,$acc0,$n0           # t[0]*n0
622         adde    $acc3,$acc3,$t3
623         $UMULH  $t2,$a2,$bi
624         addze   $acc4,$zero
625         $UMULH  $t3,$a3,$bi
626         $LDX    $bi,$bp,$cnt            # next b[i]
627         addc    $acc1,$acc1,$t0
628         # (*)   mul     $t0,$m0,$mi
629         $STU    $mi,$SIZE_T($tp)        # put aside t[0]*n0 for tail processing
630         adde    $acc2,$acc2,$t1
631         $UMULL  $t1,$m1,$mi             # lo(n[0..3]*t[0]*n0
632         adde    $acc3,$acc3,$t2
633         $UMULL  $t2,$m2,$mi
634         adde    $acc4,$acc4,$t3         # can't overflow
635         $UMULL  $t3,$m3,$mi
636         # (*)   addc    $acc0,$acc0,$t0
637         addic   $acc0,$acc0,-1          # (*), discarded
638         $UMULH  $t0,$m0,$mi             # hi(n[0..3]*t[0]*n0
639         adde    $acc0,$acc1,$t1
640         $UMULH  $t1,$m1,$mi
641         adde    $acc1,$acc2,$t2
642         $UMULH  $t2,$m2,$mi
643         adde    $acc2,$acc3,$t3
644         $UMULH  $t3,$m3,$mi
645         adde    $acc3,$acc4,$carry
646         addze   $carry,$zero
647         addc    $acc0,$acc0,$t0
648         adde    $acc1,$acc1,$t1
649         adde    $acc2,$acc2,$t2
650         adde    $acc3,$acc3,$t3
651         #addze  $carry,$carry
652         bne     .Loop_mul4x_reduction
653
654         $LD     $t0,$SIZE_T*5($tp)      # t[4..7]
655         addze   $carry,$carry
656         $LD     $t1,$SIZE_T*6($tp)
657         $LD     $t2,$SIZE_T*7($tp)
658         $LD     $t3,$SIZE_T*8($tp)
659         $LD     $a0,$SIZE_T*1($ap)      # a[4..7]
660         $LD     $a1,$SIZE_T*2($ap)
661         $LD     $a2,$SIZE_T*3($ap)
662         $LDU    $a3,$SIZE_T*4($ap)
663         addc    $acc0,$acc0,$t0
664         adde    $acc1,$acc1,$t1
665         adde    $acc2,$acc2,$t2
666         adde    $acc3,$acc3,$t3
667         #addze  $carry,$carry
668
669         $LD     $mi,$SIZE_T*8($sp)      # t[0]*n0
670         $LD     $m0,$SIZE_T*1($np)      # n[4..7]
671         $LD     $m1,$SIZE_T*2($np)
672         $LD     $m2,$SIZE_T*3($np)
673         $LDU    $m3,$SIZE_T*4($np)
674         b       .Loop_mul4x_tail
675
676 .align  5
677 .Loop_mul4x_tail:
678         $UMULL  $t0,$a0,$bi             # lo(a[4..7]*b[4])
679         addze   $carry,$carry           # modulo-scheduled
680         $UMULL  $t1,$a1,$bi
681         addi    $cnt,$cnt,$SIZE_T
682         $UMULL  $t2,$a2,$bi
683         andi.   $cnt,$cnt,$SIZE_T*4-1
684         $UMULL  $t3,$a3,$bi
685         addc    $acc0,$acc0,$t0
686         $UMULH  $t0,$a0,$bi             # hi(a[4..7]*b[4])
687         adde    $acc1,$acc1,$t1
688         $UMULH  $t1,$a1,$bi
689         adde    $acc2,$acc2,$t2
690         $UMULH  $t2,$a2,$bi
691         adde    $acc3,$acc3,$t3
692         $UMULH  $t3,$a3,$bi
693         addze   $acc4,$zero
694         $LDX    $bi,$bp,$cnt            # next b[i]
695         addc    $acc1,$acc1,$t0
696         $UMULL  $t0,$m0,$mi             # lo(n[4..7]*t[0]*n0)
697         adde    $acc2,$acc2,$t1
698         $UMULL  $t1,$m1,$mi
699         adde    $acc3,$acc3,$t2
700         $UMULL  $t2,$m2,$mi
701         adde    $acc4,$acc4,$t3         # can't overflow
702         $UMULL  $t3,$m3,$mi
703         addc    $acc0,$acc0,$t0
704         $UMULH  $t0,$m0,$mi             # hi(n[4..7]*t[0]*n0)
705         adde    $acc1,$acc1,$t1
706         $UMULH  $t1,$m1,$mi
707         adde    $acc2,$acc2,$t2
708         $UMULH  $t2,$m2,$mi
709         adde    $acc3,$acc3,$t3
710         $UMULH  $t3,$m3,$mi
711         adde    $acc4,$acc4,$carry
712         addi    $mi,$sp,$SIZE_T*8
713         $LDX    $mi,$mi,$cnt            # next a[0]*n0
714         addze   $carry,$zero
715         $STU    $acc0,$SIZE_T($tp)      # word of result
716         addc    $acc0,$acc1,$t0
717         adde    $acc1,$acc2,$t1
718         adde    $acc2,$acc3,$t2
719         adde    $acc3,$acc4,$t3
720         #addze  $carry,$carry
721         bne     .Loop_mul4x_tail
722
723         $LD     $t0,$SIZE_T*5($tp)      # next t[i] or topmost carry
724         sub     $t1,$np,$num            # rewinded np?
725         addze   $carry,$carry
726         $UCMP   $ap_end,$ap             # done yet?
727         beq     .Loop_mul4x_break
728
729         $LD     $t1,$SIZE_T*6($tp)
730         $LD     $t2,$SIZE_T*7($tp)
731         $LD     $t3,$SIZE_T*8($tp)
732         $LD     $a0,$SIZE_T*1($ap)
733         $LD     $a1,$SIZE_T*2($ap)
734         $LD     $a2,$SIZE_T*3($ap)
735         $LDU    $a3,$SIZE_T*4($ap)
736         addc    $acc0,$acc0,$t0
737         adde    $acc1,$acc1,$t1
738         adde    $acc2,$acc2,$t2
739         adde    $acc3,$acc3,$t3
740         #addze  $carry,$carry
741
742         $LD     $m0,$SIZE_T*1($np)      # n[4..7]
743         $LD     $m1,$SIZE_T*2($np)
744         $LD     $m2,$SIZE_T*3($np)
745         $LDU    $m3,$SIZE_T*4($np)
746         b       .Loop_mul4x_tail
747
748 .align  5
749 .Loop_mul4x_break:
750         $POP    $t2,$SIZE_T*6($sp)      # pull rp and &b[num-4]
751         $POP    $t3,$SIZE_T*7($sp)
752         addc    $a0,$acc0,$t0           # accumulate topmost carry
753         $LD     $acc0,$SIZE_T*12($sp)   # t[0..3]
754         addze   $a1,$acc1
755         $LD     $acc1,$SIZE_T*13($sp)
756         addze   $a2,$acc2
757         $LD     $acc2,$SIZE_T*14($sp)
758         addze   $a3,$acc3
759         $LD     $acc3,$SIZE_T*15($sp)
760         addze   $carry,$carry           # topmost carry
761         $ST     $a0,$SIZE_T*1($tp)      # result
762         sub     $ap,$ap_end,$num        # rewind ap
763         $ST     $a1,$SIZE_T*2($tp)
764         $ST     $a2,$SIZE_T*3($tp)
765         $ST     $a3,$SIZE_T*4($tp)
766         $ST     $carry,$SIZE_T*5($tp)   # store topmost carry
767
768         $LD     $m0,$SIZE_T*1($t1)      # n[0..3]
769         $LD     $m1,$SIZE_T*2($t1)
770         $LD     $m2,$SIZE_T*3($t1)
771         $LD     $m3,$SIZE_T*4($t1)
772         addi    $np,$t1,$SIZE_T*4
773         $UCMP   $bp,$t3                 # done yet?
774         beq     .Lmul4x_post
775
776         $LDU    $bi,$SIZE_T*4($bp)
777         $LD     $a0,$SIZE_T*1($ap)      # a[0..3]
778         $LD     $a1,$SIZE_T*2($ap)
779         $LD     $a2,$SIZE_T*3($ap)
780         $LDU    $a3,$SIZE_T*4($ap)
781         li      $carry,0
782         addic   $tp,$sp,$SIZE_T*7       # &t[-1], clear carry bit
783         b       .Loop_mul4x_reduction
784
785 .align  5
786 .Lmul4x_post:
787         # Final step. We see if result is larger than modulus, and
788         # if it is, subtract the modulus. But comparison implies
789         # subtraction. So we subtract modulus, see if it borrowed,
790         # and conditionally copy original value.
791         srwi    $cnt,$num,`log($SIZE_T)/log(2)+2`
792         mr      $bp,$t2                 # &rp[-1]
793         subi    $cnt,$cnt,1
794         mr      $ap_end,$t2             # &rp[-1] copy
795         subfc   $t0,$m0,$acc0
796         addi    $tp,$sp,$SIZE_T*15
797         subfe   $t1,$m1,$acc1
798
799         mtctr   $cnt
800 .Lmul4x_sub:
801         $LD     $m0,$SIZE_T*1($np)
802         $LD     $acc0,$SIZE_T*1($tp)
803         subfe   $t2,$m2,$acc2
804         $LD     $m1,$SIZE_T*2($np)
805         $LD     $acc1,$SIZE_T*2($tp)
806         subfe   $t3,$m3,$acc3
807         $LD     $m2,$SIZE_T*3($np)
808         $LD     $acc2,$SIZE_T*3($tp)
809         $LDU    $m3,$SIZE_T*4($np)
810         $LDU    $acc3,$SIZE_T*4($tp)
811         $ST     $t0,$SIZE_T*1($bp)
812         $ST     $t1,$SIZE_T*2($bp)
813         subfe   $t0,$m0,$acc0
814         $ST     $t2,$SIZE_T*3($bp)
815         $STU    $t3,$SIZE_T*4($bp)
816         subfe   $t1,$m1,$acc1
817         bdnz    .Lmul4x_sub
818
819          $LD    $a0,$SIZE_T*1($ap_end)
820         $ST     $t0,$SIZE_T*1($bp)
821          $LD    $t0,$SIZE_T*12($sp)
822         subfe   $t2,$m2,$acc2
823          $LD    $a1,$SIZE_T*2($ap_end)
824         $ST     $t1,$SIZE_T*2($bp)
825          $LD    $t1,$SIZE_T*13($sp)
826         subfe   $t3,$m3,$acc3
827         subfe   $carry,$zero,$carry     # did it borrow?
828          addi   $tp,$sp,$SIZE_T*12
829          $LD    $a2,$SIZE_T*3($ap_end)
830         $ST     $t2,$SIZE_T*3($bp)
831          $LD    $t2,$SIZE_T*14($sp)
832          $LD    $a3,$SIZE_T*4($ap_end)
833         $ST     $t3,$SIZE_T*4($bp)
834          $LD    $t3,$SIZE_T*15($sp)
835
836         mtctr   $cnt
837 .Lmul4x_cond_copy:
838         and     $t0,$t0,$carry
839         andc    $a0,$a0,$carry
840         $ST     $zero,$SIZE_T*0($tp)    # wipe stack clean
841         and     $t1,$t1,$carry
842         andc    $a1,$a1,$carry
843         $ST     $zero,$SIZE_T*1($tp)
844         and     $t2,$t2,$carry
845         andc    $a2,$a2,$carry
846         $ST     $zero,$SIZE_T*2($tp)
847         and     $t3,$t3,$carry
848         andc    $a3,$a3,$carry
849         $ST     $zero,$SIZE_T*3($tp)
850         or      $acc0,$t0,$a0
851         $LD     $a0,$SIZE_T*5($ap_end)
852         $LD     $t0,$SIZE_T*4($tp)
853         or      $acc1,$t1,$a1
854         $LD     $a1,$SIZE_T*6($ap_end)
855         $LD     $t1,$SIZE_T*5($tp)
856         or      $acc2,$t2,$a2
857         $LD     $a2,$SIZE_T*7($ap_end)
858         $LD     $t2,$SIZE_T*6($tp)
859         or      $acc3,$t3,$a3
860         $LD     $a3,$SIZE_T*8($ap_end)
861         $LD     $t3,$SIZE_T*7($tp)
862         addi    $tp,$tp,$SIZE_T*4
863         $ST     $acc0,$SIZE_T*1($ap_end)
864         $ST     $acc1,$SIZE_T*2($ap_end)
865         $ST     $acc2,$SIZE_T*3($ap_end)
866         $STU    $acc3,$SIZE_T*4($ap_end)
867         bdnz    .Lmul4x_cond_copy
868
869         $POP    $bp,0($sp)              # pull saved sp
870         and     $t0,$t0,$carry
871         andc    $a0,$a0,$carry
872         $ST     $zero,$SIZE_T*0($tp)
873         and     $t1,$t1,$carry
874         andc    $a1,$a1,$carry
875         $ST     $zero,$SIZE_T*1($tp)
876         and     $t2,$t2,$carry
877         andc    $a2,$a2,$carry
878         $ST     $zero,$SIZE_T*2($tp)
879         and     $t3,$t3,$carry
880         andc    $a3,$a3,$carry
881         $ST     $zero,$SIZE_T*3($tp)
882         or      $acc0,$t0,$a0
883         or      $acc1,$t1,$a1
884         $ST     $zero,$SIZE_T*4($tp)
885         or      $acc2,$t2,$a2
886         or      $acc3,$t3,$a3
887         $ST     $acc0,$SIZE_T*1($ap_end)
888         $ST     $acc1,$SIZE_T*2($ap_end)
889         $ST     $acc2,$SIZE_T*3($ap_end)
890         $ST     $acc3,$SIZE_T*4($ap_end)
891
892         b       .Lmul4x_done
893
894 .align  4
895 .Lmul4x4_post_condition:
896         $POP    $ap,$SIZE_T*6($sp)      # pull &rp[-1]
897         $POP    $bp,0($sp)              # pull saved sp
898         addze   $carry,$carry           # modulo-scheduled
899         # $acc0-3,$carry hold result, $m0-3 hold modulus
900         subfc   $a0,$m0,$acc0
901         subfe   $a1,$m1,$acc1
902         subfe   $a2,$m2,$acc2
903         subfe   $a3,$m3,$acc3
904         subfe   $carry,$zero,$carry     # did it borrow?
905
906         and     $m0,$m0,$carry
907         and     $m1,$m1,$carry
908         addc    $a0,$a0,$m0
909         and     $m2,$m2,$carry
910         adde    $a1,$a1,$m1
911         and     $m3,$m3,$carry
912         adde    $a2,$a2,$m2
913         adde    $a3,$a3,$m3
914
915         $ST     $a0,$SIZE_T*1($ap)      # write result
916         $ST     $a1,$SIZE_T*2($ap)
917         $ST     $a2,$SIZE_T*3($ap)
918         $ST     $a3,$SIZE_T*4($ap)
919
920 .Lmul4x_done:
921         $ST     $zero,$SIZE_T*8($sp)    # wipe stack clean
922         $ST     $zero,$SIZE_T*9($sp)
923         $ST     $zero,$SIZE_T*10($sp)
924         $ST     $zero,$SIZE_T*11($sp)
925         li      r3,1                    # signal "done"
926         $POP    r14,-$SIZE_T*18($bp)
927         $POP    r15,-$SIZE_T*17($bp)
928         $POP    r16,-$SIZE_T*16($bp)
929         $POP    r17,-$SIZE_T*15($bp)
930         $POP    r18,-$SIZE_T*14($bp)
931         $POP    r19,-$SIZE_T*13($bp)
932         $POP    r20,-$SIZE_T*12($bp)
933         $POP    r21,-$SIZE_T*11($bp)
934         $POP    r22,-$SIZE_T*10($bp)
935         $POP    r23,-$SIZE_T*9($bp)
936         $POP    r24,-$SIZE_T*8($bp)
937         $POP    r25,-$SIZE_T*7($bp)
938         $POP    r26,-$SIZE_T*6($bp)
939         $POP    r27,-$SIZE_T*5($bp)
940         $POP    r28,-$SIZE_T*4($bp)
941         $POP    r29,-$SIZE_T*3($bp)
942         $POP    r30,-$SIZE_T*2($bp)
943         $POP    r31,-$SIZE_T*1($bp)
944         mr      $sp,$bp
945         blr
946         .long   0
947         .byte   0,12,4,0x20,0x80,18,6,0
948         .long   0
949 .size   .bn_mul4x_mont_int,.-.bn_mul4x_mont_int
950 ___
951 }
952
953 if (1) {
954 ########################################################################
955 # Following is PPC adaptation of sqrx8x_mont from x86_64-mont5 module.
956
957 my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("r$_",(9..12,14..17));
958 my ($t0,$t1,$t2,$t3)=map("r$_",(18..21));
959 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("r$_",(22..29));
960 my ($cnt,$carry,$zero)=("r30","r31","r0");
961 my ($tp,$ap_end,$na0)=($bp,$np,$carry);
962
963 # sp----------->+-------------------------------+
964 #               | saved sp                      |
965 #               +-------------------------------+
966 #               .                               .
967 # +12*size_t    +-------------------------------+
968 #               | size_t tmp[2*num]             |
969 #               .                               .
970 #               .                               .
971 #               .                               .
972 #               +-------------------------------+
973 #               .                               .
974 # -18*size_t    +-------------------------------+
975 #               | 18 saved gpr, r14-r31         |
976 #               .                               .
977 #               .                               .
978 #               +-------------------------------+
979 $code.=<<___;
980 .align  5
981 __bn_sqr8x_mont:
982 .Lsqr8x_do:
983         mr      $a0,$sp
984         slwi    $a1,$num,`log($SIZE_T)/log(2)+1`
985         li      $a2,-32*$SIZE_T
986         sub     $a1,$a2,$a1
987         slwi    $num,$num,`log($SIZE_T)/log(2)`
988         $STUX   $sp,$sp,$a1             # alloca
989
990         $PUSH   r14,-$SIZE_T*18($a0)
991         $PUSH   r15,-$SIZE_T*17($a0)
992         $PUSH   r16,-$SIZE_T*16($a0)
993         $PUSH   r17,-$SIZE_T*15($a0)
994         $PUSH   r18,-$SIZE_T*14($a0)
995         $PUSH   r19,-$SIZE_T*13($a0)
996         $PUSH   r20,-$SIZE_T*12($a0)
997         $PUSH   r21,-$SIZE_T*11($a0)
998         $PUSH   r22,-$SIZE_T*10($a0)
999         $PUSH   r23,-$SIZE_T*9($a0)
1000         $PUSH   r24,-$SIZE_T*8($a0)
1001         $PUSH   r25,-$SIZE_T*7($a0)
1002         $PUSH   r26,-$SIZE_T*6($a0)
1003         $PUSH   r27,-$SIZE_T*5($a0)
1004         $PUSH   r28,-$SIZE_T*4($a0)
1005         $PUSH   r29,-$SIZE_T*3($a0)
1006         $PUSH   r30,-$SIZE_T*2($a0)
1007         $PUSH   r31,-$SIZE_T*1($a0)
1008
1009         subi    $ap,$ap,$SIZE_T         # bias by -1
1010         subi    $t0,$np,$SIZE_T         # bias by -1
1011         subi    $rp,$rp,$SIZE_T         # bias by -1
1012         $LD     $n0,0($n0)              # *n0
1013         li      $zero,0
1014
1015         add     $ap_end,$ap,$num
1016         $LD     $a0,$SIZE_T*1($ap)
1017         #li     $acc0,0
1018         $LD     $a1,$SIZE_T*2($ap)
1019         li      $acc1,0
1020         $LD     $a2,$SIZE_T*3($ap)
1021         li      $acc2,0
1022         $LD     $a3,$SIZE_T*4($ap)
1023         li      $acc3,0
1024         $LD     $a4,$SIZE_T*5($ap)
1025         li      $acc4,0
1026         $LD     $a5,$SIZE_T*6($ap)
1027         li      $acc5,0
1028         $LD     $a6,$SIZE_T*7($ap)
1029         li      $acc6,0
1030         $LDU    $a7,$SIZE_T*8($ap)
1031         li      $acc7,0
1032
1033         addi    $tp,$sp,$SIZE_T*11      # &tp[-1]
1034         subic.  $cnt,$num,$SIZE_T*8
1035         b       .Lsqr8x_zero_start
1036
1037 .align  5
1038 .Lsqr8x_zero:
1039         subic.  $cnt,$cnt,$SIZE_T*8
1040         $ST     $zero,$SIZE_T*1($tp)
1041         $ST     $zero,$SIZE_T*2($tp)
1042         $ST     $zero,$SIZE_T*3($tp)
1043         $ST     $zero,$SIZE_T*4($tp)
1044         $ST     $zero,$SIZE_T*5($tp)
1045         $ST     $zero,$SIZE_T*6($tp)
1046         $ST     $zero,$SIZE_T*7($tp)
1047         $ST     $zero,$SIZE_T*8($tp)
1048 .Lsqr8x_zero_start:
1049         $ST     $zero,$SIZE_T*9($tp)
1050         $ST     $zero,$SIZE_T*10($tp)
1051         $ST     $zero,$SIZE_T*11($tp)
1052         $ST     $zero,$SIZE_T*12($tp)
1053         $ST     $zero,$SIZE_T*13($tp)
1054         $ST     $zero,$SIZE_T*14($tp)
1055         $ST     $zero,$SIZE_T*15($tp)
1056         $STU    $zero,$SIZE_T*16($tp)
1057         bne     .Lsqr8x_zero
1058
1059         $PUSH   $rp,$SIZE_T*6($sp)      # offload &rp[-1]
1060         $PUSH   $t0,$SIZE_T*7($sp)      # offload &np[-1]
1061         $PUSH   $n0,$SIZE_T*8($sp)      # offload n0
1062         $PUSH   $tp,$SIZE_T*9($sp)      # &tp[2*num-1]
1063         $PUSH   $zero,$SIZE_T*10($sp)   # initial top-most carry
1064         addi    $tp,$sp,$SIZE_T*11      # &tp[-1]
1065
1066         # Multiply everything but a[i]*a[i]
1067 .align  5
1068 .Lsqr8x_outer_loop:
1069         #                                                 a[1]a[0]     (i)
1070         #                                             a[2]a[0]
1071         #                                         a[3]a[0]
1072         #                                     a[4]a[0]
1073         #                                 a[5]a[0]
1074         #                             a[6]a[0]
1075         #                         a[7]a[0]
1076         #                                         a[2]a[1]             (ii)
1077         #                                     a[3]a[1]
1078         #                                 a[4]a[1]
1079         #                             a[5]a[1]
1080         #                         a[6]a[1]
1081         #                     a[7]a[1]
1082         #                                 a[3]a[2]                     (iii)
1083         #                             a[4]a[2]
1084         #                         a[5]a[2]
1085         #                     a[6]a[2]
1086         #                 a[7]a[2]
1087         #                         a[4]a[3]                             (iv)
1088         #                     a[5]a[3]
1089         #                 a[6]a[3]
1090         #             a[7]a[3]
1091         #                 a[5]a[4]                                     (v)
1092         #             a[6]a[4]
1093         #         a[7]a[4]
1094         #         a[6]a[5]                                             (vi)
1095         #     a[7]a[5]
1096         # a[7]a[6]                                                     (vii)
1097
1098         $UMULL  $t0,$a1,$a0             # lo(a[1..7]*a[0])              (i)
1099         $UMULL  $t1,$a2,$a0
1100         $UMULL  $t2,$a3,$a0
1101         $UMULL  $t3,$a4,$a0
1102         addc    $acc1,$acc1,$t0         # t[1]+lo(a[1]*a[0])
1103         $UMULL  $t0,$a5,$a0
1104         adde    $acc2,$acc2,$t1
1105         $UMULL  $t1,$a6,$a0
1106         adde    $acc3,$acc3,$t2
1107         $UMULL  $t2,$a7,$a0
1108         adde    $acc4,$acc4,$t3
1109         $UMULH  $t3,$a1,$a0             # hi(a[1..7]*a[0])
1110         adde    $acc5,$acc5,$t0
1111         $UMULH  $t0,$a2,$a0
1112         adde    $acc6,$acc6,$t1
1113         $UMULH  $t1,$a3,$a0
1114         adde    $acc7,$acc7,$t2
1115         $UMULH  $t2,$a4,$a0
1116         $ST     $acc0,$SIZE_T*1($tp)    # t[0]
1117         addze   $acc0,$zero             # t[8]
1118         $ST     $acc1,$SIZE_T*2($tp)    # t[1]
1119         addc    $acc2,$acc2,$t3         # t[2]+lo(a[1]*a[0])
1120         $UMULH  $t3,$a5,$a0
1121         adde    $acc3,$acc3,$t0
1122         $UMULH  $t0,$a6,$a0
1123         adde    $acc4,$acc4,$t1
1124         $UMULH  $t1,$a7,$a0
1125         adde    $acc5,$acc5,$t2
1126          $UMULL $t2,$a2,$a1             # lo(a[2..7]*a[1])              (ii)
1127         adde    $acc6,$acc6,$t3
1128          $UMULL $t3,$a3,$a1
1129         adde    $acc7,$acc7,$t0
1130          $UMULL $t0,$a4,$a1
1131         adde    $acc0,$acc0,$t1
1132
1133         $UMULL  $t1,$a5,$a1
1134         addc    $acc3,$acc3,$t2
1135         $UMULL  $t2,$a6,$a1
1136         adde    $acc4,$acc4,$t3
1137         $UMULL  $t3,$a7,$a1
1138         adde    $acc5,$acc5,$t0
1139         $UMULH  $t0,$a2,$a1             # hi(a[2..7]*a[1])
1140         adde    $acc6,$acc6,$t1
1141         $UMULH  $t1,$a3,$a1
1142         adde    $acc7,$acc7,$t2
1143         $UMULH  $t2,$a4,$a1
1144         adde    $acc0,$acc0,$t3
1145         $UMULH  $t3,$a5,$a1
1146         $ST     $acc2,$SIZE_T*3($tp)    # t[2]
1147         addze   $acc1,$zero             # t[9]
1148         $ST     $acc3,$SIZE_T*4($tp)    # t[3]
1149         addc    $acc4,$acc4,$t0
1150         $UMULH  $t0,$a6,$a1
1151         adde    $acc5,$acc5,$t1
1152         $UMULH  $t1,$a7,$a1
1153         adde    $acc6,$acc6,$t2
1154          $UMULL $t2,$a3,$a2             # lo(a[3..7]*a[2])              (iii)
1155         adde    $acc7,$acc7,$t3
1156          $UMULL $t3,$a4,$a2
1157         adde    $acc0,$acc0,$t0
1158          $UMULL $t0,$a5,$a2
1159         adde    $acc1,$acc1,$t1
1160
1161         $UMULL  $t1,$a6,$a2
1162         addc    $acc5,$acc5,$t2
1163         $UMULL  $t2,$a7,$a2
1164         adde    $acc6,$acc6,$t3
1165         $UMULH  $t3,$a3,$a2             # hi(a[3..7]*a[2])
1166         adde    $acc7,$acc7,$t0
1167         $UMULH  $t0,$a4,$a2
1168         adde    $acc0,$acc0,$t1
1169         $UMULH  $t1,$a5,$a2
1170         adde    $acc1,$acc1,$t2
1171         $UMULH  $t2,$a6,$a2
1172         $ST     $acc4,$SIZE_T*5($tp)    # t[4]
1173         addze   $acc2,$zero             # t[10]
1174         $ST     $acc5,$SIZE_T*6($tp)    # t[5]
1175         addc    $acc6,$acc6,$t3
1176         $UMULH  $t3,$a7,$a2
1177         adde    $acc7,$acc7,$t0
1178          $UMULL $t0,$a4,$a3             # lo(a[4..7]*a[3])              (iv)
1179         adde    $acc0,$acc0,$t1
1180          $UMULL $t1,$a5,$a3
1181         adde    $acc1,$acc1,$t2
1182          $UMULL $t2,$a6,$a3
1183         adde    $acc2,$acc2,$t3
1184
1185         $UMULL  $t3,$a7,$a3
1186         addc    $acc7,$acc7,$t0
1187         $UMULH  $t0,$a4,$a3             # hi(a[4..7]*a[3])
1188         adde    $acc0,$acc0,$t1
1189         $UMULH  $t1,$a5,$a3
1190         adde    $acc1,$acc1,$t2
1191         $UMULH  $t2,$a6,$a3
1192         adde    $acc2,$acc2,$t3
1193         $UMULH  $t3,$a7,$a3
1194         $ST     $acc6,$SIZE_T*7($tp)    # t[6]
1195         addze   $acc3,$zero             # t[11]
1196         $STU    $acc7,$SIZE_T*8($tp)    # t[7]
1197         addc    $acc0,$acc0,$t0
1198          $UMULL $t0,$a5,$a4             # lo(a[5..7]*a[4])              (v)
1199         adde    $acc1,$acc1,$t1
1200          $UMULL $t1,$a6,$a4
1201         adde    $acc2,$acc2,$t2
1202          $UMULL $t2,$a7,$a4
1203         adde    $acc3,$acc3,$t3
1204
1205         $UMULH  $t3,$a5,$a4             # hi(a[5..7]*a[4])
1206         addc    $acc1,$acc1,$t0
1207         $UMULH  $t0,$a6,$a4
1208         adde    $acc2,$acc2,$t1
1209         $UMULH  $t1,$a7,$a4
1210         adde    $acc3,$acc3,$t2
1211          $UMULL $t2,$a6,$a5             # lo(a[6..7]*a[5])              (vi)
1212         addze   $acc4,$zero             # t[12]
1213         addc    $acc2,$acc2,$t3
1214          $UMULL $t3,$a7,$a5
1215         adde    $acc3,$acc3,$t0
1216          $UMULH $t0,$a6,$a5             # hi(a[6..7]*a[5])
1217         adde    $acc4,$acc4,$t1
1218
1219         $UMULH  $t1,$a7,$a5
1220         addc    $acc3,$acc3,$t2
1221          $UMULL $t2,$a7,$a6             # lo(a[7]*a[6])                 (vii)
1222         adde    $acc4,$acc4,$t3
1223          $UMULH $t3,$a7,$a6             # hi(a[7]*a[6])
1224         addze   $acc5,$zero             # t[13]
1225         addc    $acc4,$acc4,$t0
1226         $UCMP   $ap_end,$ap             # done yet?
1227         adde    $acc5,$acc5,$t1
1228
1229         addc    $acc5,$acc5,$t2
1230         sub     $t0,$ap_end,$num        # rewinded ap
1231         addze   $acc6,$zero             # t[14]
1232         add     $acc6,$acc6,$t3
1233
1234         beq     .Lsqr8x_outer_break
1235
1236         mr      $n0,$a0
1237         $LD     $a0,$SIZE_T*1($tp)
1238         $LD     $a1,$SIZE_T*2($tp)
1239         $LD     $a2,$SIZE_T*3($tp)
1240         $LD     $a3,$SIZE_T*4($tp)
1241         $LD     $a4,$SIZE_T*5($tp)
1242         $LD     $a5,$SIZE_T*6($tp)
1243         $LD     $a6,$SIZE_T*7($tp)
1244         $LD     $a7,$SIZE_T*8($tp)
1245         addc    $acc0,$acc0,$a0
1246         $LD     $a0,$SIZE_T*1($ap)
1247         adde    $acc1,$acc1,$a1
1248         $LD     $a1,$SIZE_T*2($ap)
1249         adde    $acc2,$acc2,$a2
1250         $LD     $a2,$SIZE_T*3($ap)
1251         adde    $acc3,$acc3,$a3
1252         $LD     $a3,$SIZE_T*4($ap)
1253         adde    $acc4,$acc4,$a4
1254         $LD     $a4,$SIZE_T*5($ap)
1255         adde    $acc5,$acc5,$a5
1256         $LD     $a5,$SIZE_T*6($ap)
1257         adde    $acc6,$acc6,$a6
1258         $LD     $a6,$SIZE_T*7($ap)
1259         subi    $rp,$ap,$SIZE_T*7
1260         addze   $acc7,$a7
1261         $LDU    $a7,$SIZE_T*8($ap)
1262         #addze  $carry,$zero            # moved below
1263         li      $cnt,0
1264         b       .Lsqr8x_mul
1265
1266         #                                                          a[8]a[0]
1267         #                                                      a[9]a[0]
1268         #                                                  a[a]a[0]
1269         #                                              a[b]a[0]
1270         #                                          a[c]a[0]
1271         #                                      a[d]a[0]
1272         #                                  a[e]a[0]
1273         #                              a[f]a[0]
1274         #                                                      a[8]a[1]
1275         #                          a[f]a[1]........................
1276         #                                                  a[8]a[2]
1277         #                      a[f]a[2]........................
1278         #                                              a[8]a[3]
1279         #                  a[f]a[3]........................
1280         #                                          a[8]a[4]
1281         #              a[f]a[4]........................
1282         #                                      a[8]a[5]
1283         #          a[f]a[5]........................
1284         #                                  a[8]a[6]
1285         #      a[f]a[6]........................
1286         #                              a[8]a[7]
1287         #  a[f]a[7]........................
1288 .align  5
1289 .Lsqr8x_mul:
1290         $UMULL  $t0,$a0,$n0
1291         addze   $carry,$zero            # carry bit, modulo-scheduled
1292         $UMULL  $t1,$a1,$n0
1293         addi    $cnt,$cnt,$SIZE_T
1294         $UMULL  $t2,$a2,$n0
1295         andi.   $cnt,$cnt,$SIZE_T*8-1
1296         $UMULL  $t3,$a3,$n0
1297         addc    $acc0,$acc0,$t0
1298         $UMULL  $t0,$a4,$n0
1299         adde    $acc1,$acc1,$t1
1300         $UMULL  $t1,$a5,$n0
1301         adde    $acc2,$acc2,$t2
1302         $UMULL  $t2,$a6,$n0
1303         adde    $acc3,$acc3,$t3
1304         $UMULL  $t3,$a7,$n0
1305         adde    $acc4,$acc4,$t0
1306         $UMULH  $t0,$a0,$n0
1307         adde    $acc5,$acc5,$t1
1308         $UMULH  $t1,$a1,$n0
1309         adde    $acc6,$acc6,$t2
1310         $UMULH  $t2,$a2,$n0
1311         adde    $acc7,$acc7,$t3
1312         $UMULH  $t3,$a3,$n0
1313         addze   $carry,$carry
1314         $STU    $acc0,$SIZE_T($tp)
1315         addc    $acc0,$acc1,$t0
1316         $UMULH  $t0,$a4,$n0
1317         adde    $acc1,$acc2,$t1
1318         $UMULH  $t1,$a5,$n0
1319         adde    $acc2,$acc3,$t2
1320         $UMULH  $t2,$a6,$n0
1321         adde    $acc3,$acc4,$t3
1322         $UMULH  $t3,$a7,$n0
1323         $LDX    $n0,$rp,$cnt
1324         adde    $acc4,$acc5,$t0
1325         adde    $acc5,$acc6,$t1
1326         adde    $acc6,$acc7,$t2
1327         adde    $acc7,$carry,$t3
1328         #addze  $carry,$zero            # moved above
1329         bne     .Lsqr8x_mul
1330                                         # note that carry flag is guaranteed
1331                                         # to be zero at this point
1332         $UCMP   $ap,$ap_end             # done yet?
1333         beq     .Lsqr8x_break
1334
1335         $LD     $a0,$SIZE_T*1($tp)
1336         $LD     $a1,$SIZE_T*2($tp)
1337         $LD     $a2,$SIZE_T*3($tp)
1338         $LD     $a3,$SIZE_T*4($tp)
1339         $LD     $a4,$SIZE_T*5($tp)
1340         $LD     $a5,$SIZE_T*6($tp)
1341         $LD     $a6,$SIZE_T*7($tp)
1342         $LD     $a7,$SIZE_T*8($tp)
1343         addc    $acc0,$acc0,$a0
1344         $LD     $a0,$SIZE_T*1($ap)
1345         adde    $acc1,$acc1,$a1
1346         $LD     $a1,$SIZE_T*2($ap)
1347         adde    $acc2,$acc2,$a2
1348         $LD     $a2,$SIZE_T*3($ap)
1349         adde    $acc3,$acc3,$a3
1350         $LD     $a3,$SIZE_T*4($ap)
1351         adde    $acc4,$acc4,$a4
1352         $LD     $a4,$SIZE_T*5($ap)
1353         adde    $acc5,$acc5,$a5
1354         $LD     $a5,$SIZE_T*6($ap)
1355         adde    $acc6,$acc6,$a6
1356         $LD     $a6,$SIZE_T*7($ap)
1357         adde    $acc7,$acc7,$a7
1358         $LDU    $a7,$SIZE_T*8($ap)
1359         #addze  $carry,$zero            # moved above
1360         b       .Lsqr8x_mul
1361
1362 .align  5
1363 .Lsqr8x_break:
1364         $LD     $a0,$SIZE_T*8($rp)
1365         addi    $ap,$rp,$SIZE_T*15
1366         $LD     $a1,$SIZE_T*9($rp)
1367         sub.    $t0,$ap_end,$ap         # is it last iteration?
1368         $LD     $a2,$SIZE_T*10($rp)
1369         sub     $t1,$tp,$t0
1370         $LD     $a3,$SIZE_T*11($rp)
1371         $LD     $a4,$SIZE_T*12($rp)
1372         $LD     $a5,$SIZE_T*13($rp)
1373         $LD     $a6,$SIZE_T*14($rp)
1374         $LD     $a7,$SIZE_T*15($rp)
1375         beq     .Lsqr8x_outer_loop
1376
1377         $ST     $acc0,$SIZE_T*1($tp)
1378         $LD     $acc0,$SIZE_T*1($t1)
1379         $ST     $acc1,$SIZE_T*2($tp)
1380         $LD     $acc1,$SIZE_T*2($t1)
1381         $ST     $acc2,$SIZE_T*3($tp)
1382         $LD     $acc2,$SIZE_T*3($t1)
1383         $ST     $acc3,$SIZE_T*4($tp)
1384         $LD     $acc3,$SIZE_T*4($t1)
1385         $ST     $acc4,$SIZE_T*5($tp)
1386         $LD     $acc4,$SIZE_T*5($t1)
1387         $ST     $acc5,$SIZE_T*6($tp)
1388         $LD     $acc5,$SIZE_T*6($t1)
1389         $ST     $acc6,$SIZE_T*7($tp)
1390         $LD     $acc6,$SIZE_T*7($t1)
1391         $ST     $acc7,$SIZE_T*8($tp)
1392         $LD     $acc7,$SIZE_T*8($t1)
1393         mr      $tp,$t1
1394         b       .Lsqr8x_outer_loop
1395
1396 .align  5
1397 .Lsqr8x_outer_break:
1398         ####################################################################
1399         # Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1400         $LD     $a1,$SIZE_T*1($t0)      # recall that $t0 is &a[-1]
1401         $LD     $a3,$SIZE_T*2($t0)
1402         $LD     $a5,$SIZE_T*3($t0)
1403         $LD     $a7,$SIZE_T*4($t0)
1404         addi    $ap,$t0,$SIZE_T*4
1405                                         # "tp[x]" comments are for num==8 case
1406         $LD     $t1,$SIZE_T*13($sp)     # =tp[1], t[0] is not interesting
1407         $LD     $t2,$SIZE_T*14($sp)
1408         $LD     $t3,$SIZE_T*15($sp)
1409         $LD     $t0,$SIZE_T*16($sp)
1410
1411         $ST     $acc0,$SIZE_T*1($tp)    # tp[8]=
1412         srwi    $cnt,$num,`log($SIZE_T)/log(2)+2`
1413         $ST     $acc1,$SIZE_T*2($tp)
1414         subi    $cnt,$cnt,1
1415         $ST     $acc2,$SIZE_T*3($tp)
1416         $ST     $acc3,$SIZE_T*4($tp)
1417         $ST     $acc4,$SIZE_T*5($tp)
1418         $ST     $acc5,$SIZE_T*6($tp)
1419         $ST     $acc6,$SIZE_T*7($tp)
1420         #$ST    $acc7,$SIZE_T*8($tp)    # tp[15] is not interesting
1421         addi    $tp,$sp,$SIZE_T*11      # &tp[-1]
1422         $UMULL  $acc0,$a1,$a1
1423         $UMULH  $a1,$a1,$a1
1424         add     $acc1,$t1,$t1           # <<1
1425         $SHRI   $t1,$t1,$BITS-1
1426         $UMULL  $a2,$a3,$a3
1427         $UMULH  $a3,$a3,$a3
1428         addc    $acc1,$acc1,$a1
1429         add     $acc2,$t2,$t2
1430         $SHRI   $t2,$t2,$BITS-1
1431         add     $acc3,$t3,$t3
1432         $SHRI   $t3,$t3,$BITS-1
1433         or      $acc2,$acc2,$t1
1434
1435         mtctr   $cnt
1436 .Lsqr4x_shift_n_add:
1437         $UMULL  $a4,$a5,$a5
1438         $UMULH  $a5,$a5,$a5
1439         $LD     $t1,$SIZE_T*6($tp)      # =tp[5]
1440         $LD     $a1,$SIZE_T*1($ap)
1441         adde    $acc2,$acc2,$a2
1442         add     $acc4,$t0,$t0
1443         $SHRI   $t0,$t0,$BITS-1
1444         or      $acc3,$acc3,$t2
1445         $LD     $t2,$SIZE_T*7($tp)      # =tp[6]
1446         adde    $acc3,$acc3,$a3
1447         $LD     $a3,$SIZE_T*2($ap)
1448         add     $acc5,$t1,$t1
1449         $SHRI   $t1,$t1,$BITS-1
1450         or      $acc4,$acc4,$t3
1451         $LD     $t3,$SIZE_T*8($tp)      # =tp[7]
1452         $UMULL  $a6,$a7,$a7
1453         $UMULH  $a7,$a7,$a7
1454         adde    $acc4,$acc4,$a4
1455         add     $acc6,$t2,$t2
1456         $SHRI   $t2,$t2,$BITS-1
1457         or      $acc5,$acc5,$t0
1458         $LD     $t0,$SIZE_T*9($tp)      # =tp[8]
1459         adde    $acc5,$acc5,$a5
1460         $LD     $a5,$SIZE_T*3($ap)
1461         add     $acc7,$t3,$t3
1462         $SHRI   $t3,$t3,$BITS-1
1463         or      $acc6,$acc6,$t1
1464         $LD     $t1,$SIZE_T*10($tp)     # =tp[9]
1465         $UMULL  $a0,$a1,$a1
1466         $UMULH  $a1,$a1,$a1
1467         adde    $acc6,$acc6,$a6
1468         $ST     $acc0,$SIZE_T*1($tp)    # tp[0]=
1469         add     $acc0,$t0,$t0
1470         $SHRI   $t0,$t0,$BITS-1
1471         or      $acc7,$acc7,$t2
1472         $LD     $t2,$SIZE_T*11($tp)     # =tp[10]
1473         adde    $acc7,$acc7,$a7
1474         $LDU    $a7,$SIZE_T*4($ap)
1475         $ST     $acc1,$SIZE_T*2($tp)    # tp[1]=
1476         add     $acc1,$t1,$t1
1477         $SHRI   $t1,$t1,$BITS-1
1478         or      $acc0,$acc0,$t3
1479         $LD     $t3,$SIZE_T*12($tp)     # =tp[11]
1480         $UMULL  $a2,$a3,$a3
1481         $UMULH  $a3,$a3,$a3
1482         adde    $acc0,$acc0,$a0
1483         $ST     $acc2,$SIZE_T*3($tp)    # tp[2]=
1484         add     $acc2,$t2,$t2
1485         $SHRI   $t2,$t2,$BITS-1
1486         or      $acc1,$acc1,$t0
1487         $LD     $t0,$SIZE_T*13($tp)     # =tp[12]
1488         adde    $acc1,$acc1,$a1
1489         $ST     $acc3,$SIZE_T*4($tp)    # tp[3]=
1490         $ST     $acc4,$SIZE_T*5($tp)    # tp[4]=
1491         $ST     $acc5,$SIZE_T*6($tp)    # tp[5]=
1492         $ST     $acc6,$SIZE_T*7($tp)    # tp[6]=
1493         $STU    $acc7,$SIZE_T*8($tp)    # tp[7]=
1494         add     $acc3,$t3,$t3
1495         $SHRI   $t3,$t3,$BITS-1
1496         or      $acc2,$acc2,$t1
1497         bdnz    .Lsqr4x_shift_n_add
1498 ___
1499 my ($np,$np_end)=($ap,$ap_end);
1500 $code.=<<___;
1501          $POP   $np,$SIZE_T*7($sp)      # pull &np[-1] and n0
1502          $POP   $n0,$SIZE_T*8($sp)
1503
1504         $UMULL  $a4,$a5,$a5
1505         $UMULH  $a5,$a5,$a5
1506         $ST     $acc0,$SIZE_T*1($tp)    # tp[8]=
1507          $LD    $acc0,$SIZE_T*12($sp)   # =tp[0]
1508         $LD     $t1,$SIZE_T*6($tp)      # =tp[13]
1509         adde    $acc2,$acc2,$a2
1510         add     $acc4,$t0,$t0
1511         $SHRI   $t0,$t0,$BITS-1
1512         or      $acc3,$acc3,$t2
1513         $LD     $t2,$SIZE_T*7($tp)      # =tp[14]
1514         adde    $acc3,$acc3,$a3
1515         add     $acc5,$t1,$t1
1516         $SHRI   $t1,$t1,$BITS-1
1517         or      $acc4,$acc4,$t3
1518         $UMULL  $a6,$a7,$a7
1519         $UMULH  $a7,$a7,$a7
1520         adde    $acc4,$acc4,$a4
1521         add     $acc6,$t2,$t2
1522         $SHRI   $t2,$t2,$BITS-1
1523         or      $acc5,$acc5,$t0
1524         $ST     $acc1,$SIZE_T*2($tp)    # tp[9]=
1525          $LD    $acc1,$SIZE_T*13($sp)   # =tp[1]
1526         adde    $acc5,$acc5,$a5
1527         or      $acc6,$acc6,$t1
1528          $LD    $a0,$SIZE_T*1($np)
1529          $LD    $a1,$SIZE_T*2($np)
1530         adde    $acc6,$acc6,$a6
1531          $LD    $a2,$SIZE_T*3($np)
1532          $LD    $a3,$SIZE_T*4($np)
1533         adde    $acc7,$a7,$t2
1534          $LD    $a4,$SIZE_T*5($np)
1535          $LD    $a5,$SIZE_T*6($np)
1536
1537         ################################################################
1538         # Reduce by 8 limbs per iteration
1539         $UMULL  $na0,$n0,$acc0          # t[0]*n0
1540         li      $cnt,8
1541         $LD     $a6,$SIZE_T*7($np)
1542         add     $np_end,$np,$num
1543         $LDU    $a7,$SIZE_T*8($np)
1544         $ST     $acc2,$SIZE_T*3($tp)    # tp[10]=
1545         $LD     $acc2,$SIZE_T*14($sp)
1546         $ST     $acc3,$SIZE_T*4($tp)    # tp[11]=
1547         $LD     $acc3,$SIZE_T*15($sp)
1548         $ST     $acc4,$SIZE_T*5($tp)    # tp[12]=
1549         $LD     $acc4,$SIZE_T*16($sp)
1550         $ST     $acc5,$SIZE_T*6($tp)    # tp[13]=
1551         $LD     $acc5,$SIZE_T*17($sp)
1552         $ST     $acc6,$SIZE_T*7($tp)    # tp[14]=
1553         $LD     $acc6,$SIZE_T*18($sp)
1554         $ST     $acc7,$SIZE_T*8($tp)    # tp[15]=
1555         $LD     $acc7,$SIZE_T*19($sp)
1556         addi    $tp,$sp,$SIZE_T*11      # &tp[-1]
1557         mtctr   $cnt
1558         b       .Lsqr8x_reduction
1559
1560 .align  5
1561 .Lsqr8x_reduction:
1562         # (*)   $UMULL  $t0,$a0,$na0    # lo(n[0-7])*lo(t[0]*n0)
1563         $UMULL  $t1,$a1,$na0
1564         $UMULL  $t2,$a2,$na0
1565         $STU    $na0,$SIZE_T($tp)       # put aside t[0]*n0 for tail processing
1566         $UMULL  $t3,$a3,$na0
1567         # (*)   addc    $acc0,$acc0,$t0
1568         addic   $acc0,$acc0,-1          # (*)
1569         $UMULL  $t0,$a4,$na0
1570         adde    $acc0,$acc1,$t1
1571         $UMULL  $t1,$a5,$na0
1572         adde    $acc1,$acc2,$t2
1573         $UMULL  $t2,$a6,$na0
1574         adde    $acc2,$acc3,$t3
1575         $UMULL  $t3,$a7,$na0
1576         adde    $acc3,$acc4,$t0
1577         $UMULH  $t0,$a0,$na0            # hi(n[0-7])*lo(t[0]*n0)
1578         adde    $acc4,$acc5,$t1
1579         $UMULH  $t1,$a1,$na0
1580         adde    $acc5,$acc6,$t2
1581         $UMULH  $t2,$a2,$na0
1582         adde    $acc6,$acc7,$t3
1583         $UMULH  $t3,$a3,$na0
1584         addze   $acc7,$zero
1585         addc    $acc0,$acc0,$t0
1586         $UMULH  $t0,$a4,$na0
1587         adde    $acc1,$acc1,$t1
1588         $UMULH  $t1,$a5,$na0
1589         adde    $acc2,$acc2,$t2
1590         $UMULH  $t2,$a6,$na0
1591         adde    $acc3,$acc3,$t3
1592         $UMULH  $t3,$a7,$na0
1593         $UMULL  $na0,$n0,$acc0          # next t[0]*n0
1594         adde    $acc4,$acc4,$t0
1595         adde    $acc5,$acc5,$t1
1596         adde    $acc6,$acc6,$t2
1597         adde    $acc7,$acc7,$t3
1598         bdnz    .Lsqr8x_reduction
1599
1600         $LD     $t0,$SIZE_T*1($tp)
1601         $LD     $t1,$SIZE_T*2($tp)
1602         $LD     $t2,$SIZE_T*3($tp)
1603         $LD     $t3,$SIZE_T*4($tp)
1604         subi    $rp,$tp,$SIZE_T*7
1605         $UCMP   $np_end,$np             # done yet?
1606         addc    $acc0,$acc0,$t0
1607         $LD     $t0,$SIZE_T*5($tp)
1608         adde    $acc1,$acc1,$t1
1609         $LD     $t1,$SIZE_T*6($tp)
1610         adde    $acc2,$acc2,$t2
1611         $LD     $t2,$SIZE_T*7($tp)
1612         adde    $acc3,$acc3,$t3
1613         $LD     $t3,$SIZE_T*8($tp)
1614         adde    $acc4,$acc4,$t0
1615         adde    $acc5,$acc5,$t1
1616         adde    $acc6,$acc6,$t2
1617         adde    $acc7,$acc7,$t3
1618         #addze  $carry,$zero            # moved below
1619         beq     .Lsqr8x8_post_condition
1620
1621         $LD     $n0,$SIZE_T*0($rp)
1622         $LD     $a0,$SIZE_T*1($np)
1623         $LD     $a1,$SIZE_T*2($np)
1624         $LD     $a2,$SIZE_T*3($np)
1625         $LD     $a3,$SIZE_T*4($np)
1626         $LD     $a4,$SIZE_T*5($np)
1627         $LD     $a5,$SIZE_T*6($np)
1628         $LD     $a6,$SIZE_T*7($np)
1629         $LDU    $a7,$SIZE_T*8($np)
1630         li      $cnt,0
1631
1632 .align  5
1633 .Lsqr8x_tail:
1634         $UMULL  $t0,$a0,$n0
1635         addze   $carry,$zero            # carry bit, modulo-scheduled
1636         $UMULL  $t1,$a1,$n0
1637         addi    $cnt,$cnt,$SIZE_T
1638         $UMULL  $t2,$a2,$n0
1639         andi.   $cnt,$cnt,$SIZE_T*8-1
1640         $UMULL  $t3,$a3,$n0
1641         addc    $acc0,$acc0,$t0
1642         $UMULL  $t0,$a4,$n0
1643         adde    $acc1,$acc1,$t1
1644         $UMULL  $t1,$a5,$n0
1645         adde    $acc2,$acc2,$t2
1646         $UMULL  $t2,$a6,$n0
1647         adde    $acc3,$acc3,$t3
1648         $UMULL  $t3,$a7,$n0
1649         adde    $acc4,$acc4,$t0
1650         $UMULH  $t0,$a0,$n0
1651         adde    $acc5,$acc5,$t1
1652         $UMULH  $t1,$a1,$n0
1653         adde    $acc6,$acc6,$t2
1654         $UMULH  $t2,$a2,$n0
1655         adde    $acc7,$acc7,$t3
1656         $UMULH  $t3,$a3,$n0
1657         addze   $carry,$carry
1658         $STU    $acc0,$SIZE_T($tp)
1659         addc    $acc0,$acc1,$t0
1660         $UMULH  $t0,$a4,$n0
1661         adde    $acc1,$acc2,$t1
1662         $UMULH  $t1,$a5,$n0
1663         adde    $acc2,$acc3,$t2
1664         $UMULH  $t2,$a6,$n0
1665         adde    $acc3,$acc4,$t3
1666         $UMULH  $t3,$a7,$n0
1667         $LDX    $n0,$rp,$cnt
1668         adde    $acc4,$acc5,$t0
1669         adde    $acc5,$acc6,$t1
1670         adde    $acc6,$acc7,$t2
1671         adde    $acc7,$carry,$t3
1672         #addze  $carry,$zero            # moved above
1673         bne     .Lsqr8x_tail
1674                                         # note that carry flag is guaranteed
1675                                         # to be zero at this point
1676         $LD     $a0,$SIZE_T*1($tp)
1677         $POP    $carry,$SIZE_T*10($sp)  # pull top-most carry in case we break
1678         $UCMP   $np_end,$np             # done yet?
1679         $LD     $a1,$SIZE_T*2($tp)
1680         sub     $t2,$np_end,$num        # rewinded np
1681         $LD     $a2,$SIZE_T*3($tp)
1682         $LD     $a3,$SIZE_T*4($tp)
1683         $LD     $a4,$SIZE_T*5($tp)
1684         $LD     $a5,$SIZE_T*6($tp)
1685         $LD     $a6,$SIZE_T*7($tp)
1686         $LD     $a7,$SIZE_T*8($tp)
1687         beq     .Lsqr8x_tail_break
1688
1689         addc    $acc0,$acc0,$a0
1690         $LD     $a0,$SIZE_T*1($np)
1691         adde    $acc1,$acc1,$a1
1692         $LD     $a1,$SIZE_T*2($np)
1693         adde    $acc2,$acc2,$a2
1694         $LD     $a2,$SIZE_T*3($np)
1695         adde    $acc3,$acc3,$a3
1696         $LD     $a3,$SIZE_T*4($np)
1697         adde    $acc4,$acc4,$a4
1698         $LD     $a4,$SIZE_T*5($np)
1699         adde    $acc5,$acc5,$a5
1700         $LD     $a5,$SIZE_T*6($np)
1701         adde    $acc6,$acc6,$a6
1702         $LD     $a6,$SIZE_T*7($np)
1703         adde    $acc7,$acc7,$a7
1704         $LDU    $a7,$SIZE_T*8($np)
1705         #addze  $carry,$zero            # moved above
1706         b       .Lsqr8x_tail
1707
1708 .align  5
1709 .Lsqr8x_tail_break:
1710         $POP    $n0,$SIZE_T*8($sp)      # pull n0
1711         $POP    $t3,$SIZE_T*9($sp)      # &tp[2*num-1]
1712         addi    $cnt,$tp,$SIZE_T*8      # end of current t[num] window
1713
1714         addic   $carry,$carry,-1        # "move" top-most carry to carry bit
1715         adde    $t0,$acc0,$a0
1716         $LD     $acc0,$SIZE_T*8($rp)
1717         $LD     $a0,$SIZE_T*1($t2)      # recall that $t2 is &n[-1]
1718         adde    $t1,$acc1,$a1
1719         $LD     $acc1,$SIZE_T*9($rp)
1720         $LD     $a1,$SIZE_T*2($t2)
1721         adde    $acc2,$acc2,$a2
1722         $LD     $a2,$SIZE_T*3($t2)
1723         adde    $acc3,$acc3,$a3
1724         $LD     $a3,$SIZE_T*4($t2)
1725         adde    $acc4,$acc4,$a4
1726         $LD     $a4,$SIZE_T*5($t2)
1727         adde    $acc5,$acc5,$a5
1728         $LD     $a5,$SIZE_T*6($t2)
1729         adde    $acc6,$acc6,$a6
1730         $LD     $a6,$SIZE_T*7($t2)
1731         adde    $acc7,$acc7,$a7
1732         $LD     $a7,$SIZE_T*8($t2)
1733         addi    $np,$t2,$SIZE_T*8
1734         addze   $t2,$zero               # top-most carry
1735         $UMULL  $na0,$n0,$acc0
1736         $ST     $t0,$SIZE_T*1($tp)
1737         $UCMP   $cnt,$t3                # did we hit the bottom?
1738         $ST     $t1,$SIZE_T*2($tp)
1739         li      $cnt,8
1740         $ST     $acc2,$SIZE_T*3($tp)
1741         $LD     $acc2,$SIZE_T*10($rp)
1742         $ST     $acc3,$SIZE_T*4($tp)
1743         $LD     $acc3,$SIZE_T*11($rp)
1744         $ST     $acc4,$SIZE_T*5($tp)
1745         $LD     $acc4,$SIZE_T*12($rp)
1746         $ST     $acc5,$SIZE_T*6($tp)
1747         $LD     $acc5,$SIZE_T*13($rp)
1748         $ST     $acc6,$SIZE_T*7($tp)
1749         $LD     $acc6,$SIZE_T*14($rp)
1750         $ST     $acc7,$SIZE_T*8($tp)
1751         $LD     $acc7,$SIZE_T*15($rp)
1752         $PUSH   $t2,$SIZE_T*10($sp)     # off-load top-most carry
1753         addi    $tp,$rp,$SIZE_T*7       # slide the window
1754         mtctr   $cnt
1755         bne     .Lsqr8x_reduction
1756
1757         ################################################################
1758         # Final step. We see if result is larger than modulus, and
1759         # if it is, subtract the modulus. But comparison implies
1760         # subtraction. So we subtract modulus, see if it borrowed,
1761         # and conditionally copy original value.
1762         $POP    $rp,$SIZE_T*6($sp)      # pull &rp[-1]
1763         srwi    $cnt,$num,`log($SIZE_T)/log(2)+3`
1764         mr      $n0,$tp                 # put tp aside
1765         addi    $tp,$tp,$SIZE_T*8
1766         subi    $cnt,$cnt,1
1767         subfc   $t0,$a0,$acc0
1768         subfe   $t1,$a1,$acc1
1769         mr      $carry,$t2
1770         mr      $ap_end,$rp             # $rp copy
1771
1772         mtctr   $cnt
1773         b       .Lsqr8x_sub
1774
1775 .align  5
1776 .Lsqr8x_sub:
1777         $LD     $a0,$SIZE_T*1($np)
1778         $LD     $acc0,$SIZE_T*1($tp)
1779         $LD     $a1,$SIZE_T*2($np)
1780         $LD     $acc1,$SIZE_T*2($tp)
1781         subfe   $t2,$a2,$acc2
1782         $LD     $a2,$SIZE_T*3($np)
1783         $LD     $acc2,$SIZE_T*3($tp)
1784         subfe   $t3,$a3,$acc3
1785         $LD     $a3,$SIZE_T*4($np)
1786         $LD     $acc3,$SIZE_T*4($tp)
1787         $ST     $t0,$SIZE_T*1($rp)
1788         subfe   $t0,$a4,$acc4
1789         $LD     $a4,$SIZE_T*5($np)
1790         $LD     $acc4,$SIZE_T*5($tp)
1791         $ST     $t1,$SIZE_T*2($rp)
1792         subfe   $t1,$a5,$acc5
1793         $LD     $a5,$SIZE_T*6($np)
1794         $LD     $acc5,$SIZE_T*6($tp)
1795         $ST     $t2,$SIZE_T*3($rp)
1796         subfe   $t2,$a6,$acc6
1797         $LD     $a6,$SIZE_T*7($np)
1798         $LD     $acc6,$SIZE_T*7($tp)
1799         $ST     $t3,$SIZE_T*4($rp)
1800         subfe   $t3,$a7,$acc7
1801         $LDU    $a7,$SIZE_T*8($np)
1802         $LDU    $acc7,$SIZE_T*8($tp)
1803         $ST     $t0,$SIZE_T*5($rp)
1804         subfe   $t0,$a0,$acc0
1805         $ST     $t1,$SIZE_T*6($rp)
1806         subfe   $t1,$a1,$acc1
1807         $ST     $t2,$SIZE_T*7($rp)
1808         $STU    $t3,$SIZE_T*8($rp)
1809         bdnz    .Lsqr8x_sub
1810
1811         srwi    $cnt,$num,`log($SIZE_T)/log(2)+2`
1812          $LD    $a0,$SIZE_T*1($ap_end)  # original $rp
1813          $LD    $acc0,$SIZE_T*1($n0)    # original $tp
1814         subi    $cnt,$cnt,1
1815          $LD    $a1,$SIZE_T*2($ap_end)
1816          $LD    $acc1,$SIZE_T*2($n0)
1817         subfe   $t2,$a2,$acc2
1818          $LD    $a2,$SIZE_T*3($ap_end)
1819          $LD    $acc2,$SIZE_T*3($n0)
1820         subfe   $t3,$a3,$acc3
1821          $LD    $a3,$SIZE_T*4($ap_end)
1822          $LDU   $acc3,$SIZE_T*4($n0)
1823         $ST     $t0,$SIZE_T*1($rp)
1824         subfe   $t0,$a4,$acc4
1825         $ST     $t1,$SIZE_T*2($rp)
1826         subfe   $t1,$a5,$acc5
1827         $ST     $t2,$SIZE_T*3($rp)
1828         subfe   $t2,$a6,$acc6
1829         $ST     $t3,$SIZE_T*4($rp)
1830         subfe   $t3,$a7,$acc7
1831         $ST     $t0,$SIZE_T*5($rp)
1832         subfe   $carry,$zero,$carry     # did it borrow?
1833         $ST     $t1,$SIZE_T*6($rp)
1834         $ST     $t2,$SIZE_T*7($rp)
1835         $ST     $t3,$SIZE_T*8($rp)
1836
1837         addi    $tp,$sp,$SIZE_T*11
1838         mtctr   $cnt
1839
1840 .Lsqr4x_cond_copy:
1841         andc    $a0,$a0,$carry
1842          $ST    $zero,-$SIZE_T*3($n0)   # wipe stack clean
1843         and     $acc0,$acc0,$carry
1844          $ST    $zero,-$SIZE_T*2($n0)
1845         andc    $a1,$a1,$carry
1846          $ST    $zero,-$SIZE_T*1($n0)
1847         and     $acc1,$acc1,$carry
1848          $ST    $zero,-$SIZE_T*0($n0)
1849         andc    $a2,$a2,$carry
1850          $ST    $zero,$SIZE_T*1($tp)
1851         and     $acc2,$acc2,$carry
1852          $ST    $zero,$SIZE_T*2($tp)
1853         andc    $a3,$a3,$carry
1854          $ST    $zero,$SIZE_T*3($tp)
1855         and     $acc3,$acc3,$carry
1856          $STU   $zero,$SIZE_T*4($tp)
1857         or      $t0,$a0,$acc0
1858         $LD     $a0,$SIZE_T*5($ap_end)
1859         $LD     $acc0,$SIZE_T*1($n0)
1860         or      $t1,$a1,$acc1
1861         $LD     $a1,$SIZE_T*6($ap_end)
1862         $LD     $acc1,$SIZE_T*2($n0)
1863         or      $t2,$a2,$acc2
1864         $LD     $a2,$SIZE_T*7($ap_end)
1865         $LD     $acc2,$SIZE_T*3($n0)
1866         or      $t3,$a3,$acc3
1867         $LD     $a3,$SIZE_T*8($ap_end)
1868         $LDU    $acc3,$SIZE_T*4($n0)
1869         $ST     $t0,$SIZE_T*1($ap_end)
1870         $ST     $t1,$SIZE_T*2($ap_end)
1871         $ST     $t2,$SIZE_T*3($ap_end)
1872         $STU    $t3,$SIZE_T*4($ap_end)
1873         bdnz    .Lsqr4x_cond_copy
1874
1875         $POP    $ap,0($sp)              # pull saved sp
1876         andc    $a0,$a0,$carry
1877         and     $acc0,$acc0,$carry
1878         andc    $a1,$a1,$carry
1879         and     $acc1,$acc1,$carry
1880         andc    $a2,$a2,$carry
1881         and     $acc2,$acc2,$carry
1882         andc    $a3,$a3,$carry
1883         and     $acc3,$acc3,$carry
1884         or      $t0,$a0,$acc0
1885         or      $t1,$a1,$acc1
1886         or      $t2,$a2,$acc2
1887         or      $t3,$a3,$acc3
1888         $ST     $t0,$SIZE_T*1($ap_end)
1889         $ST     $t1,$SIZE_T*2($ap_end)
1890         $ST     $t2,$SIZE_T*3($ap_end)
1891         $ST     $t3,$SIZE_T*4($ap_end)
1892
1893         b       .Lsqr8x_done
1894
1895 .align  5
1896 .Lsqr8x8_post_condition:
1897         $POP    $rp,$SIZE_T*6($sp)      # pull rp
1898         $POP    $ap,0($sp)              # pull saved sp
1899         addze   $carry,$zero
1900
1901         # $acc0-7,$carry hold result, $a0-7 hold modulus
1902         subfc   $acc0,$a0,$acc0
1903         subfe   $acc1,$a1,$acc1
1904          $ST    $zero,$SIZE_T*12($sp)   # wipe stack clean
1905          $ST    $zero,$SIZE_T*13($sp)
1906         subfe   $acc2,$a2,$acc2
1907          $ST    $zero,$SIZE_T*14($sp)
1908          $ST    $zero,$SIZE_T*15($sp)
1909         subfe   $acc3,$a3,$acc3
1910          $ST    $zero,$SIZE_T*16($sp)
1911          $ST    $zero,$SIZE_T*17($sp)
1912         subfe   $acc4,$a4,$acc4
1913          $ST    $zero,$SIZE_T*18($sp)
1914          $ST    $zero,$SIZE_T*19($sp)
1915         subfe   $acc5,$a5,$acc5
1916          $ST    $zero,$SIZE_T*20($sp)
1917          $ST    $zero,$SIZE_T*21($sp)
1918         subfe   $acc6,$a6,$acc6
1919          $ST    $zero,$SIZE_T*22($sp)
1920          $ST    $zero,$SIZE_T*23($sp)
1921         subfe   $acc7,$a7,$acc7
1922          $ST    $zero,$SIZE_T*24($sp)
1923          $ST    $zero,$SIZE_T*25($sp)
1924         subfe   $carry,$zero,$carry     # did it borrow?
1925          $ST    $zero,$SIZE_T*26($sp)
1926          $ST    $zero,$SIZE_T*27($sp)
1927
1928         and     $a0,$a0,$carry
1929         and     $a1,$a1,$carry
1930         addc    $acc0,$acc0,$a0         # add modulus back if borrowed
1931         and     $a2,$a2,$carry
1932         adde    $acc1,$acc1,$a1
1933         and     $a3,$a3,$carry
1934         adde    $acc2,$acc2,$a2
1935         and     $a4,$a4,$carry
1936         adde    $acc3,$acc3,$a3
1937         and     $a5,$a5,$carry
1938         adde    $acc4,$acc4,$a4
1939         and     $a6,$a6,$carry
1940         adde    $acc5,$acc5,$a5
1941         and     $a7,$a7,$carry
1942         adde    $acc6,$acc6,$a6
1943         adde    $acc7,$acc7,$a7
1944         $ST     $acc0,$SIZE_T*1($rp)
1945         $ST     $acc1,$SIZE_T*2($rp)
1946         $ST     $acc2,$SIZE_T*3($rp)
1947         $ST     $acc3,$SIZE_T*4($rp)
1948         $ST     $acc4,$SIZE_T*5($rp)
1949         $ST     $acc5,$SIZE_T*6($rp)
1950         $ST     $acc6,$SIZE_T*7($rp)
1951         $ST     $acc7,$SIZE_T*8($rp)
1952
1953 .Lsqr8x_done:
1954         $PUSH   $zero,$SIZE_T*8($sp)
1955         $PUSH   $zero,$SIZE_T*10($sp)
1956
1957         $POP    r14,-$SIZE_T*18($ap)
1958         li      r3,1                    # signal "done"
1959         $POP    r15,-$SIZE_T*17($ap)
1960         $POP    r16,-$SIZE_T*16($ap)
1961         $POP    r17,-$SIZE_T*15($ap)
1962         $POP    r18,-$SIZE_T*14($ap)
1963         $POP    r19,-$SIZE_T*13($ap)
1964         $POP    r20,-$SIZE_T*12($ap)
1965         $POP    r21,-$SIZE_T*11($ap)
1966         $POP    r22,-$SIZE_T*10($ap)
1967         $POP    r23,-$SIZE_T*9($ap)
1968         $POP    r24,-$SIZE_T*8($ap)
1969         $POP    r25,-$SIZE_T*7($ap)
1970         $POP    r26,-$SIZE_T*6($ap)
1971         $POP    r27,-$SIZE_T*5($ap)
1972         $POP    r28,-$SIZE_T*4($ap)
1973         $POP    r29,-$SIZE_T*3($ap)
1974         $POP    r30,-$SIZE_T*2($ap)
1975         $POP    r31,-$SIZE_T*1($ap)
1976         mr      $sp,$ap
1977         blr
1978         .long   0
1979         .byte   0,12,4,0x20,0x80,18,6,0
1980         .long   0
1981 .size   __bn_sqr8x_mont,.-__bn_sqr8x_mont
1982 ___
1983 }
1984 $code.=<<___;
1985 .asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
1986 ___
1987
1988 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1989 print $code;
1990 close STDOUT;