aesni-sha1-x86_64.pl: refine Atom-specific optimization.
[openssl.git] / crypto / sha / asm / sha512-ppc.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # I let hardware handle unaligned input, except on page boundaries
11 # (see below for details). Otherwise straightforward implementation
12 # with X vector in register bank. The module is big-endian [which is
13 # not big deal as there're no little-endian targets left around].
14
15 #                       sha256          |       sha512
16 #                       -m64    -m32    |       -m64    -m32
17 # --------------------------------------+-----------------------
18 # PPC970,gcc-4.0.0      +50%    +38%    |       +40%    +410%(*)
19 # Power6,xlc-7          +150%   +90%    |       +100%   +430%(*)
20 #
21 # (*)   64-bit code in 32-bit application context, which actually is
22 #       on TODO list. It should be noted that for safe deployment in
23 #       32-bit *mutli-threaded* context asyncronous signals should be
24 #       blocked upon entry to SHA512 block routine. This is because
25 #       32-bit signaling procedure invalidates upper halves of GPRs.
26 #       Context switch procedure preserves them, but not signaling:-(
27
28 # Second version is true multi-thread safe. Trouble with the original
29 # version was that it was using thread local storage pointer register.
30 # Well, it scrupulously preserved it, but the problem would arise the
31 # moment asynchronous signal was delivered and signal handler would
32 # dereference the TLS pointer. While it's never the case in openssl
33 # application or test suite, we have to respect this scenario and not
34 # use TLS pointer register. Alternative would be to require caller to
35 # block signals prior calling this routine. For the record, in 32-bit
36 # context R2 serves as TLS pointer, while in 64-bit context - R13.
37
38 $flavour=shift;
39 $output =shift;
40
41 if ($flavour =~ /64/) {
42         $SIZE_T=8;
43         $LRSAVE=2*$SIZE_T;
44         $STU="stdu";
45         $UCMP="cmpld";
46         $SHL="sldi";
47         $POP="ld";
48         $PUSH="std";
49 } elsif ($flavour =~ /32/) {
50         $SIZE_T=4;
51         $LRSAVE=$SIZE_T;
52         $STU="stwu";
53         $UCMP="cmplw";
54         $SHL="slwi";
55         $POP="lwz";
56         $PUSH="stw";
57 } else { die "nonsense $flavour"; }
58
59 $LITTLE_ENDIAN=0;
60 if ($flavour =~ /le$/) {
61         die "little-endian is 64-bit only: $flavour" if ($SIZE_T==4);
62         $LITTLE_ENDIAN=1;
63 }
64
65 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
66 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
67 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
68 die "can't locate ppc-xlate.pl";
69
70 open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
71
72 if ($output =~ /512/) {
73         $func="sha512_block_data_order";
74         $SZ=8;
75         @Sigma0=(28,34,39);
76         @Sigma1=(14,18,41);
77         @sigma0=(1,  8, 7);
78         @sigma1=(19,61, 6);
79         $rounds=80;
80         $LD="ld";
81         $ST="std";
82         $ROR="rotrdi";
83         $SHR="srdi";
84 } else {
85         $func="sha256_block_data_order";
86         $SZ=4;
87         @Sigma0=( 2,13,22);
88         @Sigma1=( 6,11,25);
89         @sigma0=( 7,18, 3);
90         @sigma1=(17,19,10);
91         $rounds=64;
92         $LD="lwz";
93         $ST="stw";
94         $ROR="rotrwi";
95         $SHR="srwi";
96 }
97
98 $FRAME=32*$SIZE_T+16*$SZ;
99 $LOCALS=6*$SIZE_T;
100
101 $sp ="r1";
102 $toc="r2";
103 $ctx="r3";      # zapped by $a0
104 $inp="r4";      # zapped by $a1
105 $num="r5";      # zapped by $t0
106
107 $T  ="r0";
108 $a0 ="r3";
109 $a1 ="r4";
110 $t0 ="r5";
111 $t1 ="r6";
112 $Tbl="r7";
113
114 $A  ="r8";
115 $B  ="r9";
116 $C  ="r10";
117 $D  ="r11";
118 $E  ="r12";
119 $F  =$t1;       $t1 = "r0";     # stay away from "r13";
120 $G  ="r14";
121 $H  ="r15";
122
123 @V=($A,$B,$C,$D,$E,$F,$G,$H);
124 @X=("r16","r17","r18","r19","r20","r21","r22","r23",
125     "r24","r25","r26","r27","r28","r29","r30","r31");
126
127 $inp="r31" if($SZ==4 || $SIZE_T==8);    # reassigned $inp! aliases with @X[15]
128
129 sub ROUND_00_15 {
130 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
131 $code.=<<___;
132         $ROR    $a0,$e,$Sigma1[0]
133         $ROR    $a1,$e,$Sigma1[1]
134         and     $t0,$f,$e
135         xor     $a0,$a0,$a1
136         add     $h,$h,$t1
137         andc    $t1,$g,$e
138         $ROR    $a1,$a1,`$Sigma1[2]-$Sigma1[1]`
139         or      $t0,$t0,$t1             ; Ch(e,f,g)
140         add     $h,$h,@X[$i%16]
141         xor     $a0,$a0,$a1             ; Sigma1(e)
142         add     $h,$h,$t0
143         add     $h,$h,$a0
144
145         $ROR    $a0,$a,$Sigma0[0]
146         $ROR    $a1,$a,$Sigma0[1]
147         and     $t0,$a,$b
148         and     $t1,$a,$c
149         xor     $a0,$a0,$a1
150         $ROR    $a1,$a1,`$Sigma0[2]-$Sigma0[1]`
151         xor     $t0,$t0,$t1
152         and     $t1,$b,$c
153         xor     $a0,$a0,$a1             ; Sigma0(a)
154         add     $d,$d,$h
155         xor     $t0,$t0,$t1             ; Maj(a,b,c)
156 ___
157 $code.=<<___ if ($i<15);
158         $LD     $t1,`($i+1)*$SZ`($Tbl)
159 ___
160 $code.=<<___;
161         add     $h,$h,$a0
162         add     $h,$h,$t0
163
164 ___
165 }
166
167 sub ROUND_16_xx {
168 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
169 $i-=16;
170 $code.=<<___;
171         $ROR    $a0,@X[($i+1)%16],$sigma0[0]
172         $ROR    $a1,@X[($i+1)%16],$sigma0[1]
173         $ROR    $t0,@X[($i+14)%16],$sigma1[0]
174         $ROR    $t1,@X[($i+14)%16],$sigma1[1]
175         xor     $a0,$a0,$a1
176         $SHR    $a1,@X[($i+1)%16],$sigma0[2]
177         xor     $t0,$t0,$t1
178         $SHR    $t1,@X[($i+14)%16],$sigma1[2]
179         add     @X[$i],@X[$i],@X[($i+9)%16]
180         xor     $a0,$a0,$a1             ; sigma0(X[(i+1)&0x0f])
181         xor     $t0,$t0,$t1             ; sigma1(X[(i+14)&0x0f])
182         $LD     $t1,`$i*$SZ`($Tbl)
183         add     @X[$i],@X[$i],$a0
184         add     @X[$i],@X[$i],$t0
185 ___
186 &ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
187 }
188
189 $code=<<___;
190 .machine        "any"
191 .text
192
193 .globl  $func
194 .align  6
195 $func:
196         $STU    $sp,-$FRAME($sp)
197         mflr    r0
198         $SHL    $num,$num,`log(16*$SZ)/log(2)`
199
200         $PUSH   $ctx,`$FRAME-$SIZE_T*22`($sp)
201
202         $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
203         $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
204         $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
205         $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
206         $PUSH   r18,`$FRAME-$SIZE_T*14`($sp)
207         $PUSH   r19,`$FRAME-$SIZE_T*13`($sp)
208         $PUSH   r20,`$FRAME-$SIZE_T*12`($sp)
209         $PUSH   r21,`$FRAME-$SIZE_T*11`($sp)
210         $PUSH   r22,`$FRAME-$SIZE_T*10`($sp)
211         $PUSH   r23,`$FRAME-$SIZE_T*9`($sp)
212         $PUSH   r24,`$FRAME-$SIZE_T*8`($sp)
213         $PUSH   r25,`$FRAME-$SIZE_T*7`($sp)
214         $PUSH   r26,`$FRAME-$SIZE_T*6`($sp)
215         $PUSH   r27,`$FRAME-$SIZE_T*5`($sp)
216         $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
217         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
218         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
219         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
220         $PUSH   r0,`$FRAME+$LRSAVE`($sp)
221 ___
222
223 if ($SZ==4 || $SIZE_T==8) {
224 $code.=<<___;
225         $LD     $A,`0*$SZ`($ctx)
226         mr      $inp,r4                         ; incarnate $inp
227         $LD     $B,`1*$SZ`($ctx)
228         $LD     $C,`2*$SZ`($ctx)
229         $LD     $D,`3*$SZ`($ctx)
230         $LD     $E,`4*$SZ`($ctx)
231         $LD     $F,`5*$SZ`($ctx)
232         $LD     $G,`6*$SZ`($ctx)
233         $LD     $H,`7*$SZ`($ctx)
234 ___
235 } else {
236   for ($i=16;$i<32;$i++) {
237     $code.=<<___;
238         lwz     r$i,`4*($i-16)`($ctx)
239 ___
240   }
241 }
242
243 $code.=<<___;
244         bl      LPICmeup
245 LPICedup:
246         andi.   r0,$inp,3
247         bne     Lunaligned
248 Laligned:
249         add     $num,$inp,$num
250         $PUSH   $num,`$FRAME-$SIZE_T*24`($sp)   ; end pointer
251         $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
252         bl      Lsha2_block_private
253         b       Ldone
254
255 ; PowerPC specification allows an implementation to be ill-behaved
256 ; upon unaligned access which crosses page boundary. "Better safe
257 ; than sorry" principle makes me treat it specially. But I don't
258 ; look for particular offending word, but rather for the input
259 ; block which crosses the boundary. Once found that block is aligned
260 ; and hashed separately...
261 .align  4
262 Lunaligned:
263         subfic  $t1,$inp,4096
264         andi.   $t1,$t1,`4096-16*$SZ`   ; distance to closest page boundary
265         beq     Lcross_page
266         $UCMP   $num,$t1
267         ble-    Laligned                ; didn't cross the page boundary
268         subfc   $num,$t1,$num
269         add     $t1,$inp,$t1
270         $PUSH   $num,`$FRAME-$SIZE_T*25`($sp)   ; save real remaining num
271         $PUSH   $t1,`$FRAME-$SIZE_T*24`($sp)    ; intermediate end pointer
272         $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
273         bl      Lsha2_block_private
274         ; $inp equals to the intermediate end pointer here
275         $POP    $num,`$FRAME-$SIZE_T*25`($sp)   ; restore real remaining num
276 Lcross_page:
277         li      $t1,`16*$SZ/4`
278         mtctr   $t1
279 ___
280 if ($SZ==4 || $SIZE_T==8) {
281 $code.=<<___;
282         addi    r20,$sp,$LOCALS                 ; aligned spot below the frame
283 Lmemcpy:
284         lbz     r16,0($inp)
285         lbz     r17,1($inp)
286         lbz     r18,2($inp)
287         lbz     r19,3($inp)
288         addi    $inp,$inp,4
289         stb     r16,0(r20)
290         stb     r17,1(r20)
291         stb     r18,2(r20)
292         stb     r19,3(r20)
293         addi    r20,r20,4
294         bdnz    Lmemcpy
295 ___
296 } else {
297 $code.=<<___;
298         addi    r12,$sp,$LOCALS                 ; aligned spot below the frame
299 Lmemcpy:
300         lbz     r8,0($inp)
301         lbz     r9,1($inp)
302         lbz     r10,2($inp)
303         lbz     r11,3($inp)
304         addi    $inp,$inp,4
305         stb     r8,0(r12)
306         stb     r9,1(r12)
307         stb     r10,2(r12)
308         stb     r11,3(r12)
309         addi    r12,r12,4
310         bdnz    Lmemcpy
311 ___
312 }
313
314 $code.=<<___;
315         $PUSH   $inp,`$FRAME-$SIZE_T*26`($sp)   ; save real inp
316         addi    $t1,$sp,`$LOCALS+16*$SZ`        ; fictitious end pointer
317         addi    $inp,$sp,$LOCALS                ; fictitious inp pointer
318         $PUSH   $num,`$FRAME-$SIZE_T*25`($sp)   ; save real num
319         $PUSH   $t1,`$FRAME-$SIZE_T*24`($sp)    ; end pointer
320         $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
321         bl      Lsha2_block_private
322         $POP    $inp,`$FRAME-$SIZE_T*26`($sp)   ; restore real inp
323         $POP    $num,`$FRAME-$SIZE_T*25`($sp)   ; restore real num
324         addic.  $num,$num,`-16*$SZ`             ; num--
325         bne-    Lunaligned
326
327 Ldone:
328         $POP    r0,`$FRAME+$LRSAVE`($sp)
329         $POP    r14,`$FRAME-$SIZE_T*18`($sp)
330         $POP    r15,`$FRAME-$SIZE_T*17`($sp)
331         $POP    r16,`$FRAME-$SIZE_T*16`($sp)
332         $POP    r17,`$FRAME-$SIZE_T*15`($sp)
333         $POP    r18,`$FRAME-$SIZE_T*14`($sp)
334         $POP    r19,`$FRAME-$SIZE_T*13`($sp)
335         $POP    r20,`$FRAME-$SIZE_T*12`($sp)
336         $POP    r21,`$FRAME-$SIZE_T*11`($sp)
337         $POP    r22,`$FRAME-$SIZE_T*10`($sp)
338         $POP    r23,`$FRAME-$SIZE_T*9`($sp)
339         $POP    r24,`$FRAME-$SIZE_T*8`($sp)
340         $POP    r25,`$FRAME-$SIZE_T*7`($sp)
341         $POP    r26,`$FRAME-$SIZE_T*6`($sp)
342         $POP    r27,`$FRAME-$SIZE_T*5`($sp)
343         $POP    r28,`$FRAME-$SIZE_T*4`($sp)
344         $POP    r29,`$FRAME-$SIZE_T*3`($sp)
345         $POP    r30,`$FRAME-$SIZE_T*2`($sp)
346         $POP    r31,`$FRAME-$SIZE_T*1`($sp)
347         mtlr    r0
348         addi    $sp,$sp,$FRAME
349         blr
350         .long   0
351         .byte   0,12,4,1,0x80,18,3,0
352         .long   0
353 ___
354
355 if ($SZ==4 || $SIZE_T==8) {
356 $code.=<<___;
357 .align  4
358 Lsha2_block_private:
359         $LD     $t1,0($Tbl)
360 ___
361 for($i=0;$i<16;$i++) {
362 $code.=<<___ if ($SZ==4 && !$LITTLE_ENDIAN);
363         lwz     @X[$i],`$i*$SZ`($inp)
364 ___
365 $code.=<<___ if ($SZ==4 && $LITTLE_ENDIAN);
366         lwz     $a0,`$i*$SZ`($inp)
367         rotlwi  @X[$i],$a0,8
368         rlwimi  @X[$i],$a0,24,0,7
369         rlwimi  @X[$i],$a0,24,16,23
370 ___
371 # 64-bit loads are split to 2x32-bit ones, as CPU can't handle
372 # unaligned 64-bit loads, only 32-bit ones...
373 $code.=<<___ if ($SZ==8 && !$LITTLE_ENDIAN);
374         lwz     $t0,`$i*$SZ`($inp)
375         lwz     @X[$i],`$i*$SZ+4`($inp)
376         insrdi  @X[$i],$t0,32,0
377 ___
378 $code.=<<___ if ($SZ==8 && $LITTLE_ENDIAN);
379         lwz     $a0,`$i*$SZ`($inp)
380          lwz    $a1,`$i*$SZ+4`($inp)
381         rotlwi  $t0,$a0,8
382          rotlwi @X[$i],$a1,8
383         rlwimi  $t0,$a0,24,0,7
384          rlwimi @X[$i],$a1,24,0,7
385         rlwimi  $t0,$a0,24,16,23
386          rlwimi @X[$i],$a1,24,16,23
387         insrdi  @X[$i],$t0,32,0
388 ___
389         &ROUND_00_15($i,@V);
390         unshift(@V,pop(@V));
391 }
392 $code.=<<___;
393         li      $t0,`$rounds/16-1`
394         mtctr   $t0
395 .align  4
396 Lrounds:
397         addi    $Tbl,$Tbl,`16*$SZ`
398 ___
399 for(;$i<32;$i++) {
400         &ROUND_16_xx($i,@V);
401         unshift(@V,pop(@V));
402 }
403 $code.=<<___;
404         bdnz-   Lrounds
405
406         $POP    $ctx,`$FRAME-$SIZE_T*22`($sp)
407         $POP    $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
408         $POP    $num,`$FRAME-$SIZE_T*24`($sp)   ; end pointer
409         subi    $Tbl,$Tbl,`($rounds-16)*$SZ`    ; rewind Tbl
410
411         $LD     r16,`0*$SZ`($ctx)
412         $LD     r17,`1*$SZ`($ctx)
413         $LD     r18,`2*$SZ`($ctx)
414         $LD     r19,`3*$SZ`($ctx)
415         $LD     r20,`4*$SZ`($ctx)
416         $LD     r21,`5*$SZ`($ctx)
417         $LD     r22,`6*$SZ`($ctx)
418         addi    $inp,$inp,`16*$SZ`              ; advance inp
419         $LD     r23,`7*$SZ`($ctx)
420         add     $A,$A,r16
421         add     $B,$B,r17
422         $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)
423         add     $C,$C,r18
424         $ST     $A,`0*$SZ`($ctx)
425         add     $D,$D,r19
426         $ST     $B,`1*$SZ`($ctx)
427         add     $E,$E,r20
428         $ST     $C,`2*$SZ`($ctx)
429         add     $F,$F,r21
430         $ST     $D,`3*$SZ`($ctx)
431         add     $G,$G,r22
432         $ST     $E,`4*$SZ`($ctx)
433         add     $H,$H,r23
434         $ST     $F,`5*$SZ`($ctx)
435         $ST     $G,`6*$SZ`($ctx)
436         $UCMP   $inp,$num
437         $ST     $H,`7*$SZ`($ctx)
438         bne     Lsha2_block_private
439         blr
440         .long   0
441         .byte   0,12,0x14,0,0,0,0,0
442 .size   $func,.-$func
443 ___
444 } else {
445 ########################################################################
446 # SHA512 for PPC32, X vector is off-loaded to stack...
447 #
448 #                       |       sha512
449 #                       |       -m32
450 # ----------------------+-----------------------
451 # PPC74x0,gcc-4.0.1     |       +48%
452 # POWER6,gcc-4.4.6      |       +124%(*)
453 # POWER7,gcc-4.4.6      |       +79%(*)
454 # e300,gcc-4.1.0        |       +167%
455 #
456 # (*)   ~1/3 of -m64 result [and ~20% better than -m32 code generated
457 #       by xlc-12.1]
458
459 my $XOFF=$LOCALS;
460
461 my @V=map("r$_",(16..31));      # A..H
462
463 my ($s0,$s1,$t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("r$_",(0,5,6,8..12,14,15));
464 my ($x0,$x1)=("r3","r4");       # zaps $ctx and $inp
465
466 sub ROUND_00_15_ppc32 {
467 my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
468         $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_;
469
470 $code.=<<___;
471         lwz     $t2,`$SZ*($i%16)+4`($Tbl)
472          xor    $a0,$flo,$glo
473         lwz     $t3,`$SZ*($i%16)+0`($Tbl)
474          xor    $a1,$fhi,$ghi
475         addc    $hlo,$hlo,$t0                   ; h+=x[i]
476         stw     $t0,`$XOFF+0+$SZ*($i%16)`($sp)  ; save x[i]
477
478         srwi    $s0,$elo,$Sigma1[0]
479         srwi    $s1,$ehi,$Sigma1[0]
480          and    $a0,$a0,$elo
481         adde    $hhi,$hhi,$t1
482          and    $a1,$a1,$ehi
483         stw     $t1,`$XOFF+4+$SZ*($i%16)`($sp)
484         srwi    $t0,$elo,$Sigma1[1]
485         srwi    $t1,$ehi,$Sigma1[1]
486          addc   $hlo,$hlo,$t2                   ; h+=K512[i]
487         insrwi  $s0,$ehi,$Sigma1[0],0
488         insrwi  $s1,$elo,$Sigma1[0],0
489          xor    $a0,$a0,$glo                    ; Ch(e,f,g)
490          adde   $hhi,$hhi,$t3
491          xor    $a1,$a1,$ghi
492         insrwi  $t0,$ehi,$Sigma1[1],0
493         insrwi  $t1,$elo,$Sigma1[1],0
494          addc   $hlo,$hlo,$a0                   ; h+=Ch(e,f,g)
495         srwi    $t2,$ehi,$Sigma1[2]-32
496         srwi    $t3,$elo,$Sigma1[2]-32
497         xor     $s0,$s0,$t0
498         xor     $s1,$s1,$t1
499         insrwi  $t2,$elo,$Sigma1[2]-32,0
500         insrwi  $t3,$ehi,$Sigma1[2]-32,0
501          xor    $a0,$alo,$blo                   ; a^b, b^c in next round
502          adde   $hhi,$hhi,$a1
503          xor    $a1,$ahi,$bhi
504         xor     $s0,$s0,$t2                     ; Sigma1(e)
505         xor     $s1,$s1,$t3
506
507         srwi    $t0,$alo,$Sigma0[0]
508          and    $a2,$a2,$a0
509          addc   $hlo,$hlo,$s0                   ; h+=Sigma1(e)
510          and    $a3,$a3,$a1
511         srwi    $t1,$ahi,$Sigma0[0]
512         srwi    $s0,$ahi,$Sigma0[1]-32
513          adde   $hhi,$hhi,$s1
514         srwi    $s1,$alo,$Sigma0[1]-32
515         insrwi  $t0,$ahi,$Sigma0[0],0
516         insrwi  $t1,$alo,$Sigma0[0],0
517          xor    $a2,$a2,$blo                    ; Maj(a,b,c)
518          addc   $dlo,$dlo,$hlo                  ; d+=h
519          xor    $a3,$a3,$bhi
520         insrwi  $s0,$alo,$Sigma0[1]-32,0
521         insrwi  $s1,$ahi,$Sigma0[1]-32,0
522          adde   $dhi,$dhi,$hhi
523         srwi    $t2,$ahi,$Sigma0[2]-32
524         srwi    $t3,$alo,$Sigma0[2]-32
525         xor     $s0,$s0,$t0
526          addc   $hlo,$hlo,$a2                   ; h+=Maj(a,b,c)
527         xor     $s1,$s1,$t1
528         insrwi  $t2,$alo,$Sigma0[2]-32,0
529         insrwi  $t3,$ahi,$Sigma0[2]-32,0
530          adde   $hhi,$hhi,$a3
531 ___
532 $code.=<<___ if ($i>=15);
533         lwz     $t0,`$XOFF+0+$SZ*(($i+2)%16)`($sp)
534         lwz     $t1,`$XOFF+4+$SZ*(($i+2)%16)`($sp)
535 ___
536 $code.=<<___ if ($i<15);
537         lwz     $t1,`$SZ*($i+1)+0`($inp)
538         lwz     $t0,`$SZ*($i+1)+4`($inp)
539 ___
540 $code.=<<___;
541         xor     $s0,$s0,$t2                     ; Sigma0(a)
542         xor     $s1,$s1,$t3
543         addc    $hlo,$hlo,$s0                   ; h+=Sigma0(a)
544         adde    $hhi,$hhi,$s1
545 ___
546 $code.=<<___ if ($i==15);
547         lwz     $x0,`$XOFF+0+$SZ*(($i+1)%16)`($sp)
548         lwz     $x1,`$XOFF+4+$SZ*(($i+1)%16)`($sp)
549 ___
550 }
551 sub ROUND_16_xx_ppc32 {
552 my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
553         $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_;
554
555 $code.=<<___;
556         srwi    $s0,$t0,$sigma0[0]
557         srwi    $s1,$t1,$sigma0[0]
558         srwi    $t2,$t0,$sigma0[1]
559         srwi    $t3,$t1,$sigma0[1]
560         insrwi  $s0,$t1,$sigma0[0],0
561         insrwi  $s1,$t0,$sigma0[0],0
562         srwi    $a0,$t0,$sigma0[2]
563         insrwi  $t2,$t1,$sigma0[1],0
564         insrwi  $t3,$t0,$sigma0[1],0
565         insrwi  $a0,$t1,$sigma0[2],0
566         xor     $s0,$s0,$t2
567          lwz    $t2,`$XOFF+0+$SZ*(($i+14)%16)`($sp)
568         srwi    $a1,$t1,$sigma0[2]
569         xor     $s1,$s1,$t3
570          lwz    $t3,`$XOFF+4+$SZ*(($i+14)%16)`($sp)
571         xor     $a0,$a0,$s0
572          srwi   $s0,$t2,$sigma1[0]
573         xor     $a1,$a1,$s1
574          srwi   $s1,$t3,$sigma1[0]
575         addc    $x0,$x0,$a0                     ; x[i]+=sigma0(x[i+1])
576          srwi   $a0,$t3,$sigma1[1]-32
577         insrwi  $s0,$t3,$sigma1[0],0
578         insrwi  $s1,$t2,$sigma1[0],0
579         adde    $x1,$x1,$a1
580          srwi   $a1,$t2,$sigma1[1]-32
581
582         insrwi  $a0,$t2,$sigma1[1]-32,0
583         srwi    $t2,$t2,$sigma1[2]
584         insrwi  $a1,$t3,$sigma1[1]-32,0
585         insrwi  $t2,$t3,$sigma1[2],0
586         xor     $s0,$s0,$a0
587          lwz    $a0,`$XOFF+0+$SZ*(($i+9)%16)`($sp)
588         srwi    $t3,$t3,$sigma1[2]
589         xor     $s1,$s1,$a1
590          lwz    $a1,`$XOFF+4+$SZ*(($i+9)%16)`($sp)
591         xor     $s0,$s0,$t2
592          addc   $x0,$x0,$a0                     ; x[i]+=x[i+9]
593         xor     $s1,$s1,$t3
594          adde   $x1,$x1,$a1
595         addc    $x0,$x0,$s0                     ; x[i]+=sigma1(x[i+14])
596         adde    $x1,$x1,$s1
597 ___
598         ($t0,$t1,$x0,$x1) = ($x0,$x1,$t0,$t1);
599         &ROUND_00_15_ppc32(@_);
600 }
601
602 $code.=<<___;
603 .align  4
604 Lsha2_block_private:
605         lwz     $t1,0($inp)
606         xor     $a2,@V[3],@V[5]         ; B^C, magic seed
607         lwz     $t0,4($inp)
608         xor     $a3,@V[2],@V[4]
609 ___
610 for($i=0;$i<16;$i++) {
611         &ROUND_00_15_ppc32($i,@V);
612         unshift(@V,pop(@V));    unshift(@V,pop(@V));
613         ($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1);
614 }
615 $code.=<<___;
616         li      $a0,`$rounds/16-1`
617         mtctr   $a0
618 .align  4
619 Lrounds:
620         addi    $Tbl,$Tbl,`16*$SZ`
621 ___
622 for(;$i<32;$i++) {
623         &ROUND_16_xx_ppc32($i,@V);
624         unshift(@V,pop(@V));    unshift(@V,pop(@V));
625         ($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1);
626 }
627 $code.=<<___;
628         bdnz-   Lrounds
629
630         $POP    $ctx,`$FRAME-$SIZE_T*22`($sp)
631         $POP    $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
632         $POP    $num,`$FRAME-$SIZE_T*24`($sp)   ; end pointer
633         subi    $Tbl,$Tbl,`($rounds-16)*$SZ`    ; rewind Tbl
634
635         lwz     $t0,0($ctx)
636         lwz     $t1,4($ctx)
637         lwz     $t2,8($ctx)
638         lwz     $t3,12($ctx)
639         lwz     $a0,16($ctx)
640         lwz     $a1,20($ctx)
641         lwz     $a2,24($ctx)
642         addc    @V[1],@V[1],$t1
643         lwz     $a3,28($ctx)
644         adde    @V[0],@V[0],$t0
645         lwz     $t0,32($ctx)
646         addc    @V[3],@V[3],$t3
647         lwz     $t1,36($ctx)
648         adde    @V[2],@V[2],$t2
649         lwz     $t2,40($ctx)
650         addc    @V[5],@V[5],$a1
651         lwz     $t3,44($ctx)
652         adde    @V[4],@V[4],$a0
653         lwz     $a0,48($ctx)
654         addc    @V[7],@V[7],$a3
655         lwz     $a1,52($ctx)
656         adde    @V[6],@V[6],$a2
657         lwz     $a2,56($ctx)
658         addc    @V[9],@V[9],$t1
659         lwz     $a3,60($ctx)
660         adde    @V[8],@V[8],$t0
661         stw     @V[0],0($ctx)
662         stw     @V[1],4($ctx)
663         addc    @V[11],@V[11],$t3
664         stw     @V[2],8($ctx)
665         stw     @V[3],12($ctx)
666         adde    @V[10],@V[10],$t2
667         stw     @V[4],16($ctx)
668         stw     @V[5],20($ctx)
669         addc    @V[13],@V[13],$a1
670         stw     @V[6],24($ctx)
671         stw     @V[7],28($ctx)
672         adde    @V[12],@V[12],$a0
673         stw     @V[8],32($ctx)
674         stw     @V[9],36($ctx)
675         addc    @V[15],@V[15],$a3
676         stw     @V[10],40($ctx)
677         stw     @V[11],44($ctx)
678         adde    @V[14],@V[14],$a2
679         stw     @V[12],48($ctx)
680         stw     @V[13],52($ctx)
681         stw     @V[14],56($ctx)
682         stw     @V[15],60($ctx)
683
684         addi    $inp,$inp,`16*$SZ`              ; advance inp
685         $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)
686         $UCMP   $inp,$num
687         bne     Lsha2_block_private
688         blr
689         .long   0
690         .byte   0,12,0x14,0,0,0,0,0
691 .size   $func,.-$func
692 ___
693 }
694
695 # Ugly hack here, because PPC assembler syntax seem to vary too
696 # much from platforms to platform...
697 $code.=<<___;
698 .align  6
699 LPICmeup:
700         mflr    r0
701         bcl     20,31,\$+4
702         mflr    $Tbl    ; vvvvvv "distance" between . and 1st data entry
703         addi    $Tbl,$Tbl,`64-8`
704         mtlr    r0
705         blr
706         .long   0
707         .byte   0,12,0x14,0,0,0,0,0
708         .space  `64-9*4`
709 ___
710 $code.=<<___ if ($SZ==8);
711         .quad   0x428a2f98d728ae22,0x7137449123ef65cd
712         .quad   0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
713         .quad   0x3956c25bf348b538,0x59f111f1b605d019
714         .quad   0x923f82a4af194f9b,0xab1c5ed5da6d8118
715         .quad   0xd807aa98a3030242,0x12835b0145706fbe
716         .quad   0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
717         .quad   0x72be5d74f27b896f,0x80deb1fe3b1696b1
718         .quad   0x9bdc06a725c71235,0xc19bf174cf692694
719         .quad   0xe49b69c19ef14ad2,0xefbe4786384f25e3
720         .quad   0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
721         .quad   0x2de92c6f592b0275,0x4a7484aa6ea6e483
722         .quad   0x5cb0a9dcbd41fbd4,0x76f988da831153b5
723         .quad   0x983e5152ee66dfab,0xa831c66d2db43210
724         .quad   0xb00327c898fb213f,0xbf597fc7beef0ee4
725         .quad   0xc6e00bf33da88fc2,0xd5a79147930aa725
726         .quad   0x06ca6351e003826f,0x142929670a0e6e70
727         .quad   0x27b70a8546d22ffc,0x2e1b21385c26c926
728         .quad   0x4d2c6dfc5ac42aed,0x53380d139d95b3df
729         .quad   0x650a73548baf63de,0x766a0abb3c77b2a8
730         .quad   0x81c2c92e47edaee6,0x92722c851482353b
731         .quad   0xa2bfe8a14cf10364,0xa81a664bbc423001
732         .quad   0xc24b8b70d0f89791,0xc76c51a30654be30
733         .quad   0xd192e819d6ef5218,0xd69906245565a910
734         .quad   0xf40e35855771202a,0x106aa07032bbd1b8
735         .quad   0x19a4c116b8d2d0c8,0x1e376c085141ab53
736         .quad   0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
737         .quad   0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
738         .quad   0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
739         .quad   0x748f82ee5defb2fc,0x78a5636f43172f60
740         .quad   0x84c87814a1f0ab72,0x8cc702081a6439ec
741         .quad   0x90befffa23631e28,0xa4506cebde82bde9
742         .quad   0xbef9a3f7b2c67915,0xc67178f2e372532b
743         .quad   0xca273eceea26619c,0xd186b8c721c0c207
744         .quad   0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
745         .quad   0x06f067aa72176fba,0x0a637dc5a2c898a6
746         .quad   0x113f9804bef90dae,0x1b710b35131c471b
747         .quad   0x28db77f523047d84,0x32caab7b40c72493
748         .quad   0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
749         .quad   0x4cc5d4becb3e42b6,0x597f299cfc657e2a
750         .quad   0x5fcb6fab3ad6faec,0x6c44198c4a475817
751 ___
752 $code.=<<___ if ($SZ==4);
753         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
754         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
755         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
756         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
757         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
758         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
759         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
760         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
761         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
762         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
763         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
764         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
765         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
766         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
767         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
768         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
769 ___
770
771 $code =~ s/\`([^\`]*)\`/eval $1/gem;
772 print $code;
773 close STDOUT;