e9677f811f56d07c9bf1d6c738c01e69144c3103
[openssl.git] / crypto / sha / asm / sha512-ppc.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # I let hardware handle unaligned input, except on page boundaries
11 # (see below for details). Otherwise straightforward implementation
12 # with X vector in register bank. The module is big-endian [which is
13 # not big deal as there're no little-endian targets left around].
14
15 #                       sha256          |       sha512
16 #                       -m64    -m32    |       -m64    -m32
17 # --------------------------------------+-----------------------
18 # PPC970,gcc-4.0.0      +50%    +38%    |       +40%    +410%(*)
19 # Power6,xlc-7          +150%   +90%    |       +100%   +430%(*)
20 #
21 # (*)   64-bit code in 32-bit application context, which actually is
22 #       on TODO list. It should be noted that for safe deployment in
23 #       32-bit *mutli-threaded* context asyncronous signals should be
24 #       blocked upon entry to SHA512 block routine. This is because
25 #       32-bit signaling procedure invalidates upper halves of GPRs.
26 #       Context switch procedure preserves them, but not signaling:-(
27
28 # Second version is true multi-thread safe. Trouble with the original
29 # version was that it was using thread local storage pointer register.
30 # Well, it scrupulously preserved it, but the problem would arise the
31 # moment asynchronous signal was delivered and signal handler would
32 # dereference the TLS pointer. While it's never the case in openssl
33 # application or test suite, we have to respect this scenario and not
34 # use TLS pointer register. Alternative would be to require caller to
35 # block signals prior calling this routine. For the record, in 32-bit
36 # context R2 serves as TLS pointer, while in 64-bit context - R13.
37
38 $flavour=shift;
39 $output =shift;
40
41 if ($flavour =~ /64/) {
42         $SIZE_T=8;
43         $LRSAVE=2*$SIZE_T;
44         $STU="stdu";
45         $UCMP="cmpld";
46         $SHL="sldi";
47         $POP="ld";
48         $PUSH="std";
49 } elsif ($flavour =~ /32/) {
50         $SIZE_T=4;
51         $LRSAVE=$SIZE_T;
52         $STU="stwu";
53         $UCMP="cmplw";
54         $SHL="slwi";
55         $POP="lwz";
56         $PUSH="stw";
57 } else { die "nonsense $flavour"; }
58
59 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
61 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
62 die "can't locate ppc-xlate.pl";
63
64 open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
65
66 if ($output =~ /512/) {
67         $func="sha512_block_data_order";
68         $SZ=8;
69         @Sigma0=(28,34,39);
70         @Sigma1=(14,18,41);
71         @sigma0=(1,  8, 7);
72         @sigma1=(19,61, 6);
73         $rounds=80;
74         $LD="ld";
75         $ST="std";
76         $ROR="rotrdi";
77         $SHR="srdi";
78 } else {
79         $func="sha256_block_data_order";
80         $SZ=4;
81         @Sigma0=( 2,13,22);
82         @Sigma1=( 6,11,25);
83         @sigma0=( 7,18, 3);
84         @sigma1=(17,19,10);
85         $rounds=64;
86         $LD="lwz";
87         $ST="stw";
88         $ROR="rotrwi";
89         $SHR="srwi";
90 }
91
92 $FRAME=32*$SIZE_T+16*$SZ;
93 $LOCALS=6*$SIZE_T;
94 if ($SZ==8 && $SIZE_T==4) {
95         $FRAME+=16*$SZ;
96         $XOFF=$LOCALS+16*$SZ;
97 }
98
99 $sp ="r1";
100 $toc="r2";
101 $ctx="r3";      # zapped by $a0
102 $inp="r4";      # zapped by $a1
103 $num="r5";      # zapped by $t0
104
105 $T  ="r0";
106 $a0 ="r3";
107 $a1 ="r4";
108 $t0 ="r5";
109 $t1 ="r6";
110 $Tbl="r7";
111
112 $A  ="r8";
113 $B  ="r9";
114 $C  ="r10";
115 $D  ="r11";
116 $E  ="r12";
117 $F  =$t1;       $t1 = "r0";     # stay away from "r13";
118 $G  ="r14";
119 $H  ="r15";
120
121 @V=($A,$B,$C,$D,$E,$F,$G,$H);
122 @X=("r16","r17","r18","r19","r20","r21","r22","r23",
123     "r24","r25","r26","r27","r28","r29","r30","r31");
124
125 $inp="r31" if($SZ==4 || $SIZE_T==8);    # reassigned $inp! aliases with @X[15]
126
127 sub ROUND_00_15 {
128 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
129 $code.=<<___;
130         $ROR    $a0,$e,$Sigma1[0]
131         $ROR    $a1,$e,$Sigma1[1]
132         and     $t0,$f,$e
133         xor     $a0,$a0,$a1
134         add     $h,$h,$t1
135         andc    $t1,$g,$e
136         $ROR    $a1,$a1,`$Sigma1[2]-$Sigma1[1]`
137         or      $t0,$t0,$t1             ; Ch(e,f,g)
138         add     $h,$h,@X[$i%16]
139         xor     $a0,$a0,$a1             ; Sigma1(e)
140         add     $h,$h,$t0
141         add     $h,$h,$a0
142
143         $ROR    $a0,$a,$Sigma0[0]
144         $ROR    $a1,$a,$Sigma0[1]
145         and     $t0,$a,$b
146         and     $t1,$a,$c
147         xor     $a0,$a0,$a1
148         $ROR    $a1,$a1,`$Sigma0[2]-$Sigma0[1]`
149         xor     $t0,$t0,$t1
150         and     $t1,$b,$c
151         xor     $a0,$a0,$a1             ; Sigma0(a)
152         add     $d,$d,$h
153         xor     $t0,$t0,$t1             ; Maj(a,b,c)
154 ___
155 $code.=<<___ if ($i<15);
156         $LD     $t1,`($i+1)*$SZ`($Tbl)
157 ___
158 $code.=<<___;
159         add     $h,$h,$a0
160         add     $h,$h,$t0
161
162 ___
163 }
164
165 sub ROUND_16_xx {
166 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
167 $i-=16;
168 $code.=<<___;
169         $ROR    $a0,@X[($i+1)%16],$sigma0[0]
170         $ROR    $a1,@X[($i+1)%16],$sigma0[1]
171         $ROR    $t0,@X[($i+14)%16],$sigma1[0]
172         $ROR    $t1,@X[($i+14)%16],$sigma1[1]
173         xor     $a0,$a0,$a1
174         $SHR    $a1,@X[($i+1)%16],$sigma0[2]
175         xor     $t0,$t0,$t1
176         $SHR    $t1,@X[($i+14)%16],$sigma1[2]
177         add     @X[$i],@X[$i],@X[($i+9)%16]
178         xor     $a0,$a0,$a1             ; sigma0(X[(i+1)&0x0f])
179         xor     $t0,$t0,$t1             ; sigma1(X[(i+14)&0x0f])
180         $LD     $t1,`$i*$SZ`($Tbl)
181         add     @X[$i],@X[$i],$a0
182         add     @X[$i],@X[$i],$t0
183 ___
184 &ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
185 }
186
187 $code=<<___;
188 .machine        "any"
189 .text
190
191 .globl  $func
192 .align  6
193 $func:
194         $STU    $sp,-$FRAME($sp)
195         mflr    r0
196         $SHL    $num,$num,`log(16*$SZ)/log(2)`
197
198         $PUSH   $ctx,`$FRAME-$SIZE_T*22`($sp)
199
200         $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
201         $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
202         $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
203         $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
204         $PUSH   r18,`$FRAME-$SIZE_T*14`($sp)
205         $PUSH   r19,`$FRAME-$SIZE_T*13`($sp)
206         $PUSH   r20,`$FRAME-$SIZE_T*12`($sp)
207         $PUSH   r21,`$FRAME-$SIZE_T*11`($sp)
208         $PUSH   r22,`$FRAME-$SIZE_T*10`($sp)
209         $PUSH   r23,`$FRAME-$SIZE_T*9`($sp)
210         $PUSH   r24,`$FRAME-$SIZE_T*8`($sp)
211         $PUSH   r25,`$FRAME-$SIZE_T*7`($sp)
212         $PUSH   r26,`$FRAME-$SIZE_T*6`($sp)
213         $PUSH   r27,`$FRAME-$SIZE_T*5`($sp)
214         $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
215         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
216         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
217         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
218         $PUSH   r0,`$FRAME+$LRSAVE`($sp)
219 ___
220
221 if ($SZ==4 || $SIZE_T==8) {
222 $code.=<<___;
223         $LD     $A,`0*$SZ`($ctx)
224         mr      $inp,r4                         ; incarnate $inp
225         $LD     $B,`1*$SZ`($ctx)
226         $LD     $C,`2*$SZ`($ctx)
227         $LD     $D,`3*$SZ`($ctx)
228         $LD     $E,`4*$SZ`($ctx)
229         $LD     $F,`5*$SZ`($ctx)
230         $LD     $G,`6*$SZ`($ctx)
231         $LD     $H,`7*$SZ`($ctx)
232 ___
233 } else {
234   for ($i=16;$i<32;$i++) {
235     $code.=<<___;
236         lwz     r$i,`4*($i-16)`($ctx)
237 ___
238   }
239 }
240
241 $code.=<<___;
242         bl      LPICmeup
243 LPICedup:
244         andi.   r0,$inp,3
245         bne     Lunaligned
246 Laligned:
247         add     $num,$inp,$num
248         $PUSH   $num,`$FRAME-$SIZE_T*24`($sp)   ; end pointer
249         $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
250         bl      Lsha2_block_private
251         b       Ldone
252
253 ; PowerPC specification allows an implementation to be ill-behaved
254 ; upon unaligned access which crosses page boundary. "Better safe
255 ; than sorry" principle makes me treat it specially. But I don't
256 ; look for particular offending word, but rather for the input
257 ; block which crosses the boundary. Once found that block is aligned
258 ; and hashed separately...
259 .align  4
260 Lunaligned:
261         subfic  $t1,$inp,4096
262         andi.   $t1,$t1,`4096-16*$SZ`   ; distance to closest page boundary
263         beq     Lcross_page
264         $UCMP   $num,$t1
265         ble-    Laligned                ; didn't cross the page boundary
266         subfc   $num,$t1,$num
267         add     $t1,$inp,$t1
268         $PUSH   $num,`$FRAME-$SIZE_T*25`($sp)   ; save real remaining num
269         $PUSH   $t1,`$FRAME-$SIZE_T*24`($sp)    ; intermediate end pointer
270         $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
271         bl      Lsha2_block_private
272         ; $inp equals to the intermediate end pointer here
273         $POP    $num,`$FRAME-$SIZE_T*25`($sp)   ; restore real remaining num
274 Lcross_page:
275         li      $t1,`16*$SZ/4`
276         mtctr   $t1
277 ___
278 if ($SZ==4 || $SIZE_T==8) {
279 $code.=<<___;
280         addi    r20,$sp,$LOCALS                 ; aligned spot below the frame
281 Lmemcpy:
282         lbz     r16,0($inp)
283         lbz     r17,1($inp)
284         lbz     r18,2($inp)
285         lbz     r19,3($inp)
286         addi    $inp,$inp,4
287         stb     r16,0(r20)
288         stb     r17,1(r20)
289         stb     r18,2(r20)
290         stb     r19,3(r20)
291         addi    r20,r20,4
292         bdnz    Lmemcpy
293 ___
294 } else {
295 $code.=<<___;
296         addi    r12,$sp,$LOCALS                 ; aligned spot below the frame
297 Lmemcpy:
298         lbz     r8,0($inp)
299         lbz     r9,1($inp)
300         lbz     r10,2($inp)
301         lbz     r11,3($inp)
302         addi    $inp,$inp,4
303         stb     r8,0(r12)
304         stb     r9,1(r12)
305         stb     r10,2(r12)
306         stb     r11,3(r12)
307         addi    r12,r12,4
308         bdnz    Lmemcpy
309 ___
310 }
311
312 $code.=<<___;
313         $PUSH   $inp,`$FRAME-$SIZE_T*26`($sp)   ; save real inp
314         addi    $t1,$sp,`$LOCALS+16*$SZ`        ; fictitious end pointer
315         addi    $inp,$sp,$LOCALS                ; fictitious inp pointer
316         $PUSH   $num,`$FRAME-$SIZE_T*25`($sp)   ; save real num
317         $PUSH   $t1,`$FRAME-$SIZE_T*24`($sp)    ; end pointer
318         $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
319         bl      Lsha2_block_private
320         $POP    $inp,`$FRAME-$SIZE_T*26`($sp)   ; restore real inp
321         $POP    $num,`$FRAME-$SIZE_T*25`($sp)   ; restore real num
322         addic.  $num,$num,`-16*$SZ`             ; num--
323         bne-    Lunaligned
324
325 Ldone:
326         $POP    r0,`$FRAME+$LRSAVE`($sp)
327         $POP    r14,`$FRAME-$SIZE_T*18`($sp)
328         $POP    r15,`$FRAME-$SIZE_T*17`($sp)
329         $POP    r16,`$FRAME-$SIZE_T*16`($sp)
330         $POP    r17,`$FRAME-$SIZE_T*15`($sp)
331         $POP    r18,`$FRAME-$SIZE_T*14`($sp)
332         $POP    r19,`$FRAME-$SIZE_T*13`($sp)
333         $POP    r20,`$FRAME-$SIZE_T*12`($sp)
334         $POP    r21,`$FRAME-$SIZE_T*11`($sp)
335         $POP    r22,`$FRAME-$SIZE_T*10`($sp)
336         $POP    r23,`$FRAME-$SIZE_T*9`($sp)
337         $POP    r24,`$FRAME-$SIZE_T*8`($sp)
338         $POP    r25,`$FRAME-$SIZE_T*7`($sp)
339         $POP    r26,`$FRAME-$SIZE_T*6`($sp)
340         $POP    r27,`$FRAME-$SIZE_T*5`($sp)
341         $POP    r28,`$FRAME-$SIZE_T*4`($sp)
342         $POP    r29,`$FRAME-$SIZE_T*3`($sp)
343         $POP    r30,`$FRAME-$SIZE_T*2`($sp)
344         $POP    r31,`$FRAME-$SIZE_T*1`($sp)
345         mtlr    r0
346         addi    $sp,$sp,$FRAME
347         blr
348         .long   0
349         .byte   0,12,4,1,0x80,18,3,0
350         .long   0
351 ___
352
353 if ($SZ==4 || $SIZE_T==8) {
354 $code.=<<___;
355 .align  4
356 Lsha2_block_private:
357         $LD     $t1,0($Tbl)
358 ___
359 for($i=0;$i<16;$i++) {
360 $code.=<<___ if ($SZ==4);
361         lwz     @X[$i],`$i*$SZ`($inp)
362 ___
363 # 64-bit loads are split to 2x32-bit ones, as CPU can't handle
364 # unaligned 64-bit loads, only 32-bit ones...
365 $code.=<<___ if ($SZ==8);
366         lwz     $t0,`$i*$SZ`($inp)
367         lwz     @X[$i],`$i*$SZ+4`($inp)
368         insrdi  @X[$i],$t0,32,0
369 ___
370         &ROUND_00_15($i,@V);
371         unshift(@V,pop(@V));
372 }
373 $code.=<<___;
374         li      $t0,`$rounds/16-1`
375         mtctr   $t0
376 .align  4
377 Lrounds:
378         addi    $Tbl,$Tbl,`16*$SZ`
379 ___
380 for(;$i<32;$i++) {
381         &ROUND_16_xx($i,@V);
382         unshift(@V,pop(@V));
383 }
384 $code.=<<___;
385         bdnz-   Lrounds
386
387         $POP    $ctx,`$FRAME-$SIZE_T*22`($sp)
388         $POP    $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
389         $POP    $num,`$FRAME-$SIZE_T*24`($sp)   ; end pointer
390         subi    $Tbl,$Tbl,`($rounds-16)*$SZ`    ; rewind Tbl
391
392         $LD     r16,`0*$SZ`($ctx)
393         $LD     r17,`1*$SZ`($ctx)
394         $LD     r18,`2*$SZ`($ctx)
395         $LD     r19,`3*$SZ`($ctx)
396         $LD     r20,`4*$SZ`($ctx)
397         $LD     r21,`5*$SZ`($ctx)
398         $LD     r22,`6*$SZ`($ctx)
399         addi    $inp,$inp,`16*$SZ`              ; advance inp
400         $LD     r23,`7*$SZ`($ctx)
401         add     $A,$A,r16
402         add     $B,$B,r17
403         $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)
404         add     $C,$C,r18
405         $ST     $A,`0*$SZ`($ctx)
406         add     $D,$D,r19
407         $ST     $B,`1*$SZ`($ctx)
408         add     $E,$E,r20
409         $ST     $C,`2*$SZ`($ctx)
410         add     $F,$F,r21
411         $ST     $D,`3*$SZ`($ctx)
412         add     $G,$G,r22
413         $ST     $E,`4*$SZ`($ctx)
414         add     $H,$H,r23
415         $ST     $F,`5*$SZ`($ctx)
416         $ST     $G,`6*$SZ`($ctx)
417         $UCMP   $inp,$num
418         $ST     $H,`7*$SZ`($ctx)
419         bne     Lsha2_block_private
420         blr
421         .long   0
422         .byte   0,12,0x14,0,0,0,0,0
423 ___
424 } else {
425 ########################################################################
426 # SHA512 for PPC32, X vector is off-loaded to stack...
427 #
428 #                       |       sha512
429 #                       |       -m32
430 # ----------------------+-----------------------
431 # PPC74x0,gcc-4.0.1     |       +48%
432 # POWER6,gcc-4.4.6      |       +124%(*)
433 # POWER7,gcc-4.4.6      |       +79%(*)
434 # e300,gcc-4.1.0        |       +167%
435 #
436 # (*)   ~1/3 of -m64 result [and ~20% better than -m32 code generated
437 #       by xlc-12.1]
438
439 my @V=map("r$_",(16..31));      # A..H
440
441 my ($s0,$s1,$t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("r$_",(0,5,6,8..12,14,15));
442 my ($x0,$x1)=("r3","r4");       # zaps $ctx and $inp
443
444 sub ROUND_00_15_ppc32 {
445 my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
446         $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_;
447
448 $code.=<<___;
449         lwz     $t2,`$SZ*($i%16)+4`($Tbl)
450          xor    $a0,$flo,$glo
451         lwz     $t3,`$SZ*($i%16)+0`($Tbl)
452          xor    $a1,$fhi,$ghi
453         addc    $hlo,$hlo,$t0                   ; h+=x[i]
454         stw     $t0,`$XOFF+0+$SZ*($i%16)`($sp)  ; save x[i]
455
456         srwi    $s0,$elo,$Sigma1[0]
457         srwi    $s1,$ehi,$Sigma1[0]
458          and    $a0,$a0,$elo
459         adde    $hhi,$hhi,$t1
460          and    $a1,$a1,$ehi
461         stw     $t1,`$XOFF+4+$SZ*($i%16)`($sp)
462         srwi    $t0,$elo,$Sigma1[1]
463         srwi    $t1,$ehi,$Sigma1[1]
464          addc   $hlo,$hlo,$t2                   ; h+=K512[i]
465         insrwi  $s0,$ehi,$Sigma1[0],0
466         insrwi  $s1,$elo,$Sigma1[0],0
467          xor    $a0,$a0,$glo                    ; Ch(e,f,g)
468          adde   $hhi,$hhi,$t3
469          xor    $a1,$a1,$ghi
470         insrwi  $t0,$ehi,$Sigma1[1],0
471         insrwi  $t1,$elo,$Sigma1[1],0
472          addc   $hlo,$hlo,$a0                   ; h+=Ch(e,f,g)
473         srwi    $t2,$ehi,$Sigma1[2]-32
474         srwi    $t3,$elo,$Sigma1[2]-32
475         xor     $s0,$s0,$t0
476         xor     $s1,$s1,$t1
477         insrwi  $t2,$elo,$Sigma1[2]-32,0
478         insrwi  $t3,$ehi,$Sigma1[2]-32,0
479          xor    $a0,$alo,$blo                   ; a^b, b^c in next round
480          adde   $hhi,$hhi,$a1
481          xor    $a1,$ahi,$bhi
482         xor     $s0,$s0,$t2                     ; Sigma1(e)
483         xor     $s1,$s1,$t3
484
485         srwi    $t0,$alo,$Sigma0[0]
486          and    $a2,$a2,$a0
487          addc   $hlo,$hlo,$s0                   ; h+=Sigma1(e)
488          and    $a3,$a3,$a1
489         srwi    $t1,$ahi,$Sigma0[0]
490         srwi    $s0,$ahi,$Sigma0[1]-32
491          adde   $hhi,$hhi,$s1
492         srwi    $s1,$alo,$Sigma0[1]-32
493         insrwi  $t0,$ahi,$Sigma0[0],0
494         insrwi  $t1,$alo,$Sigma0[0],0
495          xor    $a2,$a2,$blo                    ; Maj(a,b,c)
496          addc   $dlo,$dlo,$hlo                  ; d+=h
497          xor    $a3,$a3,$bhi
498         insrwi  $s0,$alo,$Sigma0[1]-32,0
499         insrwi  $s1,$ahi,$Sigma0[1]-32,0
500          adde   $dhi,$dhi,$hhi
501         srwi    $t2,$ahi,$Sigma0[2]-32
502         srwi    $t3,$alo,$Sigma0[2]-32
503         xor     $s0,$s0,$t0
504          addc   $hlo,$hlo,$a2                   ; h+=Maj(a,b,c)
505         xor     $s1,$s1,$t1
506         insrwi  $t2,$alo,$Sigma0[2]-32,0
507         insrwi  $t3,$ahi,$Sigma0[2]-32,0
508          adde   $hhi,$hhi,$a3
509 ___
510 $code.=<<___ if ($i>=15);
511         lwz     $t0,`$XOFF+0+$SZ*(($i+2)%16)`($sp)
512         lwz     $t1,`$XOFF+4+$SZ*(($i+2)%16)`($sp)
513 ___
514 $code.=<<___ if ($i<15);
515         lwz     $t1,`$SZ*($i+1)+0`($inp)
516         lwz     $t0,`$SZ*($i+1)+4`($inp)
517 ___
518 $code.=<<___;
519         xor     $s0,$s0,$t2                     ; Sigma0(a)
520         xor     $s1,$s1,$t3
521         addc    $hlo,$hlo,$s0                   ; h+=Sigma0(a)
522         adde    $hhi,$hhi,$s1
523 ___
524 $code.=<<___ if ($i==15);
525         lwz     $x0,`$XOFF+0+$SZ*(($i+1)%16)`($sp)
526         lwz     $x1,`$XOFF+4+$SZ*(($i+1)%16)`($sp)
527 ___
528 }
529 sub ROUND_16_xx_ppc32 {
530 my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
531         $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_;
532
533 $code.=<<___;
534         srwi    $s0,$t0,$sigma0[0]
535         srwi    $s1,$t1,$sigma0[0]
536         srwi    $t2,$t0,$sigma0[1]
537         srwi    $t3,$t1,$sigma0[1]
538         insrwi  $s0,$t1,$sigma0[0],0
539         insrwi  $s1,$t0,$sigma0[0],0
540         srwi    $a0,$t0,$sigma0[2]
541         insrwi  $t2,$t1,$sigma0[1],0
542         insrwi  $t3,$t0,$sigma0[1],0
543         insrwi  $a0,$t1,$sigma0[2],0
544         xor     $s0,$s0,$t2
545          lwz    $t2,`$XOFF+0+$SZ*(($i+14)%16)`($sp)
546         srwi    $a1,$t1,$sigma0[2]
547         xor     $s1,$s1,$t3
548          lwz    $t3,`$XOFF+4+$SZ*(($i+14)%16)`($sp)
549         xor     $a0,$a0,$s0
550          srwi   $s0,$t2,$sigma1[0]
551         xor     $a1,$a1,$s1
552          srwi   $s1,$t3,$sigma1[0]
553         addc    $x0,$x0,$a0                     ; x[i]+=sigma0(x[i+1])
554          srwi   $a0,$t3,$sigma1[1]-32
555         insrwi  $s0,$t3,$sigma1[0],0
556         insrwi  $s1,$t2,$sigma1[0],0
557         adde    $x1,$x1,$a1
558          srwi   $a1,$t2,$sigma1[1]-32
559
560         insrwi  $a0,$t2,$sigma1[1]-32,0
561         srwi    $t2,$t2,$sigma1[2]
562         insrwi  $a1,$t3,$sigma1[1]-32,0
563         insrwi  $t2,$t3,$sigma1[2],0
564         xor     $s0,$s0,$a0
565          lwz    $a0,`$XOFF+0+$SZ*(($i+9)%16)`($sp)
566         srwi    $t3,$t3,$sigma1[2]
567         xor     $s1,$s1,$a1
568          lwz    $a1,`$XOFF+4+$SZ*(($i+9)%16)`($sp)
569         xor     $s0,$s0,$t2
570          addc   $x0,$x0,$a0                     ; x[i]+=x[i+9]
571         xor     $s1,$s1,$t3
572          adde   $x1,$x1,$a1
573         addc    $x0,$x0,$s0                     ; x[i]+=sigma1(x[i+14])
574         adde    $x1,$x1,$s1
575 ___
576         ($t0,$t1,$x0,$x1) = ($x0,$x1,$t0,$t1);
577         &ROUND_00_15_ppc32(@_);
578 }
579
580 $code.=<<___;
581 .align  4
582 Lsha2_block_private:
583         lwz     $t1,0($inp)
584         xor     $a2,@V[3],@V[5]         ; B^C, magic seed
585         lwz     $t0,4($inp)
586         xor     $a3,@V[2],@V[4]
587 ___
588 for($i=0;$i<16;$i++) {
589         &ROUND_00_15_ppc32($i,@V);
590         unshift(@V,pop(@V));    unshift(@V,pop(@V));
591         ($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1);
592 }
593 $code.=<<___;
594         li      $a0,`$rounds/16-1`
595         mtctr   $a0
596 .align  4
597 Lrounds:
598         addi    $Tbl,$Tbl,`16*$SZ`
599 ___
600 for(;$i<32;$i++) {
601         &ROUND_16_xx_ppc32($i,@V);
602         unshift(@V,pop(@V));    unshift(@V,pop(@V));
603         ($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1);
604 }
605 $code.=<<___;
606         bdnz-   Lrounds
607
608         $POP    $ctx,`$FRAME-$SIZE_T*22`($sp)
609         $POP    $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
610         $POP    $num,`$FRAME-$SIZE_T*24`($sp)   ; end pointer
611         subi    $Tbl,$Tbl,`($rounds-16)*$SZ`    ; rewind Tbl
612
613         lwz     $t0,0($ctx)
614         lwz     $t1,4($ctx)
615         lwz     $t2,8($ctx)
616         lwz     $t3,12($ctx)
617         lwz     $a0,16($ctx)
618         lwz     $a1,20($ctx)
619         lwz     $a2,24($ctx)
620         addc    @V[1],@V[1],$t1
621         lwz     $a3,28($ctx)
622         adde    @V[0],@V[0],$t0
623         lwz     $t0,32($ctx)
624         addc    @V[3],@V[3],$t3
625         lwz     $t1,36($ctx)
626         adde    @V[2],@V[2],$t2
627         lwz     $t2,40($ctx)
628         addc    @V[5],@V[5],$a1
629         lwz     $t3,44($ctx)
630         adde    @V[4],@V[4],$a0
631         lwz     $a0,48($ctx)
632         addc    @V[7],@V[7],$a3
633         lwz     $a1,52($ctx)
634         adde    @V[6],@V[6],$a2
635         lwz     $a2,56($ctx)
636         addc    @V[9],@V[9],$t1
637         lwz     $a3,60($ctx)
638         adde    @V[8],@V[8],$t0
639         stw     @V[0],0($ctx)
640         stw     @V[1],4($ctx)
641         addc    @V[11],@V[11],$t3
642         stw     @V[2],8($ctx)
643         stw     @V[3],12($ctx)
644         adde    @V[10],@V[10],$t2
645         stw     @V[4],16($ctx)
646         stw     @V[5],20($ctx)
647         addc    @V[13],@V[13],$a1
648         stw     @V[6],24($ctx)
649         stw     @V[7],28($ctx)
650         adde    @V[12],@V[12],$a0
651         stw     @V[8],32($ctx)
652         stw     @V[9],36($ctx)
653         addc    @V[15],@V[15],$a3
654         stw     @V[10],40($ctx)
655         stw     @V[11],44($ctx)
656         adde    @V[14],@V[14],$a2
657         stw     @V[12],48($ctx)
658         stw     @V[13],52($ctx)
659         stw     @V[14],56($ctx)
660         stw     @V[15],60($ctx)
661
662         addi    $inp,$inp,`16*$SZ`              ; advance inp
663         $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)
664         $UCMP   $inp,$num
665         bne     Lsha2_block_private
666         blr
667         .long   0
668         .byte   0,12,0x14,0,0,0,0,0
669 ___
670 }
671
672 # Ugly hack here, because PPC assembler syntax seem to vary too
673 # much from platforms to platform...
674 $code.=<<___;
675 .align  6
676 LPICmeup:
677         mflr    r0
678         bcl     20,31,\$+4
679         mflr    $Tbl    ; vvvvvv "distance" between . and 1st data entry
680         addi    $Tbl,$Tbl,`64-8`
681         mtlr    r0
682         blr
683         .long   0
684         .byte   0,12,0x14,0,0,0,0,0
685         .space  `64-9*4`
686 ___
687 $code.=<<___ if ($SZ==8);
688         .long   0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
689         .long   0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
690         .long   0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
691         .long   0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
692         .long   0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
693         .long   0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
694         .long   0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
695         .long   0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
696         .long   0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
697         .long   0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
698         .long   0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
699         .long   0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
700         .long   0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
701         .long   0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
702         .long   0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
703         .long   0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
704         .long   0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
705         .long   0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
706         .long   0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
707         .long   0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
708         .long   0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
709         .long   0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
710         .long   0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
711         .long   0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
712         .long   0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
713         .long   0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
714         .long   0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
715         .long   0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
716         .long   0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
717         .long   0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
718         .long   0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
719         .long   0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
720         .long   0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
721         .long   0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
722         .long   0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
723         .long   0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
724         .long   0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
725         .long   0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
726         .long   0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
727         .long   0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
728 ___
729 $code.=<<___ if ($SZ==4);
730         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
731         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
732         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
733         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
734         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
735         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
736         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
737         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
738         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
739         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
740         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
741         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
742         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
743         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
744         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
745         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
746 ___
747
748 $code =~ s/\`([^\`]*)\`/eval $1/gem;
749 print $code;
750 close STDOUT;