Further synchronizations with md32_common.h update, consistent naming
[openssl.git] / crypto / sha / asm / sha512-ppc.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
8
9 # I let hardware handle unaligned input, except on page boundaries
10 # (see below for details). Otherwise straightforward implementation
11 # with X vector in register bank. The module is big-endian [which is
12 # not big deal as there're no little-endian targets left around].
13
14 #                       sha256          |       sha512
15 #                       -m64    -m32    |       -m64    -m32
16 # --------------------------------------+-----------------------
17 # PPC970,gcc-4.0.0      +50%    +38%    |       +40%    +410%(*)
18 #
19 # (*)   64-bit code in 32-bit application context, which actually is
20 #       on TODO list
21
22 $output=shift;
23
24 if ($output =~ /64/) {
25         $SIZE_T=8;
26         $STU="stdu";
27         $UCMP="cmpld";
28         $SHL="sldi";
29         $POP="ld";
30         $PUSH="std";
31 } elsif ($output =~ /32/) {
32         $SIZE_T=4;
33         $STU="stwu";
34         $UCMP="cmplw";
35         $SHL="slwi";
36         $POP="lwz";
37         $PUSH="stw";
38 } else { die "nonsense $output"; }
39
40 ( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) ||
41         die "can't call ../perlasm/ppc-xlate.pl: $!";
42
43 if ($output =~ /512/) {
44         $func="sha512_block_data_order";
45         $SZ=8;
46         @Sigma0=(28,34,39);
47         @Sigma1=(14,18,41);
48         @sigma0=(1,  8, 7);
49         @sigma1=(19,61, 6);
50         $rounds=80;
51         $LD="ld";
52         $ST="std";
53         $ROR="rotrdi";
54         $SHR="srdi";
55 } else {
56         $func="sha256_block_data_order";
57         $SZ=4;
58         @Sigma0=( 2,13,22);
59         @Sigma1=( 6,11,25);
60         @sigma0=( 7,18, 3);
61         @sigma1=(17,19,10);
62         $rounds=64;
63         $LD="lwz";
64         $ST="stw";
65         $ROR="rotrwi";
66         $SHR="srwi";
67 }
68
69 $FRAME=32*$SIZE_T;
70
71 $sp ="r1";
72 $toc="r2";      # zapped by $Tbl
73 $ctx="r3";      # zapped by $a0
74 $inp="r4";
75 $num="r5";      # zapped by $a1
76
77 $T  ="r0";
78 $Tbl="r2";
79 $a0 ="r3";
80 $a1 ="r5";
81 $t0 ="r6";
82 $t1 ="r7";
83
84 $A  ="r8";
85 $B  ="r9";
86 $C  ="r10";
87 $D  ="r11";
88 $E  ="r12";
89 $F  ="r13";
90 $G  ="r14";
91 $H  ="r15";
92
93 @V=($A,$B,$C,$D,$E,$F,$G,$H);
94 @X=("r16","r17","r18","r19","r20","r21","r22","r23",
95     "r24","r25","r26","r27","r28","r29","r30","r31");
96
97 sub ROUND_00_15 {
98 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
99 $code.=<<___;
100         $LD     $T,`$i*$SZ`($Tbl)
101         $ROR    $a0,$e,$Sigma1[0]
102         $ROR    $a1,$e,$Sigma1[1]
103         and     $t0,$f,$e
104         andc    $t1,$g,$e
105         add     $T,$T,$h
106         xor     $a0,$a0,$a1
107         $ROR    $a1,$a1,`$Sigma1[2]-$Sigma1[1]`
108         or      $t0,$t0,$t1             ; Ch(e,f,g)
109         add     $T,$T,@X[$i]
110         xor     $a0,$a0,$a1             ; Sigma1(e)
111         add     $T,$T,$t0
112         add     $T,$T,$a0
113
114         $ROR    $a0,$a,$Sigma0[0]
115         $ROR    $a1,$a,$Sigma0[1]
116         and     $t0,$a,$b
117         and     $t1,$a,$c
118         xor     $a0,$a0,$a1
119         $ROR    $a1,$a1,`$Sigma0[2]-$Sigma0[1]`
120         xor     $t0,$t0,$t1
121         and     $t1,$b,$c
122         xor     $a0,$a0,$a1             ; Sigma0(a)
123         add     $d,$d,$T
124         xor     $t0,$t0,$t1             ; Maj(a,b,c)
125         add     $h,$T,$a0
126         add     $h,$h,$t0
127
128 ___
129 }
130
131 sub ROUND_16_xx {
132 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
133 $i-=16;
134 $code.=<<___;
135         $ROR    $a0,@X[($i+1)%16],$sigma0[0]
136         $ROR    $a1,@X[($i+1)%16],$sigma0[1]
137         $ROR    $t0,@X[($i+14)%16],$sigma1[0]
138         $ROR    $t1,@X[($i+14)%16],$sigma1[1]
139         xor     $a0,$a0,$a1
140         $SHR    $a1,@X[($i+1)%16],$sigma0[2]
141         xor     $t0,$t0,$t1
142         $SHR    $t1,@X[($i+14)%16],$sigma1[2]
143         add     @X[$i],@X[$i],@X[($i+9)%16]
144         xor     $a0,$a0,$a1             ; sigma0(X[(i+1)&0x0f])
145         xor     $t0,$t0,$t1             ; sigma1(X[(i+14)&0x0f])
146         add     @X[$i],@X[$i],$a0
147         add     @X[$i],@X[$i],$t0
148 ___
149 &ROUND_00_15($i,$a,$b,$c,$d,$e,$f,$g,$h);
150 }
151
152 $code=<<___;
153 .text
154
155 .globl  $func
156 .align  6
157 $func:
158         mflr    r0
159         $STU    $sp,`-($FRAME+16*$SZ)`($sp)
160         $SHL    $num,$num,`log(16*$SZ)/log(2)`
161
162         $PUSH   $ctx,`$FRAME-$SIZE_T*22`($sp)
163
164         $PUSH   r0,`$FRAME-$SIZE_T*21`($sp)
165         $PUSH   $toc,`$FRAME-$SIZE_T*20`($sp)
166         $PUSH   r13,`$FRAME-$SIZE_T*19`($sp)
167         $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
168         $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
169         $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
170         $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
171         $PUSH   r18,`$FRAME-$SIZE_T*14`($sp)
172         $PUSH   r19,`$FRAME-$SIZE_T*13`($sp)
173         $PUSH   r20,`$FRAME-$SIZE_T*12`($sp)
174         $PUSH   r21,`$FRAME-$SIZE_T*11`($sp)
175         $PUSH   r22,`$FRAME-$SIZE_T*10`($sp)
176         $PUSH   r23,`$FRAME-$SIZE_T*9`($sp)
177         $PUSH   r24,`$FRAME-$SIZE_T*8`($sp)
178         $PUSH   r25,`$FRAME-$SIZE_T*7`($sp)
179         $PUSH   r26,`$FRAME-$SIZE_T*6`($sp)
180         $PUSH   r27,`$FRAME-$SIZE_T*5`($sp)
181         $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
182         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
183         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
184         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
185
186         $LD     $A,`0*$SZ`($ctx)
187         $LD     $B,`1*$SZ`($ctx)
188         $LD     $C,`2*$SZ`($ctx)
189         $LD     $D,`3*$SZ`($ctx)
190         $LD     $E,`4*$SZ`($ctx)
191         $LD     $F,`5*$SZ`($ctx)
192         $LD     $G,`6*$SZ`($ctx)
193         $LD     $H,`7*$SZ`($ctx)
194
195         b       LPICmeup
196 LPICedup:
197         andi.   r0,$inp,3
198         bne     Lunaligned
199 Laligned:
200         add     $t0,$inp,$num
201         $PUSH   $t0,`$FRAME-$SIZE_T*23`($sp)    ; end pointer
202         bl      Lsha2_block_private
203 Ldone:
204         $POP    r0,`$FRAME-$SIZE_T*21`($sp)
205         $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
206         $POP    r13,`$FRAME-$SIZE_T*19`($sp)
207         $POP    r14,`$FRAME-$SIZE_T*18`($sp)
208         $POP    r15,`$FRAME-$SIZE_T*17`($sp)
209         $POP    r16,`$FRAME-$SIZE_T*16`($sp)
210         $POP    r17,`$FRAME-$SIZE_T*15`($sp)
211         $POP    r18,`$FRAME-$SIZE_T*14`($sp)
212         $POP    r19,`$FRAME-$SIZE_T*13`($sp)
213         $POP    r20,`$FRAME-$SIZE_T*12`($sp)
214         $POP    r21,`$FRAME-$SIZE_T*11`($sp)
215         $POP    r22,`$FRAME-$SIZE_T*10`($sp)
216         $POP    r23,`$FRAME-$SIZE_T*9`($sp)
217         $POP    r24,`$FRAME-$SIZE_T*8`($sp)
218         $POP    r25,`$FRAME-$SIZE_T*7`($sp)
219         $POP    r26,`$FRAME-$SIZE_T*6`($sp)
220         $POP    r27,`$FRAME-$SIZE_T*5`($sp)
221         $POP    r28,`$FRAME-$SIZE_T*4`($sp)
222         $POP    r29,`$FRAME-$SIZE_T*3`($sp)
223         $POP    r30,`$FRAME-$SIZE_T*2`($sp)
224         $POP    r31,`$FRAME-$SIZE_T*1`($sp)
225         mtlr    r0
226         addi    $sp,$sp,`$FRAME+16*$SZ`
227         blr
228 ___
229
230 # PowerPC specification allows an implementation to be ill-behaved
231 # upon unaligned access which crosses page boundary. "Better safe
232 # than sorry" principle makes me treat it specially. But I don't
233 # look for particular offending word, but rather for the input
234 # block which crosses the boundary. Once found that block is aligned
235 # and hashed separately...
236 $code.=<<___;
237 .align  4
238 Lunaligned:
239         subfic  $t1,$inp,4096
240         andi.   $t1,$t1,`4096-16*$SZ`   ; distance to closest page boundary
241         beq     Lcross_page
242         $UCMP   $num,$t1
243         ble-    Laligned                ; didn't cross the page boundary
244         subfc   $num,$t1,$num
245         add     $t0,$inp,$t1
246         $PUSH   $num,`$FRAME-$SIZE_T*24`($sp)
247         $PUSH   $t0,`$FRAME-$SIZE_T*23`($sp)    ; end pointer
248         bl      Lsha2_block_private
249         $POP    $num,`$FRAME-$SIZE_T*24`($sp)
250 Lcross_page:
251         li      $t1,`16*$SZ/4`
252         mtctr   $t1
253         addi    r20,$sp,$FRAME  ; spot below the frame
254 Lmemcpy:
255         lbz     r16,0($inp)
256         lbz     r17,1($inp)
257         lbz     r18,2($inp)
258         lbz     r19,3($inp)
259         addi    $inp,$inp,4
260         stb     r16,0(r20)
261         stb     r17,1(r20)
262         stb     r18,2(r20)
263         stb     r19,3(r20)
264         addi    r20,r20,4
265         bdnz    Lmemcpy
266
267         $PUSH   $inp,`$FRAME-$SIZE_T*25`($sp)
268         addi    $inp,$sp,$FRAME
269         addi    $t0,$sp,`$FRAME+16*$SZ`
270         $PUSH   $num,`$FRAME-$SIZE_T*24`($sp)
271         $PUSH   $t0,`$FRAME-$SIZE_T*23`($sp)    ; end pointer
272         bl      Lsha2_block_private
273         $POP    $inp,`$FRAME-$SIZE_T*25`($sp)
274         $POP    $num,`$FRAME-$SIZE_T*24`($sp)
275         addic.  $num,$num,`-16*$SZ`
276         bne-    Lunaligned
277         b       Ldone
278 ___
279
280 $code.=<<___;
281 .align  4
282 Lsha2_block_private:
283 ___
284 for($i=0;$i<16;$i++) {
285 $code.=<<___ if ($SZ==4);
286         lwz     @X[$i],`$i*$SZ`($inp)
287 ___
288 # 64-bit loads are split to 2x32-bit ones, as CPU can't handle
289 # unaligned 64-bit loads, only 32-bit ones...
290 $code.=<<___ if ($SZ==8);
291         lwz     $t0,`$i*$SZ`($inp)
292         lwz     @X[$i],`$i*$SZ+4`($inp)
293         insrdi  @X[$i],$t0,32,0
294 ___
295         &ROUND_00_15($i,@V);
296         unshift(@V,pop(@V));
297 }
298 $code.=<<___;
299         li      $T,`$rounds/16-1`
300         mtctr   $T
301 .align  4
302 Lrounds:
303         addi    $Tbl,$Tbl,`16*$SZ`
304 ___
305 for(;$i<32;$i++) {
306         &ROUND_16_xx($i,@V);
307         unshift(@V,pop(@V));
308 }
309 $code.=<<___;
310         bdnz-   Lrounds
311
312         subi    $Tbl,$Tbl,`($rounds-16)*$SZ`
313         $POP    $ctx,`$FRAME-$SIZE_T*22`($sp)
314         $POP    $num,`$FRAME-$SIZE_T*23`($sp)   ; end pointer
315
316         $LD     r16,`0*$SZ`($ctx)
317         $LD     r17,`1*$SZ`($ctx)
318         $LD     r18,`2*$SZ`($ctx)
319         $LD     r19,`3*$SZ`($ctx)
320         $LD     r20,`4*$SZ`($ctx)
321         $LD     r21,`5*$SZ`($ctx)
322         $LD     r22,`6*$SZ`($ctx)
323         $LD     r23,`7*$SZ`($ctx)
324         add     $A,$A,r16
325         add     $B,$B,r17
326         add     $C,$C,r18
327         $ST     $A,`0*$SZ`($ctx)
328         add     $D,$D,r19
329         $ST     $B,`1*$SZ`($ctx)
330         add     $E,$E,r20
331         $ST     $C,`2*$SZ`($ctx)
332         add     $F,$F,r21
333         $ST     $D,`3*$SZ`($ctx)
334         add     $G,$G,r22
335         $ST     $E,`4*$SZ`($ctx)
336         add     $H,$H,r23
337         $ST     $F,`5*$SZ`($ctx)
338         addi    $inp,$inp,`16*$SZ`
339         $ST     $G,`6*$SZ`($ctx)
340         $UCMP   $inp,$num
341         $ST     $H,`7*$SZ`($ctx)
342         bne     Lsha2_block_private
343         blr
344 ___
345
346 # Ugly hack here, because PPC assembler syntax seem to vary too
347 # much from platforms to platform...
348 $code.=<<___;
349 .align  6
350 LPICmeup:
351         bl      LPIC
352         b       LPICedup
353         nop
354         nop
355         nop
356         nop
357         nop
358         nop
359 LPIC:   mflr    $Tbl
360         addi    $Tbl,$Tbl,`64-4`        ; "distance" between bl and last nop
361         blr
362         nop
363         nop
364         nop
365         nop
366         nop
367 ___
368 $code.=<<___ if ($SZ==8);
369         .long   0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
370         .long   0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
371         .long   0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
372         .long   0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
373         .long   0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
374         .long   0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
375         .long   0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
376         .long   0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
377         .long   0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
378         .long   0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
379         .long   0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
380         .long   0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
381         .long   0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
382         .long   0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
383         .long   0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
384         .long   0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
385         .long   0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
386         .long   0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
387         .long   0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
388         .long   0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
389         .long   0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
390         .long   0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
391         .long   0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
392         .long   0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
393         .long   0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
394         .long   0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
395         .long   0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
396         .long   0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
397         .long   0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
398         .long   0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
399         .long   0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
400         .long   0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
401         .long   0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
402         .long   0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
403         .long   0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
404         .long   0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
405         .long   0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
406         .long   0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
407         .long   0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
408         .long   0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
409 ___
410 $code.=<<___ if ($SZ==4);
411         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
412         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
413         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
414         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
415         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
416         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
417         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
418         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
419         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
420         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
421         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
422         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
423         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
424         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
425         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
426         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
427 ___
428
429 $code =~ s/\`([^\`]*)\`/eval $1/gem;
430 print $code;
431 close STDOUT;