Synchronize SHA1 assembler with md32_common.h update.
[openssl.git] / crypto / sha / asm / sha1-ppc.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # I let hardware handle unaligned input(*), except on page boundaries
11 # (see below for details). Otherwise straightforward implementation
12 # with X vector in register bank. The module is big-endian [which is
13 # not big deal as there're no little-endian targets left around].
14 #
15 # (*) this means that this module is inappropriate for PPC403? Does
16 #     anybody know if pre-POWER3 can sustain unaligned load?
17
18 #                       -m64    -m32
19 # ----------------------------------
20 # PPC970,gcc-4.0.0      +76%    +59%
21
22 $output = shift;
23
24 if ($output =~ /64\.s/) {
25         $SIZE_T =8;
26         $UCMP   ="cmpld";
27         $STU    ="stdu";
28         $POP    ="ld";
29         $PUSH   ="std";
30 } elsif ($output =~ /32\.s/) {
31         $SIZE_T =4;
32         $UCMP   ="cmplw";
33         $STU    ="stwu";
34         $POP    ="lwz";
35         $PUSH   ="stw";
36 } else { die "nonsense $output"; }
37
38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
40 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
41 die "can't locate ppc-xlate.pl";
42
43 ( defined shift || open STDOUT,"| $^X $xlate $output" ) ||
44         die "can't call $xlate: $!";
45
46 $FRAME=24*$SIZE_T;
47
48 $K  ="r0";
49 $sp ="r1";
50 $toc="r2";
51 $ctx="r3";
52 $inp="r4";
53 $num="r5";
54 $t0 ="r15";
55 $t1 ="r6";
56
57 $A  ="r7";
58 $B  ="r8";
59 $C  ="r9";
60 $D  ="r10";
61 $E  ="r11";
62 $T  ="r12";
63
64 @V=($A,$B,$C,$D,$E,$T);
65 @X=("r16","r17","r18","r19","r20","r21","r22","r23",
66     "r24","r25","r26","r27","r28","r29","r30","r31");
67
68 sub BODY_00_19 {
69 my ($i,$a,$b,$c,$d,$e,$f)=@_;
70 my $j=$i+1;
71 $code.=<<___ if ($i==0);
72         lwz     @X[$i],`$i*4`($inp)
73 ___
74 $code.=<<___ if ($i<15);
75         lwz     @X[$j],`$j*4`($inp)
76         add     $f,$K,$e
77         rotlwi  $e,$a,5
78         add     $f,$f,@X[$i]
79         and     $t0,$c,$b
80         add     $f,$f,$e
81         andc    $t1,$d,$b
82         rotlwi  $b,$b,30
83         or      $t0,$t0,$t1
84         add     $f,$f,$t0
85 ___
86 $code.=<<___ if ($i>=15);
87         add     $f,$K,$e
88         rotlwi  $e,$a,5
89         xor     @X[$j%16],@X[$j%16],@X[($j+2)%16]
90         add     $f,$f,@X[$i%16]
91         and     $t0,$c,$b
92         xor     @X[$j%16],@X[$j%16],@X[($j+8)%16]
93         add     $f,$f,$e
94         andc    $t1,$d,$b
95         rotlwi  $b,$b,30
96         or      $t0,$t0,$t1
97         xor     @X[$j%16],@X[$j%16],@X[($j+13)%16]
98         add     $f,$f,$t0
99         rotlwi  @X[$j%16],@X[$j%16],1
100 ___
101 }
102
103 sub BODY_20_39 {
104 my ($i,$a,$b,$c,$d,$e,$f)=@_;
105 my $j=$i+1;
106 $code.=<<___ if ($i<79);
107         add     $f,$K,$e
108         rotlwi  $e,$a,5
109         xor     @X[$j%16],@X[$j%16],@X[($j+2)%16]
110         add     $f,$f,@X[$i%16]
111         xor     $t0,$b,$c
112         xor     @X[$j%16],@X[$j%16],@X[($j+8)%16]
113         add     $f,$f,$e
114         rotlwi  $b,$b,30
115         xor     $t0,$t0,$d
116         xor     @X[$j%16],@X[$j%16],@X[($j+13)%16]
117         add     $f,$f,$t0
118         rotlwi  @X[$j%16],@X[$j%16],1
119 ___
120 $code.=<<___ if ($i==79);
121         add     $f,$K,$e
122         rotlwi  $e,$a,5
123         lwz     r16,0($ctx)
124         add     $f,$f,@X[$i%16]
125         xor     $t0,$b,$c
126         lwz     r17,4($ctx)
127         add     $f,$f,$e
128         rotlwi  $b,$b,30
129         lwz     r18,8($ctx)
130         xor     $t0,$t0,$d
131         lwz     r19,12($ctx)
132         add     $f,$f,$t0
133         lwz     r20,16($ctx)
134 ___
135 }
136
137 sub BODY_40_59 {
138 my ($i,$a,$b,$c,$d,$e,$f)=@_;
139 my $j=$i+1;
140 $code.=<<___;
141         add     $f,$K,$e
142         rotlwi  $e,$a,5
143         xor     @X[$j%16],@X[$j%16],@X[($j+2)%16]
144         add     $f,$f,@X[$i%16]
145         and     $t0,$b,$c
146         xor     @X[$j%16],@X[$j%16],@X[($j+8)%16]
147         add     $f,$f,$e
148         or      $t1,$b,$c
149         rotlwi  $b,$b,30
150         xor     @X[$j%16],@X[$j%16],@X[($j+13)%16]
151         and     $t1,$t1,$d
152         or      $t0,$t0,$t1
153         rotlwi  @X[$j%16],@X[$j%16],1
154         add     $f,$f,$t0
155 ___
156 }
157
158 $code=<<___;
159 .text
160
161 .globl  .sha1_block_asm_data_order
162 .align  4
163 .sha1_block_asm_data_order:
164         mflr    r0
165         $STU    $sp,`-($FRAME+64)`($sp)
166         $PUSH   r0,`$FRAME-$SIZE_T*18`($sp)
167         $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
168         $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
169         $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
170         $PUSH   r18,`$FRAME-$SIZE_T*14`($sp)
171         $PUSH   r19,`$FRAME-$SIZE_T*13`($sp)
172         $PUSH   r20,`$FRAME-$SIZE_T*12`($sp)
173         $PUSH   r21,`$FRAME-$SIZE_T*11`($sp)
174         $PUSH   r22,`$FRAME-$SIZE_T*10`($sp)
175         $PUSH   r23,`$FRAME-$SIZE_T*9`($sp)
176         $PUSH   r24,`$FRAME-$SIZE_T*8`($sp)
177         $PUSH   r25,`$FRAME-$SIZE_T*7`($sp)
178         $PUSH   r26,`$FRAME-$SIZE_T*6`($sp)
179         $PUSH   r27,`$FRAME-$SIZE_T*5`($sp)
180         $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
181         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
182         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
183         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
184         lwz     $A,0($ctx)
185         lwz     $B,4($ctx)
186         lwz     $C,8($ctx)
187         lwz     $D,12($ctx)
188         lwz     $E,16($ctx)
189         andi.   r0,$inp,3
190         bne     Lunaligned
191 Laligned:
192         mtctr   $num
193         bl      Lsha1_block_private
194 Ldone:
195         $POP    r0,`$FRAME-$SIZE_T*18`($sp)
196         $POP    r15,`$FRAME-$SIZE_T*17`($sp)
197         $POP    r16,`$FRAME-$SIZE_T*16`($sp)
198         $POP    r17,`$FRAME-$SIZE_T*15`($sp)
199         $POP    r18,`$FRAME-$SIZE_T*14`($sp)
200         $POP    r19,`$FRAME-$SIZE_T*13`($sp)
201         $POP    r20,`$FRAME-$SIZE_T*12`($sp)
202         $POP    r21,`$FRAME-$SIZE_T*11`($sp)
203         $POP    r22,`$FRAME-$SIZE_T*10`($sp)
204         $POP    r23,`$FRAME-$SIZE_T*9`($sp)
205         $POP    r24,`$FRAME-$SIZE_T*8`($sp)
206         $POP    r25,`$FRAME-$SIZE_T*7`($sp)
207         $POP    r26,`$FRAME-$SIZE_T*6`($sp)
208         $POP    r27,`$FRAME-$SIZE_T*5`($sp)
209         $POP    r28,`$FRAME-$SIZE_T*4`($sp)
210         $POP    r29,`$FRAME-$SIZE_T*3`($sp)
211         $POP    r30,`$FRAME-$SIZE_T*2`($sp)
212         $POP    r31,`$FRAME-$SIZE_T*1`($sp)
213         mtlr    r0
214         addi    $sp,$sp,`$FRAME+64`
215         blr
216 ___
217
218 # PowerPC specification allows an implementation to be ill-behaved
219 # upon unaligned access which crosses page boundary. "Better safe
220 # than sorry" principle makes me treat it specially. But I don't
221 # look for particular offending word, but rather for 64-byte input
222 # block which crosses the boundary. Once found that block is aligned
223 # and hashed separately...
224 $code.=<<___;
225 .align  4
226 Lunaligned:
227         subfic  $t1,$inp,4096
228         andi.   $t1,$t1,4095    ; distance to closest page boundary
229         srwi.   $t1,$t1,6       ; t1/=64
230         beq     Lcross_page
231         $UCMP   $num,$t1
232         ble-    Laligned        ; didn't cross the page boundary
233         mtctr   $t1
234         subfc   $num,$t1,$num
235         bl      Lsha1_block_private
236 Lcross_page:
237         li      $t1,16
238         mtctr   $t1
239         addi    r20,$sp,$FRAME  ; spot below the frame
240 Lmemcpy:
241         lbz     r16,0($inp)
242         lbz     r17,1($inp)
243         lbz     r18,2($inp)
244         lbz     r19,3($inp)
245         addi    $inp,$inp,4
246         stb     r16,0(r20)
247         stb     r17,1(r20)
248         stb     r18,2(r20)
249         stb     r19,3(r20)
250         addi    r20,r20,4
251         bdnz    Lmemcpy
252
253         $PUSH   $inp,`$FRAME-$SIZE_T*19`($sp)
254         li      $t1,1
255         addi    $inp,$sp,$FRAME
256         mtctr   $t1
257         bl      Lsha1_block_private
258         $POP    $inp,`$FRAME-$SIZE_T*19`($sp)
259         addic.  $num,$num,-1
260         bne-    Lunaligned
261         b       Ldone
262 ___
263
264 # This is private block function, which uses tailored calling
265 # interface, namely upon entry SHA_CTX is pre-loaded to given
266 # registers and counter register contains amount of chunks to
267 # digest...
268 $code.=<<___;
269 .align  4
270 Lsha1_block_private:
271 ___
272 $code.=<<___;   # load K_00_19
273         lis     $K,0x5a82
274         ori     $K,$K,0x7999
275 ___
276 for($i=0;$i<20;$i++)    { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
277 $code.=<<___;   # load K_20_39
278         lis     $K,0x6ed9
279         ori     $K,$K,0xeba1
280 ___
281 for(;$i<40;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
282 $code.=<<___;   # load K_40_59
283         lis     $K,0x8f1b
284         ori     $K,$K,0xbcdc
285 ___
286 for(;$i<60;$i++)        { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
287 $code.=<<___;   # load K_60_79
288         lis     $K,0xca62
289         ori     $K,$K,0xc1d6
290 ___
291 for(;$i<80;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
292 $code.=<<___;
293         add     r16,r16,$E
294         add     r17,r17,$T
295         add     r18,r18,$A
296         add     r19,r19,$B
297         add     r20,r20,$C
298         stw     r16,0($ctx)
299         mr      $A,r16
300         stw     r17,4($ctx)
301         mr      $B,r17
302         stw     r18,8($ctx)
303         mr      $C,r18
304         stw     r19,12($ctx)
305         mr      $D,r19
306         stw     r20,16($ctx)
307         mr      $E,r20
308         addi    $inp,$inp,`16*4`
309         bdnz-   Lsha1_block_private
310         blr
311 ___
312 $code.=<<___;
313 .asciz  "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
314 ___
315
316 $code =~ s/\`([^\`]*)\`/eval $1/gem;
317 print $code;
318 close STDOUT;