3aa9655a825b9b5c08154c2cc90c8355f329ae2b
[openssl.git] / crypto / sha / asm / sha1-ppc.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
8
9 # I let hardware handle unaligned input, except on page boundaries
10 # (see below for details). Otherwise straightforward implementation
11 # with X vector in register bank. The module is big-endian [which is
12 # not big deal as there're no little-endian targets left around].
13
14 # gcc-4.0.0     -m64    -m32
15 # --------------------------
16 # sha1          +76%    +59%
17
18 $output = shift;
19
20 if ($output =~ /64\.s/) {
21         $SIZE_T =8;
22         $RZONE  =288;
23         $UCMP   ="cmpld";
24         $STU    ="stdu";
25         $POP    ="ld";
26         $PUSH   ="std";
27 } elsif ($output =~ /32\.s/) {
28         $SIZE_T =4;
29         $RZONE  =224;
30         $UCMP   ="cmplw";
31         $STU    ="stwu";
32         $POP    ="lwz";
33         $PUSH   ="stw";
34 } else { die "nonsense $output"; }
35
36 ( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) ||
37         die "can't call ../perlasm/ppc-xlate.pl: $!";
38
39 $FRAME=24*$SIZE_T;
40
41 $K  ="r0";
42 $sp ="r1";
43 $toc="r2";
44 $ctx="r3";
45 $inp="r4";
46 $num="r5";
47 $t0 ="r15";
48 $t1 ="r6";
49
50 $A  ="r7";
51 $B  ="r8";
52 $C  ="r9";
53 $D  ="r10";
54 $E  ="r11";
55 $T  ="r12";
56
57 @V=($A,$B,$C,$D,$E,$T);
58 @X=("r16","r17","r18","r19","r20","r21","r22","r23",
59     "r24","r25","r26","r27","r28","r29","r30","r31");
60
61 sub BODY_00_19 {
62 my ($i,$a,$b,$c,$d,$e,$f)=@_;
63 my $j=$i+1;
64 $code.=<<___ if ($i==0);
65         lwz     @X[$i],`$i*4`($inp)
66 ___
67 $code.=<<___ if ($i<15);
68         lwz     @X[$j],`$j*4`($inp)
69         add     $f,$K,$e
70         rotlwi  $e,$a,5
71         add     $f,$f,@X[$i]
72         and     $t0,$c,$b
73         add     $f,$f,$e
74         andc    $t1,$d,$b
75         rotlwi  $b,$b,30
76         or      $t0,$t0,$t1
77         add     $f,$f,$t0
78 ___
79 $code.=<<___ if ($i>=15);
80         add     $f,$K,$e
81         rotlwi  $e,$a,5
82         xor     @X[$j%16],@X[$j%16],@X[($j+2)%16]
83         add     $f,$f,@X[$i%16]
84         and     $t0,$c,$b
85         xor     @X[$j%16],@X[$j%16],@X[($j+8)%16]
86         add     $f,$f,$e
87         andc    $t1,$d,$b
88         rotlwi  $b,$b,30
89         or      $t0,$t0,$t1
90         xor     @X[$j%16],@X[$j%16],@X[($j+13)%16]
91         add     $f,$f,$t0
92         rotlwi  @X[$j%16],@X[$j%16],1
93 ___
94 }
95
96 sub BODY_20_39 {
97 my ($i,$a,$b,$c,$d,$e,$f)=@_;
98 my $j=$i+1;
99 $code.=<<___ if ($i<79);
100         add     $f,$K,$e
101         rotlwi  $e,$a,5
102         xor     @X[$j%16],@X[$j%16],@X[($j+2)%16]
103         add     $f,$f,@X[$i%16]
104         xor     $t0,$b,$c
105         xor     @X[$j%16],@X[$j%16],@X[($j+8)%16]
106         add     $f,$f,$e
107         rotlwi  $b,$b,30
108         xor     $t0,$t0,$d
109         xor     @X[$j%16],@X[$j%16],@X[($j+13)%16]
110         add     $f,$f,$t0
111         rotlwi  @X[$j%16],@X[$j%16],1
112 ___
113 $code.=<<___ if ($i==79);
114         add     $f,$K,$e
115         rotlwi  $e,$a,5
116         lwz     r16,0($ctx)
117         add     $f,$f,@X[$i%16]
118         xor     $t0,$b,$c
119         lwz     r17,4($ctx)
120         add     $f,$f,$e
121         rotlwi  $b,$b,30
122         lwz     r18,8($ctx)
123         xor     $t0,$t0,$d
124         lwz     r19,12($ctx)
125         add     $f,$f,$t0
126         lwz     r20,16($ctx)
127 ___
128 }
129
130 sub BODY_40_59 {
131 my ($i,$a,$b,$c,$d,$e,$f)=@_;
132 my $j=$i+1;
133 $code.=<<___;
134         add     $f,$K,$e
135         rotlwi  $e,$a,5
136         xor     @X[$j%16],@X[$j%16],@X[($j+2)%16]
137         add     $f,$f,@X[$i%16]
138         and     $t0,$b,$c
139         xor     @X[$j%16],@X[$j%16],@X[($j+8)%16]
140         add     $f,$f,$e
141         or      $t1,$b,$c
142         rotlwi  $b,$b,30
143         xor     @X[$j%16],@X[$j%16],@X[($j+13)%16]
144         and     $t1,$t1,$d
145         or      $t0,$t0,$t1
146         rotlwi  @X[$j%16],@X[$j%16],1
147         add     $f,$f,$t0
148 ___
149 }
150
151 $code=<<___;
152 .machine "any"
153 .text
154
155 .globl  .sha1_block_asm_data_order
156 .align  4
157 .sha1_block_asm_data_order:
158         mflr    r0
159         $STU    $sp,`-($FRAME+64+$RZONE)`($sp)
160         $PUSH   r0,`$FRAME-$SIZE_T*18`($sp)
161         $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
162         $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
163         $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
164         $PUSH   r18,`$FRAME-$SIZE_T*14`($sp)
165         $PUSH   r19,`$FRAME-$SIZE_T*13`($sp)
166         $PUSH   r20,`$FRAME-$SIZE_T*12`($sp)
167         $PUSH   r21,`$FRAME-$SIZE_T*11`($sp)
168         $PUSH   r22,`$FRAME-$SIZE_T*10`($sp)
169         $PUSH   r23,`$FRAME-$SIZE_T*9`($sp)
170         $PUSH   r24,`$FRAME-$SIZE_T*8`($sp)
171         $PUSH   r25,`$FRAME-$SIZE_T*7`($sp)
172         $PUSH   r26,`$FRAME-$SIZE_T*6`($sp)
173         $PUSH   r27,`$FRAME-$SIZE_T*5`($sp)
174         $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
175         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
176         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
177         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
178         lwz     $A,0($ctx)
179         lwz     $B,4($ctx)
180         lwz     $C,8($ctx)
181         lwz     $D,12($ctx)
182         lwz     $E,16($ctx)
183         andi.   r0,$inp,3
184         bne     Lunaligned
185 Laligned:
186         mtctr   $num
187         bl      Lsha1_block_private
188 Ldone:
189         $POP    r0,`$FRAME-$SIZE_T*18`($sp)
190         $POP    r15,`$FRAME-$SIZE_T*17`($sp)
191         $POP    r16,`$FRAME-$SIZE_T*16`($sp)
192         $POP    r17,`$FRAME-$SIZE_T*15`($sp)
193         $POP    r18,`$FRAME-$SIZE_T*14`($sp)
194         $POP    r19,`$FRAME-$SIZE_T*13`($sp)
195         $POP    r20,`$FRAME-$SIZE_T*12`($sp)
196         $POP    r21,`$FRAME-$SIZE_T*11`($sp)
197         $POP    r22,`$FRAME-$SIZE_T*10`($sp)
198         $POP    r23,`$FRAME-$SIZE_T*9`($sp)
199         $POP    r24,`$FRAME-$SIZE_T*8`($sp)
200         $POP    r25,`$FRAME-$SIZE_T*7`($sp)
201         $POP    r26,`$FRAME-$SIZE_T*6`($sp)
202         $POP    r27,`$FRAME-$SIZE_T*5`($sp)
203         $POP    r28,`$FRAME-$SIZE_T*4`($sp)
204         $POP    r29,`$FRAME-$SIZE_T*3`($sp)
205         $POP    r30,`$FRAME-$SIZE_T*2`($sp)
206         $POP    r31,`$FRAME-$SIZE_T*1`($sp)
207         mtlr    r0
208         addi    $sp,$sp,`$FRAME+64+$RZONE`
209         blr
210 ___
211
212 # PowerPC specification allows an implementation to be ill-behaved
213 # upon unaligned access which crosses page boundary. "Better safe
214 # than sorry" principle makes me treat it specially. But I don't
215 # look for particular offending word, but rather for 64-byte input
216 # block which crosses the boundary. Once found that block is aligned
217 # and hashed separately...
218 $code.=<<___;
219 .align  4
220 Lunaligned:
221         li      $t1,4096
222         subf    $t1,$inp,$t1
223         andi.   $t1,$t1,4095    ; distance to closest page boundary
224         srwi.   $t1,$t1,6       ; t1/=64
225         beq     Lcross_page
226         $UCMP   $num,$t1
227         ble-    Laligned        ; didn't cross the page boundary
228         mtctr   $t1
229         subf    $num,$t1,$num
230         bl      Lsha1_block_private
231 Lcross_page:
232         li      $t1,16
233         mtctr   $t1
234         addi    r20,$sp,$FRAME  ; spot below the frame
235 Lmemcpy:
236         lbz     r16,0($inp)
237         lbz     r17,1($inp)
238         lbz     r18,2($inp)
239         lbz     r19,3($inp)
240         addi    $inp,$inp,4
241         stb     r16,0(r20)
242         stb     r17,1(r20)
243         stb     r18,2(r20)
244         stb     r19,3(r20)
245         addi    r20,r20,4
246         bdnz    Lmemcpy
247
248         $PUSH   $inp,`$FRAME-$SIZE_T*19`($sp)
249         li      $t1,1
250         addi    $inp,$sp,$FRAME
251         mtctr   $t1
252         bl      Lsha1_block_private
253         $POP    $inp,`$FRAME-$SIZE_T*19`($sp)
254         addic.  $num,$num,-1
255         bne-    Lunaligned
256         b       Ldone
257 ___
258
259 # This is private block function, which uses tailored calling
260 # interface, namely upon entry SHA_CTX is pre-loaded to given
261 # registers and counter register contains amount of chunks to
262 # digest...
263 $code.=<<___;
264 .align  4
265 Lsha1_block_private:
266 ___
267 $code.=<<___;   # load K_00_19
268         lis     $K,0x5a82
269         ori     $K,$K,0x7999
270 ___
271 for($i=0;$i<20;$i++)    { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
272 $code.=<<___;   # load K_20_39
273         lis     $K,0x6ed9
274         ori     $K,$K,0xeba1
275 ___
276 for(;$i<40;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
277 $code.=<<___;   # load K_40_59
278         lis     $K,0x8f1b
279         ori     $K,$K,0xbcdc
280 ___
281 for(;$i<60;$i++)        { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
282 $code.=<<___;   # load K_60_79
283         lis     $K,0xca62
284         ori     $K,$K,0xc1d6
285 ___
286 for(;$i<80;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
287 $code.=<<___;
288         add     r16,r16,$E
289         add     r17,r17,$T
290         add     r18,r18,$A
291         add     r19,r19,$B
292         add     r20,r20,$C
293         stw     r16,0($ctx)
294         mr      $A,r16
295         stw     r17,4($ctx)
296         mr      $B,r17
297         stw     r18,8($ctx)
298         mr      $C,r18
299         stw     r19,12($ctx)
300         mr      $D,r19
301         stw     r20,16($ctx)
302         mr      $E,r20
303         addi    $inp,$inp,`16*4`
304         bdnz-   Lsha1_block_private
305         blr
306 ___
307
308 $code =~ s/\`([^\`]*)\`/eval $1/gem;
309 print $code;
310 close STDOUT;