crypto/sha/asm/sha*-x86_64.pl: comply with Win64 ABI.
[openssl.git] / crypto / sha / asm / sha1-ppc.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # I let hardware handle unaligned input(*), except on page boundaries
11 # (see below for details). Otherwise straightforward implementation
12 # with X vector in register bank. The module is big-endian [which is
13 # not big deal as there're no little-endian targets left around].
14 #
15 # (*) this means that this module is inappropriate for PPC403? Does
16 #     anybody know if pre-POWER3 can sustain unaligned load?
17
18 #                       -m64    -m32
19 # ----------------------------------
20 # PPC970,gcc-4.0.0      +76%    +59%
21 # Power6,xlc-7          +68%    +33%
22
23 $flavour = shift;
24
25 if ($flavour =~ /64/) {
26         $SIZE_T =8;
27         $LRSAVE =2*$SIZE_T;
28         $UCMP   ="cmpld";
29         $STU    ="stdu";
30         $POP    ="ld";
31         $PUSH   ="std";
32 } elsif ($flavour =~ /32/) {
33         $SIZE_T =4;
34         $LRSAVE =$SIZE_T;
35         $UCMP   ="cmplw";
36         $STU    ="stwu";
37         $POP    ="lwz";
38         $PUSH   ="stw";
39 } else { die "nonsense $flavour"; }
40
41 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
43 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
44 die "can't locate ppc-xlate.pl";
45
46 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
47
48 $FRAME=24*$SIZE_T+64;
49 $LOCALS=6*$SIZE_T;
50
51 $K  ="r0";
52 $sp ="r1";
53 $toc="r2";
54 $ctx="r3";
55 $inp="r4";
56 $num="r5";
57 $t0 ="r15";
58 $t1 ="r6";
59
60 $A  ="r7";
61 $B  ="r8";
62 $C  ="r9";
63 $D  ="r10";
64 $E  ="r11";
65 $T  ="r12";
66
67 @V=($A,$B,$C,$D,$E,$T);
68 @X=("r16","r17","r18","r19","r20","r21","r22","r23",
69     "r24","r25","r26","r27","r28","r29","r30","r31");
70
71 sub BODY_00_19 {
72 my ($i,$a,$b,$c,$d,$e,$f)=@_;
73 my $j=$i+1;
74 $code.=<<___ if ($i==0);
75         lwz     @X[$i],`$i*4`($inp)
76 ___
77 $code.=<<___ if ($i<15);
78         lwz     @X[$j],`$j*4`($inp)
79         add     $f,$K,$e
80         rotlwi  $e,$a,5
81         add     $f,$f,@X[$i]
82         and     $t0,$c,$b
83         add     $f,$f,$e
84         andc    $t1,$d,$b
85         rotlwi  $b,$b,30
86         or      $t0,$t0,$t1
87         add     $f,$f,$t0
88 ___
89 $code.=<<___ if ($i>=15);
90         add     $f,$K,$e
91         rotlwi  $e,$a,5
92         xor     @X[$j%16],@X[$j%16],@X[($j+2)%16]
93         add     $f,$f,@X[$i%16]
94         and     $t0,$c,$b
95         xor     @X[$j%16],@X[$j%16],@X[($j+8)%16]
96         add     $f,$f,$e
97         andc    $t1,$d,$b
98         rotlwi  $b,$b,30
99         or      $t0,$t0,$t1
100         xor     @X[$j%16],@X[$j%16],@X[($j+13)%16]
101         add     $f,$f,$t0
102         rotlwi  @X[$j%16],@X[$j%16],1
103 ___
104 }
105
106 sub BODY_20_39 {
107 my ($i,$a,$b,$c,$d,$e,$f)=@_;
108 my $j=$i+1;
109 $code.=<<___ if ($i<79);
110         add     $f,$K,$e
111         rotlwi  $e,$a,5
112         xor     @X[$j%16],@X[$j%16],@X[($j+2)%16]
113         add     $f,$f,@X[$i%16]
114         xor     $t0,$b,$c
115         xor     @X[$j%16],@X[$j%16],@X[($j+8)%16]
116         add     $f,$f,$e
117         rotlwi  $b,$b,30
118         xor     $t0,$t0,$d
119         xor     @X[$j%16],@X[$j%16],@X[($j+13)%16]
120         add     $f,$f,$t0
121         rotlwi  @X[$j%16],@X[$j%16],1
122 ___
123 $code.=<<___ if ($i==79);
124         add     $f,$K,$e
125         rotlwi  $e,$a,5
126         lwz     r16,0($ctx)
127         add     $f,$f,@X[$i%16]
128         xor     $t0,$b,$c
129         lwz     r17,4($ctx)
130         add     $f,$f,$e
131         rotlwi  $b,$b,30
132         lwz     r18,8($ctx)
133         xor     $t0,$t0,$d
134         lwz     r19,12($ctx)
135         add     $f,$f,$t0
136         lwz     r20,16($ctx)
137 ___
138 }
139
140 sub BODY_40_59 {
141 my ($i,$a,$b,$c,$d,$e,$f)=@_;
142 my $j=$i+1;
143 $code.=<<___;
144         add     $f,$K,$e
145         rotlwi  $e,$a,5
146         xor     @X[$j%16],@X[$j%16],@X[($j+2)%16]
147         add     $f,$f,@X[$i%16]
148         and     $t0,$b,$c
149         xor     @X[$j%16],@X[$j%16],@X[($j+8)%16]
150         add     $f,$f,$e
151         or      $t1,$b,$c
152         rotlwi  $b,$b,30
153         xor     @X[$j%16],@X[$j%16],@X[($j+13)%16]
154         and     $t1,$t1,$d
155         or      $t0,$t0,$t1
156         rotlwi  @X[$j%16],@X[$j%16],1
157         add     $f,$f,$t0
158 ___
159 }
160
161 $code=<<___;
162 .machine        "any"
163 .text
164
165 .globl  .sha1_block_data_order
166 .align  4
167 .sha1_block_data_order:
168         $STU    $sp,-$FRAME($sp)
169         mflr    r0
170         $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
171         $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
172         $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
173         $PUSH   r18,`$FRAME-$SIZE_T*14`($sp)
174         $PUSH   r19,`$FRAME-$SIZE_T*13`($sp)
175         $PUSH   r20,`$FRAME-$SIZE_T*12`($sp)
176         $PUSH   r21,`$FRAME-$SIZE_T*11`($sp)
177         $PUSH   r22,`$FRAME-$SIZE_T*10`($sp)
178         $PUSH   r23,`$FRAME-$SIZE_T*9`($sp)
179         $PUSH   r24,`$FRAME-$SIZE_T*8`($sp)
180         $PUSH   r25,`$FRAME-$SIZE_T*7`($sp)
181         $PUSH   r26,`$FRAME-$SIZE_T*6`($sp)
182         $PUSH   r27,`$FRAME-$SIZE_T*5`($sp)
183         $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
184         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
185         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
186         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
187         $PUSH   r0,`$FRAME+$LRSAVE`($sp)
188         lwz     $A,0($ctx)
189         lwz     $B,4($ctx)
190         lwz     $C,8($ctx)
191         lwz     $D,12($ctx)
192         lwz     $E,16($ctx)
193         andi.   r0,$inp,3
194         bne     Lunaligned
195 Laligned:
196         mtctr   $num
197         bl      Lsha1_block_private
198         b       Ldone
199
200 ; PowerPC specification allows an implementation to be ill-behaved
201 ; upon unaligned access which crosses page boundary. "Better safe
202 ; than sorry" principle makes me treat it specially. But I don't
203 ; look for particular offending word, but rather for 64-byte input
204 ; block which crosses the boundary. Once found that block is aligned
205 ; and hashed separately...
206 .align  4
207 Lunaligned:
208         subfic  $t1,$inp,4096
209         andi.   $t1,$t1,4095    ; distance to closest page boundary
210         srwi.   $t1,$t1,6       ; t1/=64
211         beq     Lcross_page
212         $UCMP   $num,$t1
213         ble-    Laligned        ; didn't cross the page boundary
214         mtctr   $t1
215         subfc   $num,$t1,$num
216         bl      Lsha1_block_private
217 Lcross_page:
218         li      $t1,16
219         mtctr   $t1
220         addi    r20,$sp,$LOCALS ; spot within the frame
221 Lmemcpy:
222         lbz     r16,0($inp)
223         lbz     r17,1($inp)
224         lbz     r18,2($inp)
225         lbz     r19,3($inp)
226         addi    $inp,$inp,4
227         stb     r16,0(r20)
228         stb     r17,1(r20)
229         stb     r18,2(r20)
230         stb     r19,3(r20)
231         addi    r20,r20,4
232         bdnz    Lmemcpy
233
234         $PUSH   $inp,`$FRAME-$SIZE_T*18`($sp)
235         li      $t1,1
236         addi    $inp,$sp,$LOCALS
237         mtctr   $t1
238         bl      Lsha1_block_private
239         $POP    $inp,`$FRAME-$SIZE_T*18`($sp)
240         addic.  $num,$num,-1
241         bne-    Lunaligned
242
243 Ldone:
244         $POP    r0,`$FRAME+$LRSAVE`($sp)
245         $POP    r15,`$FRAME-$SIZE_T*17`($sp)
246         $POP    r16,`$FRAME-$SIZE_T*16`($sp)
247         $POP    r17,`$FRAME-$SIZE_T*15`($sp)
248         $POP    r18,`$FRAME-$SIZE_T*14`($sp)
249         $POP    r19,`$FRAME-$SIZE_T*13`($sp)
250         $POP    r20,`$FRAME-$SIZE_T*12`($sp)
251         $POP    r21,`$FRAME-$SIZE_T*11`($sp)
252         $POP    r22,`$FRAME-$SIZE_T*10`($sp)
253         $POP    r23,`$FRAME-$SIZE_T*9`($sp)
254         $POP    r24,`$FRAME-$SIZE_T*8`($sp)
255         $POP    r25,`$FRAME-$SIZE_T*7`($sp)
256         $POP    r26,`$FRAME-$SIZE_T*6`($sp)
257         $POP    r27,`$FRAME-$SIZE_T*5`($sp)
258         $POP    r28,`$FRAME-$SIZE_T*4`($sp)
259         $POP    r29,`$FRAME-$SIZE_T*3`($sp)
260         $POP    r30,`$FRAME-$SIZE_T*2`($sp)
261         $POP    r31,`$FRAME-$SIZE_T*1`($sp)
262         mtlr    r0
263         addi    $sp,$sp,$FRAME
264         blr
265         .long   0
266         .byte   0,12,4,1,0x80,18,3,0
267         .long   0
268 ___
269
270 # This is private block function, which uses tailored calling
271 # interface, namely upon entry SHA_CTX is pre-loaded to given
272 # registers and counter register contains amount of chunks to
273 # digest...
274 $code.=<<___;
275 .align  4
276 Lsha1_block_private:
277 ___
278 $code.=<<___;   # load K_00_19
279         lis     $K,0x5a82
280         ori     $K,$K,0x7999
281 ___
282 for($i=0;$i<20;$i++)    { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
283 $code.=<<___;   # load K_20_39
284         lis     $K,0x6ed9
285         ori     $K,$K,0xeba1
286 ___
287 for(;$i<40;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
288 $code.=<<___;   # load K_40_59
289         lis     $K,0x8f1b
290         ori     $K,$K,0xbcdc
291 ___
292 for(;$i<60;$i++)        { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
293 $code.=<<___;   # load K_60_79
294         lis     $K,0xca62
295         ori     $K,$K,0xc1d6
296 ___
297 for(;$i<80;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
298 $code.=<<___;
299         add     r16,r16,$E
300         add     r17,r17,$T
301         add     r18,r18,$A
302         add     r19,r19,$B
303         add     r20,r20,$C
304         stw     r16,0($ctx)
305         mr      $A,r16
306         stw     r17,4($ctx)
307         mr      $B,r17
308         stw     r18,8($ctx)
309         mr      $C,r18
310         stw     r19,12($ctx)
311         mr      $D,r19
312         stw     r20,16($ctx)
313         mr      $E,r20
314         addi    $inp,$inp,`16*4`
315         bdnz-   Lsha1_block_private
316         blr
317         .long   0
318         .byte   0,12,0x14,0,0,0,0,0
319 ___
320 $code.=<<___;
321 .asciz  "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
322 ___
323
324 $code =~ s/\`([^\`]*)\`/eval $1/gem;
325 print $code;
326 close STDOUT;