aesni-sha1-x86_64.pl: refine Atom-specific optimization.
[openssl.git] / crypto / sha / asm / sha1-ppc.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # I let hardware handle unaligned input(*), except on page boundaries
11 # (see below for details). Otherwise straightforward implementation
12 # with X vector in register bank. The module is big-endian [which is
13 # not big deal as there're no little-endian targets left around].
14 #
15 # (*) this means that this module is inappropriate for PPC403? Does
16 #     anybody know if pre-POWER3 can sustain unaligned load?
17
18 #                       -m64    -m32
19 # ----------------------------------
20 # PPC970,gcc-4.0.0      +76%    +59%
21 # Power6,xlc-7          +68%    +33%
22
23 $flavour = shift;
24
25 if ($flavour =~ /64/) {
26         $SIZE_T =8;
27         $LRSAVE =2*$SIZE_T;
28         $UCMP   ="cmpld";
29         $STU    ="stdu";
30         $POP    ="ld";
31         $PUSH   ="std";
32 } elsif ($flavour =~ /32/) {
33         $SIZE_T =4;
34         $LRSAVE =$SIZE_T;
35         $UCMP   ="cmplw";
36         $STU    ="stwu";
37         $POP    ="lwz";
38         $PUSH   ="stw";
39 } else { die "nonsense $flavour"; }
40
41 # Define endianess based on flavour
42 # i.e.: linux64le
43 $LITTLE_ENDIAN=0;
44 if ($flavour =~ /le$/) {
45         die "little-endian is 64-bit only: $flavour" if ($SIZE_T == 4);
46         $LITTLE_ENDIAN=1;
47 }
48
49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
51 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
52 die "can't locate ppc-xlate.pl";
53
54 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
55
56 $FRAME=24*$SIZE_T+64;
57 $LOCALS=6*$SIZE_T;
58
59 $K  ="r0";
60 $sp ="r1";
61 $toc="r2";
62 $ctx="r3";
63 $inp="r4";
64 $num="r5";
65 $t0 ="r15";
66 $t1 ="r6";
67
68 $A  ="r7";
69 $B  ="r8";
70 $C  ="r9";
71 $D  ="r10";
72 $E  ="r11";
73 $T  ="r12";
74
75 @V=($A,$B,$C,$D,$E,$T);
76 @X=("r16","r17","r18","r19","r20","r21","r22","r23",
77     "r24","r25","r26","r27","r28","r29","r30","r31");
78
79 sub loadbe {
80 my ($dst, $src, $temp_reg) = @_;
81 $code.=<<___ if (!$LITTLE_ENDIAN);
82         lwz     $dst,$src
83 ___
84 $code.=<<___ if ($LITTLE_ENDIAN);
85         lwz     $temp_reg,$src
86         rotlwi  $dst,$temp_reg,8
87         rlwimi  $dst,$temp_reg,24,0,7
88         rlwimi  $dst,$temp_reg,24,16,23
89 ___
90 }
91
92 sub BODY_00_19 {
93 my ($i,$a,$b,$c,$d,$e,$f)=@_;
94 my $j=$i+1;
95
96         # Since the last value of $f is discarded, we can use
97         # it as a temp reg to swap byte-order when needed.
98         loadbe("@X[$i]","`$i*4`($inp)",$f) if ($i==0);
99         loadbe("@X[$j]","`$j*4`($inp)",$f) if ($i<15);
100 $code.=<<___ if ($i<15);
101         add     $f,$K,$e
102         rotlwi  $e,$a,5
103         add     $f,$f,@X[$i]
104         and     $t0,$c,$b
105         add     $f,$f,$e
106         andc    $t1,$d,$b
107         rotlwi  $b,$b,30
108         or      $t0,$t0,$t1
109         add     $f,$f,$t0
110 ___
111 $code.=<<___ if ($i>=15);
112         add     $f,$K,$e
113         rotlwi  $e,$a,5
114         xor     @X[$j%16],@X[$j%16],@X[($j+2)%16]
115         add     $f,$f,@X[$i%16]
116         and     $t0,$c,$b
117         xor     @X[$j%16],@X[$j%16],@X[($j+8)%16]
118         add     $f,$f,$e
119         andc    $t1,$d,$b
120         rotlwi  $b,$b,30
121         or      $t0,$t0,$t1
122         xor     @X[$j%16],@X[$j%16],@X[($j+13)%16]
123         add     $f,$f,$t0
124         rotlwi  @X[$j%16],@X[$j%16],1
125 ___
126 }
127
128 sub BODY_20_39 {
129 my ($i,$a,$b,$c,$d,$e,$f)=@_;
130 my $j=$i+1;
131 $code.=<<___ if ($i<79);
132         add     $f,$K,$e
133         rotlwi  $e,$a,5
134         xor     @X[$j%16],@X[$j%16],@X[($j+2)%16]
135         add     $f,$f,@X[$i%16]
136         xor     $t0,$b,$c
137         xor     @X[$j%16],@X[$j%16],@X[($j+8)%16]
138         add     $f,$f,$e
139         rotlwi  $b,$b,30
140         xor     $t0,$t0,$d
141         xor     @X[$j%16],@X[$j%16],@X[($j+13)%16]
142         add     $f,$f,$t0
143         rotlwi  @X[$j%16],@X[$j%16],1
144 ___
145 $code.=<<___ if ($i==79);
146         add     $f,$K,$e
147         rotlwi  $e,$a,5
148         lwz     r16,0($ctx)
149         add     $f,$f,@X[$i%16]
150         xor     $t0,$b,$c
151         lwz     r17,4($ctx)
152         add     $f,$f,$e
153         rotlwi  $b,$b,30
154         lwz     r18,8($ctx)
155         xor     $t0,$t0,$d
156         lwz     r19,12($ctx)
157         add     $f,$f,$t0
158         lwz     r20,16($ctx)
159 ___
160 }
161
162 sub BODY_40_59 {
163 my ($i,$a,$b,$c,$d,$e,$f)=@_;
164 my $j=$i+1;
165 $code.=<<___;
166         add     $f,$K,$e
167         rotlwi  $e,$a,5
168         xor     @X[$j%16],@X[$j%16],@X[($j+2)%16]
169         add     $f,$f,@X[$i%16]
170         and     $t0,$b,$c
171         xor     @X[$j%16],@X[$j%16],@X[($j+8)%16]
172         add     $f,$f,$e
173         or      $t1,$b,$c
174         rotlwi  $b,$b,30
175         xor     @X[$j%16],@X[$j%16],@X[($j+13)%16]
176         and     $t1,$t1,$d
177         or      $t0,$t0,$t1
178         rotlwi  @X[$j%16],@X[$j%16],1
179         add     $f,$f,$t0
180 ___
181 }
182
183 $code=<<___;
184 .machine        "any"
185 .text
186
187 .globl  .sha1_block_data_order
188 .align  4
189 .sha1_block_data_order:
190         $STU    $sp,-$FRAME($sp)
191         mflr    r0
192         $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
193         $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
194         $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
195         $PUSH   r18,`$FRAME-$SIZE_T*14`($sp)
196         $PUSH   r19,`$FRAME-$SIZE_T*13`($sp)
197         $PUSH   r20,`$FRAME-$SIZE_T*12`($sp)
198         $PUSH   r21,`$FRAME-$SIZE_T*11`($sp)
199         $PUSH   r22,`$FRAME-$SIZE_T*10`($sp)
200         $PUSH   r23,`$FRAME-$SIZE_T*9`($sp)
201         $PUSH   r24,`$FRAME-$SIZE_T*8`($sp)
202         $PUSH   r25,`$FRAME-$SIZE_T*7`($sp)
203         $PUSH   r26,`$FRAME-$SIZE_T*6`($sp)
204         $PUSH   r27,`$FRAME-$SIZE_T*5`($sp)
205         $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
206         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
207         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
208         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
209         $PUSH   r0,`$FRAME+$LRSAVE`($sp)
210         lwz     $A,0($ctx)
211         lwz     $B,4($ctx)
212         lwz     $C,8($ctx)
213         lwz     $D,12($ctx)
214         lwz     $E,16($ctx)
215         andi.   r0,$inp,3
216         bne     Lunaligned
217 Laligned:
218         mtctr   $num
219         bl      Lsha1_block_private
220         b       Ldone
221
222 ; PowerPC specification allows an implementation to be ill-behaved
223 ; upon unaligned access which crosses page boundary. "Better safe
224 ; than sorry" principle makes me treat it specially. But I don't
225 ; look for particular offending word, but rather for 64-byte input
226 ; block which crosses the boundary. Once found that block is aligned
227 ; and hashed separately...
228 .align  4
229 Lunaligned:
230         subfic  $t1,$inp,4096
231         andi.   $t1,$t1,4095    ; distance to closest page boundary
232         srwi.   $t1,$t1,6       ; t1/=64
233         beq     Lcross_page
234         $UCMP   $num,$t1
235         ble-    Laligned        ; didn't cross the page boundary
236         mtctr   $t1
237         subfc   $num,$t1,$num
238         bl      Lsha1_block_private
239 Lcross_page:
240         li      $t1,16
241         mtctr   $t1
242         addi    r20,$sp,$LOCALS ; spot within the frame
243 Lmemcpy:
244         lbz     r16,0($inp)
245         lbz     r17,1($inp)
246         lbz     r18,2($inp)
247         lbz     r19,3($inp)
248         addi    $inp,$inp,4
249         stb     r16,0(r20)
250         stb     r17,1(r20)
251         stb     r18,2(r20)
252         stb     r19,3(r20)
253         addi    r20,r20,4
254         bdnz    Lmemcpy
255
256         $PUSH   $inp,`$FRAME-$SIZE_T*18`($sp)
257         li      $t1,1
258         addi    $inp,$sp,$LOCALS
259         mtctr   $t1
260         bl      Lsha1_block_private
261         $POP    $inp,`$FRAME-$SIZE_T*18`($sp)
262         addic.  $num,$num,-1
263         bne-    Lunaligned
264
265 Ldone:
266         $POP    r0,`$FRAME+$LRSAVE`($sp)
267         $POP    r15,`$FRAME-$SIZE_T*17`($sp)
268         $POP    r16,`$FRAME-$SIZE_T*16`($sp)
269         $POP    r17,`$FRAME-$SIZE_T*15`($sp)
270         $POP    r18,`$FRAME-$SIZE_T*14`($sp)
271         $POP    r19,`$FRAME-$SIZE_T*13`($sp)
272         $POP    r20,`$FRAME-$SIZE_T*12`($sp)
273         $POP    r21,`$FRAME-$SIZE_T*11`($sp)
274         $POP    r22,`$FRAME-$SIZE_T*10`($sp)
275         $POP    r23,`$FRAME-$SIZE_T*9`($sp)
276         $POP    r24,`$FRAME-$SIZE_T*8`($sp)
277         $POP    r25,`$FRAME-$SIZE_T*7`($sp)
278         $POP    r26,`$FRAME-$SIZE_T*6`($sp)
279         $POP    r27,`$FRAME-$SIZE_T*5`($sp)
280         $POP    r28,`$FRAME-$SIZE_T*4`($sp)
281         $POP    r29,`$FRAME-$SIZE_T*3`($sp)
282         $POP    r30,`$FRAME-$SIZE_T*2`($sp)
283         $POP    r31,`$FRAME-$SIZE_T*1`($sp)
284         mtlr    r0
285         addi    $sp,$sp,$FRAME
286         blr
287         .long   0
288         .byte   0,12,4,1,0x80,18,3,0
289         .long   0
290 ___
291
292 # This is private block function, which uses tailored calling
293 # interface, namely upon entry SHA_CTX is pre-loaded to given
294 # registers and counter register contains amount of chunks to
295 # digest...
296 $code.=<<___;
297 .align  4
298 Lsha1_block_private:
299 ___
300 $code.=<<___;   # load K_00_19
301         lis     $K,0x5a82
302         ori     $K,$K,0x7999
303 ___
304 for($i=0;$i<20;$i++)    { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
305 $code.=<<___;   # load K_20_39
306         lis     $K,0x6ed9
307         ori     $K,$K,0xeba1
308 ___
309 for(;$i<40;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
310 $code.=<<___;   # load K_40_59
311         lis     $K,0x8f1b
312         ori     $K,$K,0xbcdc
313 ___
314 for(;$i<60;$i++)        { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
315 $code.=<<___;   # load K_60_79
316         lis     $K,0xca62
317         ori     $K,$K,0xc1d6
318 ___
319 for(;$i<80;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
320 $code.=<<___;
321         add     r16,r16,$E
322         add     r17,r17,$T
323         add     r18,r18,$A
324         add     r19,r19,$B
325         add     r20,r20,$C
326         stw     r16,0($ctx)
327         mr      $A,r16
328         stw     r17,4($ctx)
329         mr      $B,r17
330         stw     r18,8($ctx)
331         mr      $C,r18
332         stw     r19,12($ctx)
333         mr      $D,r19
334         stw     r20,16($ctx)
335         mr      $E,r20
336         addi    $inp,$inp,`16*4`
337         bdnz-   Lsha1_block_private
338         blr
339         .long   0
340         .byte   0,12,0x14,0,0,0,0,0
341 .size   .sha1_block_data_order,.-.sha1_block_data_order
342 ___
343 $code.=<<___;
344 .asciz  "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
345 ___
346
347 $code =~ s/\`([^\`]*)\`/eval $1/gem;
348 print $code;
349 close STDOUT;