sha1-ppc.pl: shave off one cycle from BODY_20_39
[openssl.git] / crypto / sha / asm / sha1-ppc.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # I let hardware handle unaligned input(*), except on page boundaries
11 # (see below for details). Otherwise straightforward implementation
12 # with X vector in register bank.
13 #
14 # (*) this means that this module is inappropriate for PPC403? Does
15 #     anybody know if pre-POWER3 can sustain unaligned load?
16
17 #                       -m64    -m32
18 # ----------------------------------
19 # PPC970,gcc-4.0.0      +76%    +59%
20 # Power6,xlc-7          +68%    +33%
21
22 $flavour = shift;
23
24 if ($flavour =~ /64/) {
25         $SIZE_T =8;
26         $LRSAVE =2*$SIZE_T;
27         $UCMP   ="cmpld";
28         $STU    ="stdu";
29         $POP    ="ld";
30         $PUSH   ="std";
31 } elsif ($flavour =~ /32/) {
32         $SIZE_T =4;
33         $LRSAVE =$SIZE_T;
34         $UCMP   ="cmplw";
35         $STU    ="stwu";
36         $POP    ="lwz";
37         $PUSH   ="stw";
38 } else { die "nonsense $flavour"; }
39
40 # Define endianess based on flavour
41 # i.e.: linux64le
42 $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
43
44 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
45 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
46 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
47 die "can't locate ppc-xlate.pl";
48
49 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
50
51 $FRAME=24*$SIZE_T+64;
52 $LOCALS=6*$SIZE_T;
53
54 $K  ="r0";
55 $sp ="r1";
56 $toc="r2";
57 $ctx="r3";
58 $inp="r4";
59 $num="r5";
60 $t0 ="r15";
61 $t1 ="r6";
62
63 $A  ="r7";
64 $B  ="r8";
65 $C  ="r9";
66 $D  ="r10";
67 $E  ="r11";
68 $T  ="r12";
69
70 @V=($A,$B,$C,$D,$E,$T);
71 @X=("r16","r17","r18","r19","r20","r21","r22","r23",
72     "r24","r25","r26","r27","r28","r29","r30","r31");
73
74 sub loadbe {
75 my ($dst, $src, $temp_reg) = @_;
76 $code.=<<___ if (!$LITTLE_ENDIAN);
77         lwz     $dst,$src
78 ___
79 $code.=<<___ if ($LITTLE_ENDIAN);
80         lwz     $temp_reg,$src
81         rotlwi  $dst,$temp_reg,8
82         rlwimi  $dst,$temp_reg,24,0,7
83         rlwimi  $dst,$temp_reg,24,16,23
84 ___
85 }
86
87 sub BODY_00_19 {
88 my ($i,$a,$b,$c,$d,$e,$f)=@_;
89 my $j=$i+1;
90
91         # Since the last value of $f is discarded, we can use
92         # it as a temp reg to swap byte-order when needed.
93         loadbe("@X[$i]","`$i*4`($inp)",$f) if ($i==0);
94         loadbe("@X[$j]","`$j*4`($inp)",$f) if ($i<15);
95 $code.=<<___ if ($i<15);
96         add     $f,$K,$e
97         rotlwi  $e,$a,5
98         add     $f,$f,@X[$i]
99         and     $t0,$c,$b
100         add     $f,$f,$e
101         andc    $t1,$d,$b
102         rotlwi  $b,$b,30
103         or      $t0,$t0,$t1
104         add     $f,$f,$t0
105 ___
106 $code.=<<___ if ($i>=15);
107         add     $f,$K,$e
108         rotlwi  $e,$a,5
109         xor     @X[$j%16],@X[$j%16],@X[($j+2)%16]
110         add     $f,$f,@X[$i%16]
111         and     $t0,$c,$b
112         xor     @X[$j%16],@X[$j%16],@X[($j+8)%16]
113         add     $f,$f,$e
114         andc    $t1,$d,$b
115         rotlwi  $b,$b,30
116         or      $t0,$t0,$t1
117         xor     @X[$j%16],@X[$j%16],@X[($j+13)%16]
118         add     $f,$f,$t0
119         rotlwi  @X[$j%16],@X[$j%16],1
120 ___
121 }
122
123 sub BODY_20_39 {
124 my ($i,$a,$b,$c,$d,$e,$f)=@_;
125 my $j=$i+1;
126 $code.=<<___ if ($i<79);
127         add     $f,$K,$e
128         xor     $t0,$b,$d
129         rotlwi  $e,$a,5
130         xor     @X[$j%16],@X[$j%16],@X[($j+2)%16]
131         add     $f,$f,@X[$i%16]
132         xor     $t0,$t0,$c
133         xor     @X[$j%16],@X[$j%16],@X[($j+8)%16]
134         add     $f,$f,$t0
135         rotlwi  $b,$b,30
136         xor     @X[$j%16],@X[$j%16],@X[($j+13)%16]
137         add     $f,$f,$e
138         rotlwi  @X[$j%16],@X[$j%16],1
139 ___
140 $code.=<<___ if ($i==79);
141         add     $f,$K,$e
142         xor     $t0,$b,$d
143         rotlwi  $e,$a,5
144         lwz     r16,0($ctx)
145         add     $f,$f,@X[$i%16]
146         xor     $t0,$t0,$c
147         lwz     r17,4($ctx)
148         add     $f,$f,$t0
149         rotlwi  $b,$b,30
150         lwz     r18,8($ctx)
151         lwz     r19,12($ctx)
152         add     $f,$f,$e
153         lwz     r20,16($ctx)
154 ___
155 }
156
157 sub BODY_40_59 {
158 my ($i,$a,$b,$c,$d,$e,$f)=@_;
159 my $j=$i+1;
160 $code.=<<___;
161         add     $f,$K,$e
162         rotlwi  $e,$a,5
163         xor     @X[$j%16],@X[$j%16],@X[($j+2)%16]
164         add     $f,$f,@X[$i%16]
165         and     $t0,$b,$c
166         xor     @X[$j%16],@X[$j%16],@X[($j+8)%16]
167         add     $f,$f,$e
168         or      $t1,$b,$c
169         rotlwi  $b,$b,30
170         xor     @X[$j%16],@X[$j%16],@X[($j+13)%16]
171         and     $t1,$t1,$d
172         or      $t0,$t0,$t1
173         rotlwi  @X[$j%16],@X[$j%16],1
174         add     $f,$f,$t0
175 ___
176 }
177
178 $code=<<___;
179 .machine        "any"
180 .text
181
182 .globl  .sha1_block_data_order
183 .align  4
184 .sha1_block_data_order:
185         $STU    $sp,-$FRAME($sp)
186         mflr    r0
187         $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
188         $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
189         $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
190         $PUSH   r18,`$FRAME-$SIZE_T*14`($sp)
191         $PUSH   r19,`$FRAME-$SIZE_T*13`($sp)
192         $PUSH   r20,`$FRAME-$SIZE_T*12`($sp)
193         $PUSH   r21,`$FRAME-$SIZE_T*11`($sp)
194         $PUSH   r22,`$FRAME-$SIZE_T*10`($sp)
195         $PUSH   r23,`$FRAME-$SIZE_T*9`($sp)
196         $PUSH   r24,`$FRAME-$SIZE_T*8`($sp)
197         $PUSH   r25,`$FRAME-$SIZE_T*7`($sp)
198         $PUSH   r26,`$FRAME-$SIZE_T*6`($sp)
199         $PUSH   r27,`$FRAME-$SIZE_T*5`($sp)
200         $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
201         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
202         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
203         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
204         $PUSH   r0,`$FRAME+$LRSAVE`($sp)
205         lwz     $A,0($ctx)
206         lwz     $B,4($ctx)
207         lwz     $C,8($ctx)
208         lwz     $D,12($ctx)
209         lwz     $E,16($ctx)
210         andi.   r0,$inp,3
211         bne     Lunaligned
212 Laligned:
213         mtctr   $num
214         bl      Lsha1_block_private
215         b       Ldone
216
217 ; PowerPC specification allows an implementation to be ill-behaved
218 ; upon unaligned access which crosses page boundary. "Better safe
219 ; than sorry" principle makes me treat it specially. But I don't
220 ; look for particular offending word, but rather for 64-byte input
221 ; block which crosses the boundary. Once found that block is aligned
222 ; and hashed separately...
223 .align  4
224 Lunaligned:
225         subfic  $t1,$inp,4096
226         andi.   $t1,$t1,4095    ; distance to closest page boundary
227         srwi.   $t1,$t1,6       ; t1/=64
228         beq     Lcross_page
229         $UCMP   $num,$t1
230         ble-    Laligned        ; didn't cross the page boundary
231         mtctr   $t1
232         subfc   $num,$t1,$num
233         bl      Lsha1_block_private
234 Lcross_page:
235         li      $t1,16
236         mtctr   $t1
237         addi    r20,$sp,$LOCALS ; spot within the frame
238 Lmemcpy:
239         lbz     r16,0($inp)
240         lbz     r17,1($inp)
241         lbz     r18,2($inp)
242         lbz     r19,3($inp)
243         addi    $inp,$inp,4
244         stb     r16,0(r20)
245         stb     r17,1(r20)
246         stb     r18,2(r20)
247         stb     r19,3(r20)
248         addi    r20,r20,4
249         bdnz    Lmemcpy
250
251         $PUSH   $inp,`$FRAME-$SIZE_T*18`($sp)
252         li      $t1,1
253         addi    $inp,$sp,$LOCALS
254         mtctr   $t1
255         bl      Lsha1_block_private
256         $POP    $inp,`$FRAME-$SIZE_T*18`($sp)
257         addic.  $num,$num,-1
258         bne-    Lunaligned
259
260 Ldone:
261         $POP    r0,`$FRAME+$LRSAVE`($sp)
262         $POP    r15,`$FRAME-$SIZE_T*17`($sp)
263         $POP    r16,`$FRAME-$SIZE_T*16`($sp)
264         $POP    r17,`$FRAME-$SIZE_T*15`($sp)
265         $POP    r18,`$FRAME-$SIZE_T*14`($sp)
266         $POP    r19,`$FRAME-$SIZE_T*13`($sp)
267         $POP    r20,`$FRAME-$SIZE_T*12`($sp)
268         $POP    r21,`$FRAME-$SIZE_T*11`($sp)
269         $POP    r22,`$FRAME-$SIZE_T*10`($sp)
270         $POP    r23,`$FRAME-$SIZE_T*9`($sp)
271         $POP    r24,`$FRAME-$SIZE_T*8`($sp)
272         $POP    r25,`$FRAME-$SIZE_T*7`($sp)
273         $POP    r26,`$FRAME-$SIZE_T*6`($sp)
274         $POP    r27,`$FRAME-$SIZE_T*5`($sp)
275         $POP    r28,`$FRAME-$SIZE_T*4`($sp)
276         $POP    r29,`$FRAME-$SIZE_T*3`($sp)
277         $POP    r30,`$FRAME-$SIZE_T*2`($sp)
278         $POP    r31,`$FRAME-$SIZE_T*1`($sp)
279         mtlr    r0
280         addi    $sp,$sp,$FRAME
281         blr
282         .long   0
283         .byte   0,12,4,1,0x80,18,3,0
284         .long   0
285 ___
286
287 # This is private block function, which uses tailored calling
288 # interface, namely upon entry SHA_CTX is pre-loaded to given
289 # registers and counter register contains amount of chunks to
290 # digest...
291 $code.=<<___;
292 .align  4
293 Lsha1_block_private:
294 ___
295 $code.=<<___;   # load K_00_19
296         lis     $K,0x5a82
297         ori     $K,$K,0x7999
298 ___
299 for($i=0;$i<20;$i++)    { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
300 $code.=<<___;   # load K_20_39
301         lis     $K,0x6ed9
302         ori     $K,$K,0xeba1
303 ___
304 for(;$i<40;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
305 $code.=<<___;   # load K_40_59
306         lis     $K,0x8f1b
307         ori     $K,$K,0xbcdc
308 ___
309 for(;$i<60;$i++)        { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
310 $code.=<<___;   # load K_60_79
311         lis     $K,0xca62
312         ori     $K,$K,0xc1d6
313 ___
314 for(;$i<80;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
315 $code.=<<___;
316         add     r16,r16,$E
317         add     r17,r17,$T
318         add     r18,r18,$A
319         add     r19,r19,$B
320         add     r20,r20,$C
321         stw     r16,0($ctx)
322         mr      $A,r16
323         stw     r17,4($ctx)
324         mr      $B,r17
325         stw     r18,8($ctx)
326         mr      $C,r18
327         stw     r19,12($ctx)
328         mr      $D,r19
329         stw     r20,16($ctx)
330         mr      $E,r20
331         addi    $inp,$inp,`16*4`
332         bdnz-   Lsha1_block_private
333         blr
334         .long   0
335         .byte   0,12,0x14,0,0,0,0,0
336 .size   .sha1_block_data_order,.-.sha1_block_data_order
337 ___
338 $code.=<<___;
339 .asciz  "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
340 ___
341
342 $code =~ s/\`([^\`]*)\`/eval $1/gem;
343 print $code;
344 close STDOUT;