47189502c6cc9587170aea1a0a7591f7622c488e
[openssl.git] / crypto / sha / asm / sha512p8-ppc.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # SHA256/512 for PowerISA v2.07.
11 #
12 # Accurate performance measurements are problematic, because it's
13 # always virtualized setup with possibly throttled processor.
14 # Relative comparison is therefore more informative. This module is
15 # ~60% faster than integer-only sha512-ppc.pl. To anchor to something
16 # else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
17 # hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
18 # sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
19 # result is degree of computational resources' utilization. POWER8 is
20 # "massively multi-threaded chip" and difference between single- and
21 # maximum multi-process benchmark results tells that utlization is
22 # whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
23 # for sha1-ppc.pl - 73%. 100% means that multi-process result equals
24 # to single-process one, given that all threads end up on the same
25 # physical core.
26
27 $flavour=shift;
28 $output =shift;
29
30 if ($flavour =~ /64/) {
31         $SIZE_T=8;
32         $LRSAVE=2*$SIZE_T;
33         $STU="stdu";
34         $POP="ld";
35         $PUSH="std";
36 } elsif ($flavour =~ /32/) {
37         $SIZE_T=4;
38         $LRSAVE=$SIZE_T;
39         $STU="stwu";
40         $POP="lwz";
41         $PUSH="stw";
42 } else { die "nonsense $flavour"; }
43
44 $LENDIAN=($flavour=~/le/);
45
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
48 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
49 die "can't locate ppc-xlate.pl";
50
51 open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
52
53 if ($output =~ /512/) {
54         $bits=512;
55         $SZ=8;
56         $sz="d";
57         $rounds=80;
58 } else {
59         $bits=256;
60         $SZ=4;
61         $sz="w";
62         $rounds=64;
63 }
64
65 $func="sha${bits}_block_p8";
66 $FRAME=8*$SIZE_T;
67
68 $sp ="r1";
69 $toc="r2";
70 $ctx="r3";
71 $inp="r4";
72 $num="r5";
73 $Tbl="r6";
74 $idx="r7";
75 $lrsave="r8";
76 $offload="r11";
77 $vrsave="r12";
78 ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
79  $x00=0 if ($flavour =~ /osx/);
80
81 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
82 @X=map("v$_",(8..23));
83 ($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
84
85 sub ROUND {
86 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
87 my $j=($i+1)%16;
88
89 $code.=<<___            if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
90         lvx_u           @X[$i+1],0,$inp         ; load X[i] in advance
91         addi            $inp,$inp,16
92 ___
93 $code.=<<___            if ($i<16 && ($i%(16/$SZ)));
94         vsldoi          @X[$i],@X[$i-1],@X[$i-1],$SZ
95 ___
96 $code.=<<___            if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
97         vperm           @X[$i],@X[$i],@X[$i],$lemask
98 ___
99 $code.=<<___;
100         `"vshasigma${sz}        $s0,@X[($j+1)%16],0,0"          if ($i>=15)`
101         vsel            $Func,$g,$f,$e          ; Ch(e,f,g)
102         vshasigma${sz}  $S1,$e,1,15             ; Sigma1(e)
103         vaddu${sz}m     $h,$h,@X[$i%16]         ; h+=X[i]
104         vshasigma${sz}  $S0,$a,1,0              ; Sigma0(a)
105         `"vshasigma${sz}        $s1,@X[($j+14)%16],0,15"        if ($i>=15)`
106         vaddu${sz}m     $h,$h,$Func             ; h+=Ch(e,f,g)
107         vxor            $Func,$a,$b
108         `"vaddu${sz}m           @X[$j],@X[$j],@X[($j+9)%16]"    if ($i>=15)`
109         vaddu${sz}m     $h,$h,$S1               ; h+=Sigma1(e)
110         vsel            $Func,$b,$c,$Func       ; Maj(a,b,c)
111         vaddu${sz}m     $g,$g,$Ki               ; future h+=K[i]
112         vaddu${sz}m     $d,$d,$h                ; d+=h
113         vaddu${sz}m     $S0,$S0,$Func           ; Sigma0(a)+Maj(a,b,c)
114         `"vaddu${sz}m           @X[$j],@X[$j],$s0"              if ($i>=15)`
115         lvx             $Ki,$idx,$Tbl           ; load next K[i]
116         addi            $idx,$idx,16
117         vaddu${sz}m     $h,$h,$S0               ; h+=Sigma0(a)+Maj(a,b,c)
118         `"vaddu${sz}m           @X[$j],@X[$j],$s1"              if ($i>=15)`
119 ___
120 }
121
122 $code=<<___;
123 .machine        "any"
124 .text
125
126 .globl  $func
127 .align  6
128 $func:
129         $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
130         mflr            $lrsave
131         li              r10,`$FRAME+8*16+15`
132         li              r11,`$FRAME+8*16+31`
133         stvx            v20,r10,$sp             # ABI says so
134         addi            r10,r10,32
135         mfspr           $vrsave,256
136         stvx            v21,r11,$sp
137         addi            r11,r11,32
138         stvx            v22,r10,$sp
139         addi            r10,r10,32
140         stvx            v23,r11,$sp
141         addi            r11,r11,32
142         stvx            v24,r10,$sp
143         addi            r10,r10,32
144         stvx            v25,r11,$sp
145         addi            r11,r11,32
146         stvx            v26,r10,$sp
147         addi            r10,r10,32
148         stvx            v27,r11,$sp
149         addi            r11,r11,32
150         stvx            v28,r10,$sp
151         addi            r10,r10,32
152         stvx            v29,r11,$sp
153         addi            r11,r11,32
154         stvx            v30,r10,$sp
155         stvx            v31,r11,$sp
156         li              r11,-1
157         stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
158         li              $x10,0x10
159         $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
160         li              $x20,0x20
161         $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
162         li              $x30,0x30
163         $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
164         li              $x40,0x40
165         $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
166         li              $x50,0x50
167         $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
168         li              $x60,0x60
169         $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
170         li              $x70,0x70
171         $PUSH           $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
172         mtspr           256,r11
173
174         bl              LPICmeup
175         addi            $offload,$sp,$FRAME+15
176 ___
177 $code.=<<___            if ($LENDIAN);
178         li              $idx,8
179         lvsl            $lemask,0,$idx
180         vspltisb        $Ki,0x0f
181         vxor            $lemask,$lemask,$Ki
182 ___
183 $code.=<<___            if ($SZ==4);
184         lvx_4w          $A,$x00,$ctx
185         lvx_4w          $E,$x10,$ctx
186         vsldoi          $B,$A,$A,4              # unpack
187         vsldoi          $C,$A,$A,8
188         vsldoi          $D,$A,$A,12
189         vsldoi          $F,$E,$E,4
190         vsldoi          $G,$E,$E,8
191         vsldoi          $H,$E,$E,12
192 ___
193 $code.=<<___            if ($SZ==8);
194         lvx_u           $A,$x00,$ctx
195         lvx_u           $C,$x10,$ctx
196         lvx_u           $E,$x20,$ctx
197         vsldoi          $B,$A,$A,8              # unpack
198         lvx_u           $G,$x30,$ctx
199         vsldoi          $D,$C,$C,8
200         vsldoi          $F,$E,$E,8
201         vsldoi          $H,$G,$G,8
202 ___
203 $code.=<<___;
204         li              r0,`($rounds-16)/16`    # inner loop counter
205         b               Loop
206 .align  5
207 Loop:
208         lvx             $Ki,$x00,$Tbl
209         li              $idx,16
210         lvx_u           @X[0],0,$inp
211         addi            $inp,$inp,16
212         stvx            $A,$x00,$offload        # offload $A-$H
213         stvx            $B,$x10,$offload
214         stvx            $C,$x20,$offload
215         stvx            $D,$x30,$offload
216         stvx            $E,$x40,$offload
217         stvx            $F,$x50,$offload
218         stvx            $G,$x60,$offload
219         stvx            $H,$x70,$offload
220         vaddu${sz}m     $H,$H,$Ki               # h+K[i]
221         lvx             $Ki,$idx,$Tbl
222         addi            $idx,$idx,16
223 ___
224 for ($i=0;$i<16;$i++)   { &ROUND($i,@V); unshift(@V,pop(@V)); }
225 $code.=<<___;
226         mtctr           r0
227         b               L16_xx
228 .align  5
229 L16_xx:
230 ___
231 for (;$i<32;$i++)       { &ROUND($i,@V); unshift(@V,pop(@V)); }
232 $code.=<<___;
233         bdnz            L16_xx
234
235         lvx             @X[2],$x00,$offload
236         subic.          $num,$num,1
237         lvx             @X[3],$x10,$offload
238         vaddu${sz}m     $A,$A,@X[2]
239         lvx             @X[4],$x20,$offload
240         vaddu${sz}m     $B,$B,@X[3]
241         lvx             @X[5],$x30,$offload
242         vaddu${sz}m     $C,$C,@X[4]
243         lvx             @X[6],$x40,$offload
244         vaddu${sz}m     $D,$D,@X[5]
245         lvx             @X[7],$x50,$offload
246         vaddu${sz}m     $E,$E,@X[6]
247         lvx             @X[8],$x60,$offload
248         vaddu${sz}m     $F,$F,@X[7]
249         lvx             @X[9],$x70,$offload
250         vaddu${sz}m     $G,$G,@X[8]
251         vaddu${sz}m     $H,$H,@X[9]
252         bne             Loop
253 ___
254 $code.=<<___            if ($SZ==4);
255         lvx             @X[0],$idx,$Tbl
256         addi            $idx,$idx,16
257         vperm           $A,$A,$B,$Ki            # pack the answer
258         lvx             @X[1],$idx,$Tbl
259         vperm           $E,$E,$F,$Ki
260         vperm           $A,$A,$C,@X[0]
261         vperm           $E,$E,$G,@X[0]
262         vperm           $A,$A,$D,@X[1]
263         vperm           $E,$E,$H,@X[1]
264         stvx_4w         $A,$x00,$ctx
265         stvx_4w         $E,$x10,$ctx
266 ___
267 $code.=<<___            if ($SZ==8);
268         vperm           $A,$A,$B,$Ki            # pack the answer
269         vperm           $C,$C,$D,$Ki
270         vperm           $E,$E,$F,$Ki
271         vperm           $G,$G,$H,$Ki
272         stvx_u          $A,$x00,$ctx
273         stvx_u          $C,$x10,$ctx
274         stvx_u          $E,$x20,$ctx
275         stvx_u          $G,$x30,$ctx
276 ___
277 $code.=<<___;
278         li              r10,`$FRAME+8*16+15`
279         mtlr            $lrsave
280         li              r11,`$FRAME+8*16+31`
281         mtspr           256,$vrsave
282         lvx             v20,r10,$sp             # ABI says so
283         addi            r10,r10,32
284         lvx             v21,r11,$sp
285         addi            r11,r11,32
286         lvx             v22,r10,$sp
287         addi            r10,r10,32
288         lvx             v23,r11,$sp
289         addi            r11,r11,32
290         lvx             v24,r10,$sp
291         addi            r10,r10,32
292         lvx             v25,r11,$sp
293         addi            r11,r11,32
294         lvx             v26,r10,$sp
295         addi            r10,r10,32
296         lvx             v27,r11,$sp
297         addi            r11,r11,32
298         lvx             v28,r10,$sp
299         addi            r10,r10,32
300         lvx             v29,r11,$sp
301         addi            r11,r11,32
302         lvx             v30,r10,$sp
303         lvx             v31,r11,$sp
304         $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
305         $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
306         $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
307         $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
308         $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
309         $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
310         addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
311         blr
312         .long           0
313         .byte           0,12,4,1,0x80,6,3,0
314         .long           0
315 .size   $func,.-$func
316 ___
317
318 # Ugly hack here, because PPC assembler syntax seem to vary too
319 # much from platforms to platform...
320 $code.=<<___;
321 .align  6
322 LPICmeup:
323         mflr    r0
324         bcl     20,31,\$+4
325         mflr    $Tbl    ; vvvvvv "distance" between . and 1st data entry
326         addi    $Tbl,$Tbl,`64-8`
327         mtlr    r0
328         blr
329         .long   0
330         .byte   0,12,0x14,0,0,0,0,0
331         .space  `64-9*4`
332 ___
333
334 if ($SZ==8) {
335     local *table = sub {
336         foreach(@_) { $code.=".quad     $_,$_\n"; }
337     };
338     table(
339         "0x428a2f98d728ae22","0x7137449123ef65cd",
340         "0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc",
341         "0x3956c25bf348b538","0x59f111f1b605d019",
342         "0x923f82a4af194f9b","0xab1c5ed5da6d8118",
343         "0xd807aa98a3030242","0x12835b0145706fbe",
344         "0x243185be4ee4b28c","0x550c7dc3d5ffb4e2",
345         "0x72be5d74f27b896f","0x80deb1fe3b1696b1",
346         "0x9bdc06a725c71235","0xc19bf174cf692694",
347         "0xe49b69c19ef14ad2","0xefbe4786384f25e3",
348         "0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65",
349         "0x2de92c6f592b0275","0x4a7484aa6ea6e483",
350         "0x5cb0a9dcbd41fbd4","0x76f988da831153b5",
351         "0x983e5152ee66dfab","0xa831c66d2db43210",
352         "0xb00327c898fb213f","0xbf597fc7beef0ee4",
353         "0xc6e00bf33da88fc2","0xd5a79147930aa725",
354         "0x06ca6351e003826f","0x142929670a0e6e70",
355         "0x27b70a8546d22ffc","0x2e1b21385c26c926",
356         "0x4d2c6dfc5ac42aed","0x53380d139d95b3df",
357         "0x650a73548baf63de","0x766a0abb3c77b2a8",
358         "0x81c2c92e47edaee6","0x92722c851482353b",
359         "0xa2bfe8a14cf10364","0xa81a664bbc423001",
360         "0xc24b8b70d0f89791","0xc76c51a30654be30",
361         "0xd192e819d6ef5218","0xd69906245565a910",
362         "0xf40e35855771202a","0x106aa07032bbd1b8",
363         "0x19a4c116b8d2d0c8","0x1e376c085141ab53",
364         "0x2748774cdf8eeb99","0x34b0bcb5e19b48a8",
365         "0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb",
366         "0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3",
367         "0x748f82ee5defb2fc","0x78a5636f43172f60",
368         "0x84c87814a1f0ab72","0x8cc702081a6439ec",
369         "0x90befffa23631e28","0xa4506cebde82bde9",
370         "0xbef9a3f7b2c67915","0xc67178f2e372532b",
371         "0xca273eceea26619c","0xd186b8c721c0c207",
372         "0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178",
373         "0x06f067aa72176fba","0x0a637dc5a2c898a6",
374         "0x113f9804bef90dae","0x1b710b35131c471b",
375         "0x28db77f523047d84","0x32caab7b40c72493",
376         "0x3c9ebe0a15c9bebc","0x431d67c49c100d4c",
377         "0x4cc5d4becb3e42b6","0x597f299cfc657e2a",
378         "0x5fcb6fab3ad6faec","0x6c44198c4a475817","0");
379 $code.=<<___    if (!$LENDIAN);
380 .quad   0x0001020304050607,0x1011121314151617
381 ___
382 $code.=<<___    if ($LENDIAN);  # quad-swapped
383 .quad   0x1011121314151617,0x0001020304050607
384 ___
385 } else {
386     local *table = sub {
387         foreach(@_) { $code.=".long     $_,$_,$_,$_\n"; }
388     };
389     table(
390         "0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5",
391         "0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5",
392         "0xd807aa98","0x12835b01","0x243185be","0x550c7dc3",
393         "0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174",
394         "0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc",
395         "0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da",
396         "0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7",
397         "0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967",
398         "0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13",
399         "0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85",
400         "0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3",
401         "0xd192e819","0xd6990624","0xf40e3585","0x106aa070",
402         "0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5",
403         "0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3",
404         "0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208",
405         "0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0");
406 $code.=<<___    if (!$LENDIAN);
407 .long   0x00010203,0x10111213,0x10111213,0x10111213
408 .long   0x00010203,0x04050607,0x10111213,0x10111213
409 .long   0x00010203,0x04050607,0x08090a0b,0x10111213
410 ___
411 $code.=<<___    if ($LENDIAN);  # word-swapped
412 .long   0x10111213,0x10111213,0x10111213,0x00010203
413 .long   0x10111213,0x10111213,0x04050607,0x00010203
414 .long   0x10111213,0x08090a0b,0x04050607,0x00010203
415 ___
416 }
417 $code.=<<___;
418 .asciz  "SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
419 .align  2
420 ___
421
422 $code =~ s/\`([^\`]*)\`/eval $1/gem;
423 print $code;
424 close STDOUT;