Add Keccak-1600 modules for PPC64 and POWER8.
[openssl.git] / crypto / sha / asm / sha512p8-ppc.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # SHA256/512 for PowerISA v2.07.
18 #
19 # Accurate performance measurements are problematic, because it's
20 # always virtualized setup with possibly throttled processor.
21 # Relative comparison is therefore more informative. This module is
22 # ~60% faster than integer-only sha512-ppc.pl. To anchor to something
23 # else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
24 # hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
25 # sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
26 # result is degree of computational resources' utilization. POWER8 is
27 # "massively multi-threaded chip" and difference between single- and
28 # maximum multi-process benchmark results tells that utlization is
29 # whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
30 # for sha1-ppc.pl - 73%. 100% means that multi-process result equals
31 # to single-process one, given that all threads end up on the same
32 # physical core.
33
34 $flavour=shift;
35 $output =shift;
36
37 if ($flavour =~ /64/) {
38         $SIZE_T=8;
39         $LRSAVE=2*$SIZE_T;
40         $STU="stdu";
41         $POP="ld";
42         $PUSH="std";
43 } elsif ($flavour =~ /32/) {
44         $SIZE_T=4;
45         $LRSAVE=$SIZE_T;
46         $STU="stwu";
47         $POP="lwz";
48         $PUSH="stw";
49 } else { die "nonsense $flavour"; }
50
51 $LENDIAN=($flavour=~/le/);
52
53 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
55 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
56 die "can't locate ppc-xlate.pl";
57
58 open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
59
60 if ($output =~ /512/) {
61         $bits=512;
62         $SZ=8;
63         $sz="d";
64         $rounds=80;
65 } else {
66         $bits=256;
67         $SZ=4;
68         $sz="w";
69         $rounds=64;
70 }
71
72 $func="sha${bits}_block_p8";
73 $FRAME=8*$SIZE_T;
74
75 $sp ="r1";
76 $toc="r2";
77 $ctx="r3";
78 $inp="r4";
79 $num="r5";
80 $Tbl="r6";
81 $idx="r7";
82 $lrsave="r8";
83 $offload="r11";
84 $vrsave="r12";
85 ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
86  $x00=0 if ($flavour =~ /osx/);
87
88 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
89 @X=map("v$_",(8..23));
90 ($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
91
92 sub ROUND {
93 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
94 my $j=($i+1)%16;
95
96 $code.=<<___            if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
97         lvx_u           @X[$i+1],0,$inp         ; load X[i] in advance
98         addi            $inp,$inp,16
99 ___
100 $code.=<<___            if ($i<16 && ($i%(16/$SZ)));
101         vsldoi          @X[$i],@X[$i-1],@X[$i-1],$SZ
102 ___
103 $code.=<<___            if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
104         vperm           @X[$i],@X[$i],@X[$i],$lemask
105 ___
106 $code.=<<___;
107         `"vshasigma${sz}        $s0,@X[($j+1)%16],0,0"          if ($i>=15)`
108         vsel            $Func,$g,$f,$e          ; Ch(e,f,g)
109         vshasigma${sz}  $S1,$e,1,15             ; Sigma1(e)
110         vaddu${sz}m     $h,$h,@X[$i%16]         ; h+=X[i]
111         vshasigma${sz}  $S0,$a,1,0              ; Sigma0(a)
112         `"vshasigma${sz}        $s1,@X[($j+14)%16],0,15"        if ($i>=15)`
113         vaddu${sz}m     $h,$h,$Func             ; h+=Ch(e,f,g)
114         vxor            $Func,$a,$b
115         `"vaddu${sz}m           @X[$j],@X[$j],@X[($j+9)%16]"    if ($i>=15)`
116         vaddu${sz}m     $h,$h,$S1               ; h+=Sigma1(e)
117         vsel            $Func,$b,$c,$Func       ; Maj(a,b,c)
118         vaddu${sz}m     $g,$g,$Ki               ; future h+=K[i]
119         vaddu${sz}m     $d,$d,$h                ; d+=h
120         vaddu${sz}m     $S0,$S0,$Func           ; Sigma0(a)+Maj(a,b,c)
121         `"vaddu${sz}m           @X[$j],@X[$j],$s0"              if ($i>=15)`
122         lvx             $Ki,$idx,$Tbl           ; load next K[i]
123         addi            $idx,$idx,16
124         vaddu${sz}m     $h,$h,$S0               ; h+=Sigma0(a)+Maj(a,b,c)
125         `"vaddu${sz}m           @X[$j],@X[$j],$s1"              if ($i>=15)`
126 ___
127 }
128
129 $code=<<___;
130 .machine        "any"
131 .text
132
133 .globl  $func
134 .align  6
135 $func:
136         $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
137         mflr            $lrsave
138         li              r10,`$FRAME+8*16+15`
139         li              r11,`$FRAME+8*16+31`
140         stvx            v20,r10,$sp             # ABI says so
141         addi            r10,r10,32
142         mfspr           $vrsave,256
143         stvx            v21,r11,$sp
144         addi            r11,r11,32
145         stvx            v22,r10,$sp
146         addi            r10,r10,32
147         stvx            v23,r11,$sp
148         addi            r11,r11,32
149         stvx            v24,r10,$sp
150         addi            r10,r10,32
151         stvx            v25,r11,$sp
152         addi            r11,r11,32
153         stvx            v26,r10,$sp
154         addi            r10,r10,32
155         stvx            v27,r11,$sp
156         addi            r11,r11,32
157         stvx            v28,r10,$sp
158         addi            r10,r10,32
159         stvx            v29,r11,$sp
160         addi            r11,r11,32
161         stvx            v30,r10,$sp
162         stvx            v31,r11,$sp
163         li              r11,-1
164         stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
165         li              $x10,0x10
166         $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
167         li              $x20,0x20
168         $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
169         li              $x30,0x30
170         $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
171         li              $x40,0x40
172         $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
173         li              $x50,0x50
174         $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
175         li              $x60,0x60
176         $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
177         li              $x70,0x70
178         $PUSH           $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
179         mtspr           256,r11
180
181         bl              LPICmeup
182         addi            $offload,$sp,$FRAME+15
183 ___
184 $code.=<<___            if ($LENDIAN);
185         li              $idx,8
186         lvsl            $lemask,0,$idx
187         vspltisb        $Ki,0x0f
188         vxor            $lemask,$lemask,$Ki
189 ___
190 $code.=<<___            if ($SZ==4);
191         lvx_4w          $A,$x00,$ctx
192         lvx_4w          $E,$x10,$ctx
193         vsldoi          $B,$A,$A,4              # unpack
194         vsldoi          $C,$A,$A,8
195         vsldoi          $D,$A,$A,12
196         vsldoi          $F,$E,$E,4
197         vsldoi          $G,$E,$E,8
198         vsldoi          $H,$E,$E,12
199 ___
200 $code.=<<___            if ($SZ==8);
201         lvx_u           $A,$x00,$ctx
202         lvx_u           $C,$x10,$ctx
203         lvx_u           $E,$x20,$ctx
204         vsldoi          $B,$A,$A,8              # unpack
205         lvx_u           $G,$x30,$ctx
206         vsldoi          $D,$C,$C,8
207         vsldoi          $F,$E,$E,8
208         vsldoi          $H,$G,$G,8
209 ___
210 $code.=<<___;
211         li              r0,`($rounds-16)/16`    # inner loop counter
212         b               Loop
213 .align  5
214 Loop:
215         lvx             $Ki,$x00,$Tbl
216         li              $idx,16
217         lvx_u           @X[0],0,$inp
218         addi            $inp,$inp,16
219         stvx            $A,$x00,$offload        # offload $A-$H
220         stvx            $B,$x10,$offload
221         stvx            $C,$x20,$offload
222         stvx            $D,$x30,$offload
223         stvx            $E,$x40,$offload
224         stvx            $F,$x50,$offload
225         stvx            $G,$x60,$offload
226         stvx            $H,$x70,$offload
227         vaddu${sz}m     $H,$H,$Ki               # h+K[i]
228         lvx             $Ki,$idx,$Tbl
229         addi            $idx,$idx,16
230 ___
231 for ($i=0;$i<16;$i++)   { &ROUND($i,@V); unshift(@V,pop(@V)); }
232 $code.=<<___;
233         mtctr           r0
234         b               L16_xx
235 .align  5
236 L16_xx:
237 ___
238 for (;$i<32;$i++)       { &ROUND($i,@V); unshift(@V,pop(@V)); }
239 $code.=<<___;
240         bdnz            L16_xx
241
242         lvx             @X[2],$x00,$offload
243         subic.          $num,$num,1
244         lvx             @X[3],$x10,$offload
245         vaddu${sz}m     $A,$A,@X[2]
246         lvx             @X[4],$x20,$offload
247         vaddu${sz}m     $B,$B,@X[3]
248         lvx             @X[5],$x30,$offload
249         vaddu${sz}m     $C,$C,@X[4]
250         lvx             @X[6],$x40,$offload
251         vaddu${sz}m     $D,$D,@X[5]
252         lvx             @X[7],$x50,$offload
253         vaddu${sz}m     $E,$E,@X[6]
254         lvx             @X[8],$x60,$offload
255         vaddu${sz}m     $F,$F,@X[7]
256         lvx             @X[9],$x70,$offload
257         vaddu${sz}m     $G,$G,@X[8]
258         vaddu${sz}m     $H,$H,@X[9]
259         bne             Loop
260 ___
261 $code.=<<___            if ($SZ==4);
262         lvx             @X[0],$idx,$Tbl
263         addi            $idx,$idx,16
264         vperm           $A,$A,$B,$Ki            # pack the answer
265         lvx             @X[1],$idx,$Tbl
266         vperm           $E,$E,$F,$Ki
267         vperm           $A,$A,$C,@X[0]
268         vperm           $E,$E,$G,@X[0]
269         vperm           $A,$A,$D,@X[1]
270         vperm           $E,$E,$H,@X[1]
271         stvx_4w         $A,$x00,$ctx
272         stvx_4w         $E,$x10,$ctx
273 ___
274 $code.=<<___            if ($SZ==8);
275         vperm           $A,$A,$B,$Ki            # pack the answer
276         vperm           $C,$C,$D,$Ki
277         vperm           $E,$E,$F,$Ki
278         vperm           $G,$G,$H,$Ki
279         stvx_u          $A,$x00,$ctx
280         stvx_u          $C,$x10,$ctx
281         stvx_u          $E,$x20,$ctx
282         stvx_u          $G,$x30,$ctx
283 ___
284 $code.=<<___;
285         li              r10,`$FRAME+8*16+15`
286         mtlr            $lrsave
287         li              r11,`$FRAME+8*16+31`
288         mtspr           256,$vrsave
289         lvx             v20,r10,$sp             # ABI says so
290         addi            r10,r10,32
291         lvx             v21,r11,$sp
292         addi            r11,r11,32
293         lvx             v22,r10,$sp
294         addi            r10,r10,32
295         lvx             v23,r11,$sp
296         addi            r11,r11,32
297         lvx             v24,r10,$sp
298         addi            r10,r10,32
299         lvx             v25,r11,$sp
300         addi            r11,r11,32
301         lvx             v26,r10,$sp
302         addi            r10,r10,32
303         lvx             v27,r11,$sp
304         addi            r11,r11,32
305         lvx             v28,r10,$sp
306         addi            r10,r10,32
307         lvx             v29,r11,$sp
308         addi            r11,r11,32
309         lvx             v30,r10,$sp
310         lvx             v31,r11,$sp
311         $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
312         $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
313         $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
314         $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
315         $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
316         $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
317         addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
318         blr
319         .long           0
320         .byte           0,12,4,1,0x80,6,3,0
321         .long           0
322 .size   $func,.-$func
323 ___
324
325 # Ugly hack here, because PPC assembler syntax seem to vary too
326 # much from platforms to platform...
327 $code.=<<___;
328 .align  6
329 LPICmeup:
330         mflr    r0
331         bcl     20,31,\$+4
332         mflr    $Tbl    ; vvvvvv "distance" between . and 1st data entry
333         addi    $Tbl,$Tbl,`64-8`
334         mtlr    r0
335         blr
336         .long   0
337         .byte   0,12,0x14,0,0,0,0,0
338         .space  `64-9*4`
339 ___
340
341 if ($SZ==8) {
342     local *table = sub {
343         foreach(@_) { $code.=".quad     $_,$_\n"; }
344     };
345     table(
346         "0x428a2f98d728ae22","0x7137449123ef65cd",
347         "0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc",
348         "0x3956c25bf348b538","0x59f111f1b605d019",
349         "0x923f82a4af194f9b","0xab1c5ed5da6d8118",
350         "0xd807aa98a3030242","0x12835b0145706fbe",
351         "0x243185be4ee4b28c","0x550c7dc3d5ffb4e2",
352         "0x72be5d74f27b896f","0x80deb1fe3b1696b1",
353         "0x9bdc06a725c71235","0xc19bf174cf692694",
354         "0xe49b69c19ef14ad2","0xefbe4786384f25e3",
355         "0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65",
356         "0x2de92c6f592b0275","0x4a7484aa6ea6e483",
357         "0x5cb0a9dcbd41fbd4","0x76f988da831153b5",
358         "0x983e5152ee66dfab","0xa831c66d2db43210",
359         "0xb00327c898fb213f","0xbf597fc7beef0ee4",
360         "0xc6e00bf33da88fc2","0xd5a79147930aa725",
361         "0x06ca6351e003826f","0x142929670a0e6e70",
362         "0x27b70a8546d22ffc","0x2e1b21385c26c926",
363         "0x4d2c6dfc5ac42aed","0x53380d139d95b3df",
364         "0x650a73548baf63de","0x766a0abb3c77b2a8",
365         "0x81c2c92e47edaee6","0x92722c851482353b",
366         "0xa2bfe8a14cf10364","0xa81a664bbc423001",
367         "0xc24b8b70d0f89791","0xc76c51a30654be30",
368         "0xd192e819d6ef5218","0xd69906245565a910",
369         "0xf40e35855771202a","0x106aa07032bbd1b8",
370         "0x19a4c116b8d2d0c8","0x1e376c085141ab53",
371         "0x2748774cdf8eeb99","0x34b0bcb5e19b48a8",
372         "0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb",
373         "0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3",
374         "0x748f82ee5defb2fc","0x78a5636f43172f60",
375         "0x84c87814a1f0ab72","0x8cc702081a6439ec",
376         "0x90befffa23631e28","0xa4506cebde82bde9",
377         "0xbef9a3f7b2c67915","0xc67178f2e372532b",
378         "0xca273eceea26619c","0xd186b8c721c0c207",
379         "0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178",
380         "0x06f067aa72176fba","0x0a637dc5a2c898a6",
381         "0x113f9804bef90dae","0x1b710b35131c471b",
382         "0x28db77f523047d84","0x32caab7b40c72493",
383         "0x3c9ebe0a15c9bebc","0x431d67c49c100d4c",
384         "0x4cc5d4becb3e42b6","0x597f299cfc657e2a",
385         "0x5fcb6fab3ad6faec","0x6c44198c4a475817","0");
386 $code.=<<___    if (!$LENDIAN);
387 .quad   0x0001020304050607,0x1011121314151617
388 ___
389 $code.=<<___    if ($LENDIAN);  # quad-swapped
390 .quad   0x1011121314151617,0x0001020304050607
391 ___
392 } else {
393     local *table = sub {
394         foreach(@_) { $code.=".long     $_,$_,$_,$_\n"; }
395     };
396     table(
397         "0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5",
398         "0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5",
399         "0xd807aa98","0x12835b01","0x243185be","0x550c7dc3",
400         "0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174",
401         "0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc",
402         "0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da",
403         "0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7",
404         "0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967",
405         "0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13",
406         "0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85",
407         "0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3",
408         "0xd192e819","0xd6990624","0xf40e3585","0x106aa070",
409         "0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5",
410         "0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3",
411         "0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208",
412         "0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0");
413 $code.=<<___    if (!$LENDIAN);
414 .long   0x00010203,0x10111213,0x10111213,0x10111213
415 .long   0x00010203,0x04050607,0x10111213,0x10111213
416 .long   0x00010203,0x04050607,0x08090a0b,0x10111213
417 ___
418 $code.=<<___    if ($LENDIAN);  # word-swapped
419 .long   0x10111213,0x10111213,0x10111213,0x00010203
420 .long   0x10111213,0x10111213,0x04050607,0x00010203
421 .long   0x10111213,0x08090a0b,0x04050607,0x00010203
422 ___
423 }
424 $code.=<<___;
425 .asciz  "SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
426 .align  2
427 ___
428
429 $code =~ s/\`([^\`]*)\`/eval $1/gem;
430 print $code;
431 close STDOUT;