5457c4aa1662f3d00c0299f2fdc848ccfaeabfd2
[openssl.git] / crypto / sha / asm / sha512p8-ppc.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # SHA256/512 for PowerISA v2.07.
18 #
19 # Accurate performance measurements are problematic, because it's
20 # always virtualized setup with possibly throttled processor.
21 # Relative comparison is therefore more informative. This module is
22 # ~60% faster than integer-only sha512-ppc.pl. To anchor to something
23 # else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
24 # hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
25 # sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
26 # result is degree of computational resources' utilization. POWER8 is
27 # "massively multi-threaded chip" and difference between single- and
28 # maximum multi-process benchmark results tells that utlization is
29 # whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
30 # for sha1-ppc.pl - 73%. 100% means that multi-process result equals
31 # to single-process one, given that all threads end up on the same
32 # physical core.
33 #
34 ######################################################################
35 # Believed-to-be-accurate results in cycles per processed byte [on
36 # little-endian system]. Numbers in square brackets are for 64-bit
37 # build of sha512-ppc.pl, presented for reference.
38 #
39 #               POWER8
40 # SHA256        9.9 [15.8]
41 # SHA512        6.3 [10.3]
42
43 $flavour=shift;
44 $output =shift;
45
46 if ($flavour =~ /64/) {
47         $SIZE_T=8;
48         $LRSAVE=2*$SIZE_T;
49         $STU="stdu";
50         $POP="ld";
51         $PUSH="std";
52 } elsif ($flavour =~ /32/) {
53         $SIZE_T=4;
54         $LRSAVE=$SIZE_T;
55         $STU="stwu";
56         $POP="lwz";
57         $PUSH="stw";
58 } else { die "nonsense $flavour"; }
59
60 $LENDIAN=($flavour=~/le/);
61
62 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
64 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
65 die "can't locate ppc-xlate.pl";
66
67 open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
68
69 if ($output =~ /512/) {
70         $bits=512;
71         $SZ=8;
72         $sz="d";
73         $rounds=80;
74 } else {
75         $bits=256;
76         $SZ=4;
77         $sz="w";
78         $rounds=64;
79 }
80
81 $func="sha${bits}_block_p8";
82 $FRAME=8*$SIZE_T;
83
84 $sp ="r1";
85 $toc="r2";
86 $ctx="r3";
87 $inp="r4";
88 $num="r5";
89 $Tbl="r6";
90 $idx="r7";
91 $lrsave="r8";
92 $offload="r11";
93 $vrsave="r12";
94 ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
95  $x00=0 if ($flavour =~ /osx/);
96
97 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
98 @X=map("v$_",(8..23));
99 ($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
100
101 sub ROUND {
102 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
103 my $j=($i+1)%16;
104
105 $code.=<<___            if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
106         lvx_u           @X[$i+1],0,$inp         ; load X[i] in advance
107         addi            $inp,$inp,16
108 ___
109 $code.=<<___            if ($i<16 && ($i%(16/$SZ)));
110         vsldoi          @X[$i],@X[$i-1],@X[$i-1],$SZ
111 ___
112 $code.=<<___            if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
113         vperm           @X[$i],@X[$i],@X[$i],$lemask
114 ___
115 $code.=<<___;
116         `"vshasigma${sz}        $s0,@X[($j+1)%16],0,0"          if ($i>=15)`
117         vsel            $Func,$g,$f,$e          ; Ch(e,f,g)
118         vshasigma${sz}  $S1,$e,1,15             ; Sigma1(e)
119         vaddu${sz}m     $h,$h,@X[$i%16]         ; h+=X[i]
120         vshasigma${sz}  $S0,$a,1,0              ; Sigma0(a)
121         `"vshasigma${sz}        $s1,@X[($j+14)%16],0,15"        if ($i>=15)`
122         vaddu${sz}m     $h,$h,$Func             ; h+=Ch(e,f,g)
123         vxor            $Func,$a,$b
124         `"vaddu${sz}m           @X[$j],@X[$j],@X[($j+9)%16]"    if ($i>=15)`
125         vaddu${sz}m     $h,$h,$S1               ; h+=Sigma1(e)
126         vsel            $Func,$b,$c,$Func       ; Maj(a,b,c)
127         vaddu${sz}m     $g,$g,$Ki               ; future h+=K[i]
128         vaddu${sz}m     $d,$d,$h                ; d+=h
129         vaddu${sz}m     $S0,$S0,$Func           ; Sigma0(a)+Maj(a,b,c)
130         `"vaddu${sz}m           @X[$j],@X[$j],$s0"              if ($i>=15)`
131         lvx             $Ki,$idx,$Tbl           ; load next K[i]
132         addi            $idx,$idx,16
133         vaddu${sz}m     $h,$h,$S0               ; h+=Sigma0(a)+Maj(a,b,c)
134         `"vaddu${sz}m           @X[$j],@X[$j],$s1"              if ($i>=15)`
135 ___
136 }
137
138 $code=<<___;
139 .machine        "any"
140 .text
141
142 .globl  $func
143 .align  6
144 $func:
145         $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
146         mflr            $lrsave
147         li              r10,`$FRAME+8*16+15`
148         li              r11,`$FRAME+8*16+31`
149         stvx            v20,r10,$sp             # ABI says so
150         addi            r10,r10,32
151         mfspr           $vrsave,256
152         stvx            v21,r11,$sp
153         addi            r11,r11,32
154         stvx            v22,r10,$sp
155         addi            r10,r10,32
156         stvx            v23,r11,$sp
157         addi            r11,r11,32
158         stvx            v24,r10,$sp
159         addi            r10,r10,32
160         stvx            v25,r11,$sp
161         addi            r11,r11,32
162         stvx            v26,r10,$sp
163         addi            r10,r10,32
164         stvx            v27,r11,$sp
165         addi            r11,r11,32
166         stvx            v28,r10,$sp
167         addi            r10,r10,32
168         stvx            v29,r11,$sp
169         addi            r11,r11,32
170         stvx            v30,r10,$sp
171         stvx            v31,r11,$sp
172         li              r11,-1
173         stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
174         li              $x10,0x10
175         $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
176         li              $x20,0x20
177         $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
178         li              $x30,0x30
179         $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
180         li              $x40,0x40
181         $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
182         li              $x50,0x50
183         $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
184         li              $x60,0x60
185         $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
186         li              $x70,0x70
187         $PUSH           $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
188         mtspr           256,r11
189
190         bl              LPICmeup
191         addi            $offload,$sp,$FRAME+15
192 ___
193 $code.=<<___            if ($LENDIAN);
194         li              $idx,8
195         lvsl            $lemask,0,$idx
196         vspltisb        $Ki,0x0f
197         vxor            $lemask,$lemask,$Ki
198 ___
199 $code.=<<___            if ($SZ==4);
200         lvx_4w          $A,$x00,$ctx
201         lvx_4w          $E,$x10,$ctx
202         vsldoi          $B,$A,$A,4              # unpack
203         vsldoi          $C,$A,$A,8
204         vsldoi          $D,$A,$A,12
205         vsldoi          $F,$E,$E,4
206         vsldoi          $G,$E,$E,8
207         vsldoi          $H,$E,$E,12
208 ___
209 $code.=<<___            if ($SZ==8);
210         lvx_u           $A,$x00,$ctx
211         lvx_u           $C,$x10,$ctx
212         lvx_u           $E,$x20,$ctx
213         vsldoi          $B,$A,$A,8              # unpack
214         lvx_u           $G,$x30,$ctx
215         vsldoi          $D,$C,$C,8
216         vsldoi          $F,$E,$E,8
217         vsldoi          $H,$G,$G,8
218 ___
219 $code.=<<___;
220         li              r0,`($rounds-16)/16`    # inner loop counter
221         b               Loop
222 .align  5
223 Loop:
224         lvx             $Ki,$x00,$Tbl
225         li              $idx,16
226         lvx_u           @X[0],0,$inp
227         addi            $inp,$inp,16
228         stvx            $A,$x00,$offload        # offload $A-$H
229         stvx            $B,$x10,$offload
230         stvx            $C,$x20,$offload
231         stvx            $D,$x30,$offload
232         stvx            $E,$x40,$offload
233         stvx            $F,$x50,$offload
234         stvx            $G,$x60,$offload
235         stvx            $H,$x70,$offload
236         vaddu${sz}m     $H,$H,$Ki               # h+K[i]
237         lvx             $Ki,$idx,$Tbl
238         addi            $idx,$idx,16
239 ___
240 for ($i=0;$i<16;$i++)   { &ROUND($i,@V); unshift(@V,pop(@V)); }
241 $code.=<<___;
242         mtctr           r0
243         b               L16_xx
244 .align  5
245 L16_xx:
246 ___
247 for (;$i<32;$i++)       { &ROUND($i,@V); unshift(@V,pop(@V)); }
248 $code.=<<___;
249         bdnz            L16_xx
250
251         lvx             @X[2],$x00,$offload
252         subic.          $num,$num,1
253         lvx             @X[3],$x10,$offload
254         vaddu${sz}m     $A,$A,@X[2]
255         lvx             @X[4],$x20,$offload
256         vaddu${sz}m     $B,$B,@X[3]
257         lvx             @X[5],$x30,$offload
258         vaddu${sz}m     $C,$C,@X[4]
259         lvx             @X[6],$x40,$offload
260         vaddu${sz}m     $D,$D,@X[5]
261         lvx             @X[7],$x50,$offload
262         vaddu${sz}m     $E,$E,@X[6]
263         lvx             @X[8],$x60,$offload
264         vaddu${sz}m     $F,$F,@X[7]
265         lvx             @X[9],$x70,$offload
266         vaddu${sz}m     $G,$G,@X[8]
267         vaddu${sz}m     $H,$H,@X[9]
268         bne             Loop
269 ___
270 $code.=<<___            if ($SZ==4);
271         lvx             @X[0],$idx,$Tbl
272         addi            $idx,$idx,16
273         vperm           $A,$A,$B,$Ki            # pack the answer
274         lvx             @X[1],$idx,$Tbl
275         vperm           $E,$E,$F,$Ki
276         vperm           $A,$A,$C,@X[0]
277         vperm           $E,$E,$G,@X[0]
278         vperm           $A,$A,$D,@X[1]
279         vperm           $E,$E,$H,@X[1]
280         stvx_4w         $A,$x00,$ctx
281         stvx_4w         $E,$x10,$ctx
282 ___
283 $code.=<<___            if ($SZ==8);
284         vperm           $A,$A,$B,$Ki            # pack the answer
285         vperm           $C,$C,$D,$Ki
286         vperm           $E,$E,$F,$Ki
287         vperm           $G,$G,$H,$Ki
288         stvx_u          $A,$x00,$ctx
289         stvx_u          $C,$x10,$ctx
290         stvx_u          $E,$x20,$ctx
291         stvx_u          $G,$x30,$ctx
292 ___
293 $code.=<<___;
294         li              r10,`$FRAME+8*16+15`
295         mtlr            $lrsave
296         li              r11,`$FRAME+8*16+31`
297         mtspr           256,$vrsave
298         lvx             v20,r10,$sp             # ABI says so
299         addi            r10,r10,32
300         lvx             v21,r11,$sp
301         addi            r11,r11,32
302         lvx             v22,r10,$sp
303         addi            r10,r10,32
304         lvx             v23,r11,$sp
305         addi            r11,r11,32
306         lvx             v24,r10,$sp
307         addi            r10,r10,32
308         lvx             v25,r11,$sp
309         addi            r11,r11,32
310         lvx             v26,r10,$sp
311         addi            r10,r10,32
312         lvx             v27,r11,$sp
313         addi            r11,r11,32
314         lvx             v28,r10,$sp
315         addi            r10,r10,32
316         lvx             v29,r11,$sp
317         addi            r11,r11,32
318         lvx             v30,r10,$sp
319         lvx             v31,r11,$sp
320         $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
321         $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
322         $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
323         $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
324         $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
325         $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
326         addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
327         blr
328         .long           0
329         .byte           0,12,4,1,0x80,6,3,0
330         .long           0
331 .size   $func,.-$func
332 ___
333
334 # Ugly hack here, because PPC assembler syntax seem to vary too
335 # much from platforms to platform...
336 $code.=<<___;
337 .align  6
338 LPICmeup:
339         mflr    r0
340         bcl     20,31,\$+4
341         mflr    $Tbl    ; vvvvvv "distance" between . and 1st data entry
342         addi    $Tbl,$Tbl,`64-8`
343         mtlr    r0
344         blr
345         .long   0
346         .byte   0,12,0x14,0,0,0,0,0
347         .space  `64-9*4`
348 ___
349
350 if ($SZ==8) {
351     local *table = sub {
352         foreach(@_) { $code.=".quad     $_,$_\n"; }
353     };
354     table(
355         "0x428a2f98d728ae22","0x7137449123ef65cd",
356         "0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc",
357         "0x3956c25bf348b538","0x59f111f1b605d019",
358         "0x923f82a4af194f9b","0xab1c5ed5da6d8118",
359         "0xd807aa98a3030242","0x12835b0145706fbe",
360         "0x243185be4ee4b28c","0x550c7dc3d5ffb4e2",
361         "0x72be5d74f27b896f","0x80deb1fe3b1696b1",
362         "0x9bdc06a725c71235","0xc19bf174cf692694",
363         "0xe49b69c19ef14ad2","0xefbe4786384f25e3",
364         "0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65",
365         "0x2de92c6f592b0275","0x4a7484aa6ea6e483",
366         "0x5cb0a9dcbd41fbd4","0x76f988da831153b5",
367         "0x983e5152ee66dfab","0xa831c66d2db43210",
368         "0xb00327c898fb213f","0xbf597fc7beef0ee4",
369         "0xc6e00bf33da88fc2","0xd5a79147930aa725",
370         "0x06ca6351e003826f","0x142929670a0e6e70",
371         "0x27b70a8546d22ffc","0x2e1b21385c26c926",
372         "0x4d2c6dfc5ac42aed","0x53380d139d95b3df",
373         "0x650a73548baf63de","0x766a0abb3c77b2a8",
374         "0x81c2c92e47edaee6","0x92722c851482353b",
375         "0xa2bfe8a14cf10364","0xa81a664bbc423001",
376         "0xc24b8b70d0f89791","0xc76c51a30654be30",
377         "0xd192e819d6ef5218","0xd69906245565a910",
378         "0xf40e35855771202a","0x106aa07032bbd1b8",
379         "0x19a4c116b8d2d0c8","0x1e376c085141ab53",
380         "0x2748774cdf8eeb99","0x34b0bcb5e19b48a8",
381         "0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb",
382         "0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3",
383         "0x748f82ee5defb2fc","0x78a5636f43172f60",
384         "0x84c87814a1f0ab72","0x8cc702081a6439ec",
385         "0x90befffa23631e28","0xa4506cebde82bde9",
386         "0xbef9a3f7b2c67915","0xc67178f2e372532b",
387         "0xca273eceea26619c","0xd186b8c721c0c207",
388         "0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178",
389         "0x06f067aa72176fba","0x0a637dc5a2c898a6",
390         "0x113f9804bef90dae","0x1b710b35131c471b",
391         "0x28db77f523047d84","0x32caab7b40c72493",
392         "0x3c9ebe0a15c9bebc","0x431d67c49c100d4c",
393         "0x4cc5d4becb3e42b6","0x597f299cfc657e2a",
394         "0x5fcb6fab3ad6faec","0x6c44198c4a475817","0");
395 $code.=<<___    if (!$LENDIAN);
396 .quad   0x0001020304050607,0x1011121314151617
397 ___
398 $code.=<<___    if ($LENDIAN);  # quad-swapped
399 .quad   0x1011121314151617,0x0001020304050607
400 ___
401 } else {
402     local *table = sub {
403         foreach(@_) { $code.=".long     $_,$_,$_,$_\n"; }
404     };
405     table(
406         "0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5",
407         "0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5",
408         "0xd807aa98","0x12835b01","0x243185be","0x550c7dc3",
409         "0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174",
410         "0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc",
411         "0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da",
412         "0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7",
413         "0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967",
414         "0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13",
415         "0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85",
416         "0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3",
417         "0xd192e819","0xd6990624","0xf40e3585","0x106aa070",
418         "0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5",
419         "0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3",
420         "0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208",
421         "0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0");
422 $code.=<<___    if (!$LENDIAN);
423 .long   0x00010203,0x10111213,0x10111213,0x10111213
424 .long   0x00010203,0x04050607,0x10111213,0x10111213
425 .long   0x00010203,0x04050607,0x08090a0b,0x10111213
426 ___
427 $code.=<<___    if ($LENDIAN);  # word-swapped
428 .long   0x10111213,0x10111213,0x10111213,0x00010203
429 .long   0x10111213,0x10111213,0x04050607,0x00010203
430 .long   0x10111213,0x08090a0b,0x04050607,0x00010203
431 ___
432 }
433 $code.=<<___;
434 .asciz  "SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
435 .align  2
436 ___
437
438 $code =~ s/\`([^\`]*)\`/eval $1/gem;
439 print $code;
440 close STDOUT;