sha/asm/sha512p8-ppc.pl: optimize epilogue.
[openssl.git] / crypto / sha / asm / sha512p8-ppc.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # SHA256/512 for PowerISA v2.07.
18 #
19 # Accurate performance measurements are problematic, because it's
20 # always virtualized setup with possibly throttled processor.
21 # Relative comparison is therefore more informative. This module is
22 # ~60% faster than integer-only sha512-ppc.pl. To anchor to something
23 # else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
24 # hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
25 # sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
26 # result is degree of computational resources' utilization. POWER8 is
27 # "massively multi-threaded chip" and difference between single- and
28 # maximum multi-process benchmark results tells that utilization is
29 # whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
30 # for sha1-ppc.pl - 73%. 100% means that multi-process result equals
31 # to single-process one, given that all threads end up on the same
32 # physical core.
33 #
34 ######################################################################
35 # Believed-to-be-accurate results in cycles per processed byte [on
36 # little-endian system]. Numbers in square brackets are for 64-bit
37 # build of sha512-ppc.pl, presented for reference.
38 #
39 #               POWER8          POWER9
40 # SHA256        9.7 [15.8]      11.2 [12.5]
41 # SHA512        6.1 [10.3]      7.0 [7.9]
42
43 $flavour=shift;
44 $output =shift;
45
46 if ($flavour =~ /64/) {
47         $SIZE_T=8;
48         $LRSAVE=2*$SIZE_T;
49         $STU="stdu";
50         $POP="ld";
51         $PUSH="std";
52 } elsif ($flavour =~ /32/) {
53         $SIZE_T=4;
54         $LRSAVE=$SIZE_T;
55         $STU="stwu";
56         $POP="lwz";
57         $PUSH="stw";
58 } else { die "nonsense $flavour"; }
59
60 $LENDIAN=($flavour=~/le/);
61
62 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
64 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
65 die "can't locate ppc-xlate.pl";
66
67 open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
68
69 if ($output =~ /512/) {
70         $bits=512;
71         $SZ=8;
72         $sz="d";
73         $rounds=80;
74 } else {
75         $bits=256;
76         $SZ=4;
77         $sz="w";
78         $rounds=64;
79 }
80
81 $func="sha${bits}_block_p8";
82 $LOCALS=8*$SIZE_T+8*16;
83 $FRAME=$LOCALS+9*16+6*$SIZE_T;
84
85 $sp ="r1";
86 $toc="r2";
87 $ctx="r3";
88 $inp="r4";
89 $num="r5";
90 $Tbl="r6";
91 $idx="r7";
92 $lrsave="r8";
93 $offload="r11";
94 $vrsave="r12";
95 @I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70) = (0,map("r$_",(10,26..31)));
96
97 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
98 @X=map("v$_",(8..19,24..27));
99 ($Ki,$Func,$Sigma,$lemask)=map("v$_",(28..31));
100
101 sub ROUND {
102 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
103 my $j=($i+1)%16;
104 my $k=($i+2)%8;
105
106 $code.=<<___            if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
107         lvx_u           @X[$i+1],0,$inp         ; load X[i] in advance
108         addi            $inp,$inp,16
109 ___
110 $code.=<<___            if ($i<16 && ($i%(16/$SZ)));
111         vsldoi          @X[$i],@X[$i-1],@X[$i-1],$SZ
112 ___
113 $code.=<<___            if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
114         vperm           @X[$i],@X[$i],@X[$i],$lemask
115 ___
116 $code.=<<___            if ($i>=15);
117         vshasigma${sz}  $Sigma,@X[($j+1)%16],0,0
118         vaddu${sz}m     @X[$j],@X[$j],$Sigma
119         vshasigma${sz}  $Sigma,@X[($j+14)%16],0,15
120         vaddu${sz}m     @X[$j],@X[$j],$Sigma
121         vaddu${sz}m     @X[$j],@X[$j],@X[($j+9)%16]
122 ___
123 $code.=<<___;
124         vaddu${sz}m     $h,$h,@X[$i%16]         ; h+=X[i]
125         vsel            $Func,$g,$f,$e          ; Ch(e,f,g)
126         vaddu${sz}m     $g,$g,$Ki               ; future h+=K[i]
127         vaddu${sz}m     $h,$h,$Func             ; h+=Ch(e,f,g)
128         vshasigma${sz}  $Sigma,$e,1,15          ; Sigma1(e)
129         vaddu${sz}m     $h,$h,$Sigma            ; h+=Sigma1(e)
130         vxor            $Func,$a,$b
131         vsel            $Func,$b,$c,$Func       ; Maj(a,b,c)
132         vaddu${sz}m     $d,$d,$h                ; d+=h
133         vshasigma${sz}  $Sigma,$a,1,0           ; Sigma0(a)
134         vaddu${sz}m     $Sigma,$Sigma,$Func     ; Sigma0(a)+Maj(a,b,c)
135         vaddu${sz}m     $h,$h,$Sigma            ; h+=Sigma0(a)+Maj(a,b,c)
136         lvx             $Ki,@I[$k],$idx         ; load next K[i]
137 ___
138 $code.=<<___            if ($k == 7);
139         addi            $idx,$idx,0x80
140 ___
141 }
142
143 $code=<<___;
144 .machine        "any"
145 .text
146
147 .globl  $func
148 .align  6
149 $func:
150         $STU            $sp,-$FRAME($sp)
151         mflr            $lrsave
152         li              r10,`$LOCALS+15`
153         li              r11,`$LOCALS+31`
154         stvx            v24,r10,$sp             # ABI says so
155         addi            r10,r10,32
156         mfspr           $vrsave,256
157         stvx            v25,r11,$sp
158         addi            r11,r11,32
159         stvx            v26,r10,$sp
160         addi            r10,r10,32
161         stvx            v27,r11,$sp
162         addi            r11,r11,32
163         stvx            v28,r10,$sp
164         addi            r10,r10,32
165         stvx            v29,r11,$sp
166         addi            r11,r11,32
167         stvx            v30,r10,$sp
168         stvx            v31,r11,$sp
169         li              r11,-4096+255           # 0xfffff0ff
170         stw             $vrsave,`$FRAME-6*$SIZE_T-4`($sp)       # save vrsave
171         li              $x10,0x10
172         $PUSH           r26,`$FRAME-6*$SIZE_T`($sp)
173         li              $x20,0x20
174         $PUSH           r27,`$FRAME-5*$SIZE_T`($sp)
175         li              $x30,0x30
176         $PUSH           r28,`$FRAME-4*$SIZE_T`($sp)
177         li              $x40,0x40
178         $PUSH           r29,`$FRAME-3*$SIZE_T`($sp)
179         li              $x50,0x50
180         $PUSH           r30,`$FRAME-2*$SIZE_T`($sp)
181         li              $x60,0x60
182         $PUSH           r31,`$FRAME-1*$SIZE_T`($sp)
183         li              $x70,0x70
184         $PUSH           $lrsave,`$FRAME+$LRSAVE`($sp)
185         mtspr           256,r11
186
187         bl              LPICmeup
188         addi            $offload,$sp,`8*$SIZE_T+15`
189 ___
190 $code.=<<___            if ($LENDIAN);
191         li              $idx,8
192         lvsl            $lemask,0,$idx
193         vspltisb        $Ki,0x0f
194         vxor            $lemask,$lemask,$Ki
195 ___
196 $code.=<<___            if ($SZ==4);
197         lvx_4w          $A,$x00,$ctx
198         lvx_4w          $E,$x10,$ctx
199         vsldoi          $B,$A,$A,4              # unpack
200         vsldoi          $C,$A,$A,8
201         vsldoi          $D,$A,$A,12
202         vsldoi          $F,$E,$E,4
203         vsldoi          $G,$E,$E,8
204         vsldoi          $H,$E,$E,12
205 ___
206 $code.=<<___            if ($SZ==8);
207         lvx_u           $A,$x00,$ctx
208         lvx_u           $C,$x10,$ctx
209         lvx_u           $E,$x20,$ctx
210         vsldoi          $B,$A,$A,8              # unpack
211         lvx_u           $G,$x30,$ctx
212         vsldoi          $D,$C,$C,8
213         vsldoi          $F,$E,$E,8
214         vsldoi          $H,$G,$G,8
215 ___
216 $code.=<<___;
217         li              r0,`($rounds-16)/16`    # inner loop counter
218         b               Loop
219 .align  5
220 Loop:
221         lvx             $Ki,$x00,$Tbl
222         lvx_u           @X[0],0,$inp
223         addi            $inp,$inp,16
224         mr              $idx,$Tbl               # copy $Tbl
225         stvx            $A,$x00,$offload        # offload $A-$H
226         stvx            $B,$x10,$offload
227         stvx            $C,$x20,$offload
228         stvx            $D,$x30,$offload
229         stvx            $E,$x40,$offload
230         stvx            $F,$x50,$offload
231         stvx            $G,$x60,$offload
232         stvx            $H,$x70,$offload
233         vaddu${sz}m     $H,$H,$Ki               # h+K[i]
234         lvx             $Ki,$x10,$Tbl
235 ___
236 for ($i=0;$i<16;$i++)   { &ROUND($i,@V); unshift(@V,pop(@V)); }
237 $code.=<<___;
238         mtctr           r0
239         b               L16_xx
240 .align  5
241 L16_xx:
242 ___
243 for (;$i<32;$i++)       { &ROUND($i,@V); unshift(@V,pop(@V)); }
244 $code.=<<___;
245         bdnz            L16_xx
246
247         lvx             @X[2],$x00,$offload
248         subic.          $num,$num,1
249         lvx             @X[3],$x10,$offload
250         vaddu${sz}m     $A,$A,@X[2]
251         lvx             @X[4],$x20,$offload
252         vaddu${sz}m     $B,$B,@X[3]
253         lvx             @X[5],$x30,$offload
254         vaddu${sz}m     $C,$C,@X[4]
255         lvx             @X[6],$x40,$offload
256         vaddu${sz}m     $D,$D,@X[5]
257         lvx             @X[7],$x50,$offload
258         vaddu${sz}m     $E,$E,@X[6]
259         lvx             @X[8],$x60,$offload
260         vaddu${sz}m     $F,$F,@X[7]
261         lvx             @X[9],$x70,$offload
262         vaddu${sz}m     $G,$G,@X[8]
263         vaddu${sz}m     $H,$H,@X[9]
264         bne             Loop
265 ___
266 $code.=<<___            if ($SZ==4);
267         lvx             @X[0],$x20,$idx
268         vperm           $A,$A,$B,$Ki            # pack the answer
269         lvx             @X[1],$x30,$idx
270         vperm           $E,$E,$F,$Ki
271         vperm           $A,$A,$C,@X[0]
272         vperm           $E,$E,$G,@X[0]
273         vperm           $A,$A,$D,@X[1]
274         vperm           $E,$E,$H,@X[1]
275         stvx_4w         $A,$x00,$ctx
276         stvx_4w         $E,$x10,$ctx
277 ___
278 $code.=<<___            if ($SZ==8);
279         vperm           $A,$A,$B,$Ki            # pack the answer
280         vperm           $C,$C,$D,$Ki
281         vperm           $E,$E,$F,$Ki
282         vperm           $G,$G,$H,$Ki
283         stvx_u          $A,$x00,$ctx
284         stvx_u          $C,$x10,$ctx
285         stvx_u          $E,$x20,$ctx
286         stvx_u          $G,$x30,$ctx
287 ___
288 $code.=<<___;
289         addi            $offload,$sp,`$LOCALS+15`
290         mtlr            $lrsave
291         mtspr           256,$vrsave
292         lvx             v24,$x00,$offload       # ABI says so
293         lvx             v25,$x10,$offload
294         lvx             v26,$x20,$offload
295         lvx             v27,$x30,$offload
296         lvx             v28,$x40,$offload
297         lvx             v29,$x50,$offload
298         lvx             v30,$x60,$offload
299         lvx             v31,$x70,$offload
300         $POP            r26,`$FRAME-6*$SIZE_T`($sp)
301         $POP            r27,`$FRAME-5*$SIZE_T`($sp)
302         $POP            r28,`$FRAME-4*$SIZE_T`($sp)
303         $POP            r29,`$FRAME-3*$SIZE_T`($sp)
304         $POP            r30,`$FRAME-2*$SIZE_T`($sp)
305         $POP            r31,`$FRAME-1*$SIZE_T`($sp)
306         addi            $sp,$sp,$FRAME
307         blr
308         .long           0
309         .byte           0,12,4,1,0x80,6,3,0
310         .long           0
311 .size   $func,.-$func
312 ___
313
314 # Ugly hack here, because PPC assembler syntax seem to vary too
315 # much from platforms to platform...
316 $code.=<<___;
317 .align  6
318 LPICmeup:
319         mflr    r0
320         bcl     20,31,\$+4
321         mflr    $Tbl    ; vvvvvv "distance" between . and 1st data entry
322         addi    $Tbl,$Tbl,`64-8`
323         mtlr    r0
324         blr
325         .long   0
326         .byte   0,12,0x14,0,0,0,0,0
327         .space  `64-9*4`
328 ___
329
330 if ($SZ==8) {
331     local *table = sub {
332         foreach(@_) { $code.=".quad     $_,$_\n"; }
333     };
334     table(
335         "0x428a2f98d728ae22","0x7137449123ef65cd",
336         "0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc",
337         "0x3956c25bf348b538","0x59f111f1b605d019",
338         "0x923f82a4af194f9b","0xab1c5ed5da6d8118",
339         "0xd807aa98a3030242","0x12835b0145706fbe",
340         "0x243185be4ee4b28c","0x550c7dc3d5ffb4e2",
341         "0x72be5d74f27b896f","0x80deb1fe3b1696b1",
342         "0x9bdc06a725c71235","0xc19bf174cf692694",
343         "0xe49b69c19ef14ad2","0xefbe4786384f25e3",
344         "0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65",
345         "0x2de92c6f592b0275","0x4a7484aa6ea6e483",
346         "0x5cb0a9dcbd41fbd4","0x76f988da831153b5",
347         "0x983e5152ee66dfab","0xa831c66d2db43210",
348         "0xb00327c898fb213f","0xbf597fc7beef0ee4",
349         "0xc6e00bf33da88fc2","0xd5a79147930aa725",
350         "0x06ca6351e003826f","0x142929670a0e6e70",
351         "0x27b70a8546d22ffc","0x2e1b21385c26c926",
352         "0x4d2c6dfc5ac42aed","0x53380d139d95b3df",
353         "0x650a73548baf63de","0x766a0abb3c77b2a8",
354         "0x81c2c92e47edaee6","0x92722c851482353b",
355         "0xa2bfe8a14cf10364","0xa81a664bbc423001",
356         "0xc24b8b70d0f89791","0xc76c51a30654be30",
357         "0xd192e819d6ef5218","0xd69906245565a910",
358         "0xf40e35855771202a","0x106aa07032bbd1b8",
359         "0x19a4c116b8d2d0c8","0x1e376c085141ab53",
360         "0x2748774cdf8eeb99","0x34b0bcb5e19b48a8",
361         "0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb",
362         "0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3",
363         "0x748f82ee5defb2fc","0x78a5636f43172f60",
364         "0x84c87814a1f0ab72","0x8cc702081a6439ec",
365         "0x90befffa23631e28","0xa4506cebde82bde9",
366         "0xbef9a3f7b2c67915","0xc67178f2e372532b",
367         "0xca273eceea26619c","0xd186b8c721c0c207",
368         "0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178",
369         "0x06f067aa72176fba","0x0a637dc5a2c898a6",
370         "0x113f9804bef90dae","0x1b710b35131c471b",
371         "0x28db77f523047d84","0x32caab7b40c72493",
372         "0x3c9ebe0a15c9bebc","0x431d67c49c100d4c",
373         "0x4cc5d4becb3e42b6","0x597f299cfc657e2a",
374         "0x5fcb6fab3ad6faec","0x6c44198c4a475817","0");
375 $code.=<<___    if (!$LENDIAN);
376 .quad   0x0001020304050607,0x1011121314151617
377 ___
378 $code.=<<___    if ($LENDIAN);  # quad-swapped
379 .quad   0x1011121314151617,0x0001020304050607
380 ___
381 } else {
382     local *table = sub {
383         foreach(@_) { $code.=".long     $_,$_,$_,$_\n"; }
384     };
385     table(
386         "0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5",
387         "0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5",
388         "0xd807aa98","0x12835b01","0x243185be","0x550c7dc3",
389         "0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174",
390         "0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc",
391         "0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da",
392         "0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7",
393         "0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967",
394         "0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13",
395         "0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85",
396         "0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3",
397         "0xd192e819","0xd6990624","0xf40e3585","0x106aa070",
398         "0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5",
399         "0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3",
400         "0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208",
401         "0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0");
402 $code.=<<___    if (!$LENDIAN);
403 .long   0x00010203,0x10111213,0x10111213,0x10111213
404 .long   0x00010203,0x04050607,0x10111213,0x10111213
405 .long   0x00010203,0x04050607,0x08090a0b,0x10111213
406 ___
407 $code.=<<___    if ($LENDIAN);  # word-swapped
408 .long   0x10111213,0x10111213,0x10111213,0x00010203
409 .long   0x10111213,0x10111213,0x04050607,0x00010203
410 .long   0x10111213,0x08090a0b,0x04050607,0x00010203
411 ___
412 }
413 $code.=<<___;
414 .asciz  "SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
415 .align  2
416 ___
417
418 $code =~ s/\`([^\`]*)\`/eval $1/gem;
419 print $code;
420 close STDOUT;