9ee3ed7734e91851fc032caf5bc2cb18b60cc1a5
[openssl.git] / crypto / sha / asm / sha512p8-ppc.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # SHA256/512 for PowerISA v2.07.
18 #
19 # Accurate performance measurements are problematic, because it's
20 # always virtualized setup with possibly throttled processor.
21 # Relative comparison is therefore more informative. This module is
22 # ~60% faster than integer-only sha512-ppc.pl. To anchor to something
23 # else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
24 # hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
25 # sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
26 # result is degree of computational resources' utilization. POWER8 is
27 # "massively multi-threaded chip" and difference between single- and
28 # maximum multi-process benchmark results tells that utilization is
29 # whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
30 # for sha1-ppc.pl - 73%. 100% means that multi-process result equals
31 # to single-process one, given that all threads end up on the same
32 # physical core.
33 #
34 ######################################################################
35 # Believed-to-be-accurate results in cycles per processed byte [on
36 # little-endian system]. Numbers in square brackets are for 64-bit
37 # build of sha512-ppc.pl, presented for reference.
38 #
39 #               POWER8          POWER9
40 # SHA256        9.7 [15.8]      11.2 [12.5]
41 # SHA512        6.1 [10.3]      7.0 [7.9]
42
43 # $output is the last argument if it looks like a file (it has an extension)
44 # $flavour is the first argument if it doesn't look like a file
45 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
47
48 if ($flavour =~ /64/) {
49         $SIZE_T=8;
50         $LRSAVE=2*$SIZE_T;
51         $STU="stdu";
52         $POP="ld";
53         $PUSH="std";
54 } elsif ($flavour =~ /32/) {
55         $SIZE_T=4;
56         $LRSAVE=$SIZE_T;
57         $STU="stwu";
58         $POP="lwz";
59         $PUSH="stw";
60 } else { die "nonsense $flavour"; }
61
62 $LENDIAN=($flavour=~/le/);
63
64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
66 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
67 die "can't locate ppc-xlate.pl";
68
69 open STDOUT,"| $^X $xlate $flavour \"$output\""
70     or die "can't call $xlate: $!";
71
72 if ($output =~ /512/) {
73         $bits=512;
74         $SZ=8;
75         $sz="d";
76         $rounds=80;
77 } else {
78         $bits=256;
79         $SZ=4;
80         $sz="w";
81         $rounds=64;
82 }
83
84 $func="sha${bits}_block_p8";
85 $LOCALS=8*$SIZE_T+8*16;
86 $FRAME=$LOCALS+9*16+6*$SIZE_T;
87
88 $sp ="r1";
89 $toc="r2";
90 $ctx="r3";
91 $inp="r4";
92 $num="r5";
93 $Tbl="r6";
94 $idx="r7";
95 $lrsave="r8";
96 $offload="r11";
97 $vrsave="r12";
98 @I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70) = (0,map("r$_",(10,26..31)));
99
100 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
101 @X=map("v$_",(8..19,24..27));
102 ($Ki,$Func,$Sigma,$lemask)=map("v$_",(28..31));
103
104 sub ROUND {
105 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
106 my $j=($i+1)%16;
107 my $k=($i+2)%8;
108
109 $code.=<<___            if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
110         lvx_u           @X[$i+1],0,$inp         ; load X[i] in advance
111         addi            $inp,$inp,16
112 ___
113 $code.=<<___            if ($i<16 && ($i%(16/$SZ)));
114         vsldoi          @X[$i],@X[$i-1],@X[$i-1],$SZ
115 ___
116 $code.=<<___            if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
117         vperm           @X[$i],@X[$i],@X[$i],$lemask
118 ___
119 $code.=<<___            if ($i>=15);
120         vshasigma${sz}  $Sigma,@X[($j+1)%16],0,0
121         vaddu${sz}m     @X[$j],@X[$j],$Sigma
122         vshasigma${sz}  $Sigma,@X[($j+14)%16],0,15
123         vaddu${sz}m     @X[$j],@X[$j],$Sigma
124         vaddu${sz}m     @X[$j],@X[$j],@X[($j+9)%16]
125 ___
126 $code.=<<___;
127         vaddu${sz}m     $h,$h,@X[$i%16]         ; h+=X[i]
128         vsel            $Func,$g,$f,$e          ; Ch(e,f,g)
129         vaddu${sz}m     $g,$g,$Ki               ; future h+=K[i]
130         vaddu${sz}m     $h,$h,$Func             ; h+=Ch(e,f,g)
131         vshasigma${sz}  $Sigma,$e,1,15          ; Sigma1(e)
132         vaddu${sz}m     $h,$h,$Sigma            ; h+=Sigma1(e)
133         vxor            $Func,$a,$b
134         vsel            $Func,$b,$c,$Func       ; Maj(a,b,c)
135         vaddu${sz}m     $d,$d,$h                ; d+=h
136         vshasigma${sz}  $Sigma,$a,1,0           ; Sigma0(a)
137         vaddu${sz}m     $Sigma,$Sigma,$Func     ; Sigma0(a)+Maj(a,b,c)
138         vaddu${sz}m     $h,$h,$Sigma            ; h+=Sigma0(a)+Maj(a,b,c)
139         lvx             $Ki,@I[$k],$idx         ; load next K[i]
140 ___
141 $code.=<<___            if ($k == 7);
142         addi            $idx,$idx,0x80
143 ___
144 }
145
146 $code=<<___;
147 .machine        "any"
148 .text
149
150 .globl  $func
151 .align  6
152 $func:
153         $STU            $sp,-$FRAME($sp)
154         mflr            $lrsave
155         li              r10,`$LOCALS+15`
156         li              r11,`$LOCALS+31`
157         stvx            v24,r10,$sp             # ABI says so
158         addi            r10,r10,32
159         mfspr           $vrsave,256
160         stvx            v25,r11,$sp
161         addi            r11,r11,32
162         stvx            v26,r10,$sp
163         addi            r10,r10,32
164         stvx            v27,r11,$sp
165         addi            r11,r11,32
166         stvx            v28,r10,$sp
167         addi            r10,r10,32
168         stvx            v29,r11,$sp
169         addi            r11,r11,32
170         stvx            v30,r10,$sp
171         stvx            v31,r11,$sp
172         li              r11,-4096+255           # 0xfffff0ff
173         stw             $vrsave,`$FRAME-6*$SIZE_T-4`($sp)       # save vrsave
174         li              $x10,0x10
175         $PUSH           r26,`$FRAME-6*$SIZE_T`($sp)
176         li              $x20,0x20
177         $PUSH           r27,`$FRAME-5*$SIZE_T`($sp)
178         li              $x30,0x30
179         $PUSH           r28,`$FRAME-4*$SIZE_T`($sp)
180         li              $x40,0x40
181         $PUSH           r29,`$FRAME-3*$SIZE_T`($sp)
182         li              $x50,0x50
183         $PUSH           r30,`$FRAME-2*$SIZE_T`($sp)
184         li              $x60,0x60
185         $PUSH           r31,`$FRAME-1*$SIZE_T`($sp)
186         li              $x70,0x70
187         $PUSH           $lrsave,`$FRAME+$LRSAVE`($sp)
188         mtspr           256,r11
189
190         bl              LPICmeup
191         addi            $offload,$sp,`8*$SIZE_T+15`
192 ___
193 $code.=<<___            if ($LENDIAN);
194         li              $idx,8
195         lvsl            $lemask,0,$idx
196         vspltisb        $Ki,0x0f
197         vxor            $lemask,$lemask,$Ki
198 ___
199 $code.=<<___            if ($SZ==4);
200         lvx_4w          $A,$x00,$ctx
201         lvx_4w          $E,$x10,$ctx
202         vsldoi          $B,$A,$A,4              # unpack
203         vsldoi          $C,$A,$A,8
204         vsldoi          $D,$A,$A,12
205         vsldoi          $F,$E,$E,4
206         vsldoi          $G,$E,$E,8
207         vsldoi          $H,$E,$E,12
208 ___
209 $code.=<<___            if ($SZ==8);
210         lvx_u           $A,$x00,$ctx
211         lvx_u           $C,$x10,$ctx
212         lvx_u           $E,$x20,$ctx
213         vsldoi          $B,$A,$A,8              # unpack
214         lvx_u           $G,$x30,$ctx
215         vsldoi          $D,$C,$C,8
216         vsldoi          $F,$E,$E,8
217         vsldoi          $H,$G,$G,8
218 ___
219 $code.=<<___;
220         li              r0,`($rounds-16)/16`    # inner loop counter
221         b               Loop
222 .align  5
223 Loop:
224         lvx             $Ki,$x00,$Tbl
225         lvx_u           @X[0],0,$inp
226         addi            $inp,$inp,16
227         mr              $idx,$Tbl               # copy $Tbl
228         stvx            $A,$x00,$offload        # offload $A-$H
229         stvx            $B,$x10,$offload
230         stvx            $C,$x20,$offload
231         stvx            $D,$x30,$offload
232         stvx            $E,$x40,$offload
233         stvx            $F,$x50,$offload
234         stvx            $G,$x60,$offload
235         stvx            $H,$x70,$offload
236         vaddu${sz}m     $H,$H,$Ki               # h+K[i]
237         lvx             $Ki,$x10,$Tbl
238 ___
239 for ($i=0;$i<16;$i++)   { &ROUND($i,@V); unshift(@V,pop(@V)); }
240 $code.=<<___;
241         mtctr           r0
242         b               L16_xx
243 .align  5
244 L16_xx:
245 ___
246 for (;$i<32;$i++)       { &ROUND($i,@V); unshift(@V,pop(@V)); }
247 $code.=<<___;
248         bdnz            L16_xx
249
250         lvx             @X[2],$x00,$offload
251         subic.          $num,$num,1
252         lvx             @X[3],$x10,$offload
253         vaddu${sz}m     $A,$A,@X[2]
254         lvx             @X[4],$x20,$offload
255         vaddu${sz}m     $B,$B,@X[3]
256         lvx             @X[5],$x30,$offload
257         vaddu${sz}m     $C,$C,@X[4]
258         lvx             @X[6],$x40,$offload
259         vaddu${sz}m     $D,$D,@X[5]
260         lvx             @X[7],$x50,$offload
261         vaddu${sz}m     $E,$E,@X[6]
262         lvx             @X[8],$x60,$offload
263         vaddu${sz}m     $F,$F,@X[7]
264         lvx             @X[9],$x70,$offload
265         vaddu${sz}m     $G,$G,@X[8]
266         vaddu${sz}m     $H,$H,@X[9]
267         bne             Loop
268 ___
269 $code.=<<___            if ($SZ==4);
270         lvx             @X[0],$x20,$idx
271         vperm           $A,$A,$B,$Ki            # pack the answer
272         lvx             @X[1],$x30,$idx
273         vperm           $E,$E,$F,$Ki
274         vperm           $A,$A,$C,@X[0]
275         vperm           $E,$E,$G,@X[0]
276         vperm           $A,$A,$D,@X[1]
277         vperm           $E,$E,$H,@X[1]
278         stvx_4w         $A,$x00,$ctx
279         stvx_4w         $E,$x10,$ctx
280 ___
281 $code.=<<___            if ($SZ==8);
282         vperm           $A,$A,$B,$Ki            # pack the answer
283         vperm           $C,$C,$D,$Ki
284         vperm           $E,$E,$F,$Ki
285         vperm           $G,$G,$H,$Ki
286         stvx_u          $A,$x00,$ctx
287         stvx_u          $C,$x10,$ctx
288         stvx_u          $E,$x20,$ctx
289         stvx_u          $G,$x30,$ctx
290 ___
291 $code.=<<___;
292         addi            $offload,$sp,`$LOCALS+15`
293         mtlr            $lrsave
294         mtspr           256,$vrsave
295         lvx             v24,$x00,$offload       # ABI says so
296         lvx             v25,$x10,$offload
297         lvx             v26,$x20,$offload
298         lvx             v27,$x30,$offload
299         lvx             v28,$x40,$offload
300         lvx             v29,$x50,$offload
301         lvx             v30,$x60,$offload
302         lvx             v31,$x70,$offload
303         $POP            r26,`$FRAME-6*$SIZE_T`($sp)
304         $POP            r27,`$FRAME-5*$SIZE_T`($sp)
305         $POP            r28,`$FRAME-4*$SIZE_T`($sp)
306         $POP            r29,`$FRAME-3*$SIZE_T`($sp)
307         $POP            r30,`$FRAME-2*$SIZE_T`($sp)
308         $POP            r31,`$FRAME-1*$SIZE_T`($sp)
309         addi            $sp,$sp,$FRAME
310         blr
311         .long           0
312         .byte           0,12,4,1,0x80,6,3,0
313         .long           0
314 .size   $func,.-$func
315 ___
316
317 # Ugly hack here, because PPC assembler syntax seem to vary too
318 # much from platforms to platform...
319 $code.=<<___;
320 .align  6
321 LPICmeup:
322         mflr    r0
323         bcl     20,31,\$+4
324         mflr    $Tbl    ; vvvvvv "distance" between . and 1st data entry
325         addi    $Tbl,$Tbl,`64-8`
326         mtlr    r0
327         blr
328         .long   0
329         .byte   0,12,0x14,0,0,0,0,0
330         .space  `64-9*4`
331 ___
332
333 if ($SZ==8) {
334     local *table = sub {
335         foreach(@_) { $code.=".quad     $_,$_\n"; }
336     };
337     table(
338         "0x428a2f98d728ae22","0x7137449123ef65cd",
339         "0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc",
340         "0x3956c25bf348b538","0x59f111f1b605d019",
341         "0x923f82a4af194f9b","0xab1c5ed5da6d8118",
342         "0xd807aa98a3030242","0x12835b0145706fbe",
343         "0x243185be4ee4b28c","0x550c7dc3d5ffb4e2",
344         "0x72be5d74f27b896f","0x80deb1fe3b1696b1",
345         "0x9bdc06a725c71235","0xc19bf174cf692694",
346         "0xe49b69c19ef14ad2","0xefbe4786384f25e3",
347         "0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65",
348         "0x2de92c6f592b0275","0x4a7484aa6ea6e483",
349         "0x5cb0a9dcbd41fbd4","0x76f988da831153b5",
350         "0x983e5152ee66dfab","0xa831c66d2db43210",
351         "0xb00327c898fb213f","0xbf597fc7beef0ee4",
352         "0xc6e00bf33da88fc2","0xd5a79147930aa725",
353         "0x06ca6351e003826f","0x142929670a0e6e70",
354         "0x27b70a8546d22ffc","0x2e1b21385c26c926",
355         "0x4d2c6dfc5ac42aed","0x53380d139d95b3df",
356         "0x650a73548baf63de","0x766a0abb3c77b2a8",
357         "0x81c2c92e47edaee6","0x92722c851482353b",
358         "0xa2bfe8a14cf10364","0xa81a664bbc423001",
359         "0xc24b8b70d0f89791","0xc76c51a30654be30",
360         "0xd192e819d6ef5218","0xd69906245565a910",
361         "0xf40e35855771202a","0x106aa07032bbd1b8",
362         "0x19a4c116b8d2d0c8","0x1e376c085141ab53",
363         "0x2748774cdf8eeb99","0x34b0bcb5e19b48a8",
364         "0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb",
365         "0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3",
366         "0x748f82ee5defb2fc","0x78a5636f43172f60",
367         "0x84c87814a1f0ab72","0x8cc702081a6439ec",
368         "0x90befffa23631e28","0xa4506cebde82bde9",
369         "0xbef9a3f7b2c67915","0xc67178f2e372532b",
370         "0xca273eceea26619c","0xd186b8c721c0c207",
371         "0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178",
372         "0x06f067aa72176fba","0x0a637dc5a2c898a6",
373         "0x113f9804bef90dae","0x1b710b35131c471b",
374         "0x28db77f523047d84","0x32caab7b40c72493",
375         "0x3c9ebe0a15c9bebc","0x431d67c49c100d4c",
376         "0x4cc5d4becb3e42b6","0x597f299cfc657e2a",
377         "0x5fcb6fab3ad6faec","0x6c44198c4a475817","0");
378 $code.=<<___    if (!$LENDIAN);
379 .quad   0x0001020304050607,0x1011121314151617
380 ___
381 $code.=<<___    if ($LENDIAN);  # quad-swapped
382 .quad   0x1011121314151617,0x0001020304050607
383 ___
384 } else {
385     local *table = sub {
386         foreach(@_) { $code.=".long     $_,$_,$_,$_\n"; }
387     };
388     table(
389         "0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5",
390         "0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5",
391         "0xd807aa98","0x12835b01","0x243185be","0x550c7dc3",
392         "0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174",
393         "0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc",
394         "0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da",
395         "0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7",
396         "0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967",
397         "0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13",
398         "0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85",
399         "0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3",
400         "0xd192e819","0xd6990624","0xf40e3585","0x106aa070",
401         "0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5",
402         "0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3",
403         "0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208",
404         "0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0");
405 $code.=<<___    if (!$LENDIAN);
406 .long   0x00010203,0x10111213,0x10111213,0x10111213
407 .long   0x00010203,0x04050607,0x10111213,0x10111213
408 .long   0x00010203,0x04050607,0x08090a0b,0x10111213
409 ___
410 $code.=<<___    if ($LENDIAN);  # word-swapped
411 .long   0x10111213,0x10111213,0x10111213,0x00010203
412 .long   0x10111213,0x10111213,0x04050607,0x00010203
413 .long   0x10111213,0x08090a0b,0x04050607,0x00010203
414 ___
415 }
416 $code.=<<___;
417 .asciz  "SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
418 .align  2
419 ___
420
421 $code =~ s/\`([^\`]*)\`/eval $1/gem;
422 print $code;
423 close STDOUT;