sha1-mb-x86_64.pl: add commentary.
[openssl.git] / crypto / sha / asm / sha512p8-ppc.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # SHA256/512 for PowerISA v2.07.
11 #
12 # Accurate performance measurements are problematic, because it's
13 # always virtualized setup with possibly throttled processor.
14 # Relative comparison is therefore more informative. This module is
15 # ~60% faster than integer-only sha512-ppc.pl. To anchor to something
16 # else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
17 # hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
18 # sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
19 # result is degree of computational resources' utilization. POWER8 is
20 # "massively multi-threaded chip" and difference between single- and
21 # maximum multi-process benchmark results tells that utlization is
22 # whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
23 # for sha1-ppc.pl - 73%. 100% means that multi-process result equals
24 # to single-process one, given that all threads end up on the same
25 # physical core.
26
27 $flavour=shift;
28 $output =shift;
29
30 if ($flavour =~ /64/) {
31         $SIZE_T=8;
32         $LRSAVE=2*$SIZE_T;
33         $STU="stdu";
34         $POP="ld";
35         $PUSH="std";
36 } elsif ($flavour =~ /32/) {
37         $SIZE_T=4;
38         $LRSAVE=$SIZE_T;
39         $STU="stwu";
40         $POP="lwz";
41         $PUSH="stw";
42 } else { die "nonsense $flavour"; }
43
44 $LENDIAN=($flavour=~/le/);
45
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
48 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
49 die "can't locate ppc-xlate.pl";
50
51 open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
52
53 if ($output =~ /512/) {
54         $bits=512;
55         $SZ=8;
56         $sz="d";
57         $rounds=80;
58 } else {
59         $bits=256;
60         $SZ=4;
61         $sz="w";
62         $rounds=64;
63 }
64
65 $func="sha${bits}_block_p8";
66 $FRAME=8*$SIZE_T;
67
68 $sp ="r1";
69 $toc="r2";
70 $ctx="r3";
71 $inp="r4";
72 $num="r5";
73 $Tbl="r6";
74 $idx="r7";
75 $lrsave="r8";
76 $offload="r11";
77 $vrsave="r12";
78 ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
79
80 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
81 @X=map("v$_",(8..23));
82 ($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
83
84 sub ROUND {
85 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
86 my $j=($i+1)%16;
87
88 $code.=<<___            if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
89         lvx_u           @X[$i+1],0,$inp         ; load X[i] in advance
90         addi            $inp,$inp,16
91 ___
92 $code.=<<___            if ($i<16 && ($i%(16/$SZ)));
93         vsldoi          @X[$i],@X[$i-1],@X[$i-1],$SZ
94 ___
95 $code.=<<___            if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
96         vperm           @X[$i],@X[$i],@X[$i],$lemask
97 ___
98 $code.=<<___;
99         `"vshasigma${sz}        $s0,@X[($j+1)%16],0,0"          if ($i>=15)`
100         vsel            $Func,$g,$f,$e          ; Ch(e,f,g)
101         vshasigma${sz}  $S1,$e,1,15             ; Sigma1(e)
102         vaddu${sz}m     $h,$h,@X[$i%16]         ; h+=X[i]
103         vshasigma${sz}  $S0,$a,1,0              ; Sigma0(a)
104         `"vshasigma${sz}        $s1,@X[($j+14)%16],0,15"        if ($i>=15)`
105         vaddu${sz}m     $h,$h,$Func             ; h+=Ch(e,f,g)
106         vxor            $Func,$a,$b
107         `"vaddu${sz}m           @X[$j],@X[$j],@X[($j+9)%16]"    if ($i>=15)`
108         vaddu${sz}m     $h,$h,$S1               ; h+=Sigma1(e)
109         vsel            $Func,$b,$c,$Func       ; Maj(a,b,c)
110         vaddu${sz}m     $g,$g,$Ki               ; future h+=K[i]
111         vaddu${sz}m     $d,$d,$h                ; d+=h
112         vaddu${sz}m     $S0,$S0,$Func           ; Sigma0(a)+Maj(a,b,c)
113         `"vaddu${sz}m           @X[$j],@X[$j],$s0"              if ($i>=15)`
114         lvx             $Ki,$idx,$Tbl           ; load next K[i]
115         addi            $idx,$idx,16
116         vaddu${sz}m     $h,$h,$S0               ; h+=Sigma0(a)+Maj(a,b,c)
117         `"vaddu${sz}m           @X[$j],@X[$j],$s1"              if ($i>=15)`
118 ___
119 }
120
121 $code=<<___;
122 .machine        "any"
123 .text
124
125 .globl  $func
126 .align  6
127 $func:
128         $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
129         mflr            $lrsave
130         li              r10,`$FRAME+8*16+15`
131         li              r11,`$FRAME+8*16+31`
132         stvx            v20,r10,$sp             # ABI says so
133         addi            r10,r10,32
134         mfspr           $vrsave,256
135         stvx            v21,r11,$sp
136         addi            r11,r11,32
137         stvx            v22,r10,$sp
138         addi            r10,r10,32
139         stvx            v23,r11,$sp
140         addi            r11,r11,32
141         stvx            v24,r10,$sp
142         addi            r10,r10,32
143         stvx            v25,r11,$sp
144         addi            r11,r11,32
145         stvx            v26,r10,$sp
146         addi            r10,r10,32
147         stvx            v27,r11,$sp
148         addi            r11,r11,32
149         stvx            v28,r10,$sp
150         addi            r10,r10,32
151         stvx            v29,r11,$sp
152         addi            r11,r11,32
153         stvx            v30,r10,$sp
154         stvx            v31,r11,$sp
155         li              r11,-1
156         stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
157         li              $x10,0x10
158         $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
159         li              $x20,0x20
160         $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
161         li              $x30,0x30
162         $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
163         li              $x40,0x40
164         $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
165         li              $x50,0x50
166         $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
167         li              $x60,0x60
168         $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
169         li              $x70,0x70
170         $PUSH           $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
171         mtspr           256,r11
172
173         bl              LPICmeup
174         addi            $offload,$sp,$FRAME+15
175 ___
176 $code.=<<___            if ($LENDIAN);
177         li              $idx,8
178         lvsl            $lemask,0,$idx
179         vspltisb        $Ki,0x0f
180         vxor            $lemask,$lemask,$Ki
181 ___
182 $code.=<<___            if ($SZ==4);
183         lvx_4w          $A,$x00,$ctx
184         lvx_4w          $E,$x10,$ctx
185         vsldoi          $B,$A,$A,4              # unpack
186         vsldoi          $C,$A,$A,8
187         vsldoi          $D,$A,$A,12
188         vsldoi          $F,$E,$E,4
189         vsldoi          $G,$E,$E,8
190         vsldoi          $H,$E,$E,12
191 ___
192 $code.=<<___            if ($SZ==8);
193         lvx_u           $A,$x00,$ctx
194         lvx_u           $C,$x10,$ctx
195         lvx_u           $E,$x20,$ctx
196         vsldoi          $B,$A,$A,8              # unpack
197         lvx_u           $G,$x30,$ctx
198         vsldoi          $D,$C,$C,8
199         vsldoi          $F,$E,$E,8
200         vsldoi          $H,$G,$G,8
201 ___
202 $code.=<<___;
203         li              r0,`($rounds-16)/16`    # inner loop counter
204         b               Loop
205 .align  5
206 Loop:
207         lvx             $Ki,$x00,$Tbl
208         li              $idx,16
209         lvx_u           @X[0],0,$inp
210         addi            $inp,$inp,16
211         stvx            $A,$x00,$offload        # offload $A-$H
212         stvx            $B,$x10,$offload
213         stvx            $C,$x20,$offload
214         stvx            $D,$x30,$offload
215         stvx            $E,$x40,$offload
216         stvx            $F,$x50,$offload
217         stvx            $G,$x60,$offload
218         stvx            $H,$x70,$offload
219         vaddu${sz}m     $H,$H,$Ki               # h+K[i]
220         lvx             $Ki,$idx,$Tbl
221         addi            $idx,$idx,16
222 ___
223 for ($i=0;$i<16;$i++)   { &ROUND($i,@V); unshift(@V,pop(@V)); }
224 $code.=<<___;
225         mtctr           r0
226         b               L16_xx
227 .align  5
228 L16_xx:
229 ___
230 for (;$i<32;$i++)       { &ROUND($i,@V); unshift(@V,pop(@V)); }
231 $code.=<<___;
232         bdnz            L16_xx
233
234         lvx             @X[2],$x00,$offload
235         subic.          $num,$num,1
236         lvx             @X[3],$x10,$offload
237         vaddu${sz}m     $A,$A,@X[2]
238         lvx             @X[4],$x20,$offload
239         vaddu${sz}m     $B,$B,@X[3]
240         lvx             @X[5],$x30,$offload
241         vaddu${sz}m     $C,$C,@X[4]
242         lvx             @X[6],$x40,$offload
243         vaddu${sz}m     $D,$D,@X[5]
244         lvx             @X[7],$x50,$offload
245         vaddu${sz}m     $E,$E,@X[6]
246         lvx             @X[8],$x60,$offload
247         vaddu${sz}m     $F,$F,@X[7]
248         lvx             @X[9],$x70,$offload
249         vaddu${sz}m     $G,$G,@X[8]
250         vaddu${sz}m     $H,$H,@X[9]
251         bne             Loop
252 ___
253 $code.=<<___            if ($SZ==4);
254         lvx             @X[0],$idx,$Tbl
255         addi            $idx,$idx,16
256         vperm           $A,$A,$B,$Ki            # pack the answer
257         lvx             @X[1],$idx,$Tbl
258         vperm           $E,$E,$F,$Ki
259         vperm           $A,$A,$C,@X[0]
260         vperm           $E,$E,$G,@X[0]
261         vperm           $A,$A,$D,@X[1]
262         vperm           $E,$E,$H,@X[1]
263         stvx_4w         $A,$x00,$ctx
264         stvx_4w         $E,$x10,$ctx
265 ___
266 $code.=<<___            if ($SZ==8);
267         vperm           $A,$A,$B,$Ki            # pack the answer
268         vperm           $C,$C,$D,$Ki
269         vperm           $E,$E,$F,$Ki
270         vperm           $G,$G,$H,$Ki
271         stvx_u          $A,$x00,$ctx
272         stvx_u          $C,$x10,$ctx
273         stvx_u          $E,$x20,$ctx
274         stvx_u          $G,$x30,$ctx
275 ___
276 $code.=<<___;
277         li              r10,`$FRAME+8*16+15`
278         mtlr            $lrsave
279         li              r11,`$FRAME+8*16+31`
280         mtspr           256,$vrsave
281         lvx             v20,r10,$sp             # ABI says so
282         addi            r10,r10,32
283         lvx             v21,r11,$sp
284         addi            r11,r11,32
285         lvx             v22,r10,$sp
286         addi            r10,r10,32
287         lvx             v23,r11,$sp
288         addi            r11,r11,32
289         lvx             v24,r10,$sp
290         addi            r10,r10,32
291         lvx             v25,r11,$sp
292         addi            r11,r11,32
293         lvx             v26,r10,$sp
294         addi            r10,r10,32
295         lvx             v27,r11,$sp
296         addi            r11,r11,32
297         lvx             v28,r10,$sp
298         addi            r10,r10,32
299         lvx             v29,r11,$sp
300         addi            r11,r11,32
301         lvx             v30,r10,$sp
302         lvx             v31,r11,$sp
303         $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
304         $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
305         $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
306         $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
307         $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
308         $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
309         addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
310         blr
311         .long           0
312         .byte           0,12,4,1,0x80,6,3,0
313         .long           0
314 .size   $func,.-$func
315 ___
316
317 # Ugly hack here, because PPC assembler syntax seem to vary too
318 # much from platforms to platform...
319 $code.=<<___;
320 .align  6
321 LPICmeup:
322         mflr    r0
323         bcl     20,31,\$+4
324         mflr    $Tbl    ; vvvvvv "distance" between . and 1st data entry
325         addi    $Tbl,$Tbl,`64-8`
326         mtlr    r0
327         blr
328         .long   0
329         .byte   0,12,0x14,0,0,0,0,0
330         .space  `64-9*4`
331 ___
332
333 if ($SZ==8) {
334     local *table = sub {
335         foreach(@_) { $code.=".quad     $_,$_\n"; }
336     };
337     table(
338         "0x428a2f98d728ae22","0x7137449123ef65cd",
339         "0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc",
340         "0x3956c25bf348b538","0x59f111f1b605d019",
341         "0x923f82a4af194f9b","0xab1c5ed5da6d8118",
342         "0xd807aa98a3030242","0x12835b0145706fbe",
343         "0x243185be4ee4b28c","0x550c7dc3d5ffb4e2",
344         "0x72be5d74f27b896f","0x80deb1fe3b1696b1",
345         "0x9bdc06a725c71235","0xc19bf174cf692694",
346         "0xe49b69c19ef14ad2","0xefbe4786384f25e3",
347         "0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65",
348         "0x2de92c6f592b0275","0x4a7484aa6ea6e483",
349         "0x5cb0a9dcbd41fbd4","0x76f988da831153b5",
350         "0x983e5152ee66dfab","0xa831c66d2db43210",
351         "0xb00327c898fb213f","0xbf597fc7beef0ee4",
352         "0xc6e00bf33da88fc2","0xd5a79147930aa725",
353         "0x06ca6351e003826f","0x142929670a0e6e70",
354         "0x27b70a8546d22ffc","0x2e1b21385c26c926",
355         "0x4d2c6dfc5ac42aed","0x53380d139d95b3df",
356         "0x650a73548baf63de","0x766a0abb3c77b2a8",
357         "0x81c2c92e47edaee6","0x92722c851482353b",
358         "0xa2bfe8a14cf10364","0xa81a664bbc423001",
359         "0xc24b8b70d0f89791","0xc76c51a30654be30",
360         "0xd192e819d6ef5218","0xd69906245565a910",
361         "0xf40e35855771202a","0x106aa07032bbd1b8",
362         "0x19a4c116b8d2d0c8","0x1e376c085141ab53",
363         "0x2748774cdf8eeb99","0x34b0bcb5e19b48a8",
364         "0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb",
365         "0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3",
366         "0x748f82ee5defb2fc","0x78a5636f43172f60",
367         "0x84c87814a1f0ab72","0x8cc702081a6439ec",
368         "0x90befffa23631e28","0xa4506cebde82bde9",
369         "0xbef9a3f7b2c67915","0xc67178f2e372532b",
370         "0xca273eceea26619c","0xd186b8c721c0c207",
371         "0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178",
372         "0x06f067aa72176fba","0x0a637dc5a2c898a6",
373         "0x113f9804bef90dae","0x1b710b35131c471b",
374         "0x28db77f523047d84","0x32caab7b40c72493",
375         "0x3c9ebe0a15c9bebc","0x431d67c49c100d4c",
376         "0x4cc5d4becb3e42b6","0x597f299cfc657e2a",
377         "0x5fcb6fab3ad6faec","0x6c44198c4a475817","0");
378 $code.=<<___    if (!$LENDIAN);
379 .quad   0x0001020304050607,0x1011121314151617
380 ___
381 $code.=<<___    if ($LENDIAN);  # quad-swapped
382 .quad   0x1011121314151617,0x0001020304050607
383 ___
384 } else {
385     local *table = sub {
386         foreach(@_) { $code.=".long     $_,$_,$_,$_\n"; }
387     };
388     table(
389         "0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5",
390         "0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5",
391         "0xd807aa98","0x12835b01","0x243185be","0x550c7dc3",
392         "0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174",
393         "0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc",
394         "0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da",
395         "0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7",
396         "0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967",
397         "0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13",
398         "0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85",
399         "0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3",
400         "0xd192e819","0xd6990624","0xf40e3585","0x106aa070",
401         "0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5",
402         "0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3",
403         "0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208",
404         "0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0");
405 $code.=<<___    if (!$LENDIAN);
406 .long   0x00010203,0x10111213,0x10111213,0x10111213
407 .long   0x00010203,0x04050607,0x10111213,0x10111213
408 .long   0x00010203,0x04050607,0x08090a0b,0x10111213
409 ___
410 $code.=<<___    if ($LENDIAN);  # word-swapped
411 .long   0x10111213,0x10111213,0x10111213,0x00010203
412 .long   0x10111213,0x10111213,0x04050607,0x00010203
413 .long   0x10111213,0x08090a0b,0x04050607,0x00010203
414 ___
415 }
416 $code.=<<___;
417 .asciz  "SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
418 .align  2
419 ___
420
421 $code =~ s/\`([^\`]*)\`/eval $1/gem;
422 print $code;
423 close STDOUT;