3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # SHA256/512 for PowerISA v2.07.
12 # Accurate performance measurements are problematic, because it's
13 # always virtualized setup with possibly throttled processor.
14 # Relative comparison is therefore more informative. This module is
15 # ~60% faster than integer-only sha512-ppc.pl. To anchor to something
16 # else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
17 # hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
18 # sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
19 # result is degree of computational resources' utilization. POWER8 is
20 # "massively multi-threaded chip" and difference between single- and
21 # maximum multi-process benchmark results tells that utlization is
22 # whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
23 # for sha1-ppc.pl - 73%. 100% means that multi-process result equals
24 # to single-process one, given that all threads end up on the same
30 if ($flavour =~ /64/) {
36 } elsif ($flavour =~ /32/) {
42 } else { die "nonsense $flavour"; }
44 $LENDIAN=($flavour=~/le/);
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
48 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
49 die "can't locate ppc-xlate.pl";
51 open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
53 if ($output =~ /512/) {
65 $func="sha${bits}_block_p8";
78 ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
79 $x00=0 if ($flavour =~ /osx/);
81 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
82 @X=map("v$_",(8..23));
83 ($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
86 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
89 $code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
90 lvx_u @X[$i+1],0,$inp ; load X[i] in advance
93 $code.=<<___ if ($i<16 && ($i%(16/$SZ)));
94 vsldoi @X[$i],@X[$i-1],@X[$i-1],$SZ
96 $code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
97 vperm @X[$i],@X[$i],@X[$i],$lemask
100 `"vshasigma${sz} $s0,@X[($j+1)%16],0,0" if ($i>=15)`
101 vsel $Func,$g,$f,$e ; Ch(e,f,g)
102 vshasigma${sz} $S1,$e,1,15 ; Sigma1(e)
103 vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i]
104 vshasigma${sz} $S0,$a,1,0 ; Sigma0(a)
105 `"vshasigma${sz} $s1,@X[($j+14)%16],0,15" if ($i>=15)`
106 vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g)
108 `"vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]" if ($i>=15)`
109 vaddu${sz}m $h,$h,$S1 ; h+=Sigma1(e)
110 vsel $Func,$b,$c,$Func ; Maj(a,b,c)
111 vaddu${sz}m $g,$g,$Ki ; future h+=K[i]
112 vaddu${sz}m $d,$d,$h ; d+=h
113 vaddu${sz}m $S0,$S0,$Func ; Sigma0(a)+Maj(a,b,c)
114 `"vaddu${sz}m @X[$j],@X[$j],$s0" if ($i>=15)`
115 lvx $Ki,$idx,$Tbl ; load next K[i]
117 vaddu${sz}m $h,$h,$S0 ; h+=Sigma0(a)+Maj(a,b,c)
118 `"vaddu${sz}m @X[$j],@X[$j],$s1" if ($i>=15)`
129 $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
131 li r10,`$FRAME+8*16+15`
132 li r11,`$FRAME+8*16+31`
133 stvx v20,r10,$sp # ABI says so
157 stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
159 $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
161 $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
163 $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
165 $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
167 $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
169 $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
171 $PUSH $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
175 addi $offload,$sp,$FRAME+15
177 $code.=<<___ if ($LENDIAN);
181 vxor $lemask,$lemask,$Ki
183 $code.=<<___ if ($SZ==4);
186 vsldoi $B,$A,$A,4 # unpack
193 $code.=<<___ if ($SZ==8);
197 vsldoi $B,$A,$A,8 # unpack
204 li r0,`($rounds-16)/16` # inner loop counter
212 stvx $A,$x00,$offload # offload $A-$H
213 stvx $B,$x10,$offload
214 stvx $C,$x20,$offload
215 stvx $D,$x30,$offload
216 stvx $E,$x40,$offload
217 stvx $F,$x50,$offload
218 stvx $G,$x60,$offload
219 stvx $H,$x70,$offload
220 vaddu${sz}m $H,$H,$Ki # h+K[i]
224 for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); }
231 for (;$i<32;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); }
235 lvx @X[2],$x00,$offload
237 lvx @X[3],$x10,$offload
238 vaddu${sz}m $A,$A,@X[2]
239 lvx @X[4],$x20,$offload
240 vaddu${sz}m $B,$B,@X[3]
241 lvx @X[5],$x30,$offload
242 vaddu${sz}m $C,$C,@X[4]
243 lvx @X[6],$x40,$offload
244 vaddu${sz}m $D,$D,@X[5]
245 lvx @X[7],$x50,$offload
246 vaddu${sz}m $E,$E,@X[6]
247 lvx @X[8],$x60,$offload
248 vaddu${sz}m $F,$F,@X[7]
249 lvx @X[9],$x70,$offload
250 vaddu${sz}m $G,$G,@X[8]
251 vaddu${sz}m $H,$H,@X[9]
254 $code.=<<___ if ($SZ==4);
257 vperm $A,$A,$B,$Ki # pack the answer
267 $code.=<<___ if ($SZ==8);
268 vperm $A,$A,$B,$Ki # pack the answer
278 li r10,`$FRAME+8*16+15`
280 li r11,`$FRAME+8*16+31`
282 lvx v20,r10,$sp # ABI says so
304 $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
305 $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
306 $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
307 $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
308 $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
309 $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
310 addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
313 .byte 0,12,4,1,0x80,6,3,0
318 # Ugly hack here, because PPC assembler syntax seem to vary too
319 # much from platforms to platform...
325 mflr $Tbl ; vvvvvv "distance" between . and 1st data entry
326 addi $Tbl,$Tbl,`64-8`
330 .byte 0,12,0x14,0,0,0,0,0
336 foreach(@_) { $code.=".quad $_,$_\n"; }
339 "0x428a2f98d728ae22","0x7137449123ef65cd",
340 "0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc",
341 "0x3956c25bf348b538","0x59f111f1b605d019",
342 "0x923f82a4af194f9b","0xab1c5ed5da6d8118",
343 "0xd807aa98a3030242","0x12835b0145706fbe",
344 "0x243185be4ee4b28c","0x550c7dc3d5ffb4e2",
345 "0x72be5d74f27b896f","0x80deb1fe3b1696b1",
346 "0x9bdc06a725c71235","0xc19bf174cf692694",
347 "0xe49b69c19ef14ad2","0xefbe4786384f25e3",
348 "0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65",
349 "0x2de92c6f592b0275","0x4a7484aa6ea6e483",
350 "0x5cb0a9dcbd41fbd4","0x76f988da831153b5",
351 "0x983e5152ee66dfab","0xa831c66d2db43210",
352 "0xb00327c898fb213f","0xbf597fc7beef0ee4",
353 "0xc6e00bf33da88fc2","0xd5a79147930aa725",
354 "0x06ca6351e003826f","0x142929670a0e6e70",
355 "0x27b70a8546d22ffc","0x2e1b21385c26c926",
356 "0x4d2c6dfc5ac42aed","0x53380d139d95b3df",
357 "0x650a73548baf63de","0x766a0abb3c77b2a8",
358 "0x81c2c92e47edaee6","0x92722c851482353b",
359 "0xa2bfe8a14cf10364","0xa81a664bbc423001",
360 "0xc24b8b70d0f89791","0xc76c51a30654be30",
361 "0xd192e819d6ef5218","0xd69906245565a910",
362 "0xf40e35855771202a","0x106aa07032bbd1b8",
363 "0x19a4c116b8d2d0c8","0x1e376c085141ab53",
364 "0x2748774cdf8eeb99","0x34b0bcb5e19b48a8",
365 "0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb",
366 "0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3",
367 "0x748f82ee5defb2fc","0x78a5636f43172f60",
368 "0x84c87814a1f0ab72","0x8cc702081a6439ec",
369 "0x90befffa23631e28","0xa4506cebde82bde9",
370 "0xbef9a3f7b2c67915","0xc67178f2e372532b",
371 "0xca273eceea26619c","0xd186b8c721c0c207",
372 "0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178",
373 "0x06f067aa72176fba","0x0a637dc5a2c898a6",
374 "0x113f9804bef90dae","0x1b710b35131c471b",
375 "0x28db77f523047d84","0x32caab7b40c72493",
376 "0x3c9ebe0a15c9bebc","0x431d67c49c100d4c",
377 "0x4cc5d4becb3e42b6","0x597f299cfc657e2a",
378 "0x5fcb6fab3ad6faec","0x6c44198c4a475817","0");
379 $code.=<<___ if (!$LENDIAN);
380 .quad 0x0001020304050607,0x1011121314151617
382 $code.=<<___ if ($LENDIAN); # quad-swapped
383 .quad 0x1011121314151617,0x0001020304050607
387 foreach(@_) { $code.=".long $_,$_,$_,$_\n"; }
390 "0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5",
391 "0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5",
392 "0xd807aa98","0x12835b01","0x243185be","0x550c7dc3",
393 "0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174",
394 "0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc",
395 "0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da",
396 "0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7",
397 "0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967",
398 "0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13",
399 "0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85",
400 "0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3",
401 "0xd192e819","0xd6990624","0xf40e3585","0x106aa070",
402 "0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5",
403 "0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3",
404 "0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208",
405 "0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0");
406 $code.=<<___ if (!$LENDIAN);
407 .long 0x00010203,0x10111213,0x10111213,0x10111213
408 .long 0x00010203,0x04050607,0x10111213,0x10111213
409 .long 0x00010203,0x04050607,0x08090a0b,0x10111213
411 $code.=<<___ if ($LENDIAN); # word-swapped
412 .long 0x10111213,0x10111213,0x10111213,0x00010203
413 .long 0x10111213,0x10111213,0x04050607,0x00010203
414 .long 0x10111213,0x08090a0b,0x04050607,0x00010203
418 .asciz "SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
422 $code =~ s/\`([^\`]*)\`/eval $1/gem;