2 # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. Rights for redistribution and usage in source and binary
13 # forms are granted according to the License.
14 # ====================================================================
16 # sha256/512_block procedure for x86_64.
18 # 40% improvement over compiler-generated code on Opteron. On EM64T
19 # sha256 was observed to run >80% faster and sha512 - >40%. No magical
20 # tricks, just straight implementation... I really wonder why gcc
21 # [being armed with inline assembler] fails to generate as fast code.
22 # The only thing which is cool about this module is that it's very
23 # same instruction sequence used for both SHA-256 and SHA-512. In
24 # former case the instructions operate on 32-bit operands, while in
25 # latter - on 64-bit ones. All I had to do is to get one flavor right,
26 # the other one passed the test right away:-)
28 # sha256_block runs in ~1005 cycles on Opteron, which gives you
29 # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30 # frequency in GHz. sha512_block runs in ~1275 cycles, which results
31 # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32 # Well, if you compare it to IA-64 implementation, which maintains
33 # X[16] in register bank[!], tends to 4 instructions per CPU clock
34 # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35 # issue Opteron pipeline and X[16] maintained in memory. So that *if*
36 # there is a way to improve it, *then* the only way would be to try to
37 # offload X[16] updates to SSE unit, but that would require "deeper"
38 # loop unroll, which in turn would naturally cause size blow-up, not
39 # to mention increased complexity! And once again, only *if* it's
40 # actually possible to noticeably improve overall ILP, instruction
41 # level parallelism, on a given CPU implementation in this case.
43 # Special note on Intel EM64T. While Opteron CPU exhibits perfect
44 # performance ratio of 1.5 between 64- and 32-bit flavors [see above],
45 # [currently available] EM64T CPUs apparently are far from it. On the
46 # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47 # sha256_block:-( This is presumably because 64-bit shifts/rotates
48 # apparently are not atomic instructions, but implemented in microcode.
52 # Optimization including one of Pavel Semjanov's ideas, alternative
53 # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
54 # unfortunately -2% SHA512 on P4 [which nobody should care about
59 # Add SIMD code paths, see below for improvement coefficients. SSSE3
60 # code path was not attempted for SHA512, because improvement is not
61 # estimated to be high enough, noticeably less than 9%, to justify
62 # the effort, not on pre-AVX processors. [Obviously with exclusion
63 # for VIA Nano, but it has SHA512 instruction that is faster and
64 # should be used instead.] For reference, corresponding estimated
65 # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66 # higher coefficients are observed on VIA Nano and Bulldozer has more
67 # to do with specifics of their architecture [which is topic for
68 # separate discussion].
72 # Add AVX2 code path. Two consecutive input blocks are loaded to
73 # 256-bit %ymm registers, with data from first block to least
74 # significant 128-bit halves and data from second to most significant.
75 # The data is then processed with same SIMD instruction sequence as
76 # for AVX, but with %ymm as operands. Side effect is increased stack
77 # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
82 # Add support for Intel SHA Extensions.
84 ######################################################################
85 # Current performance in cycles per processed byte (less is better):
87 # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
89 # AMD K8 14.9 - - 9.57 -
91 # Core 2 15.6 13.8(+13%) - 9.97 -
92 # Westmere 14.8 12.3(+19%) - 9.58 -
93 # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
94 # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
95 # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
96 # Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%)
97 # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
98 # Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%)
99 # VIA Nano 23.0 16.5(+39%) - 14.7 -
100 # Atom 23.0 18.9(+22%) - 14.7 -
101 # Silvermont 27.4 20.6(+33%) - 17.5 -
102 # Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%)
103 # Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 -
105 # (*) whichever best applicable, including SHAEXT;
106 # (**) switch from ror to shrd stands for fair share of improvement;
107 # (***) execution time is fully determined by remaining integer-only
108 # part, body_00_15; reducing the amount of SIMD instructions
109 # below certain limit makes no difference/sense; to conserve
110 # space SHA256 XOP code path is therefore omitted;
112 # $output is the last argument if it looks like a file (it has an extension)
113 # $flavour is the first argument if it doesn't look like a file
114 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
115 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
117 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
119 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
120 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
121 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
122 die "can't locate x86_64-xlate.pl";
124 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
125 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
126 $avx = ($1>=2.19) + ($1>=2.22);
129 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
130 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
131 $avx = ($1>=2.09) + ($1>=2.10);
134 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
135 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
136 $avx = ($1>=10) + ($1>=11);
139 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
140 $avx = ($2>=3.0) + ($2>3.0);
143 $shaext=1; ### set to zero if compiling for 1.0.1
144 $avx=1 if (!$shaext && $avx);
146 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
147 or die "can't call $xlate: $!";
150 if ($output =~ /512/) {
151 $func="sha512_block_data_order";
154 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
155 "%r8", "%r9", "%r10","%r11");
156 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
163 $func="sha256_block_data_order";
166 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
167 "%r8d","%r9d","%r10d","%r11d");
168 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
176 $ctx="%rdi"; # 1st arg, zapped by $a3
177 $inp="%rsi"; # 2nd arg
180 $_ctx="16*$SZ+0*8(%rsp)";
181 $_inp="16*$SZ+1*8(%rsp)";
182 $_end="16*$SZ+2*8(%rsp)";
183 $_rsp="`16*$SZ+3*8`(%rsp)";
184 $framesz="16*$SZ+4*8";
188 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
190 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
193 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
197 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
200 mov $T1,`$SZ*($i&0xf)`(%rsp)
204 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
206 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
208 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
210 add $a2,$T1 # T1+=Ch(e,f,g)
213 add ($Tbl),$T1 # T1+=K[round]
216 xor $b,$a2 # a^b, b^c in next round
217 ror \$$Sigma1[0],$a0 # Sigma1(e)
221 ror \$$Sigma0[0],$a1 # Sigma0(a)
222 add $a0,$T1 # T1+=Sigma1(e)
224 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
228 lea $STRIDE($Tbl),$Tbl # round++
230 $code.=<<___ if ($i<15);
231 add $a1,$h # h+=Sigma0(a)
233 ($a2,$a3) = ($a3,$a2);
237 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
240 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
241 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
244 ror \$`$sigma0[1]-$sigma0[0]`,$a0
245 add $a1,$a # modulo-scheduled h+=Sigma0(a)
247 ror \$`$sigma1[1]-$sigma1[0]`,$a2
256 xor $a0,$T1 # sigma0(X[(i+1)&0xf])
257 xor $a1,$a2 # sigma1(X[(i+14)&0xf])
258 add `$SZ*(($i+9)&0xf)`(%rsp),$T1
260 add `$SZ*($i&0xf)`(%rsp),$T1
271 .extern OPENSSL_ia32cap_P
273 .type $func,\@function,3
278 $code.=<<___ if ($SZ==4 || $avx);
279 lea OPENSSL_ia32cap_P(%rip),%r11
284 $code.=<<___ if ($SZ==4 && $shaext);
285 test \$`1<<29`,%r11d # check for SHA
288 $code.=<<___ if ($avx && $SZ==8);
289 test \$`1<<11`,%r10d # check for XOP
292 $code.=<<___ if ($avx>1);
293 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
294 cmp \$`1<<8|1<<5|1<<3`,%r11d
297 $code.=<<___ if ($avx);
298 and \$`1<<30`,%r9d # mask "Intel CPU" bit
299 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
301 cmp \$`1<<28|1<<9|1<<30`,%r10d
304 $code.=<<___ if ($SZ==4);
309 mov %rsp,%rax # copy %rsp
310 .cfi_def_cfa_register %rax
323 shl \$4,%rdx # num*16
325 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
326 and \$-64,%rsp # align stack frame
327 mov $ctx,$_ctx # save ctx, 1st arg
328 mov $inp,$_inp # save inp, 2nd arh
329 mov %rdx,$_end # save end pointer, "3rd" arg
330 mov %rax,$_rsp # save copy of %rsp
331 .cfi_cfa_expression $_rsp,deref,+8
347 lea $TABLE(%rip),$Tbl
350 for($i=0;$i<16;$i++) {
351 $code.=" mov $SZ*$i($inp),$T1\n";
352 $code.=" mov @ROT[4],$a0\n";
353 $code.=" mov @ROT[0],$a1\n";
354 $code.=" bswap $T1\n";
355 &ROUND_00_15($i,@ROT);
356 unshift(@ROT,pop(@ROT));
364 &ROUND_16_XX($i,@ROT);
365 unshift(@ROT,pop(@ROT));
369 cmpb \$0,`$SZ-1`($Tbl)
373 add $a1,$A # modulo-scheduled h+=Sigma0(a)
374 lea 16*$SZ($inp),$inp
412 .cfi_def_cfa_register %rsp
422 .type $TABLE,\@object
424 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
425 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
426 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
427 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
428 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
429 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
430 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
431 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
432 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
433 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
434 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
435 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
436 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
437 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
438 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
439 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
440 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
441 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
442 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
443 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
444 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
445 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
446 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
447 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
448 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
449 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
450 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
451 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
452 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
453 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
454 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
455 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
457 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
458 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
459 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
460 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
461 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
462 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
463 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
468 .type $TABLE,\@object
470 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
471 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
472 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
473 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
474 .quad 0x3956c25bf348b538,0x59f111f1b605d019
475 .quad 0x3956c25bf348b538,0x59f111f1b605d019
476 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
477 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
478 .quad 0xd807aa98a3030242,0x12835b0145706fbe
479 .quad 0xd807aa98a3030242,0x12835b0145706fbe
480 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
481 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
482 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
483 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
484 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
485 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
486 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
487 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
488 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
489 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
490 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
491 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
492 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
493 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
494 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
495 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
496 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
497 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
498 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
499 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
500 .quad 0x06ca6351e003826f,0x142929670a0e6e70
501 .quad 0x06ca6351e003826f,0x142929670a0e6e70
502 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
503 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
504 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
505 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
506 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
507 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
508 .quad 0x81c2c92e47edaee6,0x92722c851482353b
509 .quad 0x81c2c92e47edaee6,0x92722c851482353b
510 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
511 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
512 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
513 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
514 .quad 0xd192e819d6ef5218,0xd69906245565a910
515 .quad 0xd192e819d6ef5218,0xd69906245565a910
516 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
517 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
518 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
519 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
520 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
521 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
522 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
523 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
524 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
525 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
526 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
527 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
528 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
529 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
530 .quad 0x90befffa23631e28,0xa4506cebde82bde9
531 .quad 0x90befffa23631e28,0xa4506cebde82bde9
532 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
533 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
534 .quad 0xca273eceea26619c,0xd186b8c721c0c207
535 .quad 0xca273eceea26619c,0xd186b8c721c0c207
536 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
537 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
538 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
539 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
540 .quad 0x113f9804bef90dae,0x1b710b35131c471b
541 .quad 0x113f9804bef90dae,0x1b710b35131c471b
542 .quad 0x28db77f523047d84,0x32caab7b40c72493
543 .quad 0x28db77f523047d84,0x32caab7b40c72493
544 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
545 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
546 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
547 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
548 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
549 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
551 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
552 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
553 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
557 ######################################################################
560 if ($SZ==4 && $shaext) {{{
561 ######################################################################
562 # Intel SHA Extensions implementation of SHA256 update function.
564 my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
566 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
567 my @MSG=map("%xmm$_",(3..6));
570 .type sha256_block_data_order_shaext,\@function,3
572 sha256_block_data_order_shaext:
575 $code.=<<___ if ($win64);
576 lea `-8-5*16`(%rsp),%rsp
577 movaps %xmm6,-8-5*16(%rax)
578 movaps %xmm7,-8-4*16(%rax)
579 movaps %xmm8,-8-3*16(%rax)
580 movaps %xmm9,-8-2*16(%rax)
581 movaps %xmm10,-8-1*16(%rax)
585 lea K256+0x80(%rip),$Tbl
586 movdqu ($ctx),$ABEF # DCBA
587 movdqu 16($ctx),$CDGH # HGFE
588 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
590 pshufd \$0x1b,$ABEF,$Wi # ABCD
591 pshufd \$0xb1,$ABEF,$ABEF # CDAB
592 pshufd \$0x1b,$CDGH,$CDGH # EFGH
593 movdqa $TMP,$BSWAP # offload
594 palignr \$8,$CDGH,$ABEF # ABEF
595 punpcklqdq $Wi,$CDGH # CDGH
600 movdqu ($inp),@MSG[0]
601 movdqu 0x10($inp),@MSG[1]
602 movdqu 0x20($inp),@MSG[2]
604 movdqu 0x30($inp),@MSG[3]
606 movdqa 0*32-0x80($Tbl),$Wi
609 movdqa $CDGH,$CDGH_SAVE # offload
610 sha256rnds2 $ABEF,$CDGH # 0-3
611 pshufd \$0x0e,$Wi,$Wi
613 movdqa $ABEF,$ABEF_SAVE # offload
614 sha256rnds2 $CDGH,$ABEF
616 movdqa 1*32-0x80($Tbl),$Wi
619 sha256rnds2 $ABEF,$CDGH # 4-7
620 pshufd \$0x0e,$Wi,$Wi
622 sha256msg1 @MSG[1],@MSG[0]
623 sha256rnds2 $CDGH,$ABEF
625 movdqa 2*32-0x80($Tbl),$Wi
628 sha256rnds2 $ABEF,$CDGH # 8-11
629 pshufd \$0x0e,$Wi,$Wi
631 palignr \$4,@MSG[2],$TMP
634 sha256msg1 @MSG[2],@MSG[1]
635 sha256rnds2 $CDGH,$ABEF
637 movdqa 3*32-0x80($Tbl),$Wi
639 sha256msg2 @MSG[3],@MSG[0]
640 sha256rnds2 $ABEF,$CDGH # 12-15
641 pshufd \$0x0e,$Wi,$Wi
643 palignr \$4,@MSG[3],$TMP
646 sha256msg1 @MSG[3],@MSG[2]
647 sha256rnds2 $CDGH,$ABEF
649 for($i=4;$i<16-3;$i++) {
651 movdqa $i*32-0x80($Tbl),$Wi
653 sha256msg2 @MSG[0],@MSG[1]
654 sha256rnds2 $ABEF,$CDGH # 16-19...
655 pshufd \$0x0e,$Wi,$Wi
657 palignr \$4,@MSG[0],$TMP
660 sha256msg1 @MSG[0],@MSG[3]
661 sha256rnds2 $CDGH,$ABEF
663 push(@MSG,shift(@MSG));
666 movdqa 13*32-0x80($Tbl),$Wi
668 sha256msg2 @MSG[0],@MSG[1]
669 sha256rnds2 $ABEF,$CDGH # 52-55
670 pshufd \$0x0e,$Wi,$Wi
672 palignr \$4,@MSG[0],$TMP
673 sha256rnds2 $CDGH,$ABEF
676 movdqa 14*32-0x80($Tbl),$Wi
678 sha256rnds2 $ABEF,$CDGH # 56-59
679 pshufd \$0x0e,$Wi,$Wi
680 sha256msg2 @MSG[1],@MSG[2]
682 sha256rnds2 $CDGH,$ABEF
684 movdqa 15*32-0x80($Tbl),$Wi
687 sha256rnds2 $ABEF,$CDGH # 60-63
688 pshufd \$0x0e,$Wi,$Wi
691 sha256rnds2 $CDGH,$ABEF
693 paddd $CDGH_SAVE,$CDGH
694 paddd $ABEF_SAVE,$ABEF
697 pshufd \$0xb1,$CDGH,$CDGH # DCHG
698 pshufd \$0x1b,$ABEF,$TMP # FEBA
699 pshufd \$0xb1,$ABEF,$ABEF # BAFE
700 punpckhqdq $CDGH,$ABEF # DCBA
701 palignr \$8,$TMP,$CDGH # HGFE
704 movdqu $CDGH,16($ctx)
706 $code.=<<___ if ($win64);
707 movaps -8-5*16(%rax),%xmm6
708 movaps -8-4*16(%rax),%xmm7
709 movaps -8-3*16(%rax),%xmm8
710 movaps -8-2*16(%rax),%xmm9
711 movaps -8-1*16(%rax),%xmm10
717 .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
723 my ($a,$b,$c,$d,$e,$f,$g,$h);
725 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
726 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
728 $arg = "\$$arg" if ($arg*1 eq $arg);
729 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
734 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
736 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
740 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
742 '&xor ($a4,$g)', # f^g
744 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
746 '&and ($a4,$e)', # (f^g)&e
749 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
752 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
753 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
754 '&xor ($a2,$b)', # a^b, b^c in next round
756 '&add ($h,$a4)', # h+=Ch(e,f,g)
757 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
758 '&and ($a3,$a2)', # (b^c)&(a^b)
761 '&add ($h,$a0)', # h+=Sigma1(e)
762 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
764 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
765 '&add ($d,$h)', # d+=h
766 '&add ($h,$a3)', # h+=Maj(a,b,c)
769 '&add ($a1,$h);'. # h+=Sigma0(a)
770 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
774 ######################################################################
777 if ($SZ==4) { # SHA256 only
778 my @X = map("%xmm$_",(0..3));
779 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
782 .type ${func}_ssse3,\@function,3
787 mov %rsp,%rax # copy %rsp
788 .cfi_def_cfa_register %rax
801 shl \$4,%rdx # num*16
802 sub \$`$framesz+$win64*16*4`,%rsp
803 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
804 and \$-64,%rsp # align stack frame
805 mov $ctx,$_ctx # save ctx, 1st arg
806 mov $inp,$_inp # save inp, 2nd arh
807 mov %rdx,$_end # save end pointer, "3rd" arg
808 mov %rax,$_rsp # save copy of %rsp
809 .cfi_cfa_expression $_rsp,deref,+8
811 $code.=<<___ if ($win64);
812 movaps %xmm6,16*$SZ+32(%rsp)
813 movaps %xmm7,16*$SZ+48(%rsp)
814 movaps %xmm8,16*$SZ+64(%rsp)
815 movaps %xmm9,16*$SZ+80(%rsp)
831 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
832 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
836 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
837 movdqu 0x00($inp),@X[0]
838 movdqu 0x10($inp),@X[1]
839 movdqu 0x20($inp),@X[2]
841 movdqu 0x30($inp),@X[3]
842 lea $TABLE(%rip),$Tbl
844 movdqa 0x00($Tbl),$t0
845 movdqa 0x20($Tbl),$t1
848 movdqa 0x40($Tbl),$t2
850 movdqa 0x60($Tbl),$t3
854 movdqa $t0,0x00(%rsp)
856 movdqa $t1,0x10(%rsp)
858 movdqa $t2,0x20(%rsp)
860 movdqa $t3,0x30(%rsp)
866 sub \$`-16*2*$SZ`,$Tbl # size optimization
868 sub Xupdate_256_SSSE3 () {
870 '&movdqa ($t0,@X[1]);',
871 '&movdqa ($t3,@X[3])',
872 '&palignr ($t0,@X[0],$SZ)', # X[1..4]
873 '&palignr ($t3,@X[2],$SZ);', # X[9..12]
875 '&movdqa ($t2,$t0);',
876 '&psrld ($t0,$sigma0[2])',
877 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
878 '&psrld ($t2,$sigma0[0])',
879 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
880 '&pslld ($t1,8*$SZ-$sigma0[1]);'.
882 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
884 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
887 '&pxor ($t0,$t1);', # sigma0(X[1..4])
888 '&psrld ($t3,$sigma1[2])',
889 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
890 '&psrlq ($t2,$sigma1[0])',
892 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
894 '&pshufb ($t3,$t4)', # sigma1(X[14..15])
895 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
896 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
897 '&movdqa ($t2,$t3);',
898 '&psrld ($t3,$sigma1[2])',
899 '&psrlq ($t2,$sigma1[0])',
901 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
903 '&movdqa ($t2,16*2*$j."($Tbl)")',
905 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
909 sub SSSE3_256_00_47 () {
913 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
916 foreach (Xupdate_256_SSSE3()) { # 36 instructions
922 } else { # squeeze extra 4% on Westmere and 19% on Atom
923 eval(shift(@insns)); #@
928 eval(shift(@insns)); #@
931 eval(shift(@insns)); #@
933 &palignr ($t0,@X[0],$SZ); # X[1..4]
936 &palignr ($t3,@X[2],$SZ); # X[9..12]
940 eval(shift(@insns)); #@
945 eval(shift(@insns)); #@
947 &psrld ($t0,$sigma0[2]);
951 &paddd (@X[0],$t3); # X[0..3] += X[9..12]
952 eval(shift(@insns)); #@
954 &psrld ($t2,$sigma0[0]);
957 &pshufd ($t3,@X[3],0b11111010); # X[4..15]
959 eval(shift(@insns)); #@
960 &pslld ($t1,8*$SZ-$sigma0[1]);
964 eval(shift(@insns)); #@
967 eval(shift(@insns)); #@
968 &psrld ($t2,$sigma0[1]-$sigma0[0]);
973 &pslld ($t1,$sigma0[1]-$sigma0[0]);
978 eval(shift(@insns)); #@
982 &pxor ($t0,$t1); # sigma0(X[1..4])
983 eval(shift(@insns)); #@
986 &psrld ($t3,$sigma1[2]);
989 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
990 eval(shift(@insns)); #@
992 &psrlq ($t2,$sigma1[0]);
997 eval(shift(@insns)); #@
1000 eval(shift(@insns)); #@
1001 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
1002 eval(shift(@insns));
1003 eval(shift(@insns));
1005 eval(shift(@insns)); #@
1006 eval(shift(@insns));
1007 eval(shift(@insns));
1008 #&pshufb ($t3,$t4); # sigma1(X[14..15])
1009 &pshufd ($t3,$t3,0b10000000);
1010 eval(shift(@insns));
1011 eval(shift(@insns));
1012 eval(shift(@insns));
1014 eval(shift(@insns));
1015 eval(shift(@insns)); #@
1016 eval(shift(@insns));
1017 eval(shift(@insns));
1018 eval(shift(@insns)); #@
1019 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1020 eval(shift(@insns));
1021 eval(shift(@insns));
1022 eval(shift(@insns));
1023 &pshufd ($t3,@X[0],0b01010000); # X[16..17]
1024 eval(shift(@insns));
1025 eval(shift(@insns)); #@
1026 eval(shift(@insns));
1028 eval(shift(@insns));
1029 eval(shift(@insns));
1030 &psrld ($t3,$sigma1[2]);
1031 eval(shift(@insns));
1032 eval(shift(@insns)); #@
1033 &psrlq ($t2,$sigma1[0]);
1034 eval(shift(@insns));
1035 eval(shift(@insns));
1037 eval(shift(@insns)); #@
1038 eval(shift(@insns));
1039 eval(shift(@insns));
1040 eval(shift(@insns)); #@
1041 eval(shift(@insns));
1042 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
1043 eval(shift(@insns));
1044 eval(shift(@insns));
1045 eval(shift(@insns));
1047 eval(shift(@insns));
1048 eval(shift(@insns));
1049 eval(shift(@insns)); #@
1051 &pshufd ($t3,$t3,0b00001000);
1052 eval(shift(@insns));
1053 eval(shift(@insns));
1054 &movdqa ($t2,16*2*$j."($Tbl)");
1055 eval(shift(@insns)); #@
1056 eval(shift(@insns));
1058 eval(shift(@insns));
1059 eval(shift(@insns));
1060 eval(shift(@insns));
1061 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1062 eval(shift(@insns)); #@
1063 eval(shift(@insns));
1064 eval(shift(@insns));
1067 foreach (@insns) { eval; } # remaining instructions
1068 &movdqa (16*$j."(%rsp)",$t2);
1071 for ($i=0,$j=0; $j<4; $j++) {
1072 &SSSE3_256_00_47($j,\&body_00_15,@X);
1073 push(@X,shift(@X)); # rotate(@X)
1075 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1076 &jne (".Lssse3_00_47");
1078 for ($i=0; $i<16; ) {
1079 foreach(body_00_15()) { eval; }
1086 lea 16*$SZ($inp),$inp
1110 $code.=<<___ if ($win64);
1111 movaps 16*$SZ+32(%rsp),%xmm6
1112 movaps 16*$SZ+48(%rsp),%xmm7
1113 movaps 16*$SZ+64(%rsp),%xmm8
1114 movaps 16*$SZ+80(%rsp),%xmm9
1130 .cfi_def_cfa_register %rsp
1134 .size ${func}_ssse3,.-${func}_ssse3
1139 ######################################################################
1142 if ($SZ==8) { # SHA512 only
1144 .type ${func}_xop,\@function,3
1149 mov %rsp,%rax # copy %rsp
1150 .cfi_def_cfa_register %rax
1163 shl \$4,%rdx # num*16
1164 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1165 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1166 and \$-64,%rsp # align stack frame
1167 mov $ctx,$_ctx # save ctx, 1st arg
1168 mov $inp,$_inp # save inp, 2nd arh
1169 mov %rdx,$_end # save end pointer, "3rd" arg
1170 mov %rax,$_rsp # save copy of %rsp
1171 .cfi_cfa_expression $_rsp,deref,+8
1173 $code.=<<___ if ($win64);
1174 movaps %xmm6,16*$SZ+32(%rsp)
1175 movaps %xmm7,16*$SZ+48(%rsp)
1176 movaps %xmm8,16*$SZ+64(%rsp)
1177 movaps %xmm9,16*$SZ+80(%rsp)
1179 $code.=<<___ if ($win64 && $SZ>4);
1180 movaps %xmm10,16*$SZ+96(%rsp)
1181 movaps %xmm11,16*$SZ+112(%rsp)
1197 if ($SZ==4) { # SHA256
1198 my @X = map("%xmm$_",(0..3));
1199 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1204 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1205 vmovdqu 0x00($inp),@X[0]
1206 vmovdqu 0x10($inp),@X[1]
1207 vmovdqu 0x20($inp),@X[2]
1208 vmovdqu 0x30($inp),@X[3]
1209 vpshufb $t3,@X[0],@X[0]
1210 lea $TABLE(%rip),$Tbl
1211 vpshufb $t3,@X[1],@X[1]
1212 vpshufb $t3,@X[2],@X[2]
1213 vpaddd 0x00($Tbl),@X[0],$t0
1214 vpshufb $t3,@X[3],@X[3]
1215 vpaddd 0x20($Tbl),@X[1],$t1
1216 vpaddd 0x40($Tbl),@X[2],$t2
1217 vpaddd 0x60($Tbl),@X[3],$t3
1218 vmovdqa $t0,0x00(%rsp)
1220 vmovdqa $t1,0x10(%rsp)
1222 vmovdqa $t2,0x20(%rsp)
1224 vmovdqa $t3,0x30(%rsp)
1230 sub \$`-16*2*$SZ`,$Tbl # size optimization
1232 sub XOP_256_00_47 () {
1236 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1238 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
1239 eval(shift(@insns));
1240 eval(shift(@insns));
1241 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
1242 eval(shift(@insns));
1243 eval(shift(@insns));
1244 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
1245 eval(shift(@insns));
1246 eval(shift(@insns));
1247 &vpsrld ($t0,$t0,$sigma0[2]);
1248 eval(shift(@insns));
1249 eval(shift(@insns));
1250 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
1251 eval(shift(@insns));
1252 eval(shift(@insns));
1253 eval(shift(@insns));
1254 eval(shift(@insns));
1255 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
1256 eval(shift(@insns));
1257 eval(shift(@insns));
1258 &vpxor ($t0,$t0,$t1);
1259 eval(shift(@insns));
1260 eval(shift(@insns));
1261 eval(shift(@insns));
1262 eval(shift(@insns));
1263 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
1264 eval(shift(@insns));
1265 eval(shift(@insns));
1266 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
1267 eval(shift(@insns));
1268 eval(shift(@insns));
1269 &vpsrld ($t2,@X[3],$sigma1[2]);
1270 eval(shift(@insns));
1271 eval(shift(@insns));
1272 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
1273 eval(shift(@insns));
1274 eval(shift(@insns));
1275 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1276 eval(shift(@insns));
1277 eval(shift(@insns));
1278 &vpxor ($t3,$t3,$t2);
1279 eval(shift(@insns));
1280 eval(shift(@insns));
1281 eval(shift(@insns));
1282 eval(shift(@insns));
1283 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1284 eval(shift(@insns));
1285 eval(shift(@insns));
1286 eval(shift(@insns));
1287 eval(shift(@insns));
1288 &vpsrldq ($t3,$t3,8);
1289 eval(shift(@insns));
1290 eval(shift(@insns));
1291 eval(shift(@insns));
1292 eval(shift(@insns));
1293 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1294 eval(shift(@insns));
1295 eval(shift(@insns));
1296 eval(shift(@insns));
1297 eval(shift(@insns));
1298 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
1299 eval(shift(@insns));
1300 eval(shift(@insns));
1301 &vpsrld ($t2,@X[0],$sigma1[2]);
1302 eval(shift(@insns));
1303 eval(shift(@insns));
1304 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1305 eval(shift(@insns));
1306 eval(shift(@insns));
1307 &vpxor ($t3,$t3,$t2);
1308 eval(shift(@insns));
1309 eval(shift(@insns));
1310 eval(shift(@insns));
1311 eval(shift(@insns));
1312 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
1313 eval(shift(@insns));
1314 eval(shift(@insns));
1315 eval(shift(@insns));
1316 eval(shift(@insns));
1317 &vpslldq ($t3,$t3,8); # 22 instructions
1318 eval(shift(@insns));
1319 eval(shift(@insns));
1320 eval(shift(@insns));
1321 eval(shift(@insns));
1322 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1323 eval(shift(@insns));
1324 eval(shift(@insns));
1325 eval(shift(@insns));
1326 eval(shift(@insns));
1327 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1328 foreach (@insns) { eval; } # remaining instructions
1329 &vmovdqa (16*$j."(%rsp)",$t2);
1332 for ($i=0,$j=0; $j<4; $j++) {
1333 &XOP_256_00_47($j,\&body_00_15,@X);
1334 push(@X,shift(@X)); # rotate(@X)
1336 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1337 &jne (".Lxop_00_47");
1339 for ($i=0; $i<16; ) {
1340 foreach(body_00_15()) { eval; }
1344 my @X = map("%xmm$_",(0..7));
1345 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1350 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1351 vmovdqu 0x00($inp),@X[0]
1352 lea $TABLE+0x80(%rip),$Tbl # size optimization
1353 vmovdqu 0x10($inp),@X[1]
1354 vmovdqu 0x20($inp),@X[2]
1355 vpshufb $t3,@X[0],@X[0]
1356 vmovdqu 0x30($inp),@X[3]
1357 vpshufb $t3,@X[1],@X[1]
1358 vmovdqu 0x40($inp),@X[4]
1359 vpshufb $t3,@X[2],@X[2]
1360 vmovdqu 0x50($inp),@X[5]
1361 vpshufb $t3,@X[3],@X[3]
1362 vmovdqu 0x60($inp),@X[6]
1363 vpshufb $t3,@X[4],@X[4]
1364 vmovdqu 0x70($inp),@X[7]
1365 vpshufb $t3,@X[5],@X[5]
1366 vpaddq -0x80($Tbl),@X[0],$t0
1367 vpshufb $t3,@X[6],@X[6]
1368 vpaddq -0x60($Tbl),@X[1],$t1
1369 vpshufb $t3,@X[7],@X[7]
1370 vpaddq -0x40($Tbl),@X[2],$t2
1371 vpaddq -0x20($Tbl),@X[3],$t3
1372 vmovdqa $t0,0x00(%rsp)
1373 vpaddq 0x00($Tbl),@X[4],$t0
1374 vmovdqa $t1,0x10(%rsp)
1375 vpaddq 0x20($Tbl),@X[5],$t1
1376 vmovdqa $t2,0x20(%rsp)
1377 vpaddq 0x40($Tbl),@X[6],$t2
1378 vmovdqa $t3,0x30(%rsp)
1379 vpaddq 0x60($Tbl),@X[7],$t3
1380 vmovdqa $t0,0x40(%rsp)
1382 vmovdqa $t1,0x50(%rsp)
1384 vmovdqa $t2,0x60(%rsp)
1386 vmovdqa $t3,0x70(%rsp)
1392 add \$`16*2*$SZ`,$Tbl
1394 sub XOP_512_00_47 () {
1398 my @insns = (&$body,&$body); # 52 instructions
1400 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
1401 eval(shift(@insns));
1402 eval(shift(@insns));
1403 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
1404 eval(shift(@insns));
1405 eval(shift(@insns));
1406 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
1407 eval(shift(@insns));
1408 eval(shift(@insns));
1409 &vpsrlq ($t0,$t0,$sigma0[2]);
1410 eval(shift(@insns));
1411 eval(shift(@insns));
1412 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
1413 eval(shift(@insns));
1414 eval(shift(@insns));
1415 eval(shift(@insns));
1416 eval(shift(@insns));
1417 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
1418 eval(shift(@insns));
1419 eval(shift(@insns));
1420 &vpxor ($t0,$t0,$t1);
1421 eval(shift(@insns));
1422 eval(shift(@insns));
1423 eval(shift(@insns));
1424 eval(shift(@insns));
1425 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
1426 eval(shift(@insns));
1427 eval(shift(@insns));
1428 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
1429 eval(shift(@insns));
1430 eval(shift(@insns));
1431 &vpsrlq ($t2,@X[7],$sigma1[2]);
1432 eval(shift(@insns));
1433 eval(shift(@insns));
1434 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
1435 eval(shift(@insns));
1436 eval(shift(@insns));
1437 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
1438 eval(shift(@insns));
1439 eval(shift(@insns));
1440 &vpxor ($t3,$t3,$t2);
1441 eval(shift(@insns));
1442 eval(shift(@insns));
1443 eval(shift(@insns));
1444 eval(shift(@insns));
1445 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1446 eval(shift(@insns));
1447 eval(shift(@insns));
1448 eval(shift(@insns));
1449 eval(shift(@insns));
1450 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1451 eval(shift(@insns));
1452 eval(shift(@insns));
1453 eval(shift(@insns));
1454 eval(shift(@insns));
1455 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1456 foreach (@insns) { eval; } # remaining instructions
1457 &vmovdqa (16*$j."(%rsp)",$t2);
1460 for ($i=0,$j=0; $j<8; $j++) {
1461 &XOP_512_00_47($j,\&body_00_15,@X);
1462 push(@X,shift(@X)); # rotate(@X)
1464 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1465 &jne (".Lxop_00_47");
1467 for ($i=0; $i<16; ) {
1468 foreach(body_00_15()) { eval; }
1476 lea 16*$SZ($inp),$inp
1501 $code.=<<___ if ($win64);
1502 movaps 16*$SZ+32(%rsp),%xmm6
1503 movaps 16*$SZ+48(%rsp),%xmm7
1504 movaps 16*$SZ+64(%rsp),%xmm8
1505 movaps 16*$SZ+80(%rsp),%xmm9
1507 $code.=<<___ if ($win64 && $SZ>4);
1508 movaps 16*$SZ+96(%rsp),%xmm10
1509 movaps 16*$SZ+112(%rsp),%xmm11
1525 .cfi_def_cfa_register %rsp
1529 .size ${func}_xop,.-${func}_xop
1532 ######################################################################
1533 # AVX+shrd code path
1535 local *ror = sub { &shrd(@_[0],@_) };
1538 .type ${func}_avx,\@function,3
1543 mov %rsp,%rax # copy %rsp
1544 .cfi_def_cfa_register %rax
1557 shl \$4,%rdx # num*16
1558 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1559 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1560 and \$-64,%rsp # align stack frame
1561 mov $ctx,$_ctx # save ctx, 1st arg
1562 mov $inp,$_inp # save inp, 2nd arh
1563 mov %rdx,$_end # save end pointer, "3rd" arg
1564 mov %rax,$_rsp # save copy of %rsp
1565 .cfi_cfa_expression $_rsp,deref,+8
1567 $code.=<<___ if ($win64);
1568 movaps %xmm6,16*$SZ+32(%rsp)
1569 movaps %xmm7,16*$SZ+48(%rsp)
1570 movaps %xmm8,16*$SZ+64(%rsp)
1571 movaps %xmm9,16*$SZ+80(%rsp)
1573 $code.=<<___ if ($win64 && $SZ>4);
1574 movaps %xmm10,16*$SZ+96(%rsp)
1575 movaps %xmm11,16*$SZ+112(%rsp)
1590 if ($SZ==4) { # SHA256
1591 my @X = map("%xmm$_",(0..3));
1592 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1595 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1596 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1600 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1601 vmovdqu 0x00($inp),@X[0]
1602 vmovdqu 0x10($inp),@X[1]
1603 vmovdqu 0x20($inp),@X[2]
1604 vmovdqu 0x30($inp),@X[3]
1605 vpshufb $t3,@X[0],@X[0]
1606 lea $TABLE(%rip),$Tbl
1607 vpshufb $t3,@X[1],@X[1]
1608 vpshufb $t3,@X[2],@X[2]
1609 vpaddd 0x00($Tbl),@X[0],$t0
1610 vpshufb $t3,@X[3],@X[3]
1611 vpaddd 0x20($Tbl),@X[1],$t1
1612 vpaddd 0x40($Tbl),@X[2],$t2
1613 vpaddd 0x60($Tbl),@X[3],$t3
1614 vmovdqa $t0,0x00(%rsp)
1616 vmovdqa $t1,0x10(%rsp)
1618 vmovdqa $t2,0x20(%rsp)
1620 vmovdqa $t3,0x30(%rsp)
1626 sub \$`-16*2*$SZ`,$Tbl # size optimization
1628 sub Xupdate_256_AVX () {
1630 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
1631 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
1632 '&vpsrld ($t2,$t0,$sigma0[0]);',
1633 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
1634 '&vpsrld ($t3,$t0,$sigma0[2])',
1635 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
1636 '&vpxor ($t0,$t3,$t2)',
1637 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1638 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1639 '&vpxor ($t0,$t0,$t1)',
1640 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1641 '&vpxor ($t0,$t0,$t2)',
1642 '&vpsrld ($t2,$t3,$sigma1[2]);',
1643 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
1644 '&vpsrlq ($t3,$t3,$sigma1[0]);',
1645 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
1646 '&vpxor ($t2,$t2,$t3);',
1647 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1648 '&vpxor ($t2,$t2,$t3)',
1649 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
1650 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
1651 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1652 '&vpsrld ($t2,$t3,$sigma1[2])',
1653 '&vpsrlq ($t3,$t3,$sigma1[0])',
1654 '&vpxor ($t2,$t2,$t3);',
1655 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1656 '&vpxor ($t2,$t2,$t3)',
1657 '&vpshufb ($t2,$t2,$t5)',
1658 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
1662 sub AVX_256_00_47 () {
1666 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1668 foreach (Xupdate_256_AVX()) { # 29 instructions
1670 eval(shift(@insns));
1671 eval(shift(@insns));
1672 eval(shift(@insns));
1674 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1675 foreach (@insns) { eval; } # remaining instructions
1676 &vmovdqa (16*$j."(%rsp)",$t2);
1679 for ($i=0,$j=0; $j<4; $j++) {
1680 &AVX_256_00_47($j,\&body_00_15,@X);
1681 push(@X,shift(@X)); # rotate(@X)
1683 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1684 &jne (".Lavx_00_47");
1686 for ($i=0; $i<16; ) {
1687 foreach(body_00_15()) { eval; }
1691 my @X = map("%xmm$_",(0..7));
1692 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1698 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1699 vmovdqu 0x00($inp),@X[0]
1700 lea $TABLE+0x80(%rip),$Tbl # size optimization
1701 vmovdqu 0x10($inp),@X[1]
1702 vmovdqu 0x20($inp),@X[2]
1703 vpshufb $t3,@X[0],@X[0]
1704 vmovdqu 0x30($inp),@X[3]
1705 vpshufb $t3,@X[1],@X[1]
1706 vmovdqu 0x40($inp),@X[4]
1707 vpshufb $t3,@X[2],@X[2]
1708 vmovdqu 0x50($inp),@X[5]
1709 vpshufb $t3,@X[3],@X[3]
1710 vmovdqu 0x60($inp),@X[6]
1711 vpshufb $t3,@X[4],@X[4]
1712 vmovdqu 0x70($inp),@X[7]
1713 vpshufb $t3,@X[5],@X[5]
1714 vpaddq -0x80($Tbl),@X[0],$t0
1715 vpshufb $t3,@X[6],@X[6]
1716 vpaddq -0x60($Tbl),@X[1],$t1
1717 vpshufb $t3,@X[7],@X[7]
1718 vpaddq -0x40($Tbl),@X[2],$t2
1719 vpaddq -0x20($Tbl),@X[3],$t3
1720 vmovdqa $t0,0x00(%rsp)
1721 vpaddq 0x00($Tbl),@X[4],$t0
1722 vmovdqa $t1,0x10(%rsp)
1723 vpaddq 0x20($Tbl),@X[5],$t1
1724 vmovdqa $t2,0x20(%rsp)
1725 vpaddq 0x40($Tbl),@X[6],$t2
1726 vmovdqa $t3,0x30(%rsp)
1727 vpaddq 0x60($Tbl),@X[7],$t3
1728 vmovdqa $t0,0x40(%rsp)
1730 vmovdqa $t1,0x50(%rsp)
1732 vmovdqa $t2,0x60(%rsp)
1734 vmovdqa $t3,0x70(%rsp)
1740 add \$`16*2*$SZ`,$Tbl
1742 sub Xupdate_512_AVX () {
1744 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
1745 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
1746 '&vpsrlq ($t2,$t0,$sigma0[0])',
1747 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
1748 '&vpsrlq ($t3,$t0,$sigma0[2])',
1749 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
1750 '&vpxor ($t0,$t3,$t2)',
1751 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1752 '&vpxor ($t0,$t0,$t1)',
1753 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1754 '&vpxor ($t0,$t0,$t2)',
1755 '&vpsrlq ($t3,@X[7],$sigma1[2]);',
1756 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
1757 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
1758 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
1759 '&vpsrlq ($t1,@X[7],$sigma1[0]);',
1760 '&vpxor ($t3,$t3,$t2)',
1761 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
1762 '&vpxor ($t3,$t3,$t1)',
1763 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
1764 '&vpxor ($t3,$t3,$t2)',
1765 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
1766 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
1770 sub AVX_512_00_47 () {
1774 my @insns = (&$body,&$body); # 52 instructions
1776 foreach (Xupdate_512_AVX()) { # 23 instructions
1778 eval(shift(@insns));
1779 eval(shift(@insns));
1781 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1782 foreach (@insns) { eval; } # remaining instructions
1783 &vmovdqa (16*$j."(%rsp)",$t2);
1786 for ($i=0,$j=0; $j<8; $j++) {
1787 &AVX_512_00_47($j,\&body_00_15,@X);
1788 push(@X,shift(@X)); # rotate(@X)
1790 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1791 &jne (".Lavx_00_47");
1793 for ($i=0; $i<16; ) {
1794 foreach(body_00_15()) { eval; }
1802 lea 16*$SZ($inp),$inp
1827 $code.=<<___ if ($win64);
1828 movaps 16*$SZ+32(%rsp),%xmm6
1829 movaps 16*$SZ+48(%rsp),%xmm7
1830 movaps 16*$SZ+64(%rsp),%xmm8
1831 movaps 16*$SZ+80(%rsp),%xmm9
1833 $code.=<<___ if ($win64 && $SZ>4);
1834 movaps 16*$SZ+96(%rsp),%xmm10
1835 movaps 16*$SZ+112(%rsp),%xmm11
1851 .cfi_def_cfa_register %rsp
1855 .size ${func}_avx,.-${func}_avx
1859 ######################################################################
1860 # AVX2+BMI code path
1862 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
1866 sub bodyx_00_15 () {
1867 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1869 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1871 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
1872 '&and ($a4,$e)', # f&e
1873 '&rorx ($a0,$e,$Sigma1[2])',
1874 '&rorx ($a2,$e,$Sigma1[1])',
1876 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
1877 '&lea ($h,"($h,$a4)")',
1878 '&andn ($a4,$e,$g)', # ~e&g
1881 '&rorx ($a1,$e,$Sigma1[0])',
1882 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
1883 '&xor ($a0,$a1)', # Sigma1(e)
1886 '&rorx ($a4,$a,$Sigma0[2])',
1887 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
1888 '&xor ($a2,$b)', # a^b, b^c in next round
1889 '&rorx ($a1,$a,$Sigma0[1])',
1891 '&rorx ($a0,$a,$Sigma0[0])',
1892 '&lea ($d,"($d,$h)")', # d+=h
1893 '&and ($a3,$a2)', # (b^c)&(a^b)
1896 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
1897 '&xor ($a1,$a0)', # Sigma0(a)
1898 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
1899 '&mov ($a4,$e)', # copy of f in future
1901 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1903 # and at the finish one has to $a+=$a1
1907 .type ${func}_avx2,\@function,3
1912 mov %rsp,%rax # copy %rsp
1913 .cfi_def_cfa_register %rax
1926 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1927 shl \$4,%rdx # num*16
1928 and \$-256*$SZ,%rsp # align stack frame
1929 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1930 add \$`2*$SZ*($rounds-8)`,%rsp
1931 mov $ctx,$_ctx # save ctx, 1st arg
1932 mov $inp,$_inp # save inp, 2nd arh
1933 mov %rdx,$_end # save end pointer, "3rd" arg
1934 mov %rax,$_rsp # save copy of %rsp
1935 .cfi_cfa_expression $_rsp,deref,+8
1937 $code.=<<___ if ($win64);
1938 movaps %xmm6,16*$SZ+32(%rsp)
1939 movaps %xmm7,16*$SZ+48(%rsp)
1940 movaps %xmm8,16*$SZ+64(%rsp)
1941 movaps %xmm9,16*$SZ+80(%rsp)
1943 $code.=<<___ if ($win64 && $SZ>4);
1944 movaps %xmm10,16*$SZ+96(%rsp)
1945 movaps %xmm11,16*$SZ+112(%rsp)
1951 sub \$-16*$SZ,$inp # inp++, size optimization
1953 mov $inp,%r12 # borrow $T1
1955 cmp %rdx,$inp # $_end
1957 cmove %rsp,%r12 # next block or random data
1964 if ($SZ==4) { # SHA256
1965 my @X = map("%ymm$_",(0..3));
1966 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1969 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1970 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1974 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1975 vmovdqu -16*$SZ+0($inp),%xmm0
1976 vmovdqu -16*$SZ+16($inp),%xmm1
1977 vmovdqu -16*$SZ+32($inp),%xmm2
1978 vmovdqu -16*$SZ+48($inp),%xmm3
1979 #mov $inp,$_inp # offload $inp
1980 vinserti128 \$1,(%r12),@X[0],@X[0]
1981 vinserti128 \$1,16(%r12),@X[1],@X[1]
1982 vpshufb $t3,@X[0],@X[0]
1983 vinserti128 \$1,32(%r12),@X[2],@X[2]
1984 vpshufb $t3,@X[1],@X[1]
1985 vinserti128 \$1,48(%r12),@X[3],@X[3]
1987 lea $TABLE(%rip),$Tbl
1988 vpshufb $t3,@X[2],@X[2]
1989 vpaddd 0x00($Tbl),@X[0],$t0
1990 vpshufb $t3,@X[3],@X[3]
1991 vpaddd 0x20($Tbl),@X[1],$t1
1992 vpaddd 0x40($Tbl),@X[2],$t2
1993 vpaddd 0x60($Tbl),@X[3],$t3
1994 vmovdqa $t0,0x00(%rsp)
1996 vmovdqa $t1,0x20(%rsp)
1998 $code.=<<___ if (!$win64);
1999 # temporarily use %rdi as frame pointer
2004 lea -$PUSH8(%rsp),%rsp
2006 $code.=<<___ if (!$win64);
2007 # the frame info is at $_rsp, but the stack is moving...
2008 # so a second frame pointer is saved at -8(%rsp)
2009 # that is in the red zone
2011 .cfi_cfa_expression %rsp-8,deref,+8
2015 vmovdqa $t2,0x00(%rsp)
2017 vmovdqa $t3,0x20(%rsp)
2019 sub \$-16*2*$SZ,$Tbl # size optimization
2026 sub AVX2_256_00_47 () {
2030 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
2031 my $base = "+2*$PUSH8(%rsp)";
2034 &lea ("%rsp","-$PUSH8(%rsp)");
2035 $code.=<<___ if (!$win64);
2036 .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
2037 # copy secondary frame pointer to new location again at -8(%rsp)
2038 pushq $PUSH8-8(%rsp)
2039 .cfi_cfa_expression %rsp,deref,+8
2041 .cfi_cfa_expression %rsp-8,deref,+8
2045 foreach (Xupdate_256_AVX()) { # 29 instructions
2047 eval(shift(@insns));
2048 eval(shift(@insns));
2049 eval(shift(@insns));
2051 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
2052 foreach (@insns) { eval; } # remaining instructions
2053 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2056 for ($i=0,$j=0; $j<4; $j++) {
2057 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
2058 push(@X,shift(@X)); # rotate(@X)
2060 &lea ($Tbl,16*2*$SZ."($Tbl)");
2061 &cmpb (($SZ-1)."($Tbl)",0);
2062 &jne (".Lavx2_00_47");
2064 for ($i=0; $i<16; ) {
2065 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2066 foreach(bodyx_00_15()) { eval; }
2069 my @X = map("%ymm$_",(0..7));
2070 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
2076 vmovdqu -16*$SZ($inp),%xmm0
2077 vmovdqu -16*$SZ+16($inp),%xmm1
2078 vmovdqu -16*$SZ+32($inp),%xmm2
2079 lea $TABLE+0x80(%rip),$Tbl # size optimization
2080 vmovdqu -16*$SZ+48($inp),%xmm3
2081 vmovdqu -16*$SZ+64($inp),%xmm4
2082 vmovdqu -16*$SZ+80($inp),%xmm5
2083 vmovdqu -16*$SZ+96($inp),%xmm6
2084 vmovdqu -16*$SZ+112($inp),%xmm7
2085 #mov $inp,$_inp # offload $inp
2086 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
2087 vinserti128 \$1,(%r12),@X[0],@X[0]
2088 vinserti128 \$1,16(%r12),@X[1],@X[1]
2089 vpshufb $t2,@X[0],@X[0]
2090 vinserti128 \$1,32(%r12),@X[2],@X[2]
2091 vpshufb $t2,@X[1],@X[1]
2092 vinserti128 \$1,48(%r12),@X[3],@X[3]
2093 vpshufb $t2,@X[2],@X[2]
2094 vinserti128 \$1,64(%r12),@X[4],@X[4]
2095 vpshufb $t2,@X[3],@X[3]
2096 vinserti128 \$1,80(%r12),@X[5],@X[5]
2097 vpshufb $t2,@X[4],@X[4]
2098 vinserti128 \$1,96(%r12),@X[6],@X[6]
2099 vpshufb $t2,@X[5],@X[5]
2100 vinserti128 \$1,112(%r12),@X[7],@X[7]
2102 vpaddq -0x80($Tbl),@X[0],$t0
2103 vpshufb $t2,@X[6],@X[6]
2104 vpaddq -0x60($Tbl),@X[1],$t1
2105 vpshufb $t2,@X[7],@X[7]
2106 vpaddq -0x40($Tbl),@X[2],$t2
2107 vpaddq -0x20($Tbl),@X[3],$t3
2108 vmovdqa $t0,0x00(%rsp)
2109 vpaddq 0x00($Tbl),@X[4],$t0
2110 vmovdqa $t1,0x20(%rsp)
2111 vpaddq 0x20($Tbl),@X[5],$t1
2112 vmovdqa $t2,0x40(%rsp)
2113 vpaddq 0x40($Tbl),@X[6],$t2
2114 vmovdqa $t3,0x60(%rsp)
2116 $code.=<<___ if (!$win64);
2117 # temporarily use %rdi as frame pointer
2122 lea -$PUSH8(%rsp),%rsp
2124 $code.=<<___ if (!$win64);
2125 # the frame info is at $_rsp, but the stack is moving...
2126 # so a second frame pointer is saved at -8(%rsp)
2127 # that is in the red zone
2129 .cfi_cfa_expression %rsp-8,deref,+8
2132 vpaddq 0x60($Tbl),@X[7],$t3
2133 vmovdqa $t0,0x00(%rsp)
2135 vmovdqa $t1,0x20(%rsp)
2137 vmovdqa $t2,0x40(%rsp)
2139 vmovdqa $t3,0x60(%rsp)
2148 sub AVX2_512_00_47 () {
2152 my @insns = (&$body,&$body); # 48 instructions
2153 my $base = "+2*$PUSH8(%rsp)";
2156 &lea ("%rsp","-$PUSH8(%rsp)");
2157 $code.=<<___ if (!$win64);
2158 .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
2159 # copy secondary frame pointer to new location again at -8(%rsp)
2160 pushq $PUSH8-8(%rsp)
2161 .cfi_cfa_expression %rsp,deref,+8
2163 .cfi_cfa_expression %rsp-8,deref,+8
2167 foreach (Xupdate_512_AVX()) { # 23 instructions
2170 eval(shift(@insns));
2171 eval(shift(@insns));
2172 eval(shift(@insns));
2175 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
2176 foreach (@insns) { eval; } # remaining instructions
2177 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2180 for ($i=0,$j=0; $j<8; $j++) {
2181 &AVX2_512_00_47($j,\&bodyx_00_15,@X);
2182 push(@X,shift(@X)); # rotate(@X)
2184 &lea ($Tbl,16*2*$SZ."($Tbl)");
2185 &cmpb (($SZ-1-0x80)."($Tbl)",0);
2186 &jne (".Lavx2_00_47");
2188 for ($i=0; $i<16; ) {
2189 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2190 foreach(bodyx_00_15()) { eval; }
2194 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2196 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2197 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
2217 cmp `$PUSH8+2*8`($Tbl),$inp # $_end
2228 for ($i=0; $i<8; ) {
2229 my $base="+16($Tbl)";
2230 foreach(bodyx_00_15()) { eval; }
2233 lea -$PUSH8($Tbl),$Tbl
2237 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2239 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2240 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
2241 # restore frame pointer to original location at $_rsp
2242 .cfi_cfa_expression $_rsp,deref,+8
2250 lea `2*16*$SZ`($inp),$inp # inp+=2
2257 cmove %rsp,%r12 # next block or stale data
2268 # temporarily use $Tbl as index to $_rsp
2269 # this avoids the need to save a secondary frame pointer at -8(%rsp)
2270 .cfi_cfa_expression $Tbl+`16*$SZ+3*8`,deref,+8
2273 mov `16*$SZ+3*8`($Tbl),%rsi
2277 $code.=<<___ if ($win64);
2278 movaps 16*$SZ+32($Tbl),%xmm6
2279 movaps 16*$SZ+48($Tbl),%xmm7
2280 movaps 16*$SZ+64($Tbl),%xmm8
2281 movaps 16*$SZ+80($Tbl),%xmm9
2283 $code.=<<___ if ($win64 && $SZ>4);
2284 movaps 16*$SZ+96($Tbl),%xmm10
2285 movaps 16*$SZ+112($Tbl),%xmm11
2301 .cfi_def_cfa_register %rsp
2305 .size ${func}_avx2,.-${func}_avx2
2310 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2311 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2319 .extern __imp_RtlVirtualUnwind
2320 .type se_handler,\@abi-omnipotent
2334 mov 120($context),%rax # pull context->Rax
2335 mov 248($context),%rbx # pull context->Rip
2337 mov 8($disp),%rsi # disp->ImageBase
2338 mov 56($disp),%r11 # disp->HanderlData
2340 mov 0(%r11),%r10d # HandlerData[0]
2341 lea (%rsi,%r10),%r10 # prologue label
2342 cmp %r10,%rbx # context->Rip<prologue label
2345 mov 152($context),%rax # pull context->Rsp
2347 mov 4(%r11),%r10d # HandlerData[1]
2348 lea (%rsi,%r10),%r10 # epilogue label
2349 cmp %r10,%rbx # context->Rip>=epilogue label
2352 $code.=<<___ if ($avx>1);
2353 lea .Lavx2_shortcut(%rip),%r10
2354 cmp %r10,%rbx # context->Rip<avx2_shortcut
2358 add \$`2*$SZ*($rounds-8)`,%rax
2362 mov %rax,%rsi # put aside Rsp
2363 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
2371 mov %rbx,144($context) # restore context->Rbx
2372 mov %rbp,160($context) # restore context->Rbp
2373 mov %r12,216($context) # restore context->R12
2374 mov %r13,224($context) # restore context->R13
2375 mov %r14,232($context) # restore context->R14
2376 mov %r15,240($context) # restore context->R15
2378 lea .Lepilogue(%rip),%r10
2380 jb .Lin_prologue # non-AVX code
2382 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
2383 lea 512($context),%rdi # &context.Xmm6
2384 mov \$`$SZ==4?8:12`,%ecx
2385 .long 0xa548f3fc # cld; rep movsq
2390 mov %rax,152($context) # restore context->Rsp
2391 mov %rsi,168($context) # restore context->Rsi
2392 mov %rdi,176($context) # restore context->Rdi
2394 mov 40($disp),%rdi # disp->ContextRecord
2395 mov $context,%rsi # context
2396 mov \$154,%ecx # sizeof(CONTEXT)
2397 .long 0xa548f3fc # cld; rep movsq
2400 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2401 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2402 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2403 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2404 mov 40(%rsi),%r10 # disp->ContextRecord
2405 lea 56(%rsi),%r11 # &disp->HandlerData
2406 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2407 mov %r10,32(%rsp) # arg5
2408 mov %r11,40(%rsp) # arg6
2409 mov %r12,48(%rsp) # arg7
2410 mov %rcx,56(%rsp) # arg8, (NULL)
2411 call *__imp_RtlVirtualUnwind(%rip)
2413 mov \$1,%eax # ExceptionContinueSearch
2425 .size se_handler,.-se_handler
2428 $code.=<<___ if ($SZ==4 && $shaext);
2429 .type shaext_handler,\@abi-omnipotent
2443 mov 120($context),%rax # pull context->Rax
2444 mov 248($context),%rbx # pull context->Rip
2446 lea .Lprologue_shaext(%rip),%r10
2447 cmp %r10,%rbx # context->Rip<.Lprologue
2450 lea .Lepilogue_shaext(%rip),%r10
2451 cmp %r10,%rbx # context->Rip>=.Lepilogue
2454 lea -8-5*16(%rax),%rsi
2455 lea 512($context),%rdi # &context.Xmm6
2457 .long 0xa548f3fc # cld; rep movsq
2460 .size shaext_handler,.-shaext_handler
2466 .rva .LSEH_begin_$func
2467 .rva .LSEH_end_$func
2468 .rva .LSEH_info_$func
2470 $code.=<<___ if ($SZ==4 && $shaext);
2471 .rva .LSEH_begin_${func}_shaext
2472 .rva .LSEH_end_${func}_shaext
2473 .rva .LSEH_info_${func}_shaext
2475 $code.=<<___ if ($SZ==4);
2476 .rva .LSEH_begin_${func}_ssse3
2477 .rva .LSEH_end_${func}_ssse3
2478 .rva .LSEH_info_${func}_ssse3
2480 $code.=<<___ if ($avx && $SZ==8);
2481 .rva .LSEH_begin_${func}_xop
2482 .rva .LSEH_end_${func}_xop
2483 .rva .LSEH_info_${func}_xop
2485 $code.=<<___ if ($avx);
2486 .rva .LSEH_begin_${func}_avx
2487 .rva .LSEH_end_${func}_avx
2488 .rva .LSEH_info_${func}_avx
2490 $code.=<<___ if ($avx>1);
2491 .rva .LSEH_begin_${func}_avx2
2492 .rva .LSEH_end_${func}_avx2
2493 .rva .LSEH_info_${func}_avx2
2501 .rva .Lprologue,.Lepilogue # HandlerData[]
2503 $code.=<<___ if ($SZ==4 && $shaext);
2504 .LSEH_info_${func}_shaext:
2508 $code.=<<___ if ($SZ==4);
2509 .LSEH_info_${func}_ssse3:
2512 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2514 $code.=<<___ if ($avx && $SZ==8);
2515 .LSEH_info_${func}_xop:
2518 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
2520 $code.=<<___ if ($avx);
2521 .LSEH_info_${func}_avx:
2524 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
2526 $code.=<<___ if ($avx>1);
2527 .LSEH_info_${func}_avx2:
2530 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2537 "sha256rnds2" => 0xcb,
2538 "sha256msg1" => 0xcc,
2539 "sha256msg2" => 0xcd );
2541 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2542 my @opcode=(0x0f,0x38);
2543 push @opcode,$opcodelet{$instr};
2544 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
2545 return ".byte\t".join(',',@opcode);
2547 return $instr."\t".@_[0];
2551 foreach (split("\n",$code)) {
2552 s/\`([^\`]*)\`/eval $1/geo;
2554 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;