3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
9 # sha256/512_block procedure for x86_64.
11 # 40% improvement over compiler-generated code on Opteron. On EM64T
12 # sha256 was observed to run >80% faster and sha512 - >40%. No magical
13 # tricks, just straight implementation... I really wonder why gcc
14 # [being armed with inline assembler] fails to generate as fast code.
15 # The only thing which is cool about this module is that it's very
16 # same instruction sequence used for both SHA-256 and SHA-512. In
17 # former case the instructions operate on 32-bit operands, while in
18 # latter - on 64-bit ones. All I had to do is to get one flavor right,
19 # the other one passed the test right away:-)
21 # sha256_block runs in ~1005 cycles on Opteron, which gives you
22 # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
23 # frequency in GHz. sha512_block runs in ~1275 cycles, which results
24 # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
25 # Well, if you compare it to IA-64 implementation, which maintains
26 # X[16] in register bank[!], tends to 4 instructions per CPU clock
27 # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
28 # issue Opteron pipeline and X[16] maintained in memory. So that *if*
29 # there is a way to improve it, *then* the only way would be to try to
30 # offload X[16] updates to SSE unit, but that would require "deeper"
31 # loop unroll, which in turn would naturally cause size blow-up, not
32 # to mention increased complexity! And once again, only *if* it's
33 # actually possible to noticeably improve overall ILP, instruction
34 # level parallelism, on a given CPU implementation in this case.
36 # Special note on Intel EM64T. While Opteron CPU exhibits perfect
37 # perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
38 # [currently available] EM64T CPUs apparently are far from it. On the
39 # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
40 # sha256_block:-( This is presumably because 64-bit shifts/rotates
41 # apparently are not atomic instructions, but implemented in microcode.
45 # Optimization including one of Pavel Semjanov's ideas, alternative
46 # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
47 # unfortunately -2% SHA512 on P4 [which nobody should care about
52 # Add SIMD code paths, see below for improvement coefficients. SSSE3
53 # code path was not attempted for SHA512, because improvement is not
54 # estimated to be high enough, noticeably less than 9%, to justify
55 # the effort, not on pre-AVX processors. [Obviously with exclusion
56 # for VIA Nano, but it has SHA512 instruction that is faster and
57 # should be used instead.] For reference, corresponding estimated
58 # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
59 # higher coefficients are observed on VIA Nano and Bulldozer has more
60 # to do with specifics of their architecture [which is topic for
61 # separate discussion].
65 # Add AVX2 code path. Two consecutive input blocks are loaded to
66 # 256-bit %ymm registers, with data from first block to least
67 # significant 128-bit halves and data from second to most significant.
68 # The data is then processed with same SIMD instruction sequence as
69 # for AVX, but with %ymm as operands. Side effect is increased stack
70 # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
75 # Add support for Intel SHA Extensions.
77 ######################################################################
78 # Current performance in cycles per processed byte (less is better):
80 # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
82 # AMD K8 14.9 - - 9.57 -
84 # Core 2 15.6 13.8(+13%) - 9.97 -
85 # Westmere 14.8 12.3(+19%) - 9.58 -
86 # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
87 # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
88 # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
89 # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
90 # VIA Nano 23.0 16.5(+39%) - 14.7 -
91 # Atom 23.0 18.9(+22%) - 14.7 -
93 # (*) whichever best applicable;
94 # (**) switch from ror to shrd stands for fair share of improvement;
95 # (***) execution time is fully determined by remaining integer-only
96 # part, body_00_15; reducing the amount of SIMD instructions
97 # below certain limit makes no difference/sense; to conserve
98 # space SHA256 XOP code path is therefore omitted;
102 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
104 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
106 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
107 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
108 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
109 die "can't locate x86_64-xlate.pl";
111 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
112 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
113 $avx = ($1>=2.19) + ($1>=2.22);
116 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
117 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
118 $avx = ($1>=2.09) + ($1>=2.10);
121 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
122 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
123 $avx = ($1>=10) + ($1>=11);
126 $shaext=1; ### set to zero if compiling for 1.0.1
127 $avx=1 if (!$shaext && $avx);
129 open OUT,"| \"$^X\" $xlate $flavour $output";
132 if ($output =~ /512/) {
133 $func="sha512_block_data_order";
136 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
137 "%r8", "%r9", "%r10","%r11");
138 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
145 $func="sha256_block_data_order";
148 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
149 "%r8d","%r9d","%r10d","%r11d");
150 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
158 $ctx="%rdi"; # 1st arg, zapped by $a3
159 $inp="%rsi"; # 2nd arg
162 $_ctx="16*$SZ+0*8(%rsp)";
163 $_inp="16*$SZ+1*8(%rsp)";
164 $_end="16*$SZ+2*8(%rsp)";
165 $_rsp="16*$SZ+3*8(%rsp)";
166 $framesz="16*$SZ+4*8";
170 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
172 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
175 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
179 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
182 mov $T1,`$SZ*($i&0xf)`(%rsp)
186 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
188 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
190 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
192 add $a2,$T1 # T1+=Ch(e,f,g)
195 add ($Tbl),$T1 # T1+=K[round]
198 xor $b,$a2 # a^b, b^c in next round
199 ror \$$Sigma1[0],$a0 # Sigma1(e)
203 ror \$$Sigma0[0],$a1 # Sigma0(a)
204 add $a0,$T1 # T1+=Sigma1(e)
206 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
210 lea $STRIDE($Tbl),$Tbl # round++
212 $code.=<<___ if ($i<15);
213 add $a1,$h # h+=Sigma0(a)
215 ($a2,$a3) = ($a3,$a2);
219 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
222 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
223 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
226 ror \$`$sigma0[1]-$sigma0[0]`,$a0
227 add $a1,$a # modulo-scheduled h+=Sigma0(a)
229 ror \$`$sigma1[1]-$sigma1[0]`,$a2
238 xor $a0,$T1 # sigma0(X[(i+1)&0xf])
239 xor $a1,$a2 # sigma1(X[(i+14)&0xf])
240 add `$SZ*(($i+9)&0xf)`(%rsp),$T1
242 add `$SZ*($i&0xf)`(%rsp),$T1
253 .extern OPENSSL_ia32cap_P
255 .type $func,\@function,3
259 $code.=<<___ if ($SZ==4 || $avx);
260 lea OPENSSL_ia32cap_P(%rip),%r11
265 $code.=<<___ if ($SZ==4 && $shaext);
266 test \$`1<<29`,%r11d # check for SHA
269 $code.=<<___ if ($avx && $SZ==8);
270 test \$`1<<11`,%r10d # check for XOP
273 $code.=<<___ if ($avx>1);
274 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
275 cmp \$`1<<8|1<<5|1<<3`,%r11d
278 $code.=<<___ if ($avx);
279 and \$`1<<30`,%r9d # mask "Intel CPU" bit
280 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
282 cmp \$`1<<28|1<<9|1<<30`,%r10d
285 $code.=<<___ if ($SZ==4);
296 mov %rsp,%r11 # copy %rsp
297 shl \$4,%rdx # num*16
299 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
300 and \$-64,%rsp # align stack frame
301 mov $ctx,$_ctx # save ctx, 1st arg
302 mov $inp,$_inp # save inp, 2nd arh
303 mov %rdx,$_end # save end pointer, "3rd" arg
304 mov %r11,$_rsp # save copy of %rsp
320 lea $TABLE(%rip),$Tbl
323 for($i=0;$i<16;$i++) {
324 $code.=" mov $SZ*$i($inp),$T1\n";
325 $code.=" mov @ROT[4],$a0\n";
326 $code.=" mov @ROT[0],$a1\n";
327 $code.=" bswap $T1\n";
328 &ROUND_00_15($i,@ROT);
329 unshift(@ROT,pop(@ROT));
337 &ROUND_16_XX($i,@ROT);
338 unshift(@ROT,pop(@ROT));
342 cmpb \$0,`$SZ-1`($Tbl)
346 add $a1,$A # modulo-scheduled h+=Sigma0(a)
347 lea 16*$SZ($inp),$inp
386 .type $TABLE,\@object
388 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
389 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
390 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
391 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
392 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
393 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
394 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
395 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
396 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
397 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
398 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
399 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
400 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
401 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
402 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
403 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
404 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
405 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
406 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
407 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
408 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
409 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
410 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
411 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
412 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
413 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
414 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
415 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
416 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
417 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
418 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
419 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
421 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
422 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
423 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
424 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
425 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
426 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
427 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
432 .type $TABLE,\@object
434 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
435 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
436 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
437 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
438 .quad 0x3956c25bf348b538,0x59f111f1b605d019
439 .quad 0x3956c25bf348b538,0x59f111f1b605d019
440 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
441 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
442 .quad 0xd807aa98a3030242,0x12835b0145706fbe
443 .quad 0xd807aa98a3030242,0x12835b0145706fbe
444 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
445 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
446 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
447 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
448 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
449 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
450 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
451 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
452 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
453 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
454 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
455 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
456 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
457 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
458 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
459 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
460 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
461 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
462 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
463 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
464 .quad 0x06ca6351e003826f,0x142929670a0e6e70
465 .quad 0x06ca6351e003826f,0x142929670a0e6e70
466 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
467 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
468 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
469 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
470 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
471 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
472 .quad 0x81c2c92e47edaee6,0x92722c851482353b
473 .quad 0x81c2c92e47edaee6,0x92722c851482353b
474 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
475 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
476 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
477 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
478 .quad 0xd192e819d6ef5218,0xd69906245565a910
479 .quad 0xd192e819d6ef5218,0xd69906245565a910
480 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
481 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
482 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
483 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
484 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
485 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
486 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
487 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
488 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
489 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
490 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
491 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
492 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
493 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
494 .quad 0x90befffa23631e28,0xa4506cebde82bde9
495 .quad 0x90befffa23631e28,0xa4506cebde82bde9
496 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
497 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
498 .quad 0xca273eceea26619c,0xd186b8c721c0c207
499 .quad 0xca273eceea26619c,0xd186b8c721c0c207
500 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
501 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
502 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
503 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
504 .quad 0x113f9804bef90dae,0x1b710b35131c471b
505 .quad 0x113f9804bef90dae,0x1b710b35131c471b
506 .quad 0x28db77f523047d84,0x32caab7b40c72493
507 .quad 0x28db77f523047d84,0x32caab7b40c72493
508 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
509 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
510 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
511 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
512 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
513 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
515 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
516 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
517 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
521 ######################################################################
524 if ($SZ==4 && $shaext) {{{
525 ######################################################################
526 # Intel SHA Extensions implementation of SHA256 update function.
528 my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
530 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
531 my @MSG=map("%xmm$_",(3..6));
534 .type sha256_block_data_order_shaext,\@function,3
536 sha256_block_data_order_shaext:
539 $code.=<<___ if ($win64);
540 lea `-8-5*16`(%rsp),%rsp
541 movaps %xmm6,-8-5*16(%rax)
542 movaps %xmm7,-8-4*16(%rax)
543 movaps %xmm8,-8-3*16(%rax)
544 movaps %xmm9,-8-2*16(%rax)
545 movaps %xmm10,-8-1*16(%rax)
549 lea K256+0x80(%rip),$Tbl
550 movdqu ($ctx),$ABEF # DCBA
551 movdqu 16($ctx),$CDGH # HGFE
552 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
554 pshufd \$0x1b,$ABEF,$Wi # ABCD
555 pshufd \$0xb1,$ABEF,$ABEF # CDAB
556 pshufd \$0x1b,$CDGH,$CDGH # EFGH
557 movdqa $TMP,$BSWAP # offload
558 palignr \$8,$CDGH,$ABEF # ABEF
559 punpcklqdq $Wi,$CDGH # CDGH
564 movdqu ($inp),@MSG[0]
565 movdqu 0x10($inp),@MSG[1]
566 movdqu 0x20($inp),@MSG[2]
568 movdqu 0x30($inp),@MSG[3]
570 movdqa 0*32-0x80($Tbl),$Wi
573 movdqa $CDGH,$CDGH_SAVE # offload
574 sha256rnds2 $ABEF,$CDGH # 0-3
575 pshufd \$0x0e,$Wi,$Wi
577 movdqa $ABEF,$ABEF_SAVE # offload
578 sha256rnds2 $CDGH,$ABEF
580 movdqa 1*32-0x80($Tbl),$Wi
583 sha256rnds2 $ABEF,$CDGH # 4-7
584 pshufd \$0x0e,$Wi,$Wi
586 sha256msg1 @MSG[1],@MSG[0]
587 sha256rnds2 $CDGH,$ABEF
589 movdqa 2*32-0x80($Tbl),$Wi
592 sha256rnds2 $ABEF,$CDGH # 8-11
593 pshufd \$0x0e,$Wi,$Wi
595 palignr \$4,@MSG[2],$TMP
598 sha256msg1 @MSG[2],@MSG[1]
599 sha256rnds2 $CDGH,$ABEF
601 movdqa 3*32-0x80($Tbl),$Wi
603 sha256msg2 @MSG[3],@MSG[0]
604 sha256rnds2 $ABEF,$CDGH # 12-15
605 pshufd \$0x0e,$Wi,$Wi
607 palignr \$4,@MSG[3],$TMP
610 sha256msg1 @MSG[3],@MSG[2]
611 sha256rnds2 $CDGH,$ABEF
613 for($i=4;$i<16-3;$i++) {
615 movdqa $i*32-0x80($Tbl),$Wi
617 sha256msg2 @MSG[0],@MSG[1]
618 sha256rnds2 $ABEF,$CDGH # 16-19...
619 pshufd \$0x0e,$Wi,$Wi
621 palignr \$4,@MSG[0],$TMP
624 sha256msg1 @MSG[0],@MSG[3]
625 sha256rnds2 $CDGH,$ABEF
627 push(@MSG,shift(@MSG));
630 movdqa 13*32-0x80($Tbl),$Wi
632 sha256msg2 @MSG[0],@MSG[1]
633 sha256rnds2 $ABEF,$CDGH # 52-55
634 pshufd \$0x0e,$Wi,$Wi
636 palignr \$4,@MSG[0],$TMP
637 sha256rnds2 $CDGH,$ABEF
640 movdqa 14*32-0x80($Tbl),$Wi
642 sha256rnds2 $ABEF,$CDGH # 56-59
643 pshufd \$0x0e,$Wi,$Wi
644 sha256msg2 @MSG[1],@MSG[2]
646 sha256rnds2 $CDGH,$ABEF
648 movdqa 15*32-0x80($Tbl),$Wi
651 sha256rnds2 $ABEF,$CDGH # 60-63
652 pshufd \$0x0e,$Wi,$Wi
655 sha256rnds2 $CDGH,$ABEF
657 paddd $CDGH_SAVE,$CDGH
658 paddd $ABEF_SAVE,$ABEF
661 pshufd \$0xb1,$CDGH,$CDGH # DCHG
662 pshufd \$0x1b,$ABEF,$TMP # FEBA
663 pshufd \$0xb1,$ABEF,$ABEF # BAFE
664 punpckhqdq $CDGH,$ABEF # DCBA
665 palignr \$8,$TMP,$CDGH # HGFE
668 movdqu $CDGH,16($ctx)
670 $code.=<<___ if ($win64);
671 movaps -8-5*16(%rax),%xmm6
672 movaps -8-4*16(%rax),%xmm7
673 movaps -8-3*16(%rax),%xmm8
674 movaps -8-2*16(%rax),%xmm9
675 movaps -8-1*16(%rax),%xmm10
681 .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
687 my ($a,$b,$c,$d,$e,$f,$g,$h);
689 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
690 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
692 $arg = "\$$arg" if ($arg*1 eq $arg);
693 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
698 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
700 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
704 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
706 '&xor ($a4,$g)', # f^g
708 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
710 '&and ($a4,$e)', # (f^g)&e
713 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
716 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
717 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
718 '&xor ($a2,$b)', # a^b, b^c in next round
720 '&add ($h,$a4)', # h+=Ch(e,f,g)
721 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
722 '&and ($a3,$a2)', # (b^c)&(a^b)
725 '&add ($h,$a0)', # h+=Sigma1(e)
726 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
728 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
729 '&add ($d,$h)', # d+=h
730 '&add ($h,$a3)', # h+=Maj(a,b,c)
733 '&add ($a1,$h);'. # h+=Sigma0(a)
734 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
738 ######################################################################
741 if ($SZ==4) { # SHA256 only
742 my @X = map("%xmm$_",(0..3));
743 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
746 .type ${func}_ssse3,\@function,3
756 mov %rsp,%r11 # copy %rsp
757 shl \$4,%rdx # num*16
758 sub \$`$framesz+$win64*16*4`,%rsp
759 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
760 and \$-64,%rsp # align stack frame
761 mov $ctx,$_ctx # save ctx, 1st arg
762 mov $inp,$_inp # save inp, 2nd arh
763 mov %rdx,$_end # save end pointer, "3rd" arg
764 mov %r11,$_rsp # save copy of %rsp
766 $code.=<<___ if ($win64);
767 movaps %xmm6,16*$SZ+32(%rsp)
768 movaps %xmm7,16*$SZ+48(%rsp)
769 movaps %xmm8,16*$SZ+64(%rsp)
770 movaps %xmm9,16*$SZ+80(%rsp)
786 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
787 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
791 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
792 movdqu 0x00($inp),@X[0]
793 movdqu 0x10($inp),@X[1]
794 movdqu 0x20($inp),@X[2]
796 movdqu 0x30($inp),@X[3]
797 lea $TABLE(%rip),$Tbl
799 movdqa 0x00($Tbl),$t0
800 movdqa 0x20($Tbl),$t1
803 movdqa 0x40($Tbl),$t2
805 movdqa 0x60($Tbl),$t3
809 movdqa $t0,0x00(%rsp)
811 movdqa $t1,0x10(%rsp)
813 movdqa $t2,0x20(%rsp)
815 movdqa $t3,0x30(%rsp)
821 sub \$`-16*2*$SZ`,$Tbl # size optimization
823 sub Xupdate_256_SSSE3 () {
825 '&movdqa ($t0,@X[1]);',
826 '&movdqa ($t3,@X[3])',
827 '&palignr ($t0,@X[0],$SZ)', # X[1..4]
828 '&palignr ($t3,@X[2],$SZ);', # X[9..12]
830 '&movdqa ($t2,$t0);',
831 '&psrld ($t0,$sigma0[2])',
832 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
833 '&psrld ($t2,$sigma0[0])',
834 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
835 '&pslld ($t1,8*$SZ-$sigma0[1]);'.
837 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
839 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
842 '&pxor ($t0,$t1);', # sigma0(X[1..4])
843 '&psrld ($t3,$sigma1[2])',
844 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
845 '&psrlq ($t2,$sigma1[0])',
847 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
849 '&pshufb ($t3,$t4)', # sigma1(X[14..15])
850 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
851 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
852 '&movdqa ($t2,$t3);',
853 '&psrld ($t3,$sigma1[2])',
854 '&psrlq ($t2,$sigma1[0])',
856 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
858 '&movdqa ($t2,16*2*$j."($Tbl)")',
860 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
864 sub SSSE3_256_00_47 () {
868 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
871 foreach (Xupdate_256_SSSE3()) { # 36 instructions
877 } else { # squeeze extra 4% on Westmere and 19% on Atom
878 eval(shift(@insns)); #@
883 eval(shift(@insns)); #@
886 eval(shift(@insns)); #@
888 &palignr ($t0,@X[0],$SZ); # X[1..4]
891 &palignr ($t3,@X[2],$SZ); # X[9..12]
895 eval(shift(@insns)); #@
900 eval(shift(@insns)); #@
902 &psrld ($t0,$sigma0[2]);
906 &paddd (@X[0],$t3); # X[0..3] += X[9..12]
907 eval(shift(@insns)); #@
909 &psrld ($t2,$sigma0[0]);
912 &pshufd ($t3,@X[3],0b11111010); # X[4..15]
914 eval(shift(@insns)); #@
915 &pslld ($t1,8*$SZ-$sigma0[1]);
919 eval(shift(@insns)); #@
922 eval(shift(@insns)); #@
923 &psrld ($t2,$sigma0[1]-$sigma0[0]);
928 &pslld ($t1,$sigma0[1]-$sigma0[0]);
933 eval(shift(@insns)); #@
937 &pxor ($t0,$t1); # sigma0(X[1..4])
938 eval(shift(@insns)); #@
941 &psrld ($t3,$sigma1[2]);
944 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
945 eval(shift(@insns)); #@
947 &psrlq ($t2,$sigma1[0]);
952 eval(shift(@insns)); #@
955 eval(shift(@insns)); #@
956 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
960 eval(shift(@insns)); #@
963 #&pshufb ($t3,$t4); # sigma1(X[14..15])
964 &pshufd ($t3,$t3,0b10000000);
970 eval(shift(@insns)); #@
973 eval(shift(@insns)); #@
974 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
978 &pshufd ($t3,@X[0],0b01010000); # X[16..17]
980 eval(shift(@insns)); #@
985 &psrld ($t3,$sigma1[2]);
987 eval(shift(@insns)); #@
988 &psrlq ($t2,$sigma1[0]);
992 eval(shift(@insns)); #@
995 eval(shift(@insns)); #@
997 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
1000 eval(shift(@insns));
1002 eval(shift(@insns));
1003 eval(shift(@insns));
1004 eval(shift(@insns)); #@
1006 &pshufd ($t3,$t3,0b00001000);
1007 eval(shift(@insns));
1008 eval(shift(@insns));
1009 &movdqa ($t2,16*2*$j."($Tbl)");
1010 eval(shift(@insns)); #@
1011 eval(shift(@insns));
1013 eval(shift(@insns));
1014 eval(shift(@insns));
1015 eval(shift(@insns));
1016 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1017 eval(shift(@insns)); #@
1018 eval(shift(@insns));
1019 eval(shift(@insns));
1022 foreach (@insns) { eval; } # remaining instructions
1023 &movdqa (16*$j."(%rsp)",$t2);
1026 for ($i=0,$j=0; $j<4; $j++) {
1027 &SSSE3_256_00_47($j,\&body_00_15,@X);
1028 push(@X,shift(@X)); # rotate(@X)
1030 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1031 &jne (".Lssse3_00_47");
1033 for ($i=0; $i<16; ) {
1034 foreach(body_00_15()) { eval; }
1041 lea 16*$SZ($inp),$inp
1064 $code.=<<___ if ($win64);
1065 movaps 16*$SZ+32(%rsp),%xmm6
1066 movaps 16*$SZ+48(%rsp),%xmm7
1067 movaps 16*$SZ+64(%rsp),%xmm8
1068 movaps 16*$SZ+80(%rsp),%xmm9
1080 .size ${func}_ssse3,.-${func}_ssse3
1085 ######################################################################
1088 if ($SZ==8) { # SHA512 only
1090 .type ${func}_xop,\@function,3
1100 mov %rsp,%r11 # copy %rsp
1101 shl \$4,%rdx # num*16
1102 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1103 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1104 and \$-64,%rsp # align stack frame
1105 mov $ctx,$_ctx # save ctx, 1st arg
1106 mov $inp,$_inp # save inp, 2nd arh
1107 mov %rdx,$_end # save end pointer, "3rd" arg
1108 mov %r11,$_rsp # save copy of %rsp
1110 $code.=<<___ if ($win64);
1111 movaps %xmm6,16*$SZ+32(%rsp)
1112 movaps %xmm7,16*$SZ+48(%rsp)
1113 movaps %xmm8,16*$SZ+64(%rsp)
1114 movaps %xmm9,16*$SZ+80(%rsp)
1116 $code.=<<___ if ($win64 && $SZ>4);
1117 movaps %xmm10,16*$SZ+96(%rsp)
1118 movaps %xmm11,16*$SZ+112(%rsp)
1134 if ($SZ==4) { # SHA256
1135 my @X = map("%xmm$_",(0..3));
1136 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1141 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1142 vmovdqu 0x00($inp),@X[0]
1143 vmovdqu 0x10($inp),@X[1]
1144 vmovdqu 0x20($inp),@X[2]
1145 vmovdqu 0x30($inp),@X[3]
1146 vpshufb $t3,@X[0],@X[0]
1147 lea $TABLE(%rip),$Tbl
1148 vpshufb $t3,@X[1],@X[1]
1149 vpshufb $t3,@X[2],@X[2]
1150 vpaddd 0x00($Tbl),@X[0],$t0
1151 vpshufb $t3,@X[3],@X[3]
1152 vpaddd 0x20($Tbl),@X[1],$t1
1153 vpaddd 0x40($Tbl),@X[2],$t2
1154 vpaddd 0x60($Tbl),@X[3],$t3
1155 vmovdqa $t0,0x00(%rsp)
1157 vmovdqa $t1,0x10(%rsp)
1159 vmovdqa $t2,0x20(%rsp)
1161 vmovdqa $t3,0x30(%rsp)
1167 sub \$`-16*2*$SZ`,$Tbl # size optimization
1169 sub XOP_256_00_47 () {
1173 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1175 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
1176 eval(shift(@insns));
1177 eval(shift(@insns));
1178 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
1179 eval(shift(@insns));
1180 eval(shift(@insns));
1181 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
1182 eval(shift(@insns));
1183 eval(shift(@insns));
1184 &vpsrld ($t0,$t0,$sigma0[2]);
1185 eval(shift(@insns));
1186 eval(shift(@insns));
1187 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
1188 eval(shift(@insns));
1189 eval(shift(@insns));
1190 eval(shift(@insns));
1191 eval(shift(@insns));
1192 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
1193 eval(shift(@insns));
1194 eval(shift(@insns));
1195 &vpxor ($t0,$t0,$t1);
1196 eval(shift(@insns));
1197 eval(shift(@insns));
1198 eval(shift(@insns));
1199 eval(shift(@insns));
1200 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
1201 eval(shift(@insns));
1202 eval(shift(@insns));
1203 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
1204 eval(shift(@insns));
1205 eval(shift(@insns));
1206 &vpsrld ($t2,@X[3],$sigma1[2]);
1207 eval(shift(@insns));
1208 eval(shift(@insns));
1209 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
1210 eval(shift(@insns));
1211 eval(shift(@insns));
1212 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1213 eval(shift(@insns));
1214 eval(shift(@insns));
1215 &vpxor ($t3,$t3,$t2);
1216 eval(shift(@insns));
1217 eval(shift(@insns));
1218 eval(shift(@insns));
1219 eval(shift(@insns));
1220 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1221 eval(shift(@insns));
1222 eval(shift(@insns));
1223 eval(shift(@insns));
1224 eval(shift(@insns));
1225 &vpsrldq ($t3,$t3,8);
1226 eval(shift(@insns));
1227 eval(shift(@insns));
1228 eval(shift(@insns));
1229 eval(shift(@insns));
1230 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1231 eval(shift(@insns));
1232 eval(shift(@insns));
1233 eval(shift(@insns));
1234 eval(shift(@insns));
1235 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
1236 eval(shift(@insns));
1237 eval(shift(@insns));
1238 &vpsrld ($t2,@X[0],$sigma1[2]);
1239 eval(shift(@insns));
1240 eval(shift(@insns));
1241 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1242 eval(shift(@insns));
1243 eval(shift(@insns));
1244 &vpxor ($t3,$t3,$t2);
1245 eval(shift(@insns));
1246 eval(shift(@insns));
1247 eval(shift(@insns));
1248 eval(shift(@insns));
1249 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
1250 eval(shift(@insns));
1251 eval(shift(@insns));
1252 eval(shift(@insns));
1253 eval(shift(@insns));
1254 &vpslldq ($t3,$t3,8); # 22 instructions
1255 eval(shift(@insns));
1256 eval(shift(@insns));
1257 eval(shift(@insns));
1258 eval(shift(@insns));
1259 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1260 eval(shift(@insns));
1261 eval(shift(@insns));
1262 eval(shift(@insns));
1263 eval(shift(@insns));
1264 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1265 foreach (@insns) { eval; } # remaining instructions
1266 &vmovdqa (16*$j."(%rsp)",$t2);
1269 for ($i=0,$j=0; $j<4; $j++) {
1270 &XOP_256_00_47($j,\&body_00_15,@X);
1271 push(@X,shift(@X)); # rotate(@X)
1273 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1274 &jne (".Lxop_00_47");
1276 for ($i=0; $i<16; ) {
1277 foreach(body_00_15()) { eval; }
1281 my @X = map("%xmm$_",(0..7));
1282 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1287 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1288 vmovdqu 0x00($inp),@X[0]
1289 lea $TABLE+0x80(%rip),$Tbl # size optimization
1290 vmovdqu 0x10($inp),@X[1]
1291 vmovdqu 0x20($inp),@X[2]
1292 vpshufb $t3,@X[0],@X[0]
1293 vmovdqu 0x30($inp),@X[3]
1294 vpshufb $t3,@X[1],@X[1]
1295 vmovdqu 0x40($inp),@X[4]
1296 vpshufb $t3,@X[2],@X[2]
1297 vmovdqu 0x50($inp),@X[5]
1298 vpshufb $t3,@X[3],@X[3]
1299 vmovdqu 0x60($inp),@X[6]
1300 vpshufb $t3,@X[4],@X[4]
1301 vmovdqu 0x70($inp),@X[7]
1302 vpshufb $t3,@X[5],@X[5]
1303 vpaddq -0x80($Tbl),@X[0],$t0
1304 vpshufb $t3,@X[6],@X[6]
1305 vpaddq -0x60($Tbl),@X[1],$t1
1306 vpshufb $t3,@X[7],@X[7]
1307 vpaddq -0x40($Tbl),@X[2],$t2
1308 vpaddq -0x20($Tbl),@X[3],$t3
1309 vmovdqa $t0,0x00(%rsp)
1310 vpaddq 0x00($Tbl),@X[4],$t0
1311 vmovdqa $t1,0x10(%rsp)
1312 vpaddq 0x20($Tbl),@X[5],$t1
1313 vmovdqa $t2,0x20(%rsp)
1314 vpaddq 0x40($Tbl),@X[6],$t2
1315 vmovdqa $t3,0x30(%rsp)
1316 vpaddq 0x60($Tbl),@X[7],$t3
1317 vmovdqa $t0,0x40(%rsp)
1319 vmovdqa $t1,0x50(%rsp)
1321 vmovdqa $t2,0x60(%rsp)
1323 vmovdqa $t3,0x70(%rsp)
1329 add \$`16*2*$SZ`,$Tbl
1331 sub XOP_512_00_47 () {
1335 my @insns = (&$body,&$body); # 52 instructions
1337 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
1338 eval(shift(@insns));
1339 eval(shift(@insns));
1340 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
1341 eval(shift(@insns));
1342 eval(shift(@insns));
1343 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
1344 eval(shift(@insns));
1345 eval(shift(@insns));
1346 &vpsrlq ($t0,$t0,$sigma0[2]);
1347 eval(shift(@insns));
1348 eval(shift(@insns));
1349 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
1350 eval(shift(@insns));
1351 eval(shift(@insns));
1352 eval(shift(@insns));
1353 eval(shift(@insns));
1354 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
1355 eval(shift(@insns));
1356 eval(shift(@insns));
1357 &vpxor ($t0,$t0,$t1);
1358 eval(shift(@insns));
1359 eval(shift(@insns));
1360 eval(shift(@insns));
1361 eval(shift(@insns));
1362 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
1363 eval(shift(@insns));
1364 eval(shift(@insns));
1365 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
1366 eval(shift(@insns));
1367 eval(shift(@insns));
1368 &vpsrlq ($t2,@X[7],$sigma1[2]);
1369 eval(shift(@insns));
1370 eval(shift(@insns));
1371 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
1372 eval(shift(@insns));
1373 eval(shift(@insns));
1374 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
1375 eval(shift(@insns));
1376 eval(shift(@insns));
1377 &vpxor ($t3,$t3,$t2);
1378 eval(shift(@insns));
1379 eval(shift(@insns));
1380 eval(shift(@insns));
1381 eval(shift(@insns));
1382 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1383 eval(shift(@insns));
1384 eval(shift(@insns));
1385 eval(shift(@insns));
1386 eval(shift(@insns));
1387 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1388 eval(shift(@insns));
1389 eval(shift(@insns));
1390 eval(shift(@insns));
1391 eval(shift(@insns));
1392 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1393 foreach (@insns) { eval; } # remaining instructions
1394 &vmovdqa (16*$j."(%rsp)",$t2);
1397 for ($i=0,$j=0; $j<8; $j++) {
1398 &XOP_512_00_47($j,\&body_00_15,@X);
1399 push(@X,shift(@X)); # rotate(@X)
1401 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1402 &jne (".Lxop_00_47");
1404 for ($i=0; $i<16; ) {
1405 foreach(body_00_15()) { eval; }
1413 lea 16*$SZ($inp),$inp
1437 $code.=<<___ if ($win64);
1438 movaps 16*$SZ+32(%rsp),%xmm6
1439 movaps 16*$SZ+48(%rsp),%xmm7
1440 movaps 16*$SZ+64(%rsp),%xmm8
1441 movaps 16*$SZ+80(%rsp),%xmm9
1443 $code.=<<___ if ($win64 && $SZ>4);
1444 movaps 16*$SZ+96(%rsp),%xmm10
1445 movaps 16*$SZ+112(%rsp),%xmm11
1457 .size ${func}_xop,.-${func}_xop
1460 ######################################################################
1461 # AVX+shrd code path
1463 local *ror = sub { &shrd(@_[0],@_) };
1466 .type ${func}_avx,\@function,3
1476 mov %rsp,%r11 # copy %rsp
1477 shl \$4,%rdx # num*16
1478 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1479 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1480 and \$-64,%rsp # align stack frame
1481 mov $ctx,$_ctx # save ctx, 1st arg
1482 mov $inp,$_inp # save inp, 2nd arh
1483 mov %rdx,$_end # save end pointer, "3rd" arg
1484 mov %r11,$_rsp # save copy of %rsp
1486 $code.=<<___ if ($win64);
1487 movaps %xmm6,16*$SZ+32(%rsp)
1488 movaps %xmm7,16*$SZ+48(%rsp)
1489 movaps %xmm8,16*$SZ+64(%rsp)
1490 movaps %xmm9,16*$SZ+80(%rsp)
1492 $code.=<<___ if ($win64 && $SZ>4);
1493 movaps %xmm10,16*$SZ+96(%rsp)
1494 movaps %xmm11,16*$SZ+112(%rsp)
1509 if ($SZ==4) { # SHA256
1510 my @X = map("%xmm$_",(0..3));
1511 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1514 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1515 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1519 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1520 vmovdqu 0x00($inp),@X[0]
1521 vmovdqu 0x10($inp),@X[1]
1522 vmovdqu 0x20($inp),@X[2]
1523 vmovdqu 0x30($inp),@X[3]
1524 vpshufb $t3,@X[0],@X[0]
1525 lea $TABLE(%rip),$Tbl
1526 vpshufb $t3,@X[1],@X[1]
1527 vpshufb $t3,@X[2],@X[2]
1528 vpaddd 0x00($Tbl),@X[0],$t0
1529 vpshufb $t3,@X[3],@X[3]
1530 vpaddd 0x20($Tbl),@X[1],$t1
1531 vpaddd 0x40($Tbl),@X[2],$t2
1532 vpaddd 0x60($Tbl),@X[3],$t3
1533 vmovdqa $t0,0x00(%rsp)
1535 vmovdqa $t1,0x10(%rsp)
1537 vmovdqa $t2,0x20(%rsp)
1539 vmovdqa $t3,0x30(%rsp)
1545 sub \$`-16*2*$SZ`,$Tbl # size optimization
1547 sub Xupdate_256_AVX () {
1549 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
1550 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
1551 '&vpsrld ($t2,$t0,$sigma0[0]);',
1552 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
1553 '&vpsrld ($t3,$t0,$sigma0[2])',
1554 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
1555 '&vpxor ($t0,$t3,$t2)',
1556 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1557 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1558 '&vpxor ($t0,$t0,$t1)',
1559 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1560 '&vpxor ($t0,$t0,$t2)',
1561 '&vpsrld ($t2,$t3,$sigma1[2]);',
1562 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
1563 '&vpsrlq ($t3,$t3,$sigma1[0]);',
1564 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
1565 '&vpxor ($t2,$t2,$t3);',
1566 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1567 '&vpxor ($t2,$t2,$t3)',
1568 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
1569 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
1570 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1571 '&vpsrld ($t2,$t3,$sigma1[2])',
1572 '&vpsrlq ($t3,$t3,$sigma1[0])',
1573 '&vpxor ($t2,$t2,$t3);',
1574 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1575 '&vpxor ($t2,$t2,$t3)',
1576 '&vpshufb ($t2,$t2,$t5)',
1577 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
1581 sub AVX_256_00_47 () {
1585 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1587 foreach (Xupdate_256_AVX()) { # 29 instructions
1589 eval(shift(@insns));
1590 eval(shift(@insns));
1591 eval(shift(@insns));
1593 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1594 foreach (@insns) { eval; } # remaining instructions
1595 &vmovdqa (16*$j."(%rsp)",$t2);
1598 for ($i=0,$j=0; $j<4; $j++) {
1599 &AVX_256_00_47($j,\&body_00_15,@X);
1600 push(@X,shift(@X)); # rotate(@X)
1602 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1603 &jne (".Lavx_00_47");
1605 for ($i=0; $i<16; ) {
1606 foreach(body_00_15()) { eval; }
1610 my @X = map("%xmm$_",(0..7));
1611 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1617 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1618 vmovdqu 0x00($inp),@X[0]
1619 lea $TABLE+0x80(%rip),$Tbl # size optimization
1620 vmovdqu 0x10($inp),@X[1]
1621 vmovdqu 0x20($inp),@X[2]
1622 vpshufb $t3,@X[0],@X[0]
1623 vmovdqu 0x30($inp),@X[3]
1624 vpshufb $t3,@X[1],@X[1]
1625 vmovdqu 0x40($inp),@X[4]
1626 vpshufb $t3,@X[2],@X[2]
1627 vmovdqu 0x50($inp),@X[5]
1628 vpshufb $t3,@X[3],@X[3]
1629 vmovdqu 0x60($inp),@X[6]
1630 vpshufb $t3,@X[4],@X[4]
1631 vmovdqu 0x70($inp),@X[7]
1632 vpshufb $t3,@X[5],@X[5]
1633 vpaddq -0x80($Tbl),@X[0],$t0
1634 vpshufb $t3,@X[6],@X[6]
1635 vpaddq -0x60($Tbl),@X[1],$t1
1636 vpshufb $t3,@X[7],@X[7]
1637 vpaddq -0x40($Tbl),@X[2],$t2
1638 vpaddq -0x20($Tbl),@X[3],$t3
1639 vmovdqa $t0,0x00(%rsp)
1640 vpaddq 0x00($Tbl),@X[4],$t0
1641 vmovdqa $t1,0x10(%rsp)
1642 vpaddq 0x20($Tbl),@X[5],$t1
1643 vmovdqa $t2,0x20(%rsp)
1644 vpaddq 0x40($Tbl),@X[6],$t2
1645 vmovdqa $t3,0x30(%rsp)
1646 vpaddq 0x60($Tbl),@X[7],$t3
1647 vmovdqa $t0,0x40(%rsp)
1649 vmovdqa $t1,0x50(%rsp)
1651 vmovdqa $t2,0x60(%rsp)
1653 vmovdqa $t3,0x70(%rsp)
1659 add \$`16*2*$SZ`,$Tbl
1661 sub Xupdate_512_AVX () {
1663 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
1664 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
1665 '&vpsrlq ($t2,$t0,$sigma0[0])',
1666 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
1667 '&vpsrlq ($t3,$t0,$sigma0[2])',
1668 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
1669 '&vpxor ($t0,$t3,$t2)',
1670 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1671 '&vpxor ($t0,$t0,$t1)',
1672 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1673 '&vpxor ($t0,$t0,$t2)',
1674 '&vpsrlq ($t3,@X[7],$sigma1[2]);',
1675 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
1676 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
1677 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
1678 '&vpsrlq ($t1,@X[7],$sigma1[0]);',
1679 '&vpxor ($t3,$t3,$t2)',
1680 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
1681 '&vpxor ($t3,$t3,$t1)',
1682 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
1683 '&vpxor ($t3,$t3,$t2)',
1684 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
1685 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
1689 sub AVX_512_00_47 () {
1693 my @insns = (&$body,&$body); # 52 instructions
1695 foreach (Xupdate_512_AVX()) { # 23 instructions
1697 eval(shift(@insns));
1698 eval(shift(@insns));
1700 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1701 foreach (@insns) { eval; } # remaining instructions
1702 &vmovdqa (16*$j."(%rsp)",$t2);
1705 for ($i=0,$j=0; $j<8; $j++) {
1706 &AVX_512_00_47($j,\&body_00_15,@X);
1707 push(@X,shift(@X)); # rotate(@X)
1709 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1710 &jne (".Lavx_00_47");
1712 for ($i=0; $i<16; ) {
1713 foreach(body_00_15()) { eval; }
1721 lea 16*$SZ($inp),$inp
1745 $code.=<<___ if ($win64);
1746 movaps 16*$SZ+32(%rsp),%xmm6
1747 movaps 16*$SZ+48(%rsp),%xmm7
1748 movaps 16*$SZ+64(%rsp),%xmm8
1749 movaps 16*$SZ+80(%rsp),%xmm9
1751 $code.=<<___ if ($win64 && $SZ>4);
1752 movaps 16*$SZ+96(%rsp),%xmm10
1753 movaps 16*$SZ+112(%rsp),%xmm11
1765 .size ${func}_avx,.-${func}_avx
1769 ######################################################################
1770 # AVX2+BMI code path
1772 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
1776 sub bodyx_00_15 () {
1777 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1779 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1781 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
1782 '&and ($a4,$e)', # f&e
1783 '&rorx ($a0,$e,$Sigma1[2])',
1784 '&rorx ($a2,$e,$Sigma1[1])',
1786 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
1787 '&lea ($h,"($h,$a4)")',
1788 '&andn ($a4,$e,$g)', # ~e&g
1791 '&rorx ($a1,$e,$Sigma1[0])',
1792 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
1793 '&xor ($a0,$a1)', # Sigma1(e)
1796 '&rorx ($a4,$a,$Sigma0[2])',
1797 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
1798 '&xor ($a2,$b)', # a^b, b^c in next round
1799 '&rorx ($a1,$a,$Sigma0[1])',
1801 '&rorx ($a0,$a,$Sigma0[0])',
1802 '&lea ($d,"($d,$h)")', # d+=h
1803 '&and ($a3,$a2)', # (b^c)&(a^b)
1806 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
1807 '&xor ($a1,$a0)', # Sigma0(a)
1808 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
1809 '&mov ($a4,$e)', # copy of f in future
1811 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1813 # and at the finish one has to $a+=$a1
1817 .type ${func}_avx2,\@function,3
1827 mov %rsp,%r11 # copy %rsp
1828 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1829 shl \$4,%rdx # num*16
1830 and \$-256*$SZ,%rsp # align stack frame
1831 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1832 add \$`2*$SZ*($rounds-8)`,%rsp
1833 mov $ctx,$_ctx # save ctx, 1st arg
1834 mov $inp,$_inp # save inp, 2nd arh
1835 mov %rdx,$_end # save end pointer, "3rd" arg
1836 mov %r11,$_rsp # save copy of %rsp
1838 $code.=<<___ if ($win64);
1839 movaps %xmm6,16*$SZ+32(%rsp)
1840 movaps %xmm7,16*$SZ+48(%rsp)
1841 movaps %xmm8,16*$SZ+64(%rsp)
1842 movaps %xmm9,16*$SZ+80(%rsp)
1844 $code.=<<___ if ($win64 && $SZ>4);
1845 movaps %xmm10,16*$SZ+96(%rsp)
1846 movaps %xmm11,16*$SZ+112(%rsp)
1852 sub \$-16*$SZ,$inp # inp++, size optimization
1854 mov $inp,%r12 # borrow $T1
1856 cmp %rdx,$inp # $_end
1858 cmove %rsp,%r12 # next block or random data
1865 if ($SZ==4) { # SHA256
1866 my @X = map("%ymm$_",(0..3));
1867 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1870 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1871 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1875 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1876 vmovdqu -16*$SZ+0($inp),%xmm0
1877 vmovdqu -16*$SZ+16($inp),%xmm1
1878 vmovdqu -16*$SZ+32($inp),%xmm2
1879 vmovdqu -16*$SZ+48($inp),%xmm3
1880 #mov $inp,$_inp # offload $inp
1881 vinserti128 \$1,(%r12),@X[0],@X[0]
1882 vinserti128 \$1,16(%r12),@X[1],@X[1]
1883 vpshufb $t3,@X[0],@X[0]
1884 vinserti128 \$1,32(%r12),@X[2],@X[2]
1885 vpshufb $t3,@X[1],@X[1]
1886 vinserti128 \$1,48(%r12),@X[3],@X[3]
1888 lea $TABLE(%rip),$Tbl
1889 vpshufb $t3,@X[2],@X[2]
1890 vpaddd 0x00($Tbl),@X[0],$t0
1891 vpshufb $t3,@X[3],@X[3]
1892 vpaddd 0x20($Tbl),@X[1],$t1
1893 vpaddd 0x40($Tbl),@X[2],$t2
1894 vpaddd 0x60($Tbl),@X[3],$t3
1895 vmovdqa $t0,0x00(%rsp)
1897 vmovdqa $t1,0x20(%rsp)
1898 lea -$PUSH8(%rsp),%rsp
1900 vmovdqa $t2,0x00(%rsp)
1902 vmovdqa $t3,0x20(%rsp)
1904 sub \$-16*2*$SZ,$Tbl # size optimization
1911 sub AVX2_256_00_47 () {
1915 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1916 my $base = "+2*$PUSH8(%rsp)";
1918 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1919 foreach (Xupdate_256_AVX()) { # 29 instructions
1921 eval(shift(@insns));
1922 eval(shift(@insns));
1923 eval(shift(@insns));
1925 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1926 foreach (@insns) { eval; } # remaining instructions
1927 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1930 for ($i=0,$j=0; $j<4; $j++) {
1931 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1932 push(@X,shift(@X)); # rotate(@X)
1934 &lea ($Tbl,16*2*$SZ."($Tbl)");
1935 &cmpb (($SZ-1)."($Tbl)",0);
1936 &jne (".Lavx2_00_47");
1938 for ($i=0; $i<16; ) {
1939 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1940 foreach(bodyx_00_15()) { eval; }
1943 my @X = map("%ymm$_",(0..7));
1944 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
1950 vmovdqu -16*$SZ($inp),%xmm0
1951 vmovdqu -16*$SZ+16($inp),%xmm1
1952 vmovdqu -16*$SZ+32($inp),%xmm2
1953 lea $TABLE+0x80(%rip),$Tbl # size optimization
1954 vmovdqu -16*$SZ+48($inp),%xmm3
1955 vmovdqu -16*$SZ+64($inp),%xmm4
1956 vmovdqu -16*$SZ+80($inp),%xmm5
1957 vmovdqu -16*$SZ+96($inp),%xmm6
1958 vmovdqu -16*$SZ+112($inp),%xmm7
1959 #mov $inp,$_inp # offload $inp
1960 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
1961 vinserti128 \$1,(%r12),@X[0],@X[0]
1962 vinserti128 \$1,16(%r12),@X[1],@X[1]
1963 vpshufb $t2,@X[0],@X[0]
1964 vinserti128 \$1,32(%r12),@X[2],@X[2]
1965 vpshufb $t2,@X[1],@X[1]
1966 vinserti128 \$1,48(%r12),@X[3],@X[3]
1967 vpshufb $t2,@X[2],@X[2]
1968 vinserti128 \$1,64(%r12),@X[4],@X[4]
1969 vpshufb $t2,@X[3],@X[3]
1970 vinserti128 \$1,80(%r12),@X[5],@X[5]
1971 vpshufb $t2,@X[4],@X[4]
1972 vinserti128 \$1,96(%r12),@X[6],@X[6]
1973 vpshufb $t2,@X[5],@X[5]
1974 vinserti128 \$1,112(%r12),@X[7],@X[7]
1976 vpaddq -0x80($Tbl),@X[0],$t0
1977 vpshufb $t2,@X[6],@X[6]
1978 vpaddq -0x60($Tbl),@X[1],$t1
1979 vpshufb $t2,@X[7],@X[7]
1980 vpaddq -0x40($Tbl),@X[2],$t2
1981 vpaddq -0x20($Tbl),@X[3],$t3
1982 vmovdqa $t0,0x00(%rsp)
1983 vpaddq 0x00($Tbl),@X[4],$t0
1984 vmovdqa $t1,0x20(%rsp)
1985 vpaddq 0x20($Tbl),@X[5],$t1
1986 vmovdqa $t2,0x40(%rsp)
1987 vpaddq 0x40($Tbl),@X[6],$t2
1988 vmovdqa $t3,0x60(%rsp)
1989 lea -$PUSH8(%rsp),%rsp
1990 vpaddq 0x60($Tbl),@X[7],$t3
1991 vmovdqa $t0,0x00(%rsp)
1993 vmovdqa $t1,0x20(%rsp)
1995 vmovdqa $t2,0x40(%rsp)
1997 vmovdqa $t3,0x60(%rsp)
2006 sub AVX2_512_00_47 () {
2010 my @insns = (&$body,&$body); # 48 instructions
2011 my $base = "+2*$PUSH8(%rsp)";
2013 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0);
2014 foreach (Xupdate_512_AVX()) { # 23 instructions
2017 eval(shift(@insns));
2018 eval(shift(@insns));
2019 eval(shift(@insns));
2022 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
2023 foreach (@insns) { eval; } # remaining instructions
2024 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2027 for ($i=0,$j=0; $j<8; $j++) {
2028 &AVX2_512_00_47($j,\&bodyx_00_15,@X);
2029 push(@X,shift(@X)); # rotate(@X)
2031 &lea ($Tbl,16*2*$SZ."($Tbl)");
2032 &cmpb (($SZ-1-0x80)."($Tbl)",0);
2033 &jne (".Lavx2_00_47");
2035 for ($i=0; $i<16; ) {
2036 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2037 foreach(bodyx_00_15()) { eval; }
2041 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2043 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2044 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
2064 cmp `$PUSH8+2*8`($Tbl),$inp # $_end
2075 for ($i=0; $i<8; ) {
2076 my $base="+16($Tbl)";
2077 foreach(bodyx_00_15()) { eval; }
2080 lea -$PUSH8($Tbl),$Tbl
2084 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2086 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2087 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
2095 lea `2*16*$SZ`($inp),$inp # inp+=2
2102 cmove %rsp,%r12 # next block or stale data
2119 $code.=<<___ if ($win64);
2120 movaps 16*$SZ+32(%rsp),%xmm6
2121 movaps 16*$SZ+48(%rsp),%xmm7
2122 movaps 16*$SZ+64(%rsp),%xmm8
2123 movaps 16*$SZ+80(%rsp),%xmm9
2125 $code.=<<___ if ($win64 && $SZ>4);
2126 movaps 16*$SZ+96(%rsp),%xmm10
2127 movaps 16*$SZ+112(%rsp),%xmm11
2139 .size ${func}_avx2,.-${func}_avx2
2144 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2145 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2153 .extern __imp_RtlVirtualUnwind
2154 .type se_handler,\@abi-omnipotent
2168 mov 120($context),%rax # pull context->Rax
2169 mov 248($context),%rbx # pull context->Rip
2171 mov 8($disp),%rsi # disp->ImageBase
2172 mov 56($disp),%r11 # disp->HanderlData
2174 mov 0(%r11),%r10d # HandlerData[0]
2175 lea (%rsi,%r10),%r10 # prologue label
2176 cmp %r10,%rbx # context->Rip<prologue label
2179 mov 152($context),%rax # pull context->Rsp
2181 mov 4(%r11),%r10d # HandlerData[1]
2182 lea (%rsi,%r10),%r10 # epilogue label
2183 cmp %r10,%rbx # context->Rip>=epilogue label
2186 $code.=<<___ if ($avx>1);
2187 lea .Lavx2_shortcut(%rip),%r10
2188 cmp %r10,%rbx # context->Rip<avx2_shortcut
2192 add \$`2*$SZ*($rounds-8)`,%rax
2196 mov %rax,%rsi # put aside Rsp
2197 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
2206 mov %rbx,144($context) # restore context->Rbx
2207 mov %rbp,160($context) # restore context->Rbp
2208 mov %r12,216($context) # restore context->R12
2209 mov %r13,224($context) # restore context->R13
2210 mov %r14,232($context) # restore context->R14
2211 mov %r15,240($context) # restore context->R15
2213 lea .Lepilogue(%rip),%r10
2215 jb .Lin_prologue # non-AVX code
2217 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
2218 lea 512($context),%rdi # &context.Xmm6
2219 mov \$`$SZ==4?8:12`,%ecx
2220 .long 0xa548f3fc # cld; rep movsq
2225 mov %rax,152($context) # restore context->Rsp
2226 mov %rsi,168($context) # restore context->Rsi
2227 mov %rdi,176($context) # restore context->Rdi
2229 mov 40($disp),%rdi # disp->ContextRecord
2230 mov $context,%rsi # context
2231 mov \$154,%ecx # sizeof(CONTEXT)
2232 .long 0xa548f3fc # cld; rep movsq
2235 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2236 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2237 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2238 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2239 mov 40(%rsi),%r10 # disp->ContextRecord
2240 lea 56(%rsi),%r11 # &disp->HandlerData
2241 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2242 mov %r10,32(%rsp) # arg5
2243 mov %r11,40(%rsp) # arg6
2244 mov %r12,48(%rsp) # arg7
2245 mov %rcx,56(%rsp) # arg8, (NULL)
2246 call *__imp_RtlVirtualUnwind(%rip)
2248 mov \$1,%eax # ExceptionContinueSearch
2260 .size se_handler,.-se_handler
2262 .type shaext_handler,\@abi-omnipotent
2276 mov 120($context),%rax # pull context->Rax
2277 mov 248($context),%rbx # pull context->Rip
2279 lea .Lprologue_shaext(%rip),%r10
2280 cmp %r10,%rbx # context->Rip<.Lprologue
2283 lea .Lepilogue_shaext(%rip),%r10
2284 cmp %r10,%rbx # context->Rip>=.Lepilogue
2287 lea -8-5*16(%rax),%rsi
2288 lea 512($context),%rdi # &context.Xmm6
2290 .long 0xa548f3fc # cld; rep movsq
2293 .size shaext_handler,.-shaext_handler
2297 .rva .LSEH_begin_$func
2298 .rva .LSEH_end_$func
2299 .rva .LSEH_info_$func
2301 $code.=<<___ if ($SZ==4 && $shext);
2302 .rva .LSEH_begin_${func}_shaext
2303 .rva .LSEH_end_${func}_shaext
2304 .rva .LSEH_info_${func}_shaext
2306 $code.=<<___ if ($SZ==4);
2307 .rva .LSEH_begin_${func}_ssse3
2308 .rva .LSEH_end_${func}_ssse3
2309 .rva .LSEH_info_${func}_ssse3
2311 $code.=<<___ if ($avx && $SZ==8);
2312 .rva .LSEH_begin_${func}_xop
2313 .rva .LSEH_end_${func}_xop
2314 .rva .LSEH_info_${func}_xop
2316 $code.=<<___ if ($avx);
2317 .rva .LSEH_begin_${func}_avx
2318 .rva .LSEH_end_${func}_avx
2319 .rva .LSEH_info_${func}_avx
2321 $code.=<<___ if ($avx>1);
2322 .rva .LSEH_begin_${func}_avx2
2323 .rva .LSEH_end_${func}_avx2
2324 .rva .LSEH_info_${func}_avx2
2332 .rva .Lprologue,.Lepilogue # HandlerData[]
2334 $code.=<<___ if ($SZ==4);
2335 .LSEH_info_${func}_shaext:
2338 .LSEH_info_${func}_ssse3:
2341 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2343 $code.=<<___ if ($avx && $SZ==8);
2344 .LSEH_info_${func}_xop:
2347 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
2349 $code.=<<___ if ($avx);
2350 .LSEH_info_${func}_avx:
2353 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
2355 $code.=<<___ if ($avx>1);
2356 .LSEH_info_${func}_avx2:
2359 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2366 "sha256rnds2" => 0xcb,
2367 "sha256msg1" => 0xcc,
2368 "sha256msg2" => 0xcd );
2370 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2371 my @opcode=(0x0f,0x38);
2372 push @opcode,$opcodelet{$instr};
2373 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
2374 return ".byte\t".join(',',@opcode);
2376 return $instr."\t".@_[0];
2380 foreach (split("\n",$code)) {
2381 s/\`([^\`]*)\`/eval $1/geo;
2383 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;