3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
9 # sha256/512_block procedure for x86_64.
11 # 40% improvement over compiler-generated code on Opteron. On EM64T
12 # sha256 was observed to run >80% faster and sha512 - >40%. No magical
13 # tricks, just straight implementation... I really wonder why gcc
14 # [being armed with inline assembler] fails to generate as fast code.
15 # The only thing which is cool about this module is that it's very
16 # same instruction sequence used for both SHA-256 and SHA-512. In
17 # former case the instructions operate on 32-bit operands, while in
18 # latter - on 64-bit ones. All I had to do is to get one flavor right,
19 # the other one passed the test right away:-)
21 # sha256_block runs in ~1005 cycles on Opteron, which gives you
22 # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
23 # frequency in GHz. sha512_block runs in ~1275 cycles, which results
24 # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
25 # Well, if you compare it to IA-64 implementation, which maintains
26 # X[16] in register bank[!], tends to 4 instructions per CPU clock
27 # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
28 # issue Opteron pipeline and X[16] maintained in memory. So that *if*
29 # there is a way to improve it, *then* the only way would be to try to
30 # offload X[16] updates to SSE unit, but that would require "deeper"
31 # loop unroll, which in turn would naturally cause size blow-up, not
32 # to mention increased complexity! And once again, only *if* it's
33 # actually possible to noticeably improve overall ILP, instruction
34 # level parallelism, on a given CPU implementation in this case.
36 # Special note on Intel EM64T. While Opteron CPU exhibits perfect
37 # perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
38 # [currently available] EM64T CPUs apparently are far from it. On the
39 # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
40 # sha256_block:-( This is presumably because 64-bit shifts/rotates
41 # apparently are not atomic instructions, but implemented in microcode.
45 # Optimization including one of Pavel Semjanov's ideas, alternative
46 # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
47 # unfortunately -2% SHA512 on P4 [which nobody should care about
52 # Add SIMD code paths, see below for improvement coefficients. SSSE3
53 # code path was not attempted for SHA512, because improvement is not
54 # estimated to be high enough, noticeably less than 9%, to justify
55 # the effort, not on pre-AVX processors. [Obviously with exclusion
56 # for VIA Nano, but it has SHA512 instruction that is faster and
57 # should be used instead.] For reference, corresponding estimated
58 # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
59 # higher coefficients are observed on VIA Nano and Bulldozer has more
60 # to do with specifics of their architecture [which is topic for
61 # separate discussion].
65 # Add AVX2 code path. Two consecutive input blocks are loaded to
66 # 256-bit %ymm registers, with data from first block to least
67 # significant 128-bit halves and data from second to most significant.
68 # The data is then processed with same SIMD instruction sequence as
69 # for AVX, but with %ymm as operands. Side effect is increased stack
70 # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
75 # Add support for Intel SHA Extensions.
77 ######################################################################
78 # Current performance in cycles per processed byte (less is better):
80 # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
82 # AMD K8 14.9 - - 9.57 -
84 # Core 2 15.6 13.8(+13%) - 9.97 -
85 # Westmere 14.8 12.3(+19%) - 9.58 -
86 # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
87 # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
88 # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
89 # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
90 # VIA Nano 23.0 16.5(+39%) - 14.7 -
91 # Atom 23.0 18.9(+22%) - 14.7 -
93 # (*) whichever best applicable;
94 # (**) switch from ror to shrd stands for fair share of improvement;
95 # (***) execution time is fully determined by remaining integer-only
96 # part, body_00_15; reducing the amount of SIMD instructions
97 # below certain limit makes no difference/sense; to conserve
98 # space SHA256 XOP code path is therefore omitted;
102 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
104 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
106 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
107 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
108 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
109 die "can't locate x86_64-xlate.pl";
111 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
112 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
113 $avx = ($1>=2.19) + ($1>=2.22);
116 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
117 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
118 $avx = ($1>=2.09) + ($1>=2.10);
121 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
122 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
123 $avx = ($1>=10) + ($1>=11);
126 open OUT,"| \"$^X\" $xlate $flavour $output";
129 if ($output =~ /512/) {
130 $func="sha512_block_data_order";
133 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
134 "%r8", "%r9", "%r10","%r11");
135 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
142 $func="sha256_block_data_order";
145 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
146 "%r8d","%r9d","%r10d","%r11d");
147 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
155 $ctx="%rdi"; # 1st arg, zapped by $a3
156 $inp="%rsi"; # 2nd arg
159 $_ctx="16*$SZ+0*8(%rsp)";
160 $_inp="16*$SZ+1*8(%rsp)";
161 $_end="16*$SZ+2*8(%rsp)";
162 $_rsp="16*$SZ+3*8(%rsp)";
163 $framesz="16*$SZ+4*8";
167 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
169 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
172 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
176 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
179 mov $T1,`$SZ*($i&0xf)`(%rsp)
183 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
185 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
187 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
189 add $a2,$T1 # T1+=Ch(e,f,g)
192 add ($Tbl),$T1 # T1+=K[round]
195 xor $b,$a2 # a^b, b^c in next round
196 ror \$$Sigma1[0],$a0 # Sigma1(e)
200 ror \$$Sigma0[0],$a1 # Sigma0(a)
201 add $a0,$T1 # T1+=Sigma1(e)
203 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
207 lea $STRIDE($Tbl),$Tbl # round++
209 $code.=<<___ if ($i<15);
210 add $a1,$h # h+=Sigma0(a)
212 ($a2,$a3) = ($a3,$a2);
216 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
219 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
220 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
223 ror \$`$sigma0[1]-$sigma0[0]`,$a0
224 add $a1,$a # modulo-scheduled h+=Sigma0(a)
226 ror \$`$sigma1[1]-$sigma1[0]`,$a2
235 xor $a0,$T1 # sigma0(X[(i+1)&0xf])
236 xor $a1,$a2 # sigma1(X[(i+14)&0xf])
237 add `$SZ*(($i+9)&0xf)`(%rsp),$T1
239 add `$SZ*($i&0xf)`(%rsp),$T1
250 .extern OPENSSL_ia32cap_P
252 .type $func,\@function,3
256 $code.=<<___ if ($SZ==4 || $avx);
257 lea OPENSSL_ia32cap_P(%rip),%r11
262 $code.=<<___ if ($SZ==4);
263 test \$`1<<29`,%r11d # check for SHA
266 $code.=<<___ if ($avx && $SZ==8);
267 test \$`1<<11`,%r10d # check for XOP
270 $code.=<<___ if ($avx>1);
271 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
272 cmp \$`1<<8|1<<5|1<<3`,%r11d
275 $code.=<<___ if ($avx);
276 and \$`1<<30`,%r9d # mask "Intel CPU" bit
277 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
279 cmp \$`1<<28|1<<9|1<<30`,%r10d
282 $code.=<<___ if ($SZ==4);
293 mov %rsp,%r11 # copy %rsp
294 shl \$4,%rdx # num*16
296 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
297 and \$-64,%rsp # align stack frame
298 mov $ctx,$_ctx # save ctx, 1st arg
299 mov $inp,$_inp # save inp, 2nd arh
300 mov %rdx,$_end # save end pointer, "3rd" arg
301 mov %r11,$_rsp # save copy of %rsp
317 lea $TABLE(%rip),$Tbl
320 for($i=0;$i<16;$i++) {
321 $code.=" mov $SZ*$i($inp),$T1\n";
322 $code.=" mov @ROT[4],$a0\n";
323 $code.=" mov @ROT[0],$a1\n";
324 $code.=" bswap $T1\n";
325 &ROUND_00_15($i,@ROT);
326 unshift(@ROT,pop(@ROT));
334 &ROUND_16_XX($i,@ROT);
335 unshift(@ROT,pop(@ROT));
339 cmpb \$0,`$SZ-1`($Tbl)
343 add $a1,$A # modulo-scheduled h+=Sigma0(a)
344 lea 16*$SZ($inp),$inp
383 .type $TABLE,\@object
385 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
386 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
387 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
388 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
389 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
390 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
391 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
392 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
393 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
394 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
395 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
396 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
397 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
398 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
399 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
400 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
401 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
402 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
403 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
404 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
405 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
406 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
407 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
408 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
409 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
410 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
411 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
412 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
413 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
414 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
415 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
416 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
418 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
419 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
420 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
421 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
422 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
423 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
424 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
429 .type $TABLE,\@object
431 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
432 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
433 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
434 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
435 .quad 0x3956c25bf348b538,0x59f111f1b605d019
436 .quad 0x3956c25bf348b538,0x59f111f1b605d019
437 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
438 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
439 .quad 0xd807aa98a3030242,0x12835b0145706fbe
440 .quad 0xd807aa98a3030242,0x12835b0145706fbe
441 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
442 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
443 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
444 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
445 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
446 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
447 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
448 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
449 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
450 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
451 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
452 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
453 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
454 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
455 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
456 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
457 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
458 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
459 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
460 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
461 .quad 0x06ca6351e003826f,0x142929670a0e6e70
462 .quad 0x06ca6351e003826f,0x142929670a0e6e70
463 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
464 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
465 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
466 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
467 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
468 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
469 .quad 0x81c2c92e47edaee6,0x92722c851482353b
470 .quad 0x81c2c92e47edaee6,0x92722c851482353b
471 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
472 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
473 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
474 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
475 .quad 0xd192e819d6ef5218,0xd69906245565a910
476 .quad 0xd192e819d6ef5218,0xd69906245565a910
477 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
478 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
479 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
480 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
481 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
482 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
483 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
484 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
485 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
486 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
487 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
488 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
489 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
490 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
491 .quad 0x90befffa23631e28,0xa4506cebde82bde9
492 .quad 0x90befffa23631e28,0xa4506cebde82bde9
493 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
494 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
495 .quad 0xca273eceea26619c,0xd186b8c721c0c207
496 .quad 0xca273eceea26619c,0xd186b8c721c0c207
497 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
498 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
499 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
500 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
501 .quad 0x113f9804bef90dae,0x1b710b35131c471b
502 .quad 0x113f9804bef90dae,0x1b710b35131c471b
503 .quad 0x28db77f523047d84,0x32caab7b40c72493
504 .quad 0x28db77f523047d84,0x32caab7b40c72493
505 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
506 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
507 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
508 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
509 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
510 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
512 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
513 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
514 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
518 ######################################################################
522 ######################################################################
523 # Intel SHA Extensions implementation of SHA256 update function.
525 my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
527 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
528 my @MSG=map("%xmm$_",(3..6));
531 .type sha256_block_data_order_shaext,\@function,3
533 sha256_block_data_order_shaext:
536 $code.=<<___ if ($win64);
537 lea `-8-5*16`(%rsp),%rsp
538 movaps %xmm6,-8-5*16(%rax)
539 movaps %xmm7,-8-4*16(%rax)
540 movaps %xmm8,-8-3*16(%rax)
541 movaps %xmm9,-8-2*16(%rax)
542 movaps %xmm10,-8-1*16(%rax)
546 lea K256+0x80(%rip),$Tbl
547 movdqu ($ctx),$ABEF # DCBA
548 movdqu 16($ctx),$CDGH # HGFE
549 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
551 pshufd \$0x1b,$ABEF,$Wi # ABCD
552 pshufd \$0xb1,$ABEF,$ABEF # CDAB
553 pshufd \$0x1b,$CDGH,$CDGH # EFGH
554 movdqa $TMP,$BSWAP # offload
555 palignr \$8,$CDGH,$ABEF # ABEF
556 punpcklqdq $Wi,$CDGH # CDGH
561 movdqu ($inp),@MSG[0]
562 movdqu 0x10($inp),@MSG[1]
563 movdqu 0x20($inp),@MSG[2]
565 movdqu 0x30($inp),@MSG[3]
567 movdqa 0*32-0x80($Tbl),$Wi
570 movdqa $CDGH,$CDGH_SAVE # offload
571 sha256rnds2 $ABEF,$CDGH # 0-3
572 pshufd \$0x0e,$Wi,$Wi
574 movdqa $ABEF,$ABEF_SAVE # offload
575 sha256rnds2 $CDGH,$ABEF
577 movdqa 1*32-0x80($Tbl),$Wi
580 sha256rnds2 $ABEF,$CDGH # 4-7
581 pshufd \$0x0e,$Wi,$Wi
583 sha256msg1 @MSG[1],@MSG[0]
584 sha256rnds2 $CDGH,$ABEF
586 movdqa 2*32-0x80($Tbl),$Wi
589 sha256rnds2 $ABEF,$CDGH # 8-11
590 pshufd \$0x0e,$Wi,$Wi
592 palignr \$4,@MSG[2],$TMP
595 sha256msg1 @MSG[2],@MSG[1]
596 sha256rnds2 $CDGH,$ABEF
598 movdqa 3*32-0x80($Tbl),$Wi
600 sha256msg2 @MSG[3],@MSG[0]
601 sha256rnds2 $ABEF,$CDGH # 12-15
602 pshufd \$0x0e,$Wi,$Wi
604 palignr \$4,@MSG[3],$TMP
607 sha256msg1 @MSG[3],@MSG[2]
608 sha256rnds2 $CDGH,$ABEF
610 for($i=4;$i<16-3;$i++) {
612 movdqa $i*32-0x80($Tbl),$Wi
614 sha256msg2 @MSG[0],@MSG[1]
615 sha256rnds2 $ABEF,$CDGH # 16-19...
616 pshufd \$0x0e,$Wi,$Wi
618 palignr \$4,@MSG[0],$TMP
621 sha256msg1 @MSG[0],@MSG[3]
622 sha256rnds2 $CDGH,$ABEF
624 push(@MSG,shift(@MSG));
627 movdqa 13*32-0x80($Tbl),$Wi
629 sha256msg2 @MSG[0],@MSG[1]
630 sha256rnds2 $ABEF,$CDGH # 52-55
631 pshufd \$0x0e,$Wi,$Wi
633 palignr \$4,@MSG[0],$TMP
634 sha256rnds2 $CDGH,$ABEF
637 movdqa 14*32-0x80($Tbl),$Wi
639 sha256rnds2 $ABEF,$CDGH # 56-59
640 pshufd \$0x0e,$Wi,$Wi
641 sha256msg2 @MSG[1],@MSG[2]
643 sha256rnds2 $CDGH,$ABEF
645 movdqa 15*32-0x80($Tbl),$Wi
648 sha256rnds2 $ABEF,$CDGH # 60-63
649 pshufd \$0x0e,$Wi,$Wi
652 sha256rnds2 $CDGH,$ABEF
654 paddd $CDGH_SAVE,$CDGH
655 paddd $ABEF_SAVE,$ABEF
658 pshufd \$0xb1,$CDGH,$CDGH # DCHG
659 pshufd \$0x1b,$ABEF,$TMP # FEBA
660 pshufd \$0xb1,$ABEF,$ABEF # BAFE
661 punpckhqdq $CDGH,$ABEF # DCBA
662 palignr \$8,$TMP,$CDGH # HGFE
665 movdqu $CDGH,16($ctx)
667 $code.=<<___ if ($win64);
668 movaps -8-5*16(%rax),%xmm6
669 movaps -8-4*16(%rax),%xmm7
670 movaps -8-3*16(%rax),%xmm8
671 movaps -8-2*16(%rax),%xmm9
672 movaps -8-1*16(%rax),%xmm10
678 .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
684 my ($a,$b,$c,$d,$e,$f,$g,$h);
686 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
687 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
689 $arg = "\$$arg" if ($arg*1 eq $arg);
690 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
695 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
697 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
701 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
703 '&xor ($a4,$g)', # f^g
705 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
707 '&and ($a4,$e)', # (f^g)&e
710 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
713 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
714 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
715 '&xor ($a2,$b)', # a^b, b^c in next round
717 '&add ($h,$a4)', # h+=Ch(e,f,g)
718 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
719 '&and ($a3,$a2)', # (b^c)&(a^b)
722 '&add ($h,$a0)', # h+=Sigma1(e)
723 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
725 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
726 '&add ($d,$h)', # d+=h
727 '&add ($h,$a3)', # h+=Maj(a,b,c)
730 '&add ($a1,$h);'. # h+=Sigma0(a)
731 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
735 ######################################################################
738 if ($SZ==4) { # SHA256 only
739 my @X = map("%xmm$_",(0..3));
740 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
743 .type ${func}_ssse3,\@function,3
753 mov %rsp,%r11 # copy %rsp
754 shl \$4,%rdx # num*16
755 sub \$`$framesz+$win64*16*4`,%rsp
756 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
757 and \$-64,%rsp # align stack frame
758 mov $ctx,$_ctx # save ctx, 1st arg
759 mov $inp,$_inp # save inp, 2nd arh
760 mov %rdx,$_end # save end pointer, "3rd" arg
761 mov %r11,$_rsp # save copy of %rsp
763 $code.=<<___ if ($win64);
764 movaps %xmm6,16*$SZ+32(%rsp)
765 movaps %xmm7,16*$SZ+48(%rsp)
766 movaps %xmm8,16*$SZ+64(%rsp)
767 movaps %xmm9,16*$SZ+80(%rsp)
783 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
784 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
788 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
789 movdqu 0x00($inp),@X[0]
790 movdqu 0x10($inp),@X[1]
791 movdqu 0x20($inp),@X[2]
793 movdqu 0x30($inp),@X[3]
794 lea $TABLE(%rip),$Tbl
796 movdqa 0x00($Tbl),$t0
797 movdqa 0x20($Tbl),$t1
800 movdqa 0x40($Tbl),$t2
802 movdqa 0x60($Tbl),$t3
806 movdqa $t0,0x00(%rsp)
808 movdqa $t1,0x10(%rsp)
810 movdqa $t2,0x20(%rsp)
812 movdqa $t3,0x30(%rsp)
818 sub \$`-16*2*$SZ`,$Tbl # size optimization
820 sub Xupdate_256_SSSE3 () {
822 '&movdqa ($t0,@X[1]);',
823 '&movdqa ($t3,@X[3])',
824 '&palignr ($t0,@X[0],$SZ)', # X[1..4]
825 '&palignr ($t3,@X[2],$SZ);', # X[9..12]
827 '&movdqa ($t2,$t0);',
828 '&psrld ($t0,$sigma0[2])',
829 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
830 '&psrld ($t2,$sigma0[0])',
831 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
832 '&pslld ($t1,8*$SZ-$sigma0[1]);'.
834 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
836 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
839 '&pxor ($t0,$t1);', # sigma0(X[1..4])
840 '&psrld ($t3,$sigma1[2])',
841 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
842 '&psrlq ($t2,$sigma1[0])',
844 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
846 '&pshufb ($t3,$t4)', # sigma1(X[14..15])
847 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
848 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
849 '&movdqa ($t2,$t3);',
850 '&psrld ($t3,$sigma1[2])',
851 '&psrlq ($t2,$sigma1[0])',
853 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
855 '&movdqa ($t2,16*2*$j."($Tbl)")',
857 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
861 sub SSSE3_256_00_47 () {
865 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
868 foreach (Xupdate_256_SSSE3()) { # 36 instructions
874 } else { # squeeze extra 4% on Westmere and 19% on Atom
875 eval(shift(@insns)); #@
880 eval(shift(@insns)); #@
883 eval(shift(@insns)); #@
885 &palignr ($t0,@X[0],$SZ); # X[1..4]
888 &palignr ($t3,@X[2],$SZ); # X[9..12]
892 eval(shift(@insns)); #@
897 eval(shift(@insns)); #@
899 &psrld ($t0,$sigma0[2]);
903 &paddd (@X[0],$t3); # X[0..3] += X[9..12]
904 eval(shift(@insns)); #@
906 &psrld ($t2,$sigma0[0]);
909 &pshufd ($t3,@X[3],0b11111010); # X[4..15]
911 eval(shift(@insns)); #@
912 &pslld ($t1,8*$SZ-$sigma0[1]);
916 eval(shift(@insns)); #@
919 eval(shift(@insns)); #@
920 &psrld ($t2,$sigma0[1]-$sigma0[0]);
925 &pslld ($t1,$sigma0[1]-$sigma0[0]);
930 eval(shift(@insns)); #@
934 &pxor ($t0,$t1); # sigma0(X[1..4])
935 eval(shift(@insns)); #@
938 &psrld ($t3,$sigma1[2]);
941 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
942 eval(shift(@insns)); #@
944 &psrlq ($t2,$sigma1[0]);
949 eval(shift(@insns)); #@
952 eval(shift(@insns)); #@
953 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
957 eval(shift(@insns)); #@
960 #&pshufb ($t3,$t4); # sigma1(X[14..15])
961 &pshufd ($t3,$t3,0b10000000);
967 eval(shift(@insns)); #@
970 eval(shift(@insns)); #@
971 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
975 &pshufd ($t3,@X[0],0b01010000); # X[16..17]
977 eval(shift(@insns)); #@
982 &psrld ($t3,$sigma1[2]);
984 eval(shift(@insns)); #@
985 &psrlq ($t2,$sigma1[0]);
989 eval(shift(@insns)); #@
992 eval(shift(@insns)); #@
994 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
1000 eval(shift(@insns));
1001 eval(shift(@insns)); #@
1003 &pshufd ($t3,$t3,0b00001000);
1004 eval(shift(@insns));
1005 eval(shift(@insns));
1006 &movdqa ($t2,16*2*$j."($Tbl)");
1007 eval(shift(@insns)); #@
1008 eval(shift(@insns));
1010 eval(shift(@insns));
1011 eval(shift(@insns));
1012 eval(shift(@insns));
1013 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1014 eval(shift(@insns)); #@
1015 eval(shift(@insns));
1016 eval(shift(@insns));
1019 foreach (@insns) { eval; } # remaining instructions
1020 &movdqa (16*$j."(%rsp)",$t2);
1023 for ($i=0,$j=0; $j<4; $j++) {
1024 &SSSE3_256_00_47($j,\&body_00_15,@X);
1025 push(@X,shift(@X)); # rotate(@X)
1027 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1028 &jne (".Lssse3_00_47");
1030 for ($i=0; $i<16; ) {
1031 foreach(body_00_15()) { eval; }
1038 lea 16*$SZ($inp),$inp
1061 $code.=<<___ if ($win64);
1062 movaps 16*$SZ+32(%rsp),%xmm6
1063 movaps 16*$SZ+48(%rsp),%xmm7
1064 movaps 16*$SZ+64(%rsp),%xmm8
1065 movaps 16*$SZ+80(%rsp),%xmm9
1077 .size ${func}_ssse3,.-${func}_ssse3
1082 ######################################################################
1085 if ($SZ==8) { # SHA512 only
1087 .type ${func}_xop,\@function,3
1097 mov %rsp,%r11 # copy %rsp
1098 shl \$4,%rdx # num*16
1099 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1100 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1101 and \$-64,%rsp # align stack frame
1102 mov $ctx,$_ctx # save ctx, 1st arg
1103 mov $inp,$_inp # save inp, 2nd arh
1104 mov %rdx,$_end # save end pointer, "3rd" arg
1105 mov %r11,$_rsp # save copy of %rsp
1107 $code.=<<___ if ($win64);
1108 movaps %xmm6,16*$SZ+32(%rsp)
1109 movaps %xmm7,16*$SZ+48(%rsp)
1110 movaps %xmm8,16*$SZ+64(%rsp)
1111 movaps %xmm9,16*$SZ+80(%rsp)
1113 $code.=<<___ if ($win64 && $SZ>4);
1114 movaps %xmm10,16*$SZ+96(%rsp)
1115 movaps %xmm11,16*$SZ+112(%rsp)
1131 if ($SZ==4) { # SHA256
1132 my @X = map("%xmm$_",(0..3));
1133 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1138 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1139 vmovdqu 0x00($inp),@X[0]
1140 vmovdqu 0x10($inp),@X[1]
1141 vmovdqu 0x20($inp),@X[2]
1142 vmovdqu 0x30($inp),@X[3]
1143 vpshufb $t3,@X[0],@X[0]
1144 lea $TABLE(%rip),$Tbl
1145 vpshufb $t3,@X[1],@X[1]
1146 vpshufb $t3,@X[2],@X[2]
1147 vpaddd 0x00($Tbl),@X[0],$t0
1148 vpshufb $t3,@X[3],@X[3]
1149 vpaddd 0x20($Tbl),@X[1],$t1
1150 vpaddd 0x40($Tbl),@X[2],$t2
1151 vpaddd 0x60($Tbl),@X[3],$t3
1152 vmovdqa $t0,0x00(%rsp)
1154 vmovdqa $t1,0x10(%rsp)
1156 vmovdqa $t2,0x20(%rsp)
1158 vmovdqa $t3,0x30(%rsp)
1164 sub \$`-16*2*$SZ`,$Tbl # size optimization
1166 sub XOP_256_00_47 () {
1170 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1172 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
1173 eval(shift(@insns));
1174 eval(shift(@insns));
1175 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
1176 eval(shift(@insns));
1177 eval(shift(@insns));
1178 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
1179 eval(shift(@insns));
1180 eval(shift(@insns));
1181 &vpsrld ($t0,$t0,$sigma0[2]);
1182 eval(shift(@insns));
1183 eval(shift(@insns));
1184 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
1185 eval(shift(@insns));
1186 eval(shift(@insns));
1187 eval(shift(@insns));
1188 eval(shift(@insns));
1189 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
1190 eval(shift(@insns));
1191 eval(shift(@insns));
1192 &vpxor ($t0,$t0,$t1);
1193 eval(shift(@insns));
1194 eval(shift(@insns));
1195 eval(shift(@insns));
1196 eval(shift(@insns));
1197 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
1198 eval(shift(@insns));
1199 eval(shift(@insns));
1200 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
1201 eval(shift(@insns));
1202 eval(shift(@insns));
1203 &vpsrld ($t2,@X[3],$sigma1[2]);
1204 eval(shift(@insns));
1205 eval(shift(@insns));
1206 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
1207 eval(shift(@insns));
1208 eval(shift(@insns));
1209 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1210 eval(shift(@insns));
1211 eval(shift(@insns));
1212 &vpxor ($t3,$t3,$t2);
1213 eval(shift(@insns));
1214 eval(shift(@insns));
1215 eval(shift(@insns));
1216 eval(shift(@insns));
1217 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1218 eval(shift(@insns));
1219 eval(shift(@insns));
1220 eval(shift(@insns));
1221 eval(shift(@insns));
1222 &vpsrldq ($t3,$t3,8);
1223 eval(shift(@insns));
1224 eval(shift(@insns));
1225 eval(shift(@insns));
1226 eval(shift(@insns));
1227 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1228 eval(shift(@insns));
1229 eval(shift(@insns));
1230 eval(shift(@insns));
1231 eval(shift(@insns));
1232 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
1233 eval(shift(@insns));
1234 eval(shift(@insns));
1235 &vpsrld ($t2,@X[0],$sigma1[2]);
1236 eval(shift(@insns));
1237 eval(shift(@insns));
1238 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1239 eval(shift(@insns));
1240 eval(shift(@insns));
1241 &vpxor ($t3,$t3,$t2);
1242 eval(shift(@insns));
1243 eval(shift(@insns));
1244 eval(shift(@insns));
1245 eval(shift(@insns));
1246 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
1247 eval(shift(@insns));
1248 eval(shift(@insns));
1249 eval(shift(@insns));
1250 eval(shift(@insns));
1251 &vpslldq ($t3,$t3,8); # 22 instructions
1252 eval(shift(@insns));
1253 eval(shift(@insns));
1254 eval(shift(@insns));
1255 eval(shift(@insns));
1256 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1257 eval(shift(@insns));
1258 eval(shift(@insns));
1259 eval(shift(@insns));
1260 eval(shift(@insns));
1261 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1262 foreach (@insns) { eval; } # remaining instructions
1263 &vmovdqa (16*$j."(%rsp)",$t2);
1266 for ($i=0,$j=0; $j<4; $j++) {
1267 &XOP_256_00_47($j,\&body_00_15,@X);
1268 push(@X,shift(@X)); # rotate(@X)
1270 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1271 &jne (".Lxop_00_47");
1273 for ($i=0; $i<16; ) {
1274 foreach(body_00_15()) { eval; }
1278 my @X = map("%xmm$_",(0..7));
1279 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1284 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1285 vmovdqu 0x00($inp),@X[0]
1286 lea $TABLE+0x80(%rip),$Tbl # size optimization
1287 vmovdqu 0x10($inp),@X[1]
1288 vmovdqu 0x20($inp),@X[2]
1289 vpshufb $t3,@X[0],@X[0]
1290 vmovdqu 0x30($inp),@X[3]
1291 vpshufb $t3,@X[1],@X[1]
1292 vmovdqu 0x40($inp),@X[4]
1293 vpshufb $t3,@X[2],@X[2]
1294 vmovdqu 0x50($inp),@X[5]
1295 vpshufb $t3,@X[3],@X[3]
1296 vmovdqu 0x60($inp),@X[6]
1297 vpshufb $t3,@X[4],@X[4]
1298 vmovdqu 0x70($inp),@X[7]
1299 vpshufb $t3,@X[5],@X[5]
1300 vpaddq -0x80($Tbl),@X[0],$t0
1301 vpshufb $t3,@X[6],@X[6]
1302 vpaddq -0x60($Tbl),@X[1],$t1
1303 vpshufb $t3,@X[7],@X[7]
1304 vpaddq -0x40($Tbl),@X[2],$t2
1305 vpaddq -0x20($Tbl),@X[3],$t3
1306 vmovdqa $t0,0x00(%rsp)
1307 vpaddq 0x00($Tbl),@X[4],$t0
1308 vmovdqa $t1,0x10(%rsp)
1309 vpaddq 0x20($Tbl),@X[5],$t1
1310 vmovdqa $t2,0x20(%rsp)
1311 vpaddq 0x40($Tbl),@X[6],$t2
1312 vmovdqa $t3,0x30(%rsp)
1313 vpaddq 0x60($Tbl),@X[7],$t3
1314 vmovdqa $t0,0x40(%rsp)
1316 vmovdqa $t1,0x50(%rsp)
1318 vmovdqa $t2,0x60(%rsp)
1320 vmovdqa $t3,0x70(%rsp)
1326 add \$`16*2*$SZ`,$Tbl
1328 sub XOP_512_00_47 () {
1332 my @insns = (&$body,&$body); # 52 instructions
1334 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
1335 eval(shift(@insns));
1336 eval(shift(@insns));
1337 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
1338 eval(shift(@insns));
1339 eval(shift(@insns));
1340 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
1341 eval(shift(@insns));
1342 eval(shift(@insns));
1343 &vpsrlq ($t0,$t0,$sigma0[2]);
1344 eval(shift(@insns));
1345 eval(shift(@insns));
1346 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
1347 eval(shift(@insns));
1348 eval(shift(@insns));
1349 eval(shift(@insns));
1350 eval(shift(@insns));
1351 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
1352 eval(shift(@insns));
1353 eval(shift(@insns));
1354 &vpxor ($t0,$t0,$t1);
1355 eval(shift(@insns));
1356 eval(shift(@insns));
1357 eval(shift(@insns));
1358 eval(shift(@insns));
1359 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
1360 eval(shift(@insns));
1361 eval(shift(@insns));
1362 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
1363 eval(shift(@insns));
1364 eval(shift(@insns));
1365 &vpsrlq ($t2,@X[7],$sigma1[2]);
1366 eval(shift(@insns));
1367 eval(shift(@insns));
1368 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
1369 eval(shift(@insns));
1370 eval(shift(@insns));
1371 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
1372 eval(shift(@insns));
1373 eval(shift(@insns));
1374 &vpxor ($t3,$t3,$t2);
1375 eval(shift(@insns));
1376 eval(shift(@insns));
1377 eval(shift(@insns));
1378 eval(shift(@insns));
1379 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1380 eval(shift(@insns));
1381 eval(shift(@insns));
1382 eval(shift(@insns));
1383 eval(shift(@insns));
1384 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1385 eval(shift(@insns));
1386 eval(shift(@insns));
1387 eval(shift(@insns));
1388 eval(shift(@insns));
1389 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1390 foreach (@insns) { eval; } # remaining instructions
1391 &vmovdqa (16*$j."(%rsp)",$t2);
1394 for ($i=0,$j=0; $j<8; $j++) {
1395 &XOP_512_00_47($j,\&body_00_15,@X);
1396 push(@X,shift(@X)); # rotate(@X)
1398 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1399 &jne (".Lxop_00_47");
1401 for ($i=0; $i<16; ) {
1402 foreach(body_00_15()) { eval; }
1410 lea 16*$SZ($inp),$inp
1434 $code.=<<___ if ($win64);
1435 movaps 16*$SZ+32(%rsp),%xmm6
1436 movaps 16*$SZ+48(%rsp),%xmm7
1437 movaps 16*$SZ+64(%rsp),%xmm8
1438 movaps 16*$SZ+80(%rsp),%xmm9
1440 $code.=<<___ if ($win64 && $SZ>4);
1441 movaps 16*$SZ+96(%rsp),%xmm10
1442 movaps 16*$SZ+112(%rsp),%xmm11
1454 .size ${func}_xop,.-${func}_xop
1457 ######################################################################
1458 # AVX+shrd code path
1460 local *ror = sub { &shrd(@_[0],@_) };
1463 .type ${func}_avx,\@function,3
1473 mov %rsp,%r11 # copy %rsp
1474 shl \$4,%rdx # num*16
1475 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1476 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1477 and \$-64,%rsp # align stack frame
1478 mov $ctx,$_ctx # save ctx, 1st arg
1479 mov $inp,$_inp # save inp, 2nd arh
1480 mov %rdx,$_end # save end pointer, "3rd" arg
1481 mov %r11,$_rsp # save copy of %rsp
1483 $code.=<<___ if ($win64);
1484 movaps %xmm6,16*$SZ+32(%rsp)
1485 movaps %xmm7,16*$SZ+48(%rsp)
1486 movaps %xmm8,16*$SZ+64(%rsp)
1487 movaps %xmm9,16*$SZ+80(%rsp)
1489 $code.=<<___ if ($win64 && $SZ>4);
1490 movaps %xmm10,16*$SZ+96(%rsp)
1491 movaps %xmm11,16*$SZ+112(%rsp)
1506 if ($SZ==4) { # SHA256
1507 my @X = map("%xmm$_",(0..3));
1508 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1511 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1512 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1516 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1517 vmovdqu 0x00($inp),@X[0]
1518 vmovdqu 0x10($inp),@X[1]
1519 vmovdqu 0x20($inp),@X[2]
1520 vmovdqu 0x30($inp),@X[3]
1521 vpshufb $t3,@X[0],@X[0]
1522 lea $TABLE(%rip),$Tbl
1523 vpshufb $t3,@X[1],@X[1]
1524 vpshufb $t3,@X[2],@X[2]
1525 vpaddd 0x00($Tbl),@X[0],$t0
1526 vpshufb $t3,@X[3],@X[3]
1527 vpaddd 0x20($Tbl),@X[1],$t1
1528 vpaddd 0x40($Tbl),@X[2],$t2
1529 vpaddd 0x60($Tbl),@X[3],$t3
1530 vmovdqa $t0,0x00(%rsp)
1532 vmovdqa $t1,0x10(%rsp)
1534 vmovdqa $t2,0x20(%rsp)
1536 vmovdqa $t3,0x30(%rsp)
1542 sub \$`-16*2*$SZ`,$Tbl # size optimization
1544 sub Xupdate_256_AVX () {
1546 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
1547 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
1548 '&vpsrld ($t2,$t0,$sigma0[0]);',
1549 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
1550 '&vpsrld ($t3,$t0,$sigma0[2])',
1551 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
1552 '&vpxor ($t0,$t3,$t2)',
1553 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1554 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1555 '&vpxor ($t0,$t0,$t1)',
1556 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1557 '&vpxor ($t0,$t0,$t2)',
1558 '&vpsrld ($t2,$t3,$sigma1[2]);',
1559 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
1560 '&vpsrlq ($t3,$t3,$sigma1[0]);',
1561 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
1562 '&vpxor ($t2,$t2,$t3);',
1563 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1564 '&vpxor ($t2,$t2,$t3)',
1565 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
1566 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
1567 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1568 '&vpsrld ($t2,$t3,$sigma1[2])',
1569 '&vpsrlq ($t3,$t3,$sigma1[0])',
1570 '&vpxor ($t2,$t2,$t3);',
1571 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1572 '&vpxor ($t2,$t2,$t3)',
1573 '&vpshufb ($t2,$t2,$t5)',
1574 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
1578 sub AVX_256_00_47 () {
1582 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1584 foreach (Xupdate_256_AVX()) { # 29 instructions
1586 eval(shift(@insns));
1587 eval(shift(@insns));
1588 eval(shift(@insns));
1590 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1591 foreach (@insns) { eval; } # remaining instructions
1592 &vmovdqa (16*$j."(%rsp)",$t2);
1595 for ($i=0,$j=0; $j<4; $j++) {
1596 &AVX_256_00_47($j,\&body_00_15,@X);
1597 push(@X,shift(@X)); # rotate(@X)
1599 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1600 &jne (".Lavx_00_47");
1602 for ($i=0; $i<16; ) {
1603 foreach(body_00_15()) { eval; }
1607 my @X = map("%xmm$_",(0..7));
1608 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1614 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1615 vmovdqu 0x00($inp),@X[0]
1616 lea $TABLE+0x80(%rip),$Tbl # size optimization
1617 vmovdqu 0x10($inp),@X[1]
1618 vmovdqu 0x20($inp),@X[2]
1619 vpshufb $t3,@X[0],@X[0]
1620 vmovdqu 0x30($inp),@X[3]
1621 vpshufb $t3,@X[1],@X[1]
1622 vmovdqu 0x40($inp),@X[4]
1623 vpshufb $t3,@X[2],@X[2]
1624 vmovdqu 0x50($inp),@X[5]
1625 vpshufb $t3,@X[3],@X[3]
1626 vmovdqu 0x60($inp),@X[6]
1627 vpshufb $t3,@X[4],@X[4]
1628 vmovdqu 0x70($inp),@X[7]
1629 vpshufb $t3,@X[5],@X[5]
1630 vpaddq -0x80($Tbl),@X[0],$t0
1631 vpshufb $t3,@X[6],@X[6]
1632 vpaddq -0x60($Tbl),@X[1],$t1
1633 vpshufb $t3,@X[7],@X[7]
1634 vpaddq -0x40($Tbl),@X[2],$t2
1635 vpaddq -0x20($Tbl),@X[3],$t3
1636 vmovdqa $t0,0x00(%rsp)
1637 vpaddq 0x00($Tbl),@X[4],$t0
1638 vmovdqa $t1,0x10(%rsp)
1639 vpaddq 0x20($Tbl),@X[5],$t1
1640 vmovdqa $t2,0x20(%rsp)
1641 vpaddq 0x40($Tbl),@X[6],$t2
1642 vmovdqa $t3,0x30(%rsp)
1643 vpaddq 0x60($Tbl),@X[7],$t3
1644 vmovdqa $t0,0x40(%rsp)
1646 vmovdqa $t1,0x50(%rsp)
1648 vmovdqa $t2,0x60(%rsp)
1650 vmovdqa $t3,0x70(%rsp)
1656 add \$`16*2*$SZ`,$Tbl
1658 sub Xupdate_512_AVX () {
1660 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
1661 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
1662 '&vpsrlq ($t2,$t0,$sigma0[0])',
1663 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
1664 '&vpsrlq ($t3,$t0,$sigma0[2])',
1665 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
1666 '&vpxor ($t0,$t3,$t2)',
1667 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1668 '&vpxor ($t0,$t0,$t1)',
1669 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1670 '&vpxor ($t0,$t0,$t2)',
1671 '&vpsrlq ($t3,@X[7],$sigma1[2]);',
1672 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
1673 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
1674 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
1675 '&vpsrlq ($t1,@X[7],$sigma1[0]);',
1676 '&vpxor ($t3,$t3,$t2)',
1677 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
1678 '&vpxor ($t3,$t3,$t1)',
1679 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
1680 '&vpxor ($t3,$t3,$t2)',
1681 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
1682 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
1686 sub AVX_512_00_47 () {
1690 my @insns = (&$body,&$body); # 52 instructions
1692 foreach (Xupdate_512_AVX()) { # 23 instructions
1694 eval(shift(@insns));
1695 eval(shift(@insns));
1697 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1698 foreach (@insns) { eval; } # remaining instructions
1699 &vmovdqa (16*$j."(%rsp)",$t2);
1702 for ($i=0,$j=0; $j<8; $j++) {
1703 &AVX_512_00_47($j,\&body_00_15,@X);
1704 push(@X,shift(@X)); # rotate(@X)
1706 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1707 &jne (".Lavx_00_47");
1709 for ($i=0; $i<16; ) {
1710 foreach(body_00_15()) { eval; }
1718 lea 16*$SZ($inp),$inp
1742 $code.=<<___ if ($win64);
1743 movaps 16*$SZ+32(%rsp),%xmm6
1744 movaps 16*$SZ+48(%rsp),%xmm7
1745 movaps 16*$SZ+64(%rsp),%xmm8
1746 movaps 16*$SZ+80(%rsp),%xmm9
1748 $code.=<<___ if ($win64 && $SZ>4);
1749 movaps 16*$SZ+96(%rsp),%xmm10
1750 movaps 16*$SZ+112(%rsp),%xmm11
1762 .size ${func}_avx,.-${func}_avx
1766 ######################################################################
1767 # AVX2+BMI code path
1769 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
1773 sub bodyx_00_15 () {
1774 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1776 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1778 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
1779 '&and ($a4,$e)', # f&e
1780 '&rorx ($a0,$e,$Sigma1[2])',
1781 '&rorx ($a2,$e,$Sigma1[1])',
1783 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
1784 '&lea ($h,"($h,$a4)")',
1785 '&andn ($a4,$e,$g)', # ~e&g
1788 '&rorx ($a1,$e,$Sigma1[0])',
1789 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
1790 '&xor ($a0,$a1)', # Sigma1(e)
1793 '&rorx ($a4,$a,$Sigma0[2])',
1794 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
1795 '&xor ($a2,$b)', # a^b, b^c in next round
1796 '&rorx ($a1,$a,$Sigma0[1])',
1798 '&rorx ($a0,$a,$Sigma0[0])',
1799 '&lea ($d,"($d,$h)")', # d+=h
1800 '&and ($a3,$a2)', # (b^c)&(a^b)
1803 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
1804 '&xor ($a1,$a0)', # Sigma0(a)
1805 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
1806 '&mov ($a4,$e)', # copy of f in future
1808 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1810 # and at the finish one has to $a+=$a1
1814 .type ${func}_avx2,\@function,3
1824 mov %rsp,%r11 # copy %rsp
1825 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1826 shl \$4,%rdx # num*16
1827 and \$-256*$SZ,%rsp # align stack frame
1828 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1829 add \$`2*$SZ*($rounds-8)`,%rsp
1830 mov $ctx,$_ctx # save ctx, 1st arg
1831 mov $inp,$_inp # save inp, 2nd arh
1832 mov %rdx,$_end # save end pointer, "3rd" arg
1833 mov %r11,$_rsp # save copy of %rsp
1835 $code.=<<___ if ($win64);
1836 movaps %xmm6,16*$SZ+32(%rsp)
1837 movaps %xmm7,16*$SZ+48(%rsp)
1838 movaps %xmm8,16*$SZ+64(%rsp)
1839 movaps %xmm9,16*$SZ+80(%rsp)
1841 $code.=<<___ if ($win64 && $SZ>4);
1842 movaps %xmm10,16*$SZ+96(%rsp)
1843 movaps %xmm11,16*$SZ+112(%rsp)
1849 sub \$-16*$SZ,$inp # inp++, size optimization
1851 mov $inp,%r12 # borrow $T1
1853 cmp %rdx,$inp # $_end
1855 cmove %rsp,%r12 # next block or random data
1862 if ($SZ==4) { # SHA256
1863 my @X = map("%ymm$_",(0..3));
1864 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1867 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1868 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1872 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1873 vmovdqu -16*$SZ+0($inp),%xmm0
1874 vmovdqu -16*$SZ+16($inp),%xmm1
1875 vmovdqu -16*$SZ+32($inp),%xmm2
1876 vmovdqu -16*$SZ+48($inp),%xmm3
1877 #mov $inp,$_inp # offload $inp
1878 vinserti128 \$1,(%r12),@X[0],@X[0]
1879 vinserti128 \$1,16(%r12),@X[1],@X[1]
1880 vpshufb $t3,@X[0],@X[0]
1881 vinserti128 \$1,32(%r12),@X[2],@X[2]
1882 vpshufb $t3,@X[1],@X[1]
1883 vinserti128 \$1,48(%r12),@X[3],@X[3]
1885 lea $TABLE(%rip),$Tbl
1886 vpshufb $t3,@X[2],@X[2]
1887 vpaddd 0x00($Tbl),@X[0],$t0
1888 vpshufb $t3,@X[3],@X[3]
1889 vpaddd 0x20($Tbl),@X[1],$t1
1890 vpaddd 0x40($Tbl),@X[2],$t2
1891 vpaddd 0x60($Tbl),@X[3],$t3
1892 vmovdqa $t0,0x00(%rsp)
1894 vmovdqa $t1,0x20(%rsp)
1895 lea -$PUSH8(%rsp),%rsp
1897 vmovdqa $t2,0x00(%rsp)
1899 vmovdqa $t3,0x20(%rsp)
1901 sub \$-16*2*$SZ,$Tbl # size optimization
1908 sub AVX2_256_00_47 () {
1912 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1913 my $base = "+2*$PUSH8(%rsp)";
1915 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1916 foreach (Xupdate_256_AVX()) { # 29 instructions
1918 eval(shift(@insns));
1919 eval(shift(@insns));
1920 eval(shift(@insns));
1922 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1923 foreach (@insns) { eval; } # remaining instructions
1924 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1927 for ($i=0,$j=0; $j<4; $j++) {
1928 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1929 push(@X,shift(@X)); # rotate(@X)
1931 &lea ($Tbl,16*2*$SZ."($Tbl)");
1932 &cmpb (($SZ-1)."($Tbl)",0);
1933 &jne (".Lavx2_00_47");
1935 for ($i=0; $i<16; ) {
1936 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1937 foreach(bodyx_00_15()) { eval; }
1940 my @X = map("%ymm$_",(0..7));
1941 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
1947 vmovdqu -16*$SZ($inp),%xmm0
1948 vmovdqu -16*$SZ+16($inp),%xmm1
1949 vmovdqu -16*$SZ+32($inp),%xmm2
1950 lea $TABLE+0x80(%rip),$Tbl # size optimization
1951 vmovdqu -16*$SZ+48($inp),%xmm3
1952 vmovdqu -16*$SZ+64($inp),%xmm4
1953 vmovdqu -16*$SZ+80($inp),%xmm5
1954 vmovdqu -16*$SZ+96($inp),%xmm6
1955 vmovdqu -16*$SZ+112($inp),%xmm7
1956 #mov $inp,$_inp # offload $inp
1957 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
1958 vinserti128 \$1,(%r12),@X[0],@X[0]
1959 vinserti128 \$1,16(%r12),@X[1],@X[1]
1960 vpshufb $t2,@X[0],@X[0]
1961 vinserti128 \$1,32(%r12),@X[2],@X[2]
1962 vpshufb $t2,@X[1],@X[1]
1963 vinserti128 \$1,48(%r12),@X[3],@X[3]
1964 vpshufb $t2,@X[2],@X[2]
1965 vinserti128 \$1,64(%r12),@X[4],@X[4]
1966 vpshufb $t2,@X[3],@X[3]
1967 vinserti128 \$1,80(%r12),@X[5],@X[5]
1968 vpshufb $t2,@X[4],@X[4]
1969 vinserti128 \$1,96(%r12),@X[6],@X[6]
1970 vpshufb $t2,@X[5],@X[5]
1971 vinserti128 \$1,112(%r12),@X[7],@X[7]
1973 vpaddq -0x80($Tbl),@X[0],$t0
1974 vpshufb $t2,@X[6],@X[6]
1975 vpaddq -0x60($Tbl),@X[1],$t1
1976 vpshufb $t2,@X[7],@X[7]
1977 vpaddq -0x40($Tbl),@X[2],$t2
1978 vpaddq -0x20($Tbl),@X[3],$t3
1979 vmovdqa $t0,0x00(%rsp)
1980 vpaddq 0x00($Tbl),@X[4],$t0
1981 vmovdqa $t1,0x20(%rsp)
1982 vpaddq 0x20($Tbl),@X[5],$t1
1983 vmovdqa $t2,0x40(%rsp)
1984 vpaddq 0x40($Tbl),@X[6],$t2
1985 vmovdqa $t3,0x60(%rsp)
1986 lea -$PUSH8(%rsp),%rsp
1987 vpaddq 0x60($Tbl),@X[7],$t3
1988 vmovdqa $t0,0x00(%rsp)
1990 vmovdqa $t1,0x20(%rsp)
1992 vmovdqa $t2,0x40(%rsp)
1994 vmovdqa $t3,0x60(%rsp)
2003 sub AVX2_512_00_47 () {
2007 my @insns = (&$body,&$body); # 48 instructions
2008 my $base = "+2*$PUSH8(%rsp)";
2010 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0);
2011 foreach (Xupdate_512_AVX()) { # 23 instructions
2014 eval(shift(@insns));
2015 eval(shift(@insns));
2016 eval(shift(@insns));
2019 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
2020 foreach (@insns) { eval; } # remaining instructions
2021 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2024 for ($i=0,$j=0; $j<8; $j++) {
2025 &AVX2_512_00_47($j,\&bodyx_00_15,@X);
2026 push(@X,shift(@X)); # rotate(@X)
2028 &lea ($Tbl,16*2*$SZ."($Tbl)");
2029 &cmpb (($SZ-1-0x80)."($Tbl)",0);
2030 &jne (".Lavx2_00_47");
2032 for ($i=0; $i<16; ) {
2033 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2034 foreach(bodyx_00_15()) { eval; }
2038 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2040 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2041 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
2061 cmp `$PUSH8+2*8`($Tbl),$inp # $_end
2072 for ($i=0; $i<8; ) {
2073 my $base="+16($Tbl)";
2074 foreach(bodyx_00_15()) { eval; }
2077 lea -$PUSH8($Tbl),$Tbl
2081 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2083 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2084 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
2092 lea `2*16*$SZ`($inp),$inp # inp+=2
2099 cmove %rsp,%r12 # next block or stale data
2116 $code.=<<___ if ($win64);
2117 movaps 16*$SZ+32(%rsp),%xmm6
2118 movaps 16*$SZ+48(%rsp),%xmm7
2119 movaps 16*$SZ+64(%rsp),%xmm8
2120 movaps 16*$SZ+80(%rsp),%xmm9
2122 $code.=<<___ if ($win64 && $SZ>4);
2123 movaps 16*$SZ+96(%rsp),%xmm10
2124 movaps 16*$SZ+112(%rsp),%xmm11
2136 .size ${func}_avx2,.-${func}_avx2
2141 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2142 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2150 .extern __imp_RtlVirtualUnwind
2151 .type se_handler,\@abi-omnipotent
2165 mov 120($context),%rax # pull context->Rax
2166 mov 248($context),%rbx # pull context->Rip
2168 mov 8($disp),%rsi # disp->ImageBase
2169 mov 56($disp),%r11 # disp->HanderlData
2171 mov 0(%r11),%r10d # HandlerData[0]
2172 lea (%rsi,%r10),%r10 # prologue label
2173 cmp %r10,%rbx # context->Rip<prologue label
2176 mov 152($context),%rax # pull context->Rsp
2178 mov 4(%r11),%r10d # HandlerData[1]
2179 lea (%rsi,%r10),%r10 # epilogue label
2180 cmp %r10,%rbx # context->Rip>=epilogue label
2183 $code.=<<___ if ($avx>1);
2184 lea .Lavx2_shortcut(%rip),%r10
2185 cmp %r10,%rbx # context->Rip<avx2_shortcut
2189 add \$`2*$SZ*($rounds-8)`,%rax
2193 mov %rax,%rsi # put aside Rsp
2194 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
2203 mov %rbx,144($context) # restore context->Rbx
2204 mov %rbp,160($context) # restore context->Rbp
2205 mov %r12,216($context) # restore context->R12
2206 mov %r13,224($context) # restore context->R13
2207 mov %r14,232($context) # restore context->R14
2208 mov %r15,240($context) # restore context->R15
2210 lea .Lepilogue(%rip),%r10
2212 jb .Lin_prologue # non-AVX code
2214 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
2215 lea 512($context),%rdi # &context.Xmm6
2216 mov \$`$SZ==4?8:12`,%ecx
2217 .long 0xa548f3fc # cld; rep movsq
2222 mov %rax,152($context) # restore context->Rsp
2223 mov %rsi,168($context) # restore context->Rsi
2224 mov %rdi,176($context) # restore context->Rdi
2226 mov 40($disp),%rdi # disp->ContextRecord
2227 mov $context,%rsi # context
2228 mov \$154,%ecx # sizeof(CONTEXT)
2229 .long 0xa548f3fc # cld; rep movsq
2232 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2233 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2234 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2235 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2236 mov 40(%rsi),%r10 # disp->ContextRecord
2237 lea 56(%rsi),%r11 # &disp->HandlerData
2238 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2239 mov %r10,32(%rsp) # arg5
2240 mov %r11,40(%rsp) # arg6
2241 mov %r12,48(%rsp) # arg7
2242 mov %rcx,56(%rsp) # arg8, (NULL)
2243 call *__imp_RtlVirtualUnwind(%rip)
2245 mov \$1,%eax # ExceptionContinueSearch
2257 .size se_handler,.-se_handler
2259 .type shaext_handler,\@abi-omnipotent
2273 mov 120($context),%rax # pull context->Rax
2274 mov 248($context),%rbx # pull context->Rip
2276 lea .Lprologue_shaext(%rip),%r10
2277 cmp %r10,%rbx # context->Rip<.Lprologue
2280 lea .Lepilogue_shaext(%rip),%r10
2281 cmp %r10,%rbx # context->Rip>=.Lepilogue
2284 lea -8-5*16(%rax),%rsi
2285 lea 512($context),%rdi # &context.Xmm6
2287 .long 0xa548f3fc # cld; rep movsq
2290 .size shaext_handler,.-shaext_handler
2294 .rva .LSEH_begin_$func
2295 .rva .LSEH_end_$func
2296 .rva .LSEH_info_$func
2298 $code.=<<___ if ($SZ==4);
2299 .rva .LSEH_begin_${func}_shaext
2300 .rva .LSEH_end_${func}_shaext
2301 .rva .LSEH_info_${func}_shaext
2302 .rva .LSEH_begin_${func}_ssse3
2303 .rva .LSEH_end_${func}_ssse3
2304 .rva .LSEH_info_${func}_ssse3
2306 $code.=<<___ if ($avx && $SZ==8);
2307 .rva .LSEH_begin_${func}_xop
2308 .rva .LSEH_end_${func}_xop
2309 .rva .LSEH_info_${func}_xop
2311 $code.=<<___ if ($avx);
2312 .rva .LSEH_begin_${func}_avx
2313 .rva .LSEH_end_${func}_avx
2314 .rva .LSEH_info_${func}_avx
2316 $code.=<<___ if ($avx>1);
2317 .rva .LSEH_begin_${func}_avx2
2318 .rva .LSEH_end_${func}_avx2
2319 .rva .LSEH_info_${func}_avx2
2327 .rva .Lprologue,.Lepilogue # HandlerData[]
2329 $code.=<<___ if ($SZ==4);
2330 .LSEH_info_${func}_shaext:
2333 .LSEH_info_${func}_ssse3:
2336 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2338 $code.=<<___ if ($avx && $SZ==8);
2339 .LSEH_info_${func}_xop:
2342 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
2344 $code.=<<___ if ($avx);
2345 .LSEH_info_${func}_avx:
2348 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
2350 $code.=<<___ if ($avx>1);
2351 .LSEH_info_${func}_avx2:
2354 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2361 "sha256rnds2" => 0xcb,
2362 "sha256msg1" => 0xcc,
2363 "sha256msg2" => 0xcd );
2365 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2366 my @opcode=(0x0f,0x38);
2367 push @opcode,$opcodelet{$instr};
2368 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
2369 return ".byte\t".join(',',@opcode);
2371 return $instr."\t".@_[0];
2375 foreach (split("\n",$code)) {
2376 s/\`([^\`]*)\`/eval $1/geo;
2378 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;