3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # Multi-buffer SHA256 procedure processes n buffers in parallel by
11 # placing buffer data to designated lane of SIMD register. n is
12 # naturally limited to 4 on pre-AVX2 processors and to 8 on
13 # AVX2-capable processors such as Haswell.
15 # this +aesni(i) sha256 aesni-sha256 gain(iv)
16 # -------------------------------------------------------------------
17 # Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
18 # Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95%
19 # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
20 # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
21 # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
22 # Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170%
23 # Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100%
25 # (i) multi-block CBC encrypt with 128-bit key;
26 # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
27 # because of lower AES-NI instruction throughput, nor is there
28 # AES-NI-SHA256 stitch for these processors;
29 # (iii) "this" is for n=8, when we gather twice as much data, result
30 # for n=4 is 20.3+4.44=24.7;
31 # (iv) presented improvement coefficients are asymptotic limits and
32 # in real-life application are somewhat lower, e.g. for 2KB
33 # fragments they range from 75% to 130% (on Haswell);
37 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
39 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
41 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
43 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
44 die "can't locate x86_64-xlate.pl";
48 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
49 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
50 $avx = ($1>=2.19) + ($1>=2.22);
53 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
54 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
55 $avx = ($1>=2.09) + ($1>=2.10);
58 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
59 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
60 $avx = ($1>=10) + ($1>=11);
63 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
64 $avx = ($2>=3.0) + ($2>3.0);
67 open OUT,"| \"$^X\" $xlate $flavour $output";
70 # void sha256_multi_block (
71 # struct { unsigned int A[8];
78 # unsigned int H[8]; } *ctx,
79 # struct { void *ptr; int blocks; } inp[8],
80 # int num); /* 1 or 2 */
82 $ctx="%rdi"; # 1st arg
83 $inp="%rsi"; # 2nd arg
84 $num="%edx"; # 3rd arg
85 @ptr=map("%r$_",(8..11));
88 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
89 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
96 $off %= 16; $off *= $REG_SZ;
97 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
101 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
103 $code.=<<___ if ($i<15);
104 movd `4*$i`(@ptr[0]),$Xi
105 movd `4*$i`(@ptr[1]),$t1
106 movd `4*$i`(@ptr[2]),$t2
107 movd `4*$i`(@ptr[3]),$t3
112 $code.=<<___ if ($i==15);
113 movd `4*$i`(@ptr[0]),$Xi
114 lea `16*4`(@ptr[0]),@ptr[0]
115 movd `4*$i`(@ptr[1]),$t1
116 lea `16*4`(@ptr[1]),@ptr[1]
117 movd `4*$i`(@ptr[2]),$t2
118 lea `16*4`(@ptr[2]),@ptr[2]
119 movd `4*$i`(@ptr[3]),$t3
120 lea `16*4`(@ptr[3]),@ptr[3]
127 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)`
129 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)`
133 movdqa $Xi,`&Xi_off($i)`
139 paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round]
144 `"prefetcht0 63(@ptr[0])" if ($i==15)`
146 movdqa $e,$axb # borrow $axb
152 `"prefetcht0 63(@ptr[1])" if ($i==15)`
154 pxor $t3,$sigma # Sigma1(e)
157 paddd $sigma,$Xi # Xi+=Sigma1(e)
158 pxor $axb,$t1 # Ch(e,f,g)
162 pxor $a,$axb # a^b, b^c in next round
164 `"prefetcht0 63(@ptr[2])" if ($i==15)`
167 paddd $t1,$Xi # Xi+=Ch(e,f,g)
172 `"prefetcht0 63(@ptr[3])" if ($i==15)`
178 pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
180 pxor $t3,$sigma # Sigma0(a)
183 paddd $sigma,$h # h+=Sigma0(a)
185 $code.=<<___ if (($i%8)==7);
186 lea `32*8`($Tbl),$Tbl
188 ($axb,$bxc)=($bxc,$axb);
195 movdqa `&Xi_off($i+1)`,$Xn
196 paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9]
204 movdqa `&Xi_off($i+14)`,$t1
208 movdqa $t1,$axb # borrow $axb
216 pxor $t3,$sigma # sigma0(X[i+1])
218 paddd $sigma,$Xi # Xi+=sigma0(e)
224 pxor $t2,$t1 # sigma0(X[i+14])
225 paddd $t1,$Xi # Xi+=sigma1(X[i+14])
234 .extern OPENSSL_ia32cap_P
236 .globl sha256_multi_block
237 .type sha256_multi_block,\@function,3
240 mov OPENSSL_ia32cap_P+4(%rip),%rcx
241 bt \$61,%rcx # check SHA bit
244 $code.=<<___ if ($avx);
253 $code.=<<___ if ($win64);
256 movaps %xmm7,0x10(%rsp)
257 movaps %xmm8,0x20(%rsp)
258 movaps %xmm9,0x30(%rsp)
259 movaps %xmm10,-0x78(%rax)
260 movaps %xmm11,-0x68(%rax)
261 movaps %xmm12,-0x58(%rax)
262 movaps %xmm13,-0x48(%rax)
263 movaps %xmm14,-0x38(%rax)
264 movaps %xmm15,-0x28(%rax)
267 sub \$`$REG_SZ*18`, %rsp
269 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
271 lea K256+128(%rip),$Tbl
272 lea `$REG_SZ*16`(%rsp),%rbx
273 lea 0x80($ctx),$ctx # size optimization
276 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
279 for($i=0;$i<4;$i++) {
281 mov `16*$i+0`($inp),@ptr[$i] # input pointer
282 mov `16*$i+8`($inp),%ecx # number of blocks
284 cmovg %ecx,$num # find maximum
286 mov %ecx,`4*$i`(%rbx) # initialize counters
287 cmovle $Tbl,@ptr[$i] # cancel input
294 movdqu 0x00-0x80($ctx),$A # load context
296 movdqu 0x20-0x80($ctx),$B
297 movdqu 0x40-0x80($ctx),$C
298 movdqu 0x60-0x80($ctx),$D
299 movdqu 0x80-0x80($ctx),$E
300 movdqu 0xa0-0x80($ctx),$F
301 movdqu 0xc0-0x80($ctx),$G
302 movdqu 0xe0-0x80($ctx),$H
303 movdqu .Lpbswap(%rip),$Xn
309 pxor $B,$bxc # magic seed
311 for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
313 movdqu `&Xi_off($i)`,$Xi
319 for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
325 lea K256+128(%rip),$Tbl
327 movdqa (%rbx),$sigma # pull counters
328 cmp 4*0(%rbx),%ecx # examine counters
330 cmovge $Tbl,@ptr[0] # cancel input
335 pcmpgtd $t1,$Xn # mask value
338 paddd $Xn,$sigma # counters--
341 movdqu 0x00-0x80($ctx),$t1
343 movdqu 0x20-0x80($ctx),$t2
345 movdqu 0x40-0x80($ctx),$t3
347 movdqu 0x60-0x80($ctx),$Xi
350 movdqu 0x80-0x80($ctx),$t1
353 movdqu 0xa0-0x80($ctx),$t2
356 movdqu 0xc0-0x80($ctx),$t3
359 movdqu 0xe0-0x80($ctx),$Xi
363 movdqu $A,0x00-0x80($ctx)
365 movdqu $B,0x20-0x80($ctx)
367 movdqu $C,0x40-0x80($ctx)
368 movdqu $D,0x60-0x80($ctx)
369 movdqu $E,0x80-0x80($ctx)
370 movdqu $F,0xa0-0x80($ctx)
371 movdqu $G,0xc0-0x80($ctx)
372 movdqu $H,0xe0-0x80($ctx)
374 movdqa $sigma,(%rbx) # save counters
375 movdqa .Lpbswap(%rip),$Xn
379 mov `$REG_SZ*17+8`(%rsp),$num
380 lea $REG_SZ($ctx),$ctx
381 lea `16*$REG_SZ/4`($inp),$inp
386 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
388 $code.=<<___ if ($win64);
389 movaps -0xb8(%rax),%xmm6
390 movaps -0xa8(%rax),%xmm7
391 movaps -0x98(%rax),%xmm8
392 movaps -0x88(%rax),%xmm9
393 movaps -0x78(%rax),%xmm10
394 movaps -0x68(%rax),%xmm11
395 movaps -0x58(%rax),%xmm12
396 movaps -0x48(%rax),%xmm13
397 movaps -0x38(%rax),%xmm14
398 movaps -0x28(%rax),%xmm15
406 .size sha256_multi_block,.-sha256_multi_block
409 my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
410 my @MSG0=map("%xmm$_",(4..7));
411 my @MSG1=map("%xmm$_",(8..11));
414 .type sha256_multi_block_shaext,\@function,3
416 sha256_multi_block_shaext:
422 $code.=<<___ if ($win64);
425 movaps %xmm7,0x10(%rsp)
426 movaps %xmm8,0x20(%rsp)
427 movaps %xmm9,0x30(%rsp)
428 movaps %xmm10,-0x78(%rax)
429 movaps %xmm11,-0x68(%rax)
430 movaps %xmm12,-0x58(%rax)
431 movaps %xmm13,-0x48(%rax)
432 movaps %xmm14,-0x38(%rax)
433 movaps %xmm15,-0x28(%rax)
436 sub \$`$REG_SZ*18`,%rsp
437 shl \$1,$num # we process pair at a time
439 lea 0x80($ctx),$ctx # size optimization
440 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
442 lea `$REG_SZ*16`(%rsp),%rbx
443 lea K256_shaext+0x80(%rip),$Tbl
446 mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num
449 for($i=0;$i<2;$i++) {
451 mov `16*$i+0`($inp),@ptr[$i] # input pointer
452 mov `16*$i+8`($inp),%ecx # number of blocks
454 cmovg %ecx,$num # find maximum
456 mov %ecx,`4*$i`(%rbx) # initialize counters
457 cmovle %rsp,@ptr[$i] # cancel input
464 movq 0x00-0x80($ctx),$ABEF0 # A1.A0
465 movq 0x20-0x80($ctx),@MSG0[0] # B1.B0
466 movq 0x40-0x80($ctx),$CDGH0 # C1.C0
467 movq 0x60-0x80($ctx),@MSG0[1] # D1.D0
468 movq 0x80-0x80($ctx),@MSG1[0] # E1.E0
469 movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0
470 movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0
471 movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0
473 punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0
474 punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0
475 punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0
476 punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0
477 movdqa K256_shaext-0x10(%rip),$TMPx # byte swap
481 punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0
482 punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0
483 punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1
484 punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1
486 pshufd \$0b00011011,$ABEF0,$ABEF0
487 pshufd \$0b00011011,$CDGH0,$CDGH0
488 pshufd \$0b00011011,$ABEF1,$ABEF1
489 pshufd \$0b00011011,$CDGH1,$CDGH1
494 movdqu 0x00(@ptr[0]),@MSG0[0]
495 movdqu 0x00(@ptr[1]),@MSG1[0]
496 movdqu 0x10(@ptr[0]),@MSG0[1]
497 movdqu 0x10(@ptr[1]),@MSG1[1]
498 movdqu 0x20(@ptr[0]),@MSG0[2]
499 pshufb $TMPx,@MSG0[0]
500 movdqu 0x20(@ptr[1]),@MSG1[2]
501 pshufb $TMPx,@MSG1[0]
502 movdqu 0x30(@ptr[0]),@MSG0[3]
503 lea 0x40(@ptr[0]),@ptr[0]
504 movdqu 0x30(@ptr[1]),@MSG1[3]
505 lea 0x40(@ptr[1]),@ptr[1]
507 movdqa 0*16-0x80($Tbl),$Wi
508 pshufb $TMPx,@MSG0[1]
510 pxor $ABEF0,@MSG0[0] # black magic
512 movdqa 0*16-0x80($Tbl),$TMP1
513 pshufb $TMPx,@MSG1[1]
515 movdqa $CDGH0,0x50(%rsp) # offload
516 sha256rnds2 $ABEF0,$CDGH0 # 0-3
517 pxor $ABEF1,@MSG1[0] # black magic
519 movdqa $CDGH1,0x70(%rsp)
520 sha256rnds2 $ABEF1,$CDGH1 # 0-3
521 pshufd \$0x0e,$TMP0,$Wi
522 pxor $ABEF0,@MSG0[0] # black magic
523 movdqa $ABEF0,0x40(%rsp) # offload
524 sha256rnds2 $CDGH0,$ABEF0
525 pshufd \$0x0e,$TMP1,$Wi
526 pxor $ABEF1,@MSG1[0] # black magic
527 movdqa $ABEF1,0x60(%rsp)
528 movdqa 1*16-0x80($Tbl),$TMP0
530 pshufb $TMPx,@MSG0[2]
531 sha256rnds2 $CDGH1,$ABEF1
534 movdqa 1*16-0x80($Tbl),$TMP1
536 sha256rnds2 $ABEF0,$CDGH0 # 4-7
538 prefetcht0 127(@ptr[0])
539 pshufb $TMPx,@MSG0[3]
540 pshufb $TMPx,@MSG1[2]
541 prefetcht0 127(@ptr[1])
542 sha256rnds2 $ABEF1,$CDGH1 # 4-7
543 pshufd \$0x0e,$TMP0,$Wi
544 pshufb $TMPx,@MSG1[3]
545 sha256msg1 @MSG0[1],@MSG0[0]
546 sha256rnds2 $CDGH0,$ABEF0
547 pshufd \$0x0e,$TMP1,$Wi
548 movdqa 2*16-0x80($Tbl),$TMP0
550 sha256rnds2 $CDGH1,$ABEF1
553 movdqa 2*16-0x80($Tbl),$TMP1
555 sha256rnds2 $ABEF0,$CDGH0 # 8-11
556 sha256msg1 @MSG1[1],@MSG1[0]
558 movdqa @MSG0[3],$TMPx
559 sha256rnds2 $ABEF1,$CDGH1 # 8-11
560 pshufd \$0x0e,$TMP0,$Wi
561 palignr \$4,@MSG0[2],$TMPx
563 movdqa @MSG1[3],$TMPx
564 palignr \$4,@MSG1[2],$TMPx
565 sha256msg1 @MSG0[2],@MSG0[1]
566 sha256rnds2 $CDGH0,$ABEF0
567 pshufd \$0x0e,$TMP1,$Wi
568 movdqa 3*16-0x80($Tbl),$TMP0
570 sha256rnds2 $CDGH1,$ABEF1
571 sha256msg1 @MSG1[2],@MSG1[1]
574 movdqa 3*16-0x80($Tbl),$TMP1
577 sha256msg2 @MSG0[3],@MSG0[0]
578 sha256rnds2 $ABEF0,$CDGH0 # 12-15
580 movdqa @MSG0[0],$TMPx
581 palignr \$4,@MSG0[3],$TMPx
582 sha256rnds2 $ABEF1,$CDGH1 # 12-15
583 sha256msg2 @MSG1[3],@MSG1[0]
584 pshufd \$0x0e,$TMP0,$Wi
586 movdqa @MSG1[0],$TMPx
587 palignr \$4,@MSG1[3],$TMPx
588 sha256msg1 @MSG0[3],@MSG0[2]
589 sha256rnds2 $CDGH0,$ABEF0
590 pshufd \$0x0e,$TMP1,$Wi
591 movdqa 4*16-0x80($Tbl),$TMP0
593 sha256rnds2 $CDGH1,$ABEF1
594 sha256msg1 @MSG1[3],@MSG1[2]
596 for($i=4;$i<16-3;$i++) {
599 movdqa $i*16-0x80($Tbl),$TMP1
602 sha256msg2 @MSG0[0],@MSG0[1]
603 sha256rnds2 $ABEF0,$CDGH0 # 16-19...
605 movdqa @MSG0[1],$TMPx
606 palignr \$4,@MSG0[0],$TMPx
607 sha256rnds2 $ABEF1,$CDGH1 # 16-19...
608 sha256msg2 @MSG1[0],@MSG1[1]
609 pshufd \$0x0e,$TMP0,$Wi
611 movdqa @MSG1[1],$TMPx
612 palignr \$4,@MSG1[0],$TMPx
613 sha256msg1 @MSG0[0],@MSG0[3]
614 sha256rnds2 $CDGH0,$ABEF0
615 pshufd \$0x0e,$TMP1,$Wi
616 movdqa `($i+1)*16`-0x80($Tbl),$TMP0
618 sha256rnds2 $CDGH1,$ABEF1
619 sha256msg1 @MSG1[0],@MSG1[3]
621 push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
625 movdqa 13*16-0x80($Tbl),$TMP1
628 sha256msg2 @MSG0[0],@MSG0[1]
629 sha256rnds2 $ABEF0,$CDGH0 # 52-55
631 movdqa @MSG0[1],$TMPx
632 palignr \$4,@MSG0[0],$TMPx
633 sha256rnds2 $ABEF1,$CDGH1 # 52-55
634 sha256msg2 @MSG1[0],@MSG1[1]
635 pshufd \$0x0e,$TMP0,$Wi
637 movdqa @MSG1[1],$TMPx
638 palignr \$4,@MSG1[0],$TMPx
640 sha256rnds2 $CDGH0,$ABEF0
641 pshufd \$0x0e,$TMP1,$Wi
642 movdqa 14*16-0x80($Tbl),$TMP0
644 sha256rnds2 $CDGH1,$ABEF1
647 movdqa 14*16-0x80($Tbl),$TMP1
650 sha256msg2 @MSG0[1],@MSG0[2]
652 sha256rnds2 $ABEF0,$CDGH0 # 56-59
655 pxor @MSG0[1],@MSG0[1] # zero
656 sha256rnds2 $ABEF1,$CDGH1 # 56-59
657 sha256msg2 @MSG1[1],@MSG1[2]
658 pshufd \$0x0e,$TMP0,$Wi
659 movdqa 15*16-0x80($Tbl),$TMP0
661 movq (%rbx),@MSG0[2] # pull counters
663 sha256rnds2 $CDGH0,$ABEF0
664 pshufd \$0x0e,$TMP1,$Wi
665 movdqa 15*16-0x80($Tbl),$TMP1
667 sha256rnds2 $CDGH1,$ABEF1
670 cmp 4*0(%rbx),%ecx # examine counters
671 cmovge %rsp,@ptr[0] # cancel input
674 pshufd \$0x00,@MSG0[2],@MSG1[0]
675 sha256rnds2 $ABEF0,$CDGH0 # 60-63
677 pshufd \$0x55,@MSG0[2],@MSG1[1]
678 movdqa @MSG0[2],@MSG1[2]
679 sha256rnds2 $ABEF1,$CDGH1 # 60-63
680 pshufd \$0x0e,$TMP0,$Wi
681 pcmpgtd @MSG0[1],@MSG1[0]
682 pcmpgtd @MSG0[1],@MSG1[1]
683 sha256rnds2 $CDGH0,$ABEF0
684 pshufd \$0x0e,$TMP1,$Wi
685 pcmpgtd @MSG0[1],@MSG1[2] # counter mask
686 movdqa K256_shaext-0x10(%rip),$TMPx
687 sha256rnds2 $CDGH1,$ABEF1
693 paddd @MSG0[2],@MSG1[2] # counters--
695 paddd 0x50(%rsp),$CDGH0
696 paddd 0x70(%rsp),$CDGH1
697 paddd 0x40(%rsp),$ABEF0
698 paddd 0x60(%rsp),$ABEF1
700 movq @MSG1[2],(%rbx) # save counters
704 mov `$REG_SZ*17+8`(%rsp),$num
706 pshufd \$0b00011011,$ABEF0,$ABEF0
707 pshufd \$0b00011011,$CDGH0,$CDGH0
708 pshufd \$0b00011011,$ABEF1,$ABEF1
709 pshufd \$0b00011011,$CDGH1,$CDGH1
711 movdqa $ABEF0,@MSG0[0]
712 movdqa $CDGH0,@MSG0[1]
713 punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0
714 punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0
715 punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0
716 punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0
718 movq $ABEF0,0x00-0x80($ctx) # A1.A0
720 movq @MSG0[0],0x80-0x80($ctx) # E1.E0
722 movq $ABEF0,0x20-0x80($ctx) # B1.B0
723 movq @MSG0[0],0xa0-0x80($ctx) # F1.F0
725 movq $CDGH0,0x40-0x80($ctx) # C1.C0
727 movq @MSG0[1],0xc0-0x80($ctx) # G1.G0
729 movq $CDGH0,0x60-0x80($ctx) # D1.D0
730 movq @MSG0[1],0xe0-0x80($ctx) # H1.H0
732 lea `$REG_SZ/2`($ctx),$ctx
733 lea `16*2`($inp),$inp
735 jnz .Loop_grande_shaext
738 #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
740 $code.=<<___ if ($win64);
741 movaps -0xb8(%rax),%xmm6
742 movaps -0xa8(%rax),%xmm7
743 movaps -0x98(%rax),%xmm8
744 movaps -0x88(%rax),%xmm9
745 movaps -0x78(%rax),%xmm10
746 movaps -0x68(%rax),%xmm11
747 movaps -0x58(%rax),%xmm12
748 movaps -0x48(%rax),%xmm13
749 movaps -0x38(%rax),%xmm14
750 movaps -0x28(%rax),%xmm15
758 .size sha256_multi_block_shaext,.-sha256_multi_block_shaext
762 sub ROUND_00_15_avx {
763 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
765 $code.=<<___ if ($i<15 && $REG_SZ==16);
766 vmovd `4*$i`(@ptr[0]),$Xi
767 vmovd `4*$i`(@ptr[1]),$t1
768 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
769 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
770 vpunpckldq $t1,$Xi,$Xi
773 $code.=<<___ if ($i==15 && $REG_SZ==16);
774 vmovd `4*$i`(@ptr[0]),$Xi
775 lea `16*4`(@ptr[0]),@ptr[0]
776 vmovd `4*$i`(@ptr[1]),$t1
777 lea `16*4`(@ptr[1]),@ptr[1]
778 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
779 lea `16*4`(@ptr[2]),@ptr[2]
780 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
781 lea `16*4`(@ptr[3]),@ptr[3]
782 vpunpckldq $t1,$Xi,$Xi
785 $code.=<<___ if ($i<15 && $REG_SZ==32);
786 vmovd `4*$i`(@ptr[0]),$Xi
787 vmovd `4*$i`(@ptr[4]),$t1
788 vmovd `4*$i`(@ptr[1]),$t2
789 vmovd `4*$i`(@ptr[5]),$t3
790 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
791 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
792 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
793 vpunpckldq $t2,$Xi,$Xi
794 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
795 vpunpckldq $t3,$t1,$t1
796 vinserti128 $t1,$Xi,$Xi
799 $code.=<<___ if ($i==15 && $REG_SZ==32);
800 vmovd `4*$i`(@ptr[0]),$Xi
801 lea `16*4`(@ptr[0]),@ptr[0]
802 vmovd `4*$i`(@ptr[4]),$t1
803 lea `16*4`(@ptr[4]),@ptr[4]
804 vmovd `4*$i`(@ptr[1]),$t2
805 lea `16*4`(@ptr[1]),@ptr[1]
806 vmovd `4*$i`(@ptr[5]),$t3
807 lea `16*4`(@ptr[5]),@ptr[5]
808 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
809 lea `16*4`(@ptr[2]),@ptr[2]
810 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
811 lea `16*4`(@ptr[6]),@ptr[6]
812 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
813 lea `16*4`(@ptr[3]),@ptr[3]
814 vpunpckldq $t2,$Xi,$Xi
815 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
816 lea `16*4`(@ptr[7]),@ptr[7]
817 vpunpckldq $t3,$t1,$t1
818 vinserti128 $t1,$Xi,$Xi
824 vmovdqu $Xi,`&Xi_off($i)`
825 vpaddd $h,$Xi,$Xi # Xi+=h
828 vpxor $t3,$sigma,$sigma
830 vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round]
831 vpxor $t2,$sigma,$sigma
834 vpxor $t3,$sigma,$sigma
835 `"prefetcht0 63(@ptr[0])" if ($i==15)`
838 vpand $f,$e,$axb # borrow $axb
839 `"prefetcht0 63(@ptr[1])" if ($i==15)`
840 vpxor $t2,$sigma,$sigma
842 vpsrld \$2,$a,$h # borrow $h
843 vpxor $t3,$sigma,$sigma # Sigma1(e)
844 `"prefetcht0 63(@ptr[2])" if ($i==15)`
846 vpxor $axb,$t1,$t1 # Ch(e,f,g)
847 vpxor $a,$b,$axb # a^b, b^c in next round
848 `"prefetcht0 63(@ptr[3])" if ($i==15)`
850 vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
853 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
855 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
857 `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
861 vpxor $t3,$sigma,$sigma
862 `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
864 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
865 vpaddd $Xi,$d,$d # d+=Xi
866 `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
867 vpxor $t2,$sigma,$sigma
868 vpxor $t3,$sigma,$sigma # Sigma0(a)
870 vpaddd $Xi,$h,$h # h+=Xi
871 vpaddd $sigma,$h,$h # h+=Sigma0(a)
873 $code.=<<___ if (($i%8)==7);
876 ($axb,$bxc)=($bxc,$axb);
879 sub ROUND_16_XX_avx {
883 vmovdqu `&Xi_off($i+1)`,$Xn
884 vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9]
886 vpsrld \$3,$Xn,$sigma
889 vpxor $t2,$sigma,$sigma
891 vpxor $t3,$sigma,$sigma
893 vmovdqu `&Xi_off($i+14)`,$t1
894 vpsrld \$10,$t1,$axb # borrow $axb
896 vpxor $t2,$sigma,$sigma
898 vpxor $t3,$sigma,$sigma # sigma0(X[i+1])
900 vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e)
901 vpxor $t2,$axb,$sigma
903 vpxor $t3,$sigma,$sigma
905 vpxor $t2,$sigma,$sigma
906 vpxor $t3,$sigma,$sigma # sigma0(X[i+14])
907 vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14])
909 &ROUND_00_15_avx($i,@_);
914 .type sha256_multi_block_avx,\@function,3
916 sha256_multi_block_avx:
919 $code.=<<___ if ($avx>1);
934 $code.=<<___ if ($win64);
937 movaps %xmm7,0x10(%rsp)
938 movaps %xmm8,0x20(%rsp)
939 movaps %xmm9,0x30(%rsp)
940 movaps %xmm10,-0x78(%rax)
941 movaps %xmm11,-0x68(%rax)
942 movaps %xmm12,-0x58(%rax)
943 movaps %xmm13,-0x48(%rax)
944 movaps %xmm14,-0x38(%rax)
945 movaps %xmm15,-0x28(%rax)
948 sub \$`$REG_SZ*18`, %rsp
950 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
952 lea K256+128(%rip),$Tbl
953 lea `$REG_SZ*16`(%rsp),%rbx
954 lea 0x80($ctx),$ctx # size optimization
957 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
960 for($i=0;$i<4;$i++) {
962 mov `16*$i+0`($inp),@ptr[$i] # input pointer
963 mov `16*$i+8`($inp),%ecx # number of blocks
965 cmovg %ecx,$num # find maximum
967 mov %ecx,`4*$i`(%rbx) # initialize counters
968 cmovle $Tbl,@ptr[$i] # cancel input
975 vmovdqu 0x00-0x80($ctx),$A # load context
977 vmovdqu 0x20-0x80($ctx),$B
978 vmovdqu 0x40-0x80($ctx),$C
979 vmovdqu 0x60-0x80($ctx),$D
980 vmovdqu 0x80-0x80($ctx),$E
981 vmovdqu 0xa0-0x80($ctx),$F
982 vmovdqu 0xc0-0x80($ctx),$G
983 vmovdqu 0xe0-0x80($ctx),$H
984 vmovdqu .Lpbswap(%rip),$Xn
989 vpxor $B,$C,$bxc # magic seed
991 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
993 vmovdqu `&Xi_off($i)`,$Xi
999 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1005 lea K256+128(%rip),$Tbl
1007 for($i=0;$i<4;$i++) {
1009 cmp `4*$i`(%rbx),%ecx # examine counters
1010 cmovge $Tbl,@ptr[$i] # cancel input
1014 vmovdqa (%rbx),$sigma # pull counters
1017 vpcmpgtd $t1,$Xn,$Xn # mask value
1018 vpaddd $Xn,$sigma,$sigma # counters--
1020 vmovdqu 0x00-0x80($ctx),$t1
1022 vmovdqu 0x20-0x80($ctx),$t2
1024 vmovdqu 0x40-0x80($ctx),$t3
1026 vmovdqu 0x60-0x80($ctx),$Xi
1029 vmovdqu 0x80-0x80($ctx),$t1
1032 vmovdqu 0xa0-0x80($ctx),$t2
1035 vmovdqu 0xc0-0x80($ctx),$t3
1038 vmovdqu 0xe0-0x80($ctx),$Xi
1042 vmovdqu $A,0x00-0x80($ctx)
1044 vmovdqu $B,0x20-0x80($ctx)
1046 vmovdqu $C,0x40-0x80($ctx)
1047 vmovdqu $D,0x60-0x80($ctx)
1048 vmovdqu $E,0x80-0x80($ctx)
1049 vmovdqu $F,0xa0-0x80($ctx)
1050 vmovdqu $G,0xc0-0x80($ctx)
1051 vmovdqu $H,0xe0-0x80($ctx)
1053 vmovdqu $sigma,(%rbx) # save counters
1054 vmovdqu .Lpbswap(%rip),$Xn
1058 mov `$REG_SZ*17+8`(%rsp),$num
1059 lea $REG_SZ($ctx),$ctx
1060 lea `16*$REG_SZ/4`($inp),$inp
1062 jnz .Loop_grande_avx
1065 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
1068 $code.=<<___ if ($win64);
1069 movaps -0xb8(%rax),%xmm6
1070 movaps -0xa8(%rax),%xmm7
1071 movaps -0x98(%rax),%xmm8
1072 movaps -0x88(%rax),%xmm9
1073 movaps -0x78(%rax),%xmm10
1074 movaps -0x68(%rax),%xmm11
1075 movaps -0x58(%rax),%xmm12
1076 movaps -0x48(%rax),%xmm13
1077 movaps -0x38(%rax),%xmm14
1078 movaps -0x28(%rax),%xmm15
1086 .size sha256_multi_block_avx,.-sha256_multi_block_avx
1089 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1092 @ptr=map("%r$_",(12..15,8..11));
1094 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1095 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1098 .type sha256_multi_block_avx2,\@function,3
1100 sha256_multi_block_avx2:
1110 $code.=<<___ if ($win64);
1111 lea -0xa8(%rsp),%rsp
1113 movaps %xmm7,0x10(%rsp)
1114 movaps %xmm8,0x20(%rsp)
1115 movaps %xmm9,0x30(%rsp)
1116 movaps %xmm10,0x40(%rsp)
1117 movaps %xmm11,0x50(%rsp)
1118 movaps %xmm12,-0x78(%rax)
1119 movaps %xmm13,-0x68(%rax)
1120 movaps %xmm14,-0x58(%rax)
1121 movaps %xmm15,-0x48(%rax)
1124 sub \$`$REG_SZ*18`, %rsp
1126 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
1128 lea K256+128(%rip),$Tbl
1129 lea 0x80($ctx),$ctx # size optimization
1132 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
1134 lea `$REG_SZ*16`(%rsp),%rbx
1136 for($i=0;$i<8;$i++) {
1138 mov `16*$i+0`($inp),@ptr[$i] # input pointer
1139 mov `16*$i+8`($inp),%ecx # number of blocks
1141 cmovg %ecx,$num # find maximum
1143 mov %ecx,`4*$i`(%rbx) # initialize counters
1144 cmovle $Tbl,@ptr[$i] # cancel input
1148 vmovdqu 0x00-0x80($ctx),$A # load context
1150 vmovdqu 0x20-0x80($ctx),$B
1151 lea 256+128(%rsp),%rbx
1152 vmovdqu 0x40-0x80($ctx),$C
1153 vmovdqu 0x60-0x80($ctx),$D
1154 vmovdqu 0x80-0x80($ctx),$E
1155 vmovdqu 0xa0-0x80($ctx),$F
1156 vmovdqu 0xc0-0x80($ctx),$G
1157 vmovdqu 0xe0-0x80($ctx),$H
1158 vmovdqu .Lpbswap(%rip),$Xn
1163 vpxor $B,$C,$bxc # magic seed
1165 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1167 vmovdqu `&Xi_off($i)`,$Xi
1169 jmp .Loop_16_xx_avx2
1173 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1176 jnz .Loop_16_xx_avx2
1179 lea `$REG_SZ*16`(%rsp),%rbx
1180 lea K256+128(%rip),$Tbl
1182 for($i=0;$i<8;$i++) {
1184 cmp `4*$i`(%rbx),%ecx # examine counters
1185 cmovge $Tbl,@ptr[$i] # cancel input
1189 vmovdqa (%rbx),$sigma # pull counters
1192 vpcmpgtd $t1,$Xn,$Xn # mask value
1193 vpaddd $Xn,$sigma,$sigma # counters--
1195 vmovdqu 0x00-0x80($ctx),$t1
1197 vmovdqu 0x20-0x80($ctx),$t2
1199 vmovdqu 0x40-0x80($ctx),$t3
1201 vmovdqu 0x60-0x80($ctx),$Xi
1204 vmovdqu 0x80-0x80($ctx),$t1
1207 vmovdqu 0xa0-0x80($ctx),$t2
1210 vmovdqu 0xc0-0x80($ctx),$t3
1213 vmovdqu 0xe0-0x80($ctx),$Xi
1217 vmovdqu $A,0x00-0x80($ctx)
1219 vmovdqu $B,0x20-0x80($ctx)
1221 vmovdqu $C,0x40-0x80($ctx)
1222 vmovdqu $D,0x60-0x80($ctx)
1223 vmovdqu $E,0x80-0x80($ctx)
1224 vmovdqu $F,0xa0-0x80($ctx)
1225 vmovdqu $G,0xc0-0x80($ctx)
1226 vmovdqu $H,0xe0-0x80($ctx)
1228 vmovdqu $sigma,(%rbx) # save counters
1229 lea 256+128(%rsp),%rbx
1230 vmovdqu .Lpbswap(%rip),$Xn
1234 #mov `$REG_SZ*17+8`(%rsp),$num
1235 #lea $REG_SZ($ctx),$ctx
1236 #lea `16*$REG_SZ/4`($inp),$inp
1238 #jnz .Loop_grande_avx2
1241 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
1244 $code.=<<___ if ($win64);
1245 movaps -0xd8(%rax),%xmm6
1246 movaps -0xc8(%rax),%xmm7
1247 movaps -0xb8(%rax),%xmm8
1248 movaps -0xa8(%rax),%xmm9
1249 movaps -0x98(%rax),%xmm10
1250 movaps -0x88(%rax),%xmm11
1251 movaps -0x78(%rax),%xmm12
1252 movaps -0x68(%rax),%xmm13
1253 movaps -0x58(%rax),%xmm14
1254 movaps -0x48(%rax),%xmm15
1266 .size sha256_multi_block_avx2,.-sha256_multi_block_avx2
1281 &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1282 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1283 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1284 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1285 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1286 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1287 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1288 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1289 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1290 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1291 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1292 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1293 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1294 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1295 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1296 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1299 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1300 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1302 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1303 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1304 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1305 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1306 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1307 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1308 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1309 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1310 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1311 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1312 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1313 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1314 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1315 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1316 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1317 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1318 .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1322 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1323 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1330 .extern __imp_RtlVirtualUnwind
1331 .type se_handler,\@abi-omnipotent
1345 mov 120($context),%rax # pull context->Rax
1346 mov 248($context),%rbx # pull context->Rip
1348 mov 8($disp),%rsi # disp->ImageBase
1349 mov 56($disp),%r11 # disp->HandlerData
1351 mov 0(%r11),%r10d # HandlerData[0]
1352 lea (%rsi,%r10),%r10 # end of prologue label
1353 cmp %r10,%rbx # context->Rip<.Lbody
1356 mov 152($context),%rax # pull context->Rsp
1358 mov 4(%r11),%r10d # HandlerData[1]
1359 lea (%rsi,%r10),%r10 # epilogue label
1360 cmp %r10,%rbx # context->Rip>=.Lepilogue
1363 mov `16*17`(%rax),%rax # pull saved stack pointer
1367 mov %rbx,144($context) # restore context->Rbx
1368 mov %rbp,160($context) # restore context->Rbp
1370 lea -24-10*16(%rax),%rsi
1371 lea 512($context),%rdi # &context.Xmm6
1373 .long 0xa548f3fc # cld; rep movsq
1378 mov %rax,152($context) # restore context->Rsp
1379 mov %rsi,168($context) # restore context->Rsi
1380 mov %rdi,176($context) # restore context->Rdi
1382 mov 40($disp),%rdi # disp->ContextRecord
1383 mov $context,%rsi # context
1384 mov \$154,%ecx # sizeof(CONTEXT)
1385 .long 0xa548f3fc # cld; rep movsq
1388 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1389 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1390 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1391 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1392 mov 40(%rsi),%r10 # disp->ContextRecord
1393 lea 56(%rsi),%r11 # &disp->HandlerData
1394 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1395 mov %r10,32(%rsp) # arg5
1396 mov %r11,40(%rsp) # arg6
1397 mov %r12,48(%rsp) # arg7
1398 mov %rcx,56(%rsp) # arg8, (NULL)
1399 call *__imp_RtlVirtualUnwind(%rip)
1401 mov \$1,%eax # ExceptionContinueSearch
1413 .size se_handler,.-se_handler
1415 $code.=<<___ if ($avx>1);
1416 .type avx2_handler,\@abi-omnipotent
1430 mov 120($context),%rax # pull context->Rax
1431 mov 248($context),%rbx # pull context->Rip
1433 mov 8($disp),%rsi # disp->ImageBase
1434 mov 56($disp),%r11 # disp->HandlerData
1436 mov 0(%r11),%r10d # HandlerData[0]
1437 lea (%rsi,%r10),%r10 # end of prologue label
1438 cmp %r10,%rbx # context->Rip<body label
1441 mov 152($context),%rax # pull context->Rsp
1443 mov 4(%r11),%r10d # HandlerData[1]
1444 lea (%rsi,%r10),%r10 # epilogue label
1445 cmp %r10,%rbx # context->Rip>=epilogue label
1448 mov `32*17`($context),%rax # pull saved stack pointer
1456 mov %rbx,144($context) # restore context->Rbx
1457 mov %rbp,160($context) # restore context->Rbp
1458 mov %r12,216($context) # restore cotnext->R12
1459 mov %r13,224($context) # restore cotnext->R13
1460 mov %r14,232($context) # restore cotnext->R14
1461 mov %r15,240($context) # restore cotnext->R15
1463 lea -56-10*16(%rax),%rsi
1464 lea 512($context),%rdi # &context.Xmm6
1466 .long 0xa548f3fc # cld; rep movsq
1469 .size avx2_handler,.-avx2_handler
1474 .rva .LSEH_begin_sha256_multi_block
1475 .rva .LSEH_end_sha256_multi_block
1476 .rva .LSEH_info_sha256_multi_block
1477 .rva .LSEH_begin_sha256_multi_block_shaext
1478 .rva .LSEH_end_sha256_multi_block_shaext
1479 .rva .LSEH_info_sha256_multi_block_shaext
1481 $code.=<<___ if ($avx);
1482 .rva .LSEH_begin_sha256_multi_block_avx
1483 .rva .LSEH_end_sha256_multi_block_avx
1484 .rva .LSEH_info_sha256_multi_block_avx
1486 $code.=<<___ if ($avx>1);
1487 .rva .LSEH_begin_sha256_multi_block_avx2
1488 .rva .LSEH_end_sha256_multi_block_avx2
1489 .rva .LSEH_info_sha256_multi_block_avx2
1494 .LSEH_info_sha256_multi_block:
1497 .rva .Lbody,.Lepilogue # HandlerData[]
1498 .LSEH_info_sha256_multi_block_shaext:
1501 .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
1503 $code.=<<___ if ($avx);
1504 .LSEH_info_sha256_multi_block_avx:
1507 .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
1509 $code.=<<___ if ($avx>1);
1510 .LSEH_info_sha256_multi_block_avx2:
1513 .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
1516 ####################################################################
1519 local *opcode=shift;
1523 $rex|=0x04 if ($dst>=8);
1524 $rex|=0x01 if ($src>=8);
1525 unshift @opcode,$rex|0x40 if ($rex);
1531 "sha256rnds2" => 0xcb,
1532 "sha256msg1" => 0xcc,
1533 "sha256msg2" => 0xcd );
1535 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1536 my @opcode=(0x0f,0x38);
1537 rex(\@opcode,$2,$1);
1538 push @opcode,$opcodelet{$instr};
1539 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1540 return ".byte\t".join(',',@opcode);
1542 return $instr."\t".@_[0];
1546 foreach (split("\n",$code)) {
1547 s/\`([^\`]*)\`/eval($1)/ge;
1549 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or
1551 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1552 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1553 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
1554 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1555 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
1556 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;