3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # Multi-buffer SHA256 procedure processes n buffers in parallel by
11 # placing buffer data to designated lane of SIMD register. n is
12 # naturally limited to 4 on pre-AVX2 processors and to 8 on
13 # AVX2-capable processors such as Haswell.
15 # this +aesni(i) sha256 aesni-sha256 gain(iv)
16 # -------------------------------------------------------------------
17 # Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
18 # Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95%
19 # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
20 # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
21 # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
22 # Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100%
24 # (i) multi-block CBC encrypt with 128-bit key;
25 # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
26 # because of lower AES-NI instruction throughput, nor is there
27 # AES-NI-SHA256 stitch for these processors;
28 # (iii) "this" is for n=8, when we gather twice as much data, result
29 # for n=4 is 20.3+4.44=24.7;
30 # (iv) presented improvement coefficients are asymptotic limits and
31 # in real-life application are somewhat lower, e.g. for 2KB
32 # fragments they range from 75% to 130% (on Haswell);
36 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
38 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
40 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
41 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
42 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
43 die "can't locate x86_64-xlate.pl";
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49 $avx = ($1>=2.19) + ($1>=2.22);
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54 $avx = ($1>=2.09) + ($1>=2.10);
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59 $avx = ($1>=10) + ($1>=11);
62 open OUT,"| \"$^X\" $xlate $flavour $output";
65 # void sha256_multi_block (
66 # struct { unsigned int A[8];
73 # unsigned int H[8]; } *ctx,
74 # struct { void *ptr; int blocks; } inp[8],
75 # int num); /* 1 or 2 */
77 $ctx="%rdi"; # 1st arg
78 $inp="%rsi"; # 2nd arg
79 $num="%edx"; # 3rd arg
80 @ptr=map("%r$_",(8..11));
83 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
84 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
91 $off %= 16; $off *= $REG_SZ;
92 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
96 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
98 $code.=<<___ if ($i<15);
99 movd `4*$i`(@ptr[0]),$Xi
100 movd `4*$i`(@ptr[1]),$t1
101 movd `4*$i`(@ptr[2]),$t2
102 movd `4*$i`(@ptr[3]),$t3
107 $code.=<<___ if ($i==15);
108 movd `4*$i`(@ptr[0]),$Xi
109 lea `16*4`(@ptr[0]),@ptr[0]
110 movd `4*$i`(@ptr[1]),$t1
111 lea `16*4`(@ptr[1]),@ptr[1]
112 movd `4*$i`(@ptr[2]),$t2
113 lea `16*4`(@ptr[2]),@ptr[2]
114 movd `4*$i`(@ptr[3]),$t3
115 lea `16*4`(@ptr[3]),@ptr[3]
122 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)`
124 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)`
128 movdqa $Xi,`&Xi_off($i)`
134 paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round]
139 `"prefetcht0 63(@ptr[0])" if ($i==15)`
141 movdqa $e,$axb # borrow $axb
147 `"prefetcht0 63(@ptr[1])" if ($i==15)`
149 pxor $t3,$sigma # Sigma1(e)
152 paddd $sigma,$Xi # Xi+=Sigma1(e)
153 pxor $axb,$t1 # Ch(e,f,g)
157 pxor $a,$axb # a^b, b^c in next round
159 `"prefetcht0 63(@ptr[2])" if ($i==15)`
162 paddd $t1,$Xi # Xi+=Ch(e,f,g)
167 `"prefetcht0 63(@ptr[3])" if ($i==15)`
173 pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
175 pxor $t3,$sigma # Sigma0(a)
178 paddd $sigma,$h # h+=Sigma0(a)
180 $code.=<<___ if (($i%8)==7);
181 lea `32*8`($Tbl),$Tbl
183 ($axb,$bxc)=($bxc,$axb);
190 movdqa `&Xi_off($i+1)`,$Xn
191 paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9]
199 movdqa `&Xi_off($i+14)`,$t1
203 movdqa $t1,$axb # borrow $axb
211 pxor $t3,$sigma # sigma0(X[i+1])
213 paddd $sigma,$Xi # Xi+=sigma0(e)
219 pxor $t2,$t1 # sigma0(X[i+14])
220 paddd $t1,$Xi # Xi+=sigma1(X[i+14])
229 .extern OPENSSL_ia32cap_P
231 .globl sha256_multi_block
232 .type sha256_multi_block,\@function,3
235 mov OPENSSL_ia32cap_P+4(%rip),%rcx
236 bt \$61,%rcx # check SHA bit
239 $code.=<<___ if ($avx);
248 $code.=<<___ if ($win64);
251 movaps %xmm7,0x10(%rsp)
252 movaps %xmm8,0x20(%rsp)
253 movaps %xmm9,0x30(%rsp)
254 movaps %xmm10,-0x78(%rax)
255 movaps %xmm11,-0x68(%rax)
256 movaps %xmm12,-0x58(%rax)
257 movaps %xmm13,-0x48(%rax)
258 movaps %xmm14,-0x38(%rax)
259 movaps %xmm15,-0x28(%rax)
262 sub \$`$REG_SZ*18`, %rsp
264 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
266 lea K256+128(%rip),$Tbl
267 lea `$REG_SZ*16`(%rsp),%rbx
268 lea 0x80($ctx),$ctx # size optimization
271 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
274 for($i=0;$i<4;$i++) {
276 mov `16*$i+0`($inp),@ptr[$i] # input pointer
277 mov `16*$i+8`($inp),%ecx # number of blocks
279 cmovg %ecx,$num # find maximum
281 mov %ecx,`4*$i`(%rbx) # initialize counters
282 cmovle $Tbl,@ptr[$i] # cancel input
289 movdqu 0x00-0x80($ctx),$A # load context
291 movdqu 0x20-0x80($ctx),$B
292 movdqu 0x40-0x80($ctx),$C
293 movdqu 0x60-0x80($ctx),$D
294 movdqu 0x80-0x80($ctx),$E
295 movdqu 0xa0-0x80($ctx),$F
296 movdqu 0xc0-0x80($ctx),$G
297 movdqu 0xe0-0x80($ctx),$H
298 movdqu .Lpbswap(%rip),$Xn
304 pxor $B,$bxc # magic seed
306 for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
308 movdqu `&Xi_off($i)`,$Xi
314 for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
320 lea K256+128(%rip),$Tbl
322 movdqa (%rbx),$sigma # pull counters
323 cmp 4*0(%rbx),%ecx # examine counters
325 cmovge $Tbl,@ptr[0] # cancel input
330 pcmpgtd $t1,$Xn # mask value
333 paddd $Xn,$sigma # counters--
336 movdqu 0x00-0x80($ctx),$t1
338 movdqu 0x20-0x80($ctx),$t2
340 movdqu 0x40-0x80($ctx),$t3
342 movdqu 0x60-0x80($ctx),$Xi
345 movdqu 0x80-0x80($ctx),$t1
348 movdqu 0xa0-0x80($ctx),$t2
351 movdqu 0xc0-0x80($ctx),$t3
354 movdqu 0xe0-0x80($ctx),$Xi
358 movdqu $A,0x00-0x80($ctx)
360 movdqu $B,0x20-0x80($ctx)
362 movdqu $C,0x40-0x80($ctx)
363 movdqu $D,0x60-0x80($ctx)
364 movdqu $E,0x80-0x80($ctx)
365 movdqu $F,0xa0-0x80($ctx)
366 movdqu $G,0xc0-0x80($ctx)
367 movdqu $H,0xe0-0x80($ctx)
369 movdqa $sigma,(%rbx) # save counters
370 movdqa .Lpbswap(%rip),$Xn
374 mov `$REG_SZ*17+8`(%rsp),$num
375 lea $REG_SZ($ctx),$ctx
376 lea `16*$REG_SZ/4`($inp),$inp
381 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
383 $code.=<<___ if ($win64);
384 movaps -0xb8(%rax),%xmm6
385 movaps -0xa8(%rax),%xmm7
386 movaps -0x98(%rax),%xmm8
387 movaps -0x88(%rax),%xmm9
388 movaps -0x78(%rax),%xmm10
389 movaps -0x68(%rax),%xmm11
390 movaps -0x58(%rax),%xmm12
391 movaps -0x48(%rax),%xmm13
392 movaps -0x38(%rax),%xmm14
393 movaps -0x28(%rax),%xmm15
401 .size sha256_multi_block,.-sha256_multi_block
404 my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
405 my @MSG0=map("%xmm$_",(4..7));
406 my @MSG1=map("%xmm$_",(8..11));
409 .type sha256_multi_block_shaext,\@function,3
411 sha256_multi_block_shaext:
417 $code.=<<___ if ($win64);
420 movaps %xmm7,0x10(%rsp)
421 movaps %xmm8,0x20(%rsp)
422 movaps %xmm9,0x30(%rsp)
423 movaps %xmm10,-0x78(%rax)
424 movaps %xmm11,-0x68(%rax)
425 movaps %xmm12,-0x58(%rax)
426 movaps %xmm13,-0x48(%rax)
427 movaps %xmm14,-0x38(%rax)
428 movaps %xmm15,-0x28(%rax)
431 sub \$`$REG_SZ*18`,%rsp
432 shl \$1,$num # we process pair at a time
434 lea 0x80($ctx),$ctx # size optimization
435 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
437 lea `$REG_SZ*16`(%rsp),%rbx
438 lea K256_shaext+0x80(%rip),$Tbl
441 mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num
444 for($i=0;$i<2;$i++) {
446 mov `16*$i+0`($inp),@ptr[$i] # input pointer
447 mov `16*$i+8`($inp),%ecx # number of blocks
449 cmovg %ecx,$num # find maximum
451 mov %ecx,`4*$i`(%rbx) # initialize counters
452 cmovle %rsp,@ptr[$i] # cancel input
459 movq 0x00-0x80($ctx),$ABEF0 # A1.A0
460 movq 0x20-0x80($ctx),@MSG0[0] # B1.B0
461 movq 0x40-0x80($ctx),$CDGH0 # C1.C0
462 movq 0x60-0x80($ctx),@MSG0[1] # D1.D0
463 movq 0x80-0x80($ctx),@MSG1[0] # E1.E0
464 movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0
465 movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0
466 movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0
468 punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0
469 punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0
470 punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0
471 punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0
472 movdqa K256_shaext-0x10(%rip),$TMPx # byte swap
476 punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0
477 punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0
478 punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1
479 punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1
481 pshufd \$0b00011011,$ABEF0,$ABEF0
482 pshufd \$0b00011011,$CDGH0,$CDGH0
483 pshufd \$0b00011011,$ABEF1,$ABEF1
484 pshufd \$0b00011011,$CDGH1,$CDGH1
489 movdqu 0x00(@ptr[0]),@MSG0[0]
490 movdqu 0x00(@ptr[1]),@MSG1[0]
491 movdqu 0x10(@ptr[0]),@MSG0[1]
492 movdqu 0x10(@ptr[1]),@MSG1[1]
493 movdqu 0x20(@ptr[0]),@MSG0[2]
494 pshufb $TMPx,@MSG0[0]
495 movdqu 0x20(@ptr[1]),@MSG1[2]
496 pshufb $TMPx,@MSG1[0]
497 movdqu 0x30(@ptr[0]),@MSG0[3]
498 lea 0x40(@ptr[0]),@ptr[0]
499 movdqu 0x30(@ptr[1]),@MSG1[3]
500 lea 0x40(@ptr[1]),@ptr[1]
502 movdqa 0*16-0x80($Tbl),$Wi
503 pshufb $TMPx,@MSG0[1]
505 pxor $ABEF0,@MSG0[0] # black magic
507 movdqa 0*16-0x80($Tbl),$TMP1
508 pshufb $TMPx,@MSG1[1]
510 movdqa $CDGH0,0x50(%rsp) # offload
511 sha256rnds2 $ABEF0,$CDGH0 # 0-3
512 pxor $ABEF1,@MSG1[0] # black magic
514 movdqa $CDGH1,0x70(%rsp)
515 sha256rnds2 $ABEF1,$CDGH1 # 0-3
516 pshufd \$0x0e,$TMP0,$Wi
517 pxor $ABEF0,@MSG0[0] # black magic
518 movdqa $ABEF0,0x40(%rsp) # offload
519 sha256rnds2 $CDGH0,$ABEF0
520 pshufd \$0x0e,$TMP1,$Wi
521 pxor $ABEF1,@MSG1[0] # black magic
522 movdqa $ABEF1,0x60(%rsp)
523 movdqa 1*16-0x80($Tbl),$TMP0
525 pshufb $TMPx,@MSG0[2]
526 sha256rnds2 $CDGH1,$ABEF1
529 movdqa 1*16-0x80($Tbl),$TMP1
531 sha256rnds2 $ABEF0,$CDGH0 # 4-7
533 prefetcht0 127(@ptr[0])
534 pshufb $TMPx,@MSG0[3]
535 pshufb $TMPx,@MSG1[2]
536 prefetcht0 127(@ptr[1])
537 sha256rnds2 $ABEF1,$CDGH1 # 4-7
538 pshufd \$0x0e,$TMP0,$Wi
539 pshufb $TMPx,@MSG1[3]
540 sha256msg1 @MSG0[1],@MSG0[0]
541 sha256rnds2 $CDGH0,$ABEF0
542 pshufd \$0x0e,$TMP1,$Wi
543 movdqa 2*16-0x80($Tbl),$TMP0
545 sha256rnds2 $CDGH1,$ABEF1
548 movdqa 2*16-0x80($Tbl),$TMP1
550 sha256rnds2 $ABEF0,$CDGH0 # 8-11
551 sha256msg1 @MSG1[1],@MSG1[0]
553 movdqa @MSG0[3],$TMPx
554 sha256rnds2 $ABEF1,$CDGH1 # 8-11
555 pshufd \$0x0e,$TMP0,$Wi
556 palignr \$4,@MSG0[2],$TMPx
558 movdqa @MSG1[3],$TMPx
559 palignr \$4,@MSG1[2],$TMPx
560 sha256msg1 @MSG0[2],@MSG0[1]
561 sha256rnds2 $CDGH0,$ABEF0
562 pshufd \$0x0e,$TMP1,$Wi
563 movdqa 3*16-0x80($Tbl),$TMP0
565 sha256rnds2 $CDGH1,$ABEF1
566 sha256msg1 @MSG1[2],@MSG1[1]
569 movdqa 3*16-0x80($Tbl),$TMP1
572 sha256msg2 @MSG0[3],@MSG0[0]
573 sha256rnds2 $ABEF0,$CDGH0 # 12-15
575 movdqa @MSG0[0],$TMPx
576 palignr \$4,@MSG0[3],$TMPx
577 sha256rnds2 $ABEF1,$CDGH1 # 12-15
578 sha256msg2 @MSG1[3],@MSG1[0]
579 pshufd \$0x0e,$TMP0,$Wi
581 movdqa @MSG1[0],$TMPx
582 palignr \$4,@MSG1[3],$TMPx
583 sha256msg1 @MSG0[3],@MSG0[2]
584 sha256rnds2 $CDGH0,$ABEF0
585 pshufd \$0x0e,$TMP1,$Wi
586 movdqa 4*16-0x80($Tbl),$TMP0
588 sha256rnds2 $CDGH1,$ABEF1
589 sha256msg1 @MSG1[3],@MSG1[2]
591 for($i=4;$i<16-3;$i++) {
594 movdqa $i*16-0x80($Tbl),$TMP1
597 sha256msg2 @MSG0[0],@MSG0[1]
598 sha256rnds2 $ABEF0,$CDGH0 # 16-19...
600 movdqa @MSG0[1],$TMPx
601 palignr \$4,@MSG0[0],$TMPx
602 sha256rnds2 $ABEF1,$CDGH1 # 16-19...
603 sha256msg2 @MSG1[0],@MSG1[1]
604 pshufd \$0x0e,$TMP0,$Wi
606 movdqa @MSG1[1],$TMPx
607 palignr \$4,@MSG1[0],$TMPx
608 sha256msg1 @MSG0[0],@MSG0[3]
609 sha256rnds2 $CDGH0,$ABEF0
610 pshufd \$0x0e,$TMP1,$Wi
611 movdqa `($i+1)*16`-0x80($Tbl),$TMP0
613 sha256rnds2 $CDGH1,$ABEF1
614 sha256msg1 @MSG1[0],@MSG1[3]
616 push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
620 movdqa 13*16-0x80($Tbl),$TMP1
623 sha256msg2 @MSG0[0],@MSG0[1]
624 sha256rnds2 $ABEF0,$CDGH0 # 52-55
626 movdqa @MSG0[1],$TMPx
627 palignr \$4,@MSG0[0],$TMPx
628 sha256rnds2 $ABEF1,$CDGH1 # 52-55
629 sha256msg2 @MSG1[0],@MSG1[1]
630 pshufd \$0x0e,$TMP0,$Wi
632 movdqa @MSG1[1],$TMPx
633 palignr \$4,@MSG1[0],$TMPx
635 sha256rnds2 $CDGH0,$ABEF0
636 pshufd \$0x0e,$TMP1,$Wi
637 movdqa 14*16-0x80($Tbl),$TMP0
639 sha256rnds2 $CDGH1,$ABEF1
642 movdqa 14*16-0x80($Tbl),$TMP1
645 sha256msg2 @MSG0[1],@MSG0[2]
647 sha256rnds2 $ABEF0,$CDGH0 # 56-59
650 pxor @MSG0[1],@MSG0[1] # zero
651 sha256rnds2 $ABEF1,$CDGH1 # 56-59
652 sha256msg2 @MSG1[1],@MSG1[2]
653 pshufd \$0x0e,$TMP0,$Wi
654 movdqa 15*16-0x80($Tbl),$TMP0
656 movq (%rbx),@MSG0[2] # pull counters
658 sha256rnds2 $CDGH0,$ABEF0
659 pshufd \$0x0e,$TMP1,$Wi
660 movdqa 15*16-0x80($Tbl),$TMP1
662 sha256rnds2 $CDGH1,$ABEF1
665 cmp 4*0(%rbx),%ecx # examine counters
666 cmovge %rsp,@ptr[0] # cancel input
669 pshufd \$0x00,@MSG0[2],@MSG1[0]
670 sha256rnds2 $ABEF0,$CDGH0 # 60-63
672 pshufd \$0x55,@MSG0[2],@MSG1[1]
673 movdqa @MSG0[2],@MSG1[2]
674 sha256rnds2 $ABEF1,$CDGH1 # 60-63
675 pshufd \$0x0e,$TMP0,$Wi
676 pcmpgtd @MSG0[1],@MSG1[0]
677 pcmpgtd @MSG0[1],@MSG1[1]
678 sha256rnds2 $CDGH0,$ABEF0
679 pshufd \$0x0e,$TMP1,$Wi
680 pcmpgtd @MSG0[1],@MSG1[2] # counter mask
681 movdqa K256_shaext-0x10(%rip),$TMPx
682 sha256rnds2 $CDGH1,$ABEF1
688 paddd @MSG0[2],@MSG1[2] # counters--
690 paddd 0x50(%rsp),$CDGH0
691 paddd 0x70(%rsp),$CDGH1
692 paddd 0x40(%rsp),$ABEF0
693 paddd 0x60(%rsp),$ABEF1
695 movq @MSG1[2],(%rbx) # save counters
699 mov `$REG_SZ*17+8`(%rsp),$num
701 pshufd \$0b00011011,$ABEF0,$ABEF0
702 pshufd \$0b00011011,$CDGH0,$CDGH0
703 pshufd \$0b00011011,$ABEF1,$ABEF1
704 pshufd \$0b00011011,$CDGH1,$CDGH1
706 movdqa $ABEF0,@MSG0[0]
707 movdqa $CDGH0,@MSG0[1]
708 punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0
709 punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0
710 punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0
711 punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0
713 movq $ABEF0,0x00-0x80($ctx) # A1.A0
715 movq @MSG0[0],0x80-0x80($ctx) # E1.E0
717 movq $ABEF0,0x20-0x80($ctx) # B1.B0
718 movq @MSG0[0],0xa0-0x80($ctx) # F1.F0
720 movq $CDGH0,0x40-0x80($ctx) # C1.C0
722 movq @MSG0[1],0xc0-0x80($ctx) # G1.G0
724 movq $CDGH0,0x60-0x80($ctx) # D1.D0
725 movq @MSG0[1],0xe0-0x80($ctx) # H1.H0
727 lea `$REG_SZ/2`($ctx),$ctx
728 lea `16*2`($inp),$inp
730 jnz .Loop_grande_shaext
733 #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
735 $code.=<<___ if ($win64);
736 movaps -0xb8(%rax),%xmm6
737 movaps -0xa8(%rax),%xmm7
738 movaps -0x98(%rax),%xmm8
739 movaps -0x88(%rax),%xmm9
740 movaps -0x78(%rax),%xmm10
741 movaps -0x68(%rax),%xmm11
742 movaps -0x58(%rax),%xmm12
743 movaps -0x48(%rax),%xmm13
744 movaps -0x38(%rax),%xmm14
745 movaps -0x28(%rax),%xmm15
753 .size sha256_multi_block_shaext,.-sha256_multi_block_shaext
757 sub ROUND_00_15_avx {
758 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
760 $code.=<<___ if ($i<15 && $REG_SZ==16);
761 vmovd `4*$i`(@ptr[0]),$Xi
762 vmovd `4*$i`(@ptr[1]),$t1
763 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
764 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
765 vpunpckldq $t1,$Xi,$Xi
768 $code.=<<___ if ($i==15 && $REG_SZ==16);
769 vmovd `4*$i`(@ptr[0]),$Xi
770 lea `16*4`(@ptr[0]),@ptr[0]
771 vmovd `4*$i`(@ptr[1]),$t1
772 lea `16*4`(@ptr[1]),@ptr[1]
773 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
774 lea `16*4`(@ptr[2]),@ptr[2]
775 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
776 lea `16*4`(@ptr[3]),@ptr[3]
777 vpunpckldq $t1,$Xi,$Xi
780 $code.=<<___ if ($i<15 && $REG_SZ==32);
781 vmovd `4*$i`(@ptr[0]),$Xi
782 vmovd `4*$i`(@ptr[4]),$t1
783 vmovd `4*$i`(@ptr[1]),$t2
784 vmovd `4*$i`(@ptr[5]),$t3
785 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
786 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
787 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
788 vpunpckldq $t2,$Xi,$Xi
789 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
790 vpunpckldq $t3,$t1,$t1
791 vinserti128 $t1,$Xi,$Xi
794 $code.=<<___ if ($i==15 && $REG_SZ==32);
795 vmovd `4*$i`(@ptr[0]),$Xi
796 lea `16*4`(@ptr[0]),@ptr[0]
797 vmovd `4*$i`(@ptr[4]),$t1
798 lea `16*4`(@ptr[4]),@ptr[4]
799 vmovd `4*$i`(@ptr[1]),$t2
800 lea `16*4`(@ptr[1]),@ptr[1]
801 vmovd `4*$i`(@ptr[5]),$t3
802 lea `16*4`(@ptr[5]),@ptr[5]
803 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
804 lea `16*4`(@ptr[2]),@ptr[2]
805 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
806 lea `16*4`(@ptr[6]),@ptr[6]
807 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
808 lea `16*4`(@ptr[3]),@ptr[3]
809 vpunpckldq $t2,$Xi,$Xi
810 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
811 lea `16*4`(@ptr[7]),@ptr[7]
812 vpunpckldq $t3,$t1,$t1
813 vinserti128 $t1,$Xi,$Xi
819 vmovdqu $Xi,`&Xi_off($i)`
820 vpaddd $h,$Xi,$Xi # Xi+=h
823 vpxor $t3,$sigma,$sigma
825 vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round]
826 vpxor $t2,$sigma,$sigma
829 vpxor $t3,$sigma,$sigma
830 `"prefetcht0 63(@ptr[0])" if ($i==15)`
833 vpand $f,$e,$axb # borrow $axb
834 `"prefetcht0 63(@ptr[1])" if ($i==15)`
835 vpxor $t2,$sigma,$sigma
837 vpsrld \$2,$a,$h # borrow $h
838 vpxor $t3,$sigma,$sigma # Sigma1(e)
839 `"prefetcht0 63(@ptr[2])" if ($i==15)`
841 vpxor $axb,$t1,$t1 # Ch(e,f,g)
842 vpxor $a,$b,$axb # a^b, b^c in next round
843 `"prefetcht0 63(@ptr[3])" if ($i==15)`
845 vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
848 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
850 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
852 `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
856 vpxor $t3,$sigma,$sigma
857 `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
859 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
860 vpaddd $Xi,$d,$d # d+=Xi
861 `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
862 vpxor $t2,$sigma,$sigma
863 vpxor $t3,$sigma,$sigma # Sigma0(a)
865 vpaddd $Xi,$h,$h # h+=Xi
866 vpaddd $sigma,$h,$h # h+=Sigma0(a)
868 $code.=<<___ if (($i%8)==7);
871 ($axb,$bxc)=($bxc,$axb);
874 sub ROUND_16_XX_avx {
878 vmovdqu `&Xi_off($i+1)`,$Xn
879 vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9]
881 vpsrld \$3,$Xn,$sigma
884 vpxor $t2,$sigma,$sigma
886 vpxor $t3,$sigma,$sigma
888 vmovdqu `&Xi_off($i+14)`,$t1
889 vpsrld \$10,$t1,$axb # borrow $axb
891 vpxor $t2,$sigma,$sigma
893 vpxor $t3,$sigma,$sigma # sigma0(X[i+1])
895 vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e)
896 vpxor $t2,$axb,$sigma
898 vpxor $t3,$sigma,$sigma
900 vpxor $t2,$sigma,$sigma
901 vpxor $t3,$sigma,$sigma # sigma0(X[i+14])
902 vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14])
904 &ROUND_00_15_avx($i,@_);
909 .type sha256_multi_block_avx,\@function,3
911 sha256_multi_block_avx:
914 $code.=<<___ if ($avx>1);
929 $code.=<<___ if ($win64);
932 movaps %xmm7,0x10(%rsp)
933 movaps %xmm8,0x20(%rsp)
934 movaps %xmm9,0x30(%rsp)
935 movaps %xmm10,-0x78(%rax)
936 movaps %xmm11,-0x68(%rax)
937 movaps %xmm12,-0x58(%rax)
938 movaps %xmm13,-0x48(%rax)
939 movaps %xmm14,-0x38(%rax)
940 movaps %xmm15,-0x28(%rax)
943 sub \$`$REG_SZ*18`, %rsp
945 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
947 lea K256+128(%rip),$Tbl
948 lea `$REG_SZ*16`(%rsp),%rbx
949 lea 0x80($ctx),$ctx # size optimization
952 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
955 for($i=0;$i<4;$i++) {
957 mov `16*$i+0`($inp),@ptr[$i] # input pointer
958 mov `16*$i+8`($inp),%ecx # number of blocks
960 cmovg %ecx,$num # find maximum
962 mov %ecx,`4*$i`(%rbx) # initialize counters
963 cmovle $Tbl,@ptr[$i] # cancel input
970 vmovdqu 0x00-0x80($ctx),$A # load context
972 vmovdqu 0x20-0x80($ctx),$B
973 vmovdqu 0x40-0x80($ctx),$C
974 vmovdqu 0x60-0x80($ctx),$D
975 vmovdqu 0x80-0x80($ctx),$E
976 vmovdqu 0xa0-0x80($ctx),$F
977 vmovdqu 0xc0-0x80($ctx),$G
978 vmovdqu 0xe0-0x80($ctx),$H
979 vmovdqu .Lpbswap(%rip),$Xn
984 vpxor $B,$C,$bxc # magic seed
986 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
988 vmovdqu `&Xi_off($i)`,$Xi
994 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1000 lea K256+128(%rip),$Tbl
1002 for($i=0;$i<4;$i++) {
1004 cmp `4*$i`(%rbx),%ecx # examine counters
1005 cmovge $Tbl,@ptr[$i] # cancel input
1009 vmovdqa (%rbx),$sigma # pull counters
1012 vpcmpgtd $t1,$Xn,$Xn # mask value
1013 vpaddd $Xn,$sigma,$sigma # counters--
1015 vmovdqu 0x00-0x80($ctx),$t1
1017 vmovdqu 0x20-0x80($ctx),$t2
1019 vmovdqu 0x40-0x80($ctx),$t3
1021 vmovdqu 0x60-0x80($ctx),$Xi
1024 vmovdqu 0x80-0x80($ctx),$t1
1027 vmovdqu 0xa0-0x80($ctx),$t2
1030 vmovdqu 0xc0-0x80($ctx),$t3
1033 vmovdqu 0xe0-0x80($ctx),$Xi
1037 vmovdqu $A,0x00-0x80($ctx)
1039 vmovdqu $B,0x20-0x80($ctx)
1041 vmovdqu $C,0x40-0x80($ctx)
1042 vmovdqu $D,0x60-0x80($ctx)
1043 vmovdqu $E,0x80-0x80($ctx)
1044 vmovdqu $F,0xa0-0x80($ctx)
1045 vmovdqu $G,0xc0-0x80($ctx)
1046 vmovdqu $H,0xe0-0x80($ctx)
1048 vmovdqu $sigma,(%rbx) # save counters
1049 vmovdqu .Lpbswap(%rip),$Xn
1053 mov `$REG_SZ*17+8`(%rsp),$num
1054 lea $REG_SZ($ctx),$ctx
1055 lea `16*$REG_SZ/4`($inp),$inp
1057 jnz .Loop_grande_avx
1060 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
1063 $code.=<<___ if ($win64);
1064 movaps -0xb8(%rax),%xmm6
1065 movaps -0xa8(%rax),%xmm7
1066 movaps -0x98(%rax),%xmm8
1067 movaps -0x88(%rax),%xmm9
1068 movaps -0x78(%rax),%xmm10
1069 movaps -0x68(%rax),%xmm11
1070 movaps -0x58(%rax),%xmm12
1071 movaps -0x48(%rax),%xmm13
1072 movaps -0x38(%rax),%xmm14
1073 movaps -0x28(%rax),%xmm15
1081 .size sha256_multi_block_avx,.-sha256_multi_block_avx
1084 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1087 @ptr=map("%r$_",(12..15,8..11));
1089 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1090 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1093 .type sha256_multi_block_avx2,\@function,3
1095 sha256_multi_block_avx2:
1105 $code.=<<___ if ($win64);
1106 lea -0xa8(%rsp),%rsp
1108 movaps %xmm7,0x10(%rsp)
1109 movaps %xmm8,0x20(%rsp)
1110 movaps %xmm9,0x30(%rsp)
1111 movaps %xmm10,0x40(%rsp)
1112 movaps %xmm11,0x50(%rsp)
1113 movaps %xmm12,-0x78(%rax)
1114 movaps %xmm13,-0x68(%rax)
1115 movaps %xmm14,-0x58(%rax)
1116 movaps %xmm15,-0x48(%rax)
1119 sub \$`$REG_SZ*18`, %rsp
1121 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
1123 lea K256+128(%rip),$Tbl
1124 lea 0x80($ctx),$ctx # size optimization
1127 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
1129 lea `$REG_SZ*16`(%rsp),%rbx
1131 for($i=0;$i<8;$i++) {
1133 mov `16*$i+0`($inp),@ptr[$i] # input pointer
1134 mov `16*$i+8`($inp),%ecx # number of blocks
1136 cmovg %ecx,$num # find maximum
1138 mov %ecx,`4*$i`(%rbx) # initialize counters
1139 cmovle $Tbl,@ptr[$i] # cancel input
1143 vmovdqu 0x00-0x80($ctx),$A # load context
1145 vmovdqu 0x20-0x80($ctx),$B
1146 lea 256+128(%rsp),%rbx
1147 vmovdqu 0x40-0x80($ctx),$C
1148 vmovdqu 0x60-0x80($ctx),$D
1149 vmovdqu 0x80-0x80($ctx),$E
1150 vmovdqu 0xa0-0x80($ctx),$F
1151 vmovdqu 0xc0-0x80($ctx),$G
1152 vmovdqu 0xe0-0x80($ctx),$H
1153 vmovdqu .Lpbswap(%rip),$Xn
1158 vpxor $B,$C,$bxc # magic seed
1160 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1162 vmovdqu `&Xi_off($i)`,$Xi
1164 jmp .Loop_16_xx_avx2
1168 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1171 jnz .Loop_16_xx_avx2
1174 lea `$REG_SZ*16`(%rsp),%rbx
1175 lea K256+128(%rip),$Tbl
1177 for($i=0;$i<8;$i++) {
1179 cmp `4*$i`(%rbx),%ecx # examine counters
1180 cmovge $Tbl,@ptr[$i] # cancel input
1184 vmovdqa (%rbx),$sigma # pull counters
1187 vpcmpgtd $t1,$Xn,$Xn # mask value
1188 vpaddd $Xn,$sigma,$sigma # counters--
1190 vmovdqu 0x00-0x80($ctx),$t1
1192 vmovdqu 0x20-0x80($ctx),$t2
1194 vmovdqu 0x40-0x80($ctx),$t3
1196 vmovdqu 0x60-0x80($ctx),$Xi
1199 vmovdqu 0x80-0x80($ctx),$t1
1202 vmovdqu 0xa0-0x80($ctx),$t2
1205 vmovdqu 0xc0-0x80($ctx),$t3
1208 vmovdqu 0xe0-0x80($ctx),$Xi
1212 vmovdqu $A,0x00-0x80($ctx)
1214 vmovdqu $B,0x20-0x80($ctx)
1216 vmovdqu $C,0x40-0x80($ctx)
1217 vmovdqu $D,0x60-0x80($ctx)
1218 vmovdqu $E,0x80-0x80($ctx)
1219 vmovdqu $F,0xa0-0x80($ctx)
1220 vmovdqu $G,0xc0-0x80($ctx)
1221 vmovdqu $H,0xe0-0x80($ctx)
1223 vmovdqu $sigma,(%rbx) # save counters
1224 lea 256+128(%rsp),%rbx
1225 vmovdqu .Lpbswap(%rip),$Xn
1229 #mov `$REG_SZ*17+8`(%rsp),$num
1230 #lea $REG_SZ($ctx),$ctx
1231 #lea `16*$REG_SZ/4`($inp),$inp
1233 #jnz .Loop_grande_avx2
1236 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
1239 $code.=<<___ if ($win64);
1240 movaps -0xd8(%rax),%xmm6
1241 movaps -0xc8(%rax),%xmm7
1242 movaps -0xb8(%rax),%xmm8
1243 movaps -0xa8(%rax),%xmm9
1244 movaps -0x98(%rax),%xmm10
1245 movaps -0x88(%rax),%xmm11
1246 movaps -0x78(%rax),%xmm12
1247 movaps -0x68(%rax),%xmm13
1248 movaps -0x58(%rax),%xmm14
1249 movaps -0x48(%rax),%xmm15
1261 .size sha256_multi_block_avx2,.-sha256_multi_block_avx2
1276 &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1277 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1278 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1279 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1280 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1281 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1282 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1283 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1284 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1285 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1286 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1287 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1288 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1289 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1290 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1291 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1294 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1295 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1297 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1298 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1299 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1300 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1301 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1302 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1303 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1304 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1305 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1306 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1307 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1308 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1309 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1310 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1311 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1312 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1313 .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1317 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1318 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1325 .extern __imp_RtlVirtualUnwind
1326 .type se_handler,\@abi-omnipotent
1340 mov 120($context),%rax # pull context->Rax
1341 mov 248($context),%rbx # pull context->Rip
1343 mov 8($disp),%rsi # disp->ImageBase
1344 mov 56($disp),%r11 # disp->HandlerData
1346 mov 0(%r11),%r10d # HandlerData[0]
1347 lea (%rsi,%r10),%r10 # end of prologue label
1348 cmp %r10,%rbx # context->Rip<.Lbody
1351 mov 152($context),%rax # pull context->Rsp
1353 mov 4(%r11),%r10d # HandlerData[1]
1354 lea (%rsi,%r10),%r10 # epilogue label
1355 cmp %r10,%rbx # context->Rip>=.Lepilogue
1358 mov `16*17`(%rax),%rax # pull saved stack pointer
1362 mov %rbx,144($context) # restore context->Rbx
1363 mov %rbp,160($context) # restore context->Rbp
1365 lea -24-10*16(%rax),%rsi
1366 lea 512($context),%rdi # &context.Xmm6
1368 .long 0xa548f3fc # cld; rep movsq
1373 mov %rax,152($context) # restore context->Rsp
1374 mov %rsi,168($context) # restore context->Rsi
1375 mov %rdi,176($context) # restore context->Rdi
1377 mov 40($disp),%rdi # disp->ContextRecord
1378 mov $context,%rsi # context
1379 mov \$154,%ecx # sizeof(CONTEXT)
1380 .long 0xa548f3fc # cld; rep movsq
1383 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1384 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1385 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1386 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1387 mov 40(%rsi),%r10 # disp->ContextRecord
1388 lea 56(%rsi),%r11 # &disp->HandlerData
1389 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1390 mov %r10,32(%rsp) # arg5
1391 mov %r11,40(%rsp) # arg6
1392 mov %r12,48(%rsp) # arg7
1393 mov %rcx,56(%rsp) # arg8, (NULL)
1394 call *__imp_RtlVirtualUnwind(%rip)
1396 mov \$1,%eax # ExceptionContinueSearch
1408 .size se_handler,.-se_handler
1410 $code.=<<___ if ($avx>1);
1411 .type avx2_handler,\@abi-omnipotent
1425 mov 120($context),%rax # pull context->Rax
1426 mov 248($context),%rbx # pull context->Rip
1428 mov 8($disp),%rsi # disp->ImageBase
1429 mov 56($disp),%r11 # disp->HandlerData
1431 mov 0(%r11),%r10d # HandlerData[0]
1432 lea (%rsi,%r10),%r10 # end of prologue label
1433 cmp %r10,%rbx # context->Rip<body label
1436 mov 152($context),%rax # pull context->Rsp
1438 mov 4(%r11),%r10d # HandlerData[1]
1439 lea (%rsi,%r10),%r10 # epilogue label
1440 cmp %r10,%rbx # context->Rip>=epilogue label
1443 mov `32*17`($context),%rax # pull saved stack pointer
1451 mov %rbx,144($context) # restore context->Rbx
1452 mov %rbp,160($context) # restore context->Rbp
1453 mov %r12,216($context) # restore cotnext->R12
1454 mov %r13,224($context) # restore cotnext->R13
1455 mov %r14,232($context) # restore cotnext->R14
1456 mov %r15,240($context) # restore cotnext->R15
1458 lea -56-10*16(%rax),%rsi
1459 lea 512($context),%rdi # &context.Xmm6
1461 .long 0xa548f3fc # cld; rep movsq
1464 .size avx2_handler,.-avx2_handler
1469 .rva .LSEH_begin_sha256_multi_block
1470 .rva .LSEH_end_sha256_multi_block
1471 .rva .LSEH_info_sha256_multi_block
1472 .rva .LSEH_begin_sha256_multi_block_shaext
1473 .rva .LSEH_end_sha256_multi_block_shaext
1474 .rva .LSEH_info_sha256_multi_block_shaext
1476 $code.=<<___ if ($avx);
1477 .rva .LSEH_begin_sha256_multi_block_avx
1478 .rva .LSEH_end_sha256_multi_block_avx
1479 .rva .LSEH_info_sha256_multi_block_avx
1481 $code.=<<___ if ($avx>1);
1482 .rva .LSEH_begin_sha256_multi_block_avx2
1483 .rva .LSEH_end_sha256_multi_block_avx2
1484 .rva .LSEH_info_sha256_multi_block_avx2
1489 .LSEH_info_sha256_multi_block:
1492 .rva .Lbody,.Lepilogue # HandlerData[]
1493 .LSEH_info_sha256_multi_block_shaext:
1496 .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
1498 $code.=<<___ if ($avx);
1499 .LSEH_info_sha256_multi_block_avx:
1502 .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
1504 $code.=<<___ if ($avx>1);
1505 .LSEH_info_sha256_multi_block_avx2:
1508 .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
1511 ####################################################################
1514 local *opcode=shift;
1518 $rex|=0x04 if ($dst>=8);
1519 $rex|=0x01 if ($src>=8);
1520 unshift @opcode,$rex|0x40 if ($rex);
1526 "sha256rnds2" => 0xcb,
1527 "sha256msg1" => 0xcc,
1528 "sha256msg2" => 0xcd );
1530 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1531 my @opcode=(0x0f,0x38);
1532 rex(\@opcode,$2,$1);
1533 push @opcode,$opcodelet{$instr};
1534 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1535 return ".byte\t".join(',',@opcode);
1537 return $instr."\t".@_[0];
1541 foreach (split("\n",$code)) {
1542 s/\`([^\`]*)\`/eval($1)/ge;
1544 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or
1546 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1547 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1548 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
1549 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1550 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
1551 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;