2 # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # Multi-buffer SHA256 procedure processes n buffers in parallel by
18 # placing buffer data to designated lane of SIMD register. n is
19 # naturally limited to 4 on pre-AVX2 processors and to 8 on
20 # AVX2-capable processors such as Haswell.
22 # this +aesni(i) sha256 aesni-sha256 gain(iv)
23 # -------------------------------------------------------------------
24 # Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
25 # Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95%
26 # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
27 # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
28 # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
29 # Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170%
30 # Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100%
32 # (i) multi-block CBC encrypt with 128-bit key;
33 # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
34 # because of lower AES-NI instruction throughput, nor is there
35 # AES-NI-SHA256 stitch for these processors;
36 # (iii) "this" is for n=8, when we gather twice as much data, result
37 # for n=4 is 20.3+4.44=24.7;
38 # (iv) presented improvement coefficients are asymptotic limits and
39 # in real-life application are somewhat lower, e.g. for 2KB
40 # fragments they range from 75% to 130% (on Haswell);
42 # $output is the last argument if it looks like a file (it has an extension)
43 # $flavour is the first argument if it doesn't look like a file
44 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
45 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
47 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52 die "can't locate x86_64-xlate.pl";
54 push(@INC,"${dir}","${dir}../../perlasm");
55 require "x86_64-support.pl";
57 $ptr_size=&pointer_size($flavour);
61 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
62 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
63 $avx = ($1>=2.19) + ($1>=2.22);
66 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
67 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
68 $avx = ($1>=2.09) + ($1>=2.10);
71 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
72 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
73 $avx = ($1>=10) + ($1>=11);
76 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
77 $avx = ($2>=3.0) + ($2>3.0);
80 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
81 or die "can't call $xlate: $!";
84 # void sha256_multi_block (
85 # struct { unsigned int A[8];
92 # unsigned int H[8]; } *ctx,
93 # struct { void *ptr; int blocks; } inp[8],
94 # int num); /* 1 or 2 */
96 $ctx="%rdi"; # 1st arg
97 $inp="%rsi"; # 2nd arg
98 $num="%edx"; # 3rd arg
99 @ptr=map("%r$_",(8..11));
101 $inp_elm_size=2*$ptr_size;
103 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
104 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
111 $off %= 16; $off *= $REG_SZ;
112 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
116 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
118 $code.=<<___ if ($i<15);
119 movd `4*$i`(@ptr[0]),$Xi
120 movd `4*$i`(@ptr[1]),$t1
121 movd `4*$i`(@ptr[2]),$t2
122 movd `4*$i`(@ptr[3]),$t3
127 $code.=<<___ if ($i==15);
128 movd `4*$i`(@ptr[0]),$Xi
129 lea `16*4`(@ptr[0]),@ptr[0]
130 movd `4*$i`(@ptr[1]),$t1
131 lea `16*4`(@ptr[1]),@ptr[1]
132 movd `4*$i`(@ptr[2]),$t2
133 lea `16*4`(@ptr[2]),@ptr[2]
134 movd `4*$i`(@ptr[3]),$t3
135 lea `16*4`(@ptr[3]),@ptr[3]
142 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)`
144 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)`
148 movdqa $Xi,`&Xi_off($i)`
154 paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round]
159 `"prefetcht0 63(@ptr[0])" if ($i==15)`
161 movdqa $e,$axb # borrow $axb
167 `"prefetcht0 63(@ptr[1])" if ($i==15)`
169 pxor $t3,$sigma # Sigma1(e)
172 paddd $sigma,$Xi # Xi+=Sigma1(e)
173 pxor $axb,$t1 # Ch(e,f,g)
177 pxor $a,$axb # a^b, b^c in next round
179 `"prefetcht0 63(@ptr[2])" if ($i==15)`
182 paddd $t1,$Xi # Xi+=Ch(e,f,g)
187 `"prefetcht0 63(@ptr[3])" if ($i==15)`
193 pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
195 pxor $t3,$sigma # Sigma0(a)
198 paddd $sigma,$h # h+=Sigma0(a)
200 $code.=<<___ if (($i%8)==7);
201 lea `32*8`($Tbl),$Tbl
203 ($axb,$bxc)=($bxc,$axb);
210 movdqa `&Xi_off($i+1)`,$Xn
211 paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9]
219 movdqa `&Xi_off($i+14)`,$t1
223 movdqa $t1,$axb # borrow $axb
231 pxor $t3,$sigma # sigma0(X[i+1])
233 paddd $sigma,$Xi # Xi+=sigma0(e)
239 pxor $t2,$t1 # sigma0(X[i+14])
240 paddd $t1,$Xi # Xi+=sigma1(X[i+14])
249 .extern OPENSSL_ia32cap_P
251 .globl sha256_multi_block
252 .type sha256_multi_block,\@function,3
256 mov OPENSSL_ia32cap_P+4(%rip),%rcx
257 bt \$61,%rcx # check SHA bit
260 $code.=<<___ if ($avx);
266 .cfi_def_cfa_register %rax
272 $code.=<<___ if ($win64);
275 movaps %xmm7,0x10(%rsp)
276 movaps %xmm8,0x20(%rsp)
277 movaps %xmm9,0x30(%rsp)
278 movaps %xmm10,-0x78(%rax)
279 movaps %xmm11,-0x68(%rax)
280 movaps %xmm12,-0x58(%rax)
281 movaps %xmm13,-0x48(%rax)
282 movaps %xmm14,-0x38(%rax)
283 movaps %xmm15,-0x28(%rax)
286 sub \$`$REG_SZ*18`, %rsp
288 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
289 .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
291 lea K256+128(%rip),$Tbl
292 lea `$REG_SZ*16`(%rsp),%rbx
293 lea 0x80($ctx),$ctx # size optimization
296 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
299 for($i=0;$i<4;$i++) {
300 $ptr_reg=&pointer_register($flavour,@ptr[$i]);
303 mov `$inp_elm_size*$i+0`($inp),$ptr_reg
305 mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
307 cmovg %ecx,$num # find maximum
309 mov %ecx,`4*$i`(%rbx) # initialize counters
310 cmovle $Tbl,@ptr[$i] # cancel input
317 movdqu 0x00-0x80($ctx),$A # load context
319 movdqu 0x20-0x80($ctx),$B
320 movdqu 0x40-0x80($ctx),$C
321 movdqu 0x60-0x80($ctx),$D
322 movdqu 0x80-0x80($ctx),$E
323 movdqu 0xa0-0x80($ctx),$F
324 movdqu 0xc0-0x80($ctx),$G
325 movdqu 0xe0-0x80($ctx),$H
326 movdqu .Lpbswap(%rip),$Xn
332 pxor $B,$bxc # magic seed
334 for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
336 movdqu `&Xi_off($i)`,$Xi
342 for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
348 lea K256+128(%rip),$Tbl
350 movdqa (%rbx),$sigma # pull counters
351 cmp 4*0(%rbx),%ecx # examine counters
353 cmovge $Tbl,@ptr[0] # cancel input
358 pcmpgtd $t1,$Xn # mask value
361 paddd $Xn,$sigma # counters--
364 movdqu 0x00-0x80($ctx),$t1
366 movdqu 0x20-0x80($ctx),$t2
368 movdqu 0x40-0x80($ctx),$t3
370 movdqu 0x60-0x80($ctx),$Xi
373 movdqu 0x80-0x80($ctx),$t1
376 movdqu 0xa0-0x80($ctx),$t2
379 movdqu 0xc0-0x80($ctx),$t3
382 movdqu 0xe0-0x80($ctx),$Xi
386 movdqu $A,0x00-0x80($ctx)
388 movdqu $B,0x20-0x80($ctx)
390 movdqu $C,0x40-0x80($ctx)
391 movdqu $D,0x60-0x80($ctx)
392 movdqu $E,0x80-0x80($ctx)
393 movdqu $F,0xa0-0x80($ctx)
394 movdqu $G,0xc0-0x80($ctx)
395 movdqu $H,0xe0-0x80($ctx)
397 movdqa $sigma,(%rbx) # save counters
398 movdqa .Lpbswap(%rip),$Xn
402 mov `$REG_SZ*17+8`(%rsp),$num
403 lea $REG_SZ($ctx),$ctx
404 lea `$inp_elm_size*$REG_SZ/4`($inp),$inp
409 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
412 $code.=<<___ if ($win64);
413 movaps -0xb8(%rax),%xmm6
414 movaps -0xa8(%rax),%xmm7
415 movaps -0x98(%rax),%xmm8
416 movaps -0x88(%rax),%xmm9
417 movaps -0x78(%rax),%xmm10
418 movaps -0x68(%rax),%xmm11
419 movaps -0x58(%rax),%xmm12
420 movaps -0x48(%rax),%xmm13
421 movaps -0x38(%rax),%xmm14
422 movaps -0x28(%rax),%xmm15
430 .cfi_def_cfa_register %rsp
434 .size sha256_multi_block,.-sha256_multi_block
437 my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
438 my @MSG0=map("%xmm$_",(4..7));
439 my @MSG1=map("%xmm$_",(8..11));
442 .type sha256_multi_block_shaext,\@function,3
444 sha256_multi_block_shaext:
448 .cfi_def_cfa_register %rax
454 $code.=<<___ if ($win64);
457 movaps %xmm7,0x10(%rsp)
458 movaps %xmm8,0x20(%rsp)
459 movaps %xmm9,0x30(%rsp)
460 movaps %xmm10,-0x78(%rax)
461 movaps %xmm11,-0x68(%rax)
462 movaps %xmm12,-0x58(%rax)
463 movaps %xmm13,-0x48(%rax)
464 movaps %xmm14,-0x38(%rax)
465 movaps %xmm15,-0x28(%rax)
468 sub \$`$REG_SZ*18`,%rsp
469 shl \$1,$num # we process pair at a time
471 lea 0x80($ctx),$ctx # size optimization
472 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
474 lea `$REG_SZ*16`(%rsp),%rbx
475 lea K256_shaext+0x80(%rip),$Tbl
478 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
481 for($i=0;$i<2;$i++) {
482 $ptr_reg=&pointer_register($flavour,@ptr[$i]);
485 mov `$inp_elm_size*$i+0`($inp),$ptr_reg
487 mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
489 cmovg %ecx,$num # find maximum
491 mov %ecx,`4*$i`(%rbx) # initialize counters
492 cmovle %rsp,@ptr[$i] # cancel input
499 movq 0x00-0x80($ctx),$ABEF0 # A1.A0
500 movq 0x20-0x80($ctx),@MSG0[0] # B1.B0
501 movq 0x40-0x80($ctx),$CDGH0 # C1.C0
502 movq 0x60-0x80($ctx),@MSG0[1] # D1.D0
503 movq 0x80-0x80($ctx),@MSG1[0] # E1.E0
504 movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0
505 movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0
506 movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0
508 punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0
509 punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0
510 punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0
511 punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0
512 movdqa K256_shaext-0x10(%rip),$TMPx # byte swap
516 punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0
517 punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0
518 punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1
519 punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1
521 pshufd \$0b00011011,$ABEF0,$ABEF0
522 pshufd \$0b00011011,$CDGH0,$CDGH0
523 pshufd \$0b00011011,$ABEF1,$ABEF1
524 pshufd \$0b00011011,$CDGH1,$CDGH1
529 movdqu 0x00(@ptr[0]),@MSG0[0]
530 movdqu 0x00(@ptr[1]),@MSG1[0]
531 movdqu 0x10(@ptr[0]),@MSG0[1]
532 movdqu 0x10(@ptr[1]),@MSG1[1]
533 movdqu 0x20(@ptr[0]),@MSG0[2]
534 pshufb $TMPx,@MSG0[0]
535 movdqu 0x20(@ptr[1]),@MSG1[2]
536 pshufb $TMPx,@MSG1[0]
537 movdqu 0x30(@ptr[0]),@MSG0[3]
538 lea 0x40(@ptr[0]),@ptr[0]
539 movdqu 0x30(@ptr[1]),@MSG1[3]
540 lea 0x40(@ptr[1]),@ptr[1]
542 movdqa 0*16-0x80($Tbl),$Wi
543 pshufb $TMPx,@MSG0[1]
545 pxor $ABEF0,@MSG0[0] # black magic
547 movdqa 0*16-0x80($Tbl),$TMP1
548 pshufb $TMPx,@MSG1[1]
550 movdqa $CDGH0,0x50(%rsp) # offload
551 sha256rnds2 $ABEF0,$CDGH0 # 0-3
552 pxor $ABEF1,@MSG1[0] # black magic
554 movdqa $CDGH1,0x70(%rsp)
555 sha256rnds2 $ABEF1,$CDGH1 # 0-3
556 pshufd \$0x0e,$TMP0,$Wi
557 pxor $ABEF0,@MSG0[0] # black magic
558 movdqa $ABEF0,0x40(%rsp) # offload
559 sha256rnds2 $CDGH0,$ABEF0
560 pshufd \$0x0e,$TMP1,$Wi
561 pxor $ABEF1,@MSG1[0] # black magic
562 movdqa $ABEF1,0x60(%rsp)
563 movdqa 1*16-0x80($Tbl),$TMP0
565 pshufb $TMPx,@MSG0[2]
566 sha256rnds2 $CDGH1,$ABEF1
569 movdqa 1*16-0x80($Tbl),$TMP1
571 sha256rnds2 $ABEF0,$CDGH0 # 4-7
573 prefetcht0 127(@ptr[0])
574 pshufb $TMPx,@MSG0[3]
575 pshufb $TMPx,@MSG1[2]
576 prefetcht0 127(@ptr[1])
577 sha256rnds2 $ABEF1,$CDGH1 # 4-7
578 pshufd \$0x0e,$TMP0,$Wi
579 pshufb $TMPx,@MSG1[3]
580 sha256msg1 @MSG0[1],@MSG0[0]
581 sha256rnds2 $CDGH0,$ABEF0
582 pshufd \$0x0e,$TMP1,$Wi
583 movdqa 2*16-0x80($Tbl),$TMP0
585 sha256rnds2 $CDGH1,$ABEF1
588 movdqa 2*16-0x80($Tbl),$TMP1
590 sha256rnds2 $ABEF0,$CDGH0 # 8-11
591 sha256msg1 @MSG1[1],@MSG1[0]
593 movdqa @MSG0[3],$TMPx
594 sha256rnds2 $ABEF1,$CDGH1 # 8-11
595 pshufd \$0x0e,$TMP0,$Wi
596 palignr \$4,@MSG0[2],$TMPx
598 movdqa @MSG1[3],$TMPx
599 palignr \$4,@MSG1[2],$TMPx
600 sha256msg1 @MSG0[2],@MSG0[1]
601 sha256rnds2 $CDGH0,$ABEF0
602 pshufd \$0x0e,$TMP1,$Wi
603 movdqa 3*16-0x80($Tbl),$TMP0
605 sha256rnds2 $CDGH1,$ABEF1
606 sha256msg1 @MSG1[2],@MSG1[1]
609 movdqa 3*16-0x80($Tbl),$TMP1
612 sha256msg2 @MSG0[3],@MSG0[0]
613 sha256rnds2 $ABEF0,$CDGH0 # 12-15
615 movdqa @MSG0[0],$TMPx
616 palignr \$4,@MSG0[3],$TMPx
617 sha256rnds2 $ABEF1,$CDGH1 # 12-15
618 sha256msg2 @MSG1[3],@MSG1[0]
619 pshufd \$0x0e,$TMP0,$Wi
621 movdqa @MSG1[0],$TMPx
622 palignr \$4,@MSG1[3],$TMPx
623 sha256msg1 @MSG0[3],@MSG0[2]
624 sha256rnds2 $CDGH0,$ABEF0
625 pshufd \$0x0e,$TMP1,$Wi
626 movdqa 4*16-0x80($Tbl),$TMP0
628 sha256rnds2 $CDGH1,$ABEF1
629 sha256msg1 @MSG1[3],@MSG1[2]
631 for($i=4;$i<16-3;$i++) {
634 movdqa $i*16-0x80($Tbl),$TMP1
637 sha256msg2 @MSG0[0],@MSG0[1]
638 sha256rnds2 $ABEF0,$CDGH0 # 16-19...
640 movdqa @MSG0[1],$TMPx
641 palignr \$4,@MSG0[0],$TMPx
642 sha256rnds2 $ABEF1,$CDGH1 # 16-19...
643 sha256msg2 @MSG1[0],@MSG1[1]
644 pshufd \$0x0e,$TMP0,$Wi
646 movdqa @MSG1[1],$TMPx
647 palignr \$4,@MSG1[0],$TMPx
648 sha256msg1 @MSG0[0],@MSG0[3]
649 sha256rnds2 $CDGH0,$ABEF0
650 pshufd \$0x0e,$TMP1,$Wi
651 movdqa `($i+1)*16`-0x80($Tbl),$TMP0
653 sha256rnds2 $CDGH1,$ABEF1
654 sha256msg1 @MSG1[0],@MSG1[3]
656 push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
660 movdqa 13*16-0x80($Tbl),$TMP1
663 sha256msg2 @MSG0[0],@MSG0[1]
664 sha256rnds2 $ABEF0,$CDGH0 # 52-55
666 movdqa @MSG0[1],$TMPx
667 palignr \$4,@MSG0[0],$TMPx
668 sha256rnds2 $ABEF1,$CDGH1 # 52-55
669 sha256msg2 @MSG1[0],@MSG1[1]
670 pshufd \$0x0e,$TMP0,$Wi
672 movdqa @MSG1[1],$TMPx
673 palignr \$4,@MSG1[0],$TMPx
675 sha256rnds2 $CDGH0,$ABEF0
676 pshufd \$0x0e,$TMP1,$Wi
677 movdqa 14*16-0x80($Tbl),$TMP0
679 sha256rnds2 $CDGH1,$ABEF1
682 movdqa 14*16-0x80($Tbl),$TMP1
685 sha256msg2 @MSG0[1],@MSG0[2]
687 sha256rnds2 $ABEF0,$CDGH0 # 56-59
690 pxor @MSG0[1],@MSG0[1] # zero
691 sha256rnds2 $ABEF1,$CDGH1 # 56-59
692 sha256msg2 @MSG1[1],@MSG1[2]
693 pshufd \$0x0e,$TMP0,$Wi
694 movdqa 15*16-0x80($Tbl),$TMP0
696 movq (%rbx),@MSG0[2] # pull counters
698 sha256rnds2 $CDGH0,$ABEF0
699 pshufd \$0x0e,$TMP1,$Wi
700 movdqa 15*16-0x80($Tbl),$TMP1
702 sha256rnds2 $CDGH1,$ABEF1
705 cmp 4*0(%rbx),%ecx # examine counters
706 cmovge %rsp,@ptr[0] # cancel input
709 pshufd \$0x00,@MSG0[2],@MSG1[0]
710 sha256rnds2 $ABEF0,$CDGH0 # 60-63
712 pshufd \$0x55,@MSG0[2],@MSG1[1]
713 movdqa @MSG0[2],@MSG1[2]
714 sha256rnds2 $ABEF1,$CDGH1 # 60-63
715 pshufd \$0x0e,$TMP0,$Wi
716 pcmpgtd @MSG0[1],@MSG1[0]
717 pcmpgtd @MSG0[1],@MSG1[1]
718 sha256rnds2 $CDGH0,$ABEF0
719 pshufd \$0x0e,$TMP1,$Wi
720 pcmpgtd @MSG0[1],@MSG1[2] # counter mask
721 movdqa K256_shaext-0x10(%rip),$TMPx
722 sha256rnds2 $CDGH1,$ABEF1
728 paddd @MSG0[2],@MSG1[2] # counters--
730 paddd 0x50(%rsp),$CDGH0
731 paddd 0x70(%rsp),$CDGH1
732 paddd 0x40(%rsp),$ABEF0
733 paddd 0x60(%rsp),$ABEF1
735 movq @MSG1[2],(%rbx) # save counters
739 mov `$REG_SZ*17+8`(%rsp),$num
741 pshufd \$0b00011011,$ABEF0,$ABEF0
742 pshufd \$0b00011011,$CDGH0,$CDGH0
743 pshufd \$0b00011011,$ABEF1,$ABEF1
744 pshufd \$0b00011011,$CDGH1,$CDGH1
746 movdqa $ABEF0,@MSG0[0]
747 movdqa $CDGH0,@MSG0[1]
748 punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0
749 punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0
750 punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0
751 punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0
753 movq $ABEF0,0x00-0x80($ctx) # A1.A0
755 movq @MSG0[0],0x80-0x80($ctx) # E1.E0
757 movq $ABEF0,0x20-0x80($ctx) # B1.B0
758 movq @MSG0[0],0xa0-0x80($ctx) # F1.F0
760 movq $CDGH0,0x40-0x80($ctx) # C1.C0
762 movq @MSG0[1],0xc0-0x80($ctx) # G1.G0
764 movq $CDGH0,0x60-0x80($ctx) # D1.D0
765 movq @MSG0[1],0xe0-0x80($ctx) # H1.H0
767 lea `$REG_SZ/2`($ctx),$ctx
768 lea `$inp_elm_size*2`($inp),$inp
770 jnz .Loop_grande_shaext
773 #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
775 $code.=<<___ if ($win64);
776 movaps -0xb8(%rax),%xmm6
777 movaps -0xa8(%rax),%xmm7
778 movaps -0x98(%rax),%xmm8
779 movaps -0x88(%rax),%xmm9
780 movaps -0x78(%rax),%xmm10
781 movaps -0x68(%rax),%xmm11
782 movaps -0x58(%rax),%xmm12
783 movaps -0x48(%rax),%xmm13
784 movaps -0x38(%rax),%xmm14
785 movaps -0x28(%rax),%xmm15
793 .cfi_def_cfa_register %rsp
797 .size sha256_multi_block_shaext,.-sha256_multi_block_shaext
801 sub ROUND_00_15_avx {
802 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
804 $code.=<<___ if ($i<15 && $REG_SZ==16);
805 vmovd `4*$i`(@ptr[0]),$Xi
806 vmovd `4*$i`(@ptr[1]),$t1
807 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
808 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
809 vpunpckldq $t1,$Xi,$Xi
812 $code.=<<___ if ($i==15 && $REG_SZ==16);
813 vmovd `4*$i`(@ptr[0]),$Xi
814 lea `16*4`(@ptr[0]),@ptr[0]
815 vmovd `4*$i`(@ptr[1]),$t1
816 lea `16*4`(@ptr[1]),@ptr[1]
817 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
818 lea `16*4`(@ptr[2]),@ptr[2]
819 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
820 lea `16*4`(@ptr[3]),@ptr[3]
821 vpunpckldq $t1,$Xi,$Xi
824 $code.=<<___ if ($i<15 && $REG_SZ==32);
825 vmovd `4*$i`(@ptr[0]),$Xi
826 vmovd `4*$i`(@ptr[4]),$t1
827 vmovd `4*$i`(@ptr[1]),$t2
828 vmovd `4*$i`(@ptr[5]),$t3
829 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
830 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
831 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
832 vpunpckldq $t2,$Xi,$Xi
833 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
834 vpunpckldq $t3,$t1,$t1
835 vinserti128 $t1,$Xi,$Xi
838 $code.=<<___ if ($i==15 && $REG_SZ==32);
839 vmovd `4*$i`(@ptr[0]),$Xi
840 lea `16*4`(@ptr[0]),@ptr[0]
841 vmovd `4*$i`(@ptr[4]),$t1
842 lea `16*4`(@ptr[4]),@ptr[4]
843 vmovd `4*$i`(@ptr[1]),$t2
844 lea `16*4`(@ptr[1]),@ptr[1]
845 vmovd `4*$i`(@ptr[5]),$t3
846 lea `16*4`(@ptr[5]),@ptr[5]
847 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
848 lea `16*4`(@ptr[2]),@ptr[2]
849 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
850 lea `16*4`(@ptr[6]),@ptr[6]
851 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
852 lea `16*4`(@ptr[3]),@ptr[3]
853 vpunpckldq $t2,$Xi,$Xi
854 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
855 lea `16*4`(@ptr[7]),@ptr[7]
856 vpunpckldq $t3,$t1,$t1
857 vinserti128 $t1,$Xi,$Xi
863 vmovdqu $Xi,`&Xi_off($i)`
864 vpaddd $h,$Xi,$Xi # Xi+=h
867 vpxor $t3,$sigma,$sigma
869 vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round]
870 vpxor $t2,$sigma,$sigma
873 vpxor $t3,$sigma,$sigma
874 `"prefetcht0 63(@ptr[0])" if ($i==15)`
877 vpand $f,$e,$axb # borrow $axb
878 `"prefetcht0 63(@ptr[1])" if ($i==15)`
879 vpxor $t2,$sigma,$sigma
881 vpsrld \$2,$a,$h # borrow $h
882 vpxor $t3,$sigma,$sigma # Sigma1(e)
883 `"prefetcht0 63(@ptr[2])" if ($i==15)`
885 vpxor $axb,$t1,$t1 # Ch(e,f,g)
886 vpxor $a,$b,$axb # a^b, b^c in next round
887 `"prefetcht0 63(@ptr[3])" if ($i==15)`
889 vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
892 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
894 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
896 `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
900 vpxor $t3,$sigma,$sigma
901 `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
903 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
904 vpaddd $Xi,$d,$d # d+=Xi
905 `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
906 vpxor $t2,$sigma,$sigma
907 vpxor $t3,$sigma,$sigma # Sigma0(a)
909 vpaddd $Xi,$h,$h # h+=Xi
910 vpaddd $sigma,$h,$h # h+=Sigma0(a)
912 $code.=<<___ if (($i%8)==7);
915 ($axb,$bxc)=($bxc,$axb);
918 sub ROUND_16_XX_avx {
922 vmovdqu `&Xi_off($i+1)`,$Xn
923 vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9]
925 vpsrld \$3,$Xn,$sigma
928 vpxor $t2,$sigma,$sigma
930 vpxor $t3,$sigma,$sigma
932 vmovdqu `&Xi_off($i+14)`,$t1
933 vpsrld \$10,$t1,$axb # borrow $axb
935 vpxor $t2,$sigma,$sigma
937 vpxor $t3,$sigma,$sigma # sigma0(X[i+1])
939 vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e)
940 vpxor $t2,$axb,$sigma
942 vpxor $t3,$sigma,$sigma
944 vpxor $t2,$sigma,$sigma
945 vpxor $t3,$sigma,$sigma # sigma0(X[i+14])
946 vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14])
948 &ROUND_00_15_avx($i,@_);
953 .type sha256_multi_block_avx,\@function,3
955 sha256_multi_block_avx:
959 $code.=<<___ if ($avx>1);
971 .cfi_def_cfa_register %rax
977 $code.=<<___ if ($win64);
980 movaps %xmm7,0x10(%rsp)
981 movaps %xmm8,0x20(%rsp)
982 movaps %xmm9,0x30(%rsp)
983 movaps %xmm10,-0x78(%rax)
984 movaps %xmm11,-0x68(%rax)
985 movaps %xmm12,-0x58(%rax)
986 movaps %xmm13,-0x48(%rax)
987 movaps %xmm14,-0x38(%rax)
988 movaps %xmm15,-0x28(%rax)
991 sub \$`$REG_SZ*18`, %rsp
993 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
994 .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
996 lea K256+128(%rip),$Tbl
997 lea `$REG_SZ*16`(%rsp),%rbx
998 lea 0x80($ctx),$ctx # size optimization
1001 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
1004 for($i=0;$i<4;$i++) {
1005 $ptr_reg=&pointer_register($flavour,@ptr[$i]);
1008 mov `$inp_elm_size*$i+0`($inp),$ptr_reg
1010 mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
1012 cmovg %ecx,$num # find maximum
1014 mov %ecx,`4*$i`(%rbx) # initialize counters
1015 cmovle $Tbl,@ptr[$i] # cancel input
1022 vmovdqu 0x00-0x80($ctx),$A # load context
1024 vmovdqu 0x20-0x80($ctx),$B
1025 vmovdqu 0x40-0x80($ctx),$C
1026 vmovdqu 0x60-0x80($ctx),$D
1027 vmovdqu 0x80-0x80($ctx),$E
1028 vmovdqu 0xa0-0x80($ctx),$F
1029 vmovdqu 0xc0-0x80($ctx),$G
1030 vmovdqu 0xe0-0x80($ctx),$H
1031 vmovdqu .Lpbswap(%rip),$Xn
1036 vpxor $B,$C,$bxc # magic seed
1038 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1040 vmovdqu `&Xi_off($i)`,$Xi
1046 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1052 lea K256+128(%rip),$Tbl
1054 for($i=0;$i<4;$i++) {
1056 cmp `4*$i`(%rbx),%ecx # examine counters
1057 cmovge $Tbl,@ptr[$i] # cancel input
1061 vmovdqa (%rbx),$sigma # pull counters
1064 vpcmpgtd $t1,$Xn,$Xn # mask value
1065 vpaddd $Xn,$sigma,$sigma # counters--
1067 vmovdqu 0x00-0x80($ctx),$t1
1069 vmovdqu 0x20-0x80($ctx),$t2
1071 vmovdqu 0x40-0x80($ctx),$t3
1073 vmovdqu 0x60-0x80($ctx),$Xi
1076 vmovdqu 0x80-0x80($ctx),$t1
1079 vmovdqu 0xa0-0x80($ctx),$t2
1082 vmovdqu 0xc0-0x80($ctx),$t3
1085 vmovdqu 0xe0-0x80($ctx),$Xi
1089 vmovdqu $A,0x00-0x80($ctx)
1091 vmovdqu $B,0x20-0x80($ctx)
1093 vmovdqu $C,0x40-0x80($ctx)
1094 vmovdqu $D,0x60-0x80($ctx)
1095 vmovdqu $E,0x80-0x80($ctx)
1096 vmovdqu $F,0xa0-0x80($ctx)
1097 vmovdqu $G,0xc0-0x80($ctx)
1098 vmovdqu $H,0xe0-0x80($ctx)
1100 vmovdqu $sigma,(%rbx) # save counters
1101 vmovdqu .Lpbswap(%rip),$Xn
1105 mov `$REG_SZ*17+8`(%rsp),$num
1106 lea $REG_SZ($ctx),$ctx
1107 lea `$inp_elm_size*$REG_SZ/4`($inp),$inp
1109 jnz .Loop_grande_avx
1112 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
1116 $code.=<<___ if ($win64);
1117 movaps -0xb8(%rax),%xmm6
1118 movaps -0xa8(%rax),%xmm7
1119 movaps -0x98(%rax),%xmm8
1120 movaps -0x88(%rax),%xmm9
1121 movaps -0x78(%rax),%xmm10
1122 movaps -0x68(%rax),%xmm11
1123 movaps -0x58(%rax),%xmm12
1124 movaps -0x48(%rax),%xmm13
1125 movaps -0x38(%rax),%xmm14
1126 movaps -0x28(%rax),%xmm15
1134 .cfi_def_cfa_register %rsp
1138 .size sha256_multi_block_avx,.-sha256_multi_block_avx
1141 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1144 @ptr=map("%r$_",(12..15,8..11));
1146 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1147 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1150 .type sha256_multi_block_avx2,\@function,3
1152 sha256_multi_block_avx2:
1156 .cfi_def_cfa_register %rax
1170 $code.=<<___ if ($win64);
1171 lea -0xa8(%rsp),%rsp
1173 movaps %xmm7,0x10(%rsp)
1174 movaps %xmm8,0x20(%rsp)
1175 movaps %xmm9,0x30(%rsp)
1176 movaps %xmm10,0x40(%rsp)
1177 movaps %xmm11,0x50(%rsp)
1178 movaps %xmm12,-0x78(%rax)
1179 movaps %xmm13,-0x68(%rax)
1180 movaps %xmm14,-0x58(%rax)
1181 movaps %xmm15,-0x48(%rax)
1184 sub \$`$REG_SZ*18`, %rsp
1186 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
1187 .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
1189 lea K256+128(%rip),$Tbl
1190 lea 0x80($ctx),$ctx # size optimization
1193 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
1195 lea `$REG_SZ*16`(%rsp),%rbx
1197 for($i=0;$i<8;$i++) {
1198 $ptr_reg=&pointer_register($flavour,@ptr[$i]);
1201 mov `$inp_elm_size*$i+0`($inp),$ptr_reg
1203 mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
1205 cmovg %ecx,$num # find maximum
1207 mov %ecx,`4*$i`(%rbx) # initialize counters
1208 cmovle $Tbl,@ptr[$i] # cancel input
1212 vmovdqu 0x00-0x80($ctx),$A # load context
1214 vmovdqu 0x20-0x80($ctx),$B
1215 lea 256+128(%rsp),%rbx
1216 vmovdqu 0x40-0x80($ctx),$C
1217 vmovdqu 0x60-0x80($ctx),$D
1218 vmovdqu 0x80-0x80($ctx),$E
1219 vmovdqu 0xa0-0x80($ctx),$F
1220 vmovdqu 0xc0-0x80($ctx),$G
1221 vmovdqu 0xe0-0x80($ctx),$H
1222 vmovdqu .Lpbswap(%rip),$Xn
1227 vpxor $B,$C,$bxc # magic seed
1229 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1231 vmovdqu `&Xi_off($i)`,$Xi
1233 jmp .Loop_16_xx_avx2
1237 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1240 jnz .Loop_16_xx_avx2
1243 lea `$REG_SZ*16`(%rsp),%rbx
1244 lea K256+128(%rip),$Tbl
1246 for($i=0;$i<8;$i++) {
1248 cmp `4*$i`(%rbx),%ecx # examine counters
1249 cmovge $Tbl,@ptr[$i] # cancel input
1253 vmovdqa (%rbx),$sigma # pull counters
1256 vpcmpgtd $t1,$Xn,$Xn # mask value
1257 vpaddd $Xn,$sigma,$sigma # counters--
1259 vmovdqu 0x00-0x80($ctx),$t1
1261 vmovdqu 0x20-0x80($ctx),$t2
1263 vmovdqu 0x40-0x80($ctx),$t3
1265 vmovdqu 0x60-0x80($ctx),$Xi
1268 vmovdqu 0x80-0x80($ctx),$t1
1271 vmovdqu 0xa0-0x80($ctx),$t2
1274 vmovdqu 0xc0-0x80($ctx),$t3
1277 vmovdqu 0xe0-0x80($ctx),$Xi
1281 vmovdqu $A,0x00-0x80($ctx)
1283 vmovdqu $B,0x20-0x80($ctx)
1285 vmovdqu $C,0x40-0x80($ctx)
1286 vmovdqu $D,0x60-0x80($ctx)
1287 vmovdqu $E,0x80-0x80($ctx)
1288 vmovdqu $F,0xa0-0x80($ctx)
1289 vmovdqu $G,0xc0-0x80($ctx)
1290 vmovdqu $H,0xe0-0x80($ctx)
1292 vmovdqu $sigma,(%rbx) # save counters
1293 lea 256+128(%rsp),%rbx
1294 vmovdqu .Lpbswap(%rip),$Xn
1298 #mov `$REG_SZ*17+8`(%rsp),$num
1299 #lea $REG_SZ($ctx),$ctx
1300 #lea `$inp_elm_size*$REG_SZ/4`($inp),$inp
1302 #jnz .Loop_grande_avx2
1305 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
1309 $code.=<<___ if ($win64);
1310 movaps -0xd8(%rax),%xmm6
1311 movaps -0xc8(%rax),%xmm7
1312 movaps -0xb8(%rax),%xmm8
1313 movaps -0xa8(%rax),%xmm9
1314 movaps -0x98(%rax),%xmm10
1315 movaps -0x88(%rax),%xmm11
1316 movaps -0x78(%rax),%xmm12
1317 movaps -0x68(%rax),%xmm13
1318 movaps -0x58(%rax),%xmm14
1319 movaps -0x48(%rax),%xmm15
1335 .cfi_def_cfa_register %rsp
1339 .size sha256_multi_block_avx2,.-sha256_multi_block_avx2
1354 &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1355 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1356 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1357 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1358 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1359 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1360 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1361 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1362 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1363 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1364 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1365 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1366 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1367 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1368 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1369 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1372 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1373 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1375 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1376 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1377 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1378 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1379 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1380 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1381 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1382 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1383 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1384 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1385 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1386 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1387 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1388 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1389 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1390 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1391 .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1395 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1396 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1403 .extern __imp_RtlVirtualUnwind
1404 .type se_handler,\@abi-omnipotent
1418 mov 120($context),%rax # pull context->Rax
1419 mov 248($context),%rbx # pull context->Rip
1421 mov 8($disp),%rsi # disp->ImageBase
1422 mov 56($disp),%r11 # disp->HandlerData
1424 mov 0(%r11),%r10d # HandlerData[0]
1425 lea (%rsi,%r10),%r10 # end of prologue label
1426 cmp %r10,%rbx # context->Rip<.Lbody
1429 mov 152($context),%rax # pull context->Rsp
1431 mov 4(%r11),%r10d # HandlerData[1]
1432 lea (%rsi,%r10),%r10 # epilogue label
1433 cmp %r10,%rbx # context->Rip>=.Lepilogue
1436 mov `16*17`(%rax),%rax # pull saved stack pointer
1440 mov %rbx,144($context) # restore context->Rbx
1441 mov %rbp,160($context) # restore context->Rbp
1443 lea -24-10*16(%rax),%rsi
1444 lea 512($context),%rdi # &context.Xmm6
1446 .long 0xa548f3fc # cld; rep movsq
1451 mov %rax,152($context) # restore context->Rsp
1452 mov %rsi,168($context) # restore context->Rsi
1453 mov %rdi,176($context) # restore context->Rdi
1455 mov 40($disp),%rdi # disp->ContextRecord
1456 mov $context,%rsi # context
1457 mov \$154,%ecx # sizeof(CONTEXT)
1458 .long 0xa548f3fc # cld; rep movsq
1461 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1462 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1463 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1464 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1465 mov 40(%rsi),%r10 # disp->ContextRecord
1466 lea 56(%rsi),%r11 # &disp->HandlerData
1467 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1468 mov %r10,32(%rsp) # arg5
1469 mov %r11,40(%rsp) # arg6
1470 mov %r12,48(%rsp) # arg7
1471 mov %rcx,56(%rsp) # arg8, (NULL)
1472 call *__imp_RtlVirtualUnwind(%rip)
1474 mov \$1,%eax # ExceptionContinueSearch
1486 .size se_handler,.-se_handler
1488 $code.=<<___ if ($avx>1);
1489 .type avx2_handler,\@abi-omnipotent
1503 mov 120($context),%rax # pull context->Rax
1504 mov 248($context),%rbx # pull context->Rip
1506 mov 8($disp),%rsi # disp->ImageBase
1507 mov 56($disp),%r11 # disp->HandlerData
1509 mov 0(%r11),%r10d # HandlerData[0]
1510 lea (%rsi,%r10),%r10 # end of prologue label
1511 cmp %r10,%rbx # context->Rip<body label
1514 mov 152($context),%rax # pull context->Rsp
1516 mov 4(%r11),%r10d # HandlerData[1]
1517 lea (%rsi,%r10),%r10 # epilogue label
1518 cmp %r10,%rbx # context->Rip>=epilogue label
1521 mov `32*17`($context),%rax # pull saved stack pointer
1529 mov %rbx,144($context) # restore context->Rbx
1530 mov %rbp,160($context) # restore context->Rbp
1531 mov %r12,216($context) # restore context->R12
1532 mov %r13,224($context) # restore context->R13
1533 mov %r14,232($context) # restore context->R14
1534 mov %r15,240($context) # restore context->R15
1536 lea -56-10*16(%rax),%rsi
1537 lea 512($context),%rdi # &context.Xmm6
1539 .long 0xa548f3fc # cld; rep movsq
1542 .size avx2_handler,.-avx2_handler
1547 .rva .LSEH_begin_sha256_multi_block
1548 .rva .LSEH_end_sha256_multi_block
1549 .rva .LSEH_info_sha256_multi_block
1550 .rva .LSEH_begin_sha256_multi_block_shaext
1551 .rva .LSEH_end_sha256_multi_block_shaext
1552 .rva .LSEH_info_sha256_multi_block_shaext
1554 $code.=<<___ if ($avx);
1555 .rva .LSEH_begin_sha256_multi_block_avx
1556 .rva .LSEH_end_sha256_multi_block_avx
1557 .rva .LSEH_info_sha256_multi_block_avx
1559 $code.=<<___ if ($avx>1);
1560 .rva .LSEH_begin_sha256_multi_block_avx2
1561 .rva .LSEH_end_sha256_multi_block_avx2
1562 .rva .LSEH_info_sha256_multi_block_avx2
1567 .LSEH_info_sha256_multi_block:
1570 .rva .Lbody,.Lepilogue # HandlerData[]
1571 .LSEH_info_sha256_multi_block_shaext:
1574 .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
1576 $code.=<<___ if ($avx);
1577 .LSEH_info_sha256_multi_block_avx:
1580 .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
1582 $code.=<<___ if ($avx>1);
1583 .LSEH_info_sha256_multi_block_avx2:
1586 .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
1589 ####################################################################
1592 local *opcode=shift;
1596 $rex|=0x04 if ($dst>=8);
1597 $rex|=0x01 if ($src>=8);
1598 unshift @opcode,$rex|0x40 if ($rex);
1604 "sha256rnds2" => 0xcb,
1605 "sha256msg1" => 0xcc,
1606 "sha256msg2" => 0xcd );
1608 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1609 my @opcode=(0x0f,0x38);
1610 rex(\@opcode,$2,$1);
1611 push @opcode,$opcodelet{$instr};
1612 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1613 return ".byte\t".join(',',@opcode);
1615 return $instr."\t".@_[0];
1619 foreach (split("\n",$code)) {
1620 s/\`([^\`]*)\`/eval($1)/ge;
1622 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or
1624 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1625 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1626 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
1627 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1628 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
1629 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1634 close STDOUT or die "error closing STDOUT: $!";