3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
24 # AES-128/-192/-256+SHA256 this(**)gain
25 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
26 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
27 # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
28 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
30 # (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
31 # Westmere is omitted from loop, this is because gain was not
32 # estimated high enough to justify the effort;
33 # (**) these are EVP-free results, results obtained with 'speed
34 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
45 die "can't locate x86_64-xlate.pl";
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49 $avx = ($1>=2.19) + ($1>=2.22);
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54 $avx = ($1>=2.09) + ($1>=2.10);
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59 $avx = ($1>=10) + ($1>=11);
62 open OUT,"| \"$^X\" $xlate $flavour $output";
65 $func="aesni_cbc_sha256_enc";
68 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
69 "%r8d","%r9d","%r10d","%r11d");
70 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
77 ########################################################################
78 # void aesni_cbc_sha256_enc(const void *inp,
85 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
86 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
90 $_inp="16*$SZ+0*8(%rsp)";
91 $_out="16*$SZ+1*8(%rsp)";
92 $_end="16*$SZ+2*8(%rsp)";
93 $_key="16*$SZ+3*8(%rsp)";
94 $_ivp="16*$SZ+4*8(%rsp)";
95 $_ctx="16*$SZ+5*8(%rsp)";
96 $_in0="16*$SZ+6*8(%rsp)";
97 $_rsp="16*$SZ+7*8(%rsp)";
103 .extern OPENSSL_ia32cap_P
105 .type $func,\@abi-omnipotent
109 $code.=<<___ if ($avx);
110 lea OPENSSL_ia32cap_P(%rip),%r11
112 cmp \$0,`$win64?"%rcx":"%rdi"`
117 bt \$61,%r10 # check for SHA
123 test \$`1<<11`,%r10d # check for XOP
126 $code.=<<___ if ($avx>1);
127 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
128 cmp \$`1<<8|1<<5|1<<3`,%r11d
131 $code.=<<___ if ($avx);
132 and \$`1<<30`,%eax # mask "Intel CPU" bit
133 and \$`1<<28|1<<9`,%r10d # mask AVX+SSSE3 bits
135 cmp \$`1<<28|1<<9|1<<30`,%r10d
141 cmp \$0,`$win64?"%rcx":"%rdi"`
149 .type $TABLE,\@object
151 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
152 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
153 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
154 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
155 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
156 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
157 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
158 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
159 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
160 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
161 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
162 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
163 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
164 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
165 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
166 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
167 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
168 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
169 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
170 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
171 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
172 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
173 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
174 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
175 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
176 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
177 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
178 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
179 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
180 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
181 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
182 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
184 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
185 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
186 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
187 .long 0,0,0,0, 0,0,0,0
188 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
192 ######################################################################
196 ($iv,$inout,$roundkey,$temp,
197 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
201 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
202 ## &vmovdqu ($inout,($inp));
203 ## &mov ($_inp,$inp);
205 '&vpxor ($inout,$inout,$roundkey);'.
206 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
208 '&vpxor ($inout,$inout,$iv);',
210 '&vaesenc ($inout,$inout,$roundkey);'.
211 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
213 '&vaesenc ($inout,$inout,$roundkey);'.
214 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
216 '&vaesenc ($inout,$inout,$roundkey);'.
217 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
219 '&vaesenc ($inout,$inout,$roundkey);'.
220 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
222 '&vaesenc ($inout,$inout,$roundkey);'.
223 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
225 '&vaesenc ($inout,$inout,$roundkey);'.
226 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
228 '&vaesenc ($inout,$inout,$roundkey);'.
229 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
231 '&vaesenc ($inout,$inout,$roundkey);'.
232 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
234 '&vaesenc ($inout,$inout,$roundkey);'.
235 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
237 '&vaesenclast ($temp,$inout,$roundkey);'.
238 ' &vaesenc ($inout,$inout,$roundkey);'.
239 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
241 '&vpand ($iv,$temp,$mask10);'.
242 ' &vaesenc ($inout,$inout,$roundkey);'.
243 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
245 '&vaesenclast ($temp,$inout,$roundkey);'.
246 ' &vaesenc ($inout,$inout,$roundkey);'.
247 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
249 '&vpand ($temp,$temp,$mask12);'.
250 ' &vaesenc ($inout,$inout,$roundkey);'.
251 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
253 '&vpor ($iv,$iv,$temp);'.
254 ' &vaesenclast ($temp,$inout,$roundkey);'.
255 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
257 ## &mov ($inp,$_inp);
258 ## &mov ($out,$_out);
259 ## &vpand ($temp,$temp,$mask14);
260 ## &vpor ($iv,$iv,$temp);
261 ## &vmovdqu ($iv,($out,$inp);
262 ## &lea (inp,16($inp));
266 my ($a,$b,$c,$d,$e,$f,$g,$h);
268 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
269 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
271 $arg = "\$$arg" if ($arg*1 eq $arg);
272 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
277 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
279 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
284 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
285 '&xor ($a4,$g)', # f^g
287 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
289 '&and ($a4,$e)', # (f^g)&e
291 @aesni_cbc_block[$aesni_cbc_idx++].
293 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
296 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
297 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
298 '&xor ($a2,$b)', # a^b, b^c in next round
300 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
301 '&add ($h,$a4)', # h+=Ch(e,f,g)
302 '&and ($a3,$a2)', # (b^c)&(a^b)
305 '&add ($h,$a0)', # h+=Sigma1(e)
306 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
308 '&add ($d,$h)', # d+=h
309 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
310 '&add ($h,$a3)', # h+=Maj(a,b,c)
313 '&add ($a1,$h);'. # h+=Sigma0(a)
314 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
319 ######################################################################
323 .type ${func}_xop,\@function,6
327 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
334 mov %rsp,%r11 # copy %rsp
335 sub \$`$framesz+$win64*16*10`,%rsp
336 and \$-64,%rsp # align stack frame
339 sub $inp,$out # re-bias
341 add $inp,$len # end of input
343 #mov $inp,$_inp # saved later
346 #mov $key,$_key # remains resident in $inp register
352 $code.=<<___ if ($win64);
353 movaps %xmm6,`$framesz+16*0`(%rsp)
354 movaps %xmm7,`$framesz+16*1`(%rsp)
355 movaps %xmm8,`$framesz+16*2`(%rsp)
356 movaps %xmm9,`$framesz+16*3`(%rsp)
357 movaps %xmm10,`$framesz+16*4`(%rsp)
358 movaps %xmm11,`$framesz+16*5`(%rsp)
359 movaps %xmm12,`$framesz+16*6`(%rsp)
360 movaps %xmm13,`$framesz+16*7`(%rsp)
361 movaps %xmm14,`$framesz+16*8`(%rsp)
362 movaps %xmm15,`$framesz+16*9`(%rsp)
368 mov $inp,%r12 # borrow $a4
369 lea 0x80($key),$inp # size optimization, reassign
370 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
371 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
372 mov $ctx,%r15 # borrow $a2
373 mov $in0,%rsi # borrow $a3
374 vmovdqu ($ivp),$iv # load IV
386 vmovdqa 0x00(%r13,%r14,8),$mask14
387 vmovdqa 0x10(%r13,%r14,8),$mask12
388 vmovdqa 0x20(%r13,%r14,8),$mask10
389 vmovdqu 0x00-0x80($inp),$roundkey
392 if ($SZ==4) { # SHA256
393 my @X = map("%xmm$_",(0..3));
394 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
399 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
400 vmovdqu 0x00(%rsi,%r12),@X[0]
401 vmovdqu 0x10(%rsi,%r12),@X[1]
402 vmovdqu 0x20(%rsi,%r12),@X[2]
403 vmovdqu 0x30(%rsi,%r12),@X[3]
404 vpshufb $t3,@X[0],@X[0]
405 lea $TABLE(%rip),$Tbl
406 vpshufb $t3,@X[1],@X[1]
407 vpshufb $t3,@X[2],@X[2]
408 vpaddd 0x00($Tbl),@X[0],$t0
409 vpshufb $t3,@X[3],@X[3]
410 vpaddd 0x20($Tbl),@X[1],$t1
411 vpaddd 0x40($Tbl),@X[2],$t2
412 vpaddd 0x60($Tbl),@X[3],$t3
413 vmovdqa $t0,0x00(%rsp)
415 vmovdqa $t1,0x10(%rsp)
417 vmovdqa $t2,0x20(%rsp)
419 vmovdqa $t3,0x30(%rsp)
425 sub \$-16*2*$SZ,$Tbl # size optimization
426 vmovdqu (%r12),$inout # $a4
429 sub XOP_256_00_47 () {
433 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
435 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
438 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
441 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
444 &vpsrld ($t0,$t0,$sigma0[2]);
447 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
452 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
455 &vpxor ($t0,$t0,$t1);
460 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
463 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
466 &vpsrld ($t2,@X[3],$sigma1[2]);
469 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
472 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
475 &vpxor ($t3,$t3,$t2);
480 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
485 &vpsrldq ($t3,$t3,8);
490 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
495 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
498 &vpsrld ($t2,@X[0],$sigma1[2]);
501 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
504 &vpxor ($t3,$t3,$t2);
509 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
514 &vpslldq ($t3,$t3,8); # 22 instructions
519 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
524 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
525 foreach (@insns) { eval; } # remaining instructions
526 &vmovdqa (16*$j."(%rsp)",$t2);
530 for ($i=0,$j=0; $j<4; $j++) {
531 &XOP_256_00_47($j,\&body_00_15,@X);
532 push(@X,shift(@X)); # rotate(@X)
534 &mov ("%r12",$_inp); # borrow $a4
535 &vpand ($temp,$temp,$mask14);
536 &mov ("%r15",$_out); # borrow $a2
537 &vpor ($iv,$iv,$temp);
538 &vmovdqu ("(%r15,%r12)",$iv); # write output
539 &lea ("%r12","16(%r12)"); # inp++
541 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
542 &jne (".Lxop_00_47");
544 &vmovdqu ($inout,"(%r12)");
548 for ($i=0; $i<16; ) {
549 foreach(body_00_15()) { eval; }
553 mov $_inp,%r12 # borrow $a4
554 mov $_out,%r13 # borrow $a0
555 mov $_ctx,%r15 # borrow $a2
556 mov $_in0,%rsi # borrow $a3
558 vpand $mask14,$temp,$temp
561 vmovdqu $iv,(%r13,%r12) # write output
562 lea 16(%r12),%r12 # inp++
588 vmovdqu $iv,($ivp) # output IV
591 $code.=<<___ if ($win64);
592 movaps `$framesz+16*0`(%rsp),%xmm6
593 movaps `$framesz+16*1`(%rsp),%xmm7
594 movaps `$framesz+16*2`(%rsp),%xmm8
595 movaps `$framesz+16*3`(%rsp),%xmm9
596 movaps `$framesz+16*4`(%rsp),%xmm10
597 movaps `$framesz+16*5`(%rsp),%xmm11
598 movaps `$framesz+16*6`(%rsp),%xmm12
599 movaps `$framesz+16*7`(%rsp),%xmm13
600 movaps `$framesz+16*8`(%rsp),%xmm14
601 movaps `$framesz+16*9`(%rsp),%xmm15
613 .size ${func}_xop,.-${func}_xop
615 ######################################################################
618 local *ror = sub { &shrd(@_[0],@_) };
621 .type ${func}_avx,\@function,6
625 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
632 mov %rsp,%r11 # copy %rsp
633 sub \$`$framesz+$win64*16*10`,%rsp
634 and \$-64,%rsp # align stack frame
637 sub $inp,$out # re-bias
639 add $inp,$len # end of input
641 #mov $inp,$_inp # saved later
644 #mov $key,$_key # remains resident in $inp register
650 $code.=<<___ if ($win64);
651 movaps %xmm6,`$framesz+16*0`(%rsp)
652 movaps %xmm7,`$framesz+16*1`(%rsp)
653 movaps %xmm8,`$framesz+16*2`(%rsp)
654 movaps %xmm9,`$framesz+16*3`(%rsp)
655 movaps %xmm10,`$framesz+16*4`(%rsp)
656 movaps %xmm11,`$framesz+16*5`(%rsp)
657 movaps %xmm12,`$framesz+16*6`(%rsp)
658 movaps %xmm13,`$framesz+16*7`(%rsp)
659 movaps %xmm14,`$framesz+16*8`(%rsp)
660 movaps %xmm15,`$framesz+16*9`(%rsp)
666 mov $inp,%r12 # borrow $a4
667 lea 0x80($key),$inp # size optimization, reassign
668 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
669 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
670 mov $ctx,%r15 # borrow $a2
671 mov $in0,%rsi # borrow $a3
672 vmovdqu ($ivp),$iv # load IV
684 vmovdqa 0x00(%r13,%r14,8),$mask14
685 vmovdqa 0x10(%r13,%r14,8),$mask12
686 vmovdqa 0x20(%r13,%r14,8),$mask10
687 vmovdqu 0x00-0x80($inp),$roundkey
689 if ($SZ==4) { # SHA256
690 my @X = map("%xmm$_",(0..3));
691 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
697 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
698 vmovdqu 0x00(%rsi,%r12),@X[0]
699 vmovdqu 0x10(%rsi,%r12),@X[1]
700 vmovdqu 0x20(%rsi,%r12),@X[2]
701 vmovdqu 0x30(%rsi,%r12),@X[3]
702 vpshufb $t3,@X[0],@X[0]
703 lea $TABLE(%rip),$Tbl
704 vpshufb $t3,@X[1],@X[1]
705 vpshufb $t3,@X[2],@X[2]
706 vpaddd 0x00($Tbl),@X[0],$t0
707 vpshufb $t3,@X[3],@X[3]
708 vpaddd 0x20($Tbl),@X[1],$t1
709 vpaddd 0x40($Tbl),@X[2],$t2
710 vpaddd 0x60($Tbl),@X[3],$t3
711 vmovdqa $t0,0x00(%rsp)
713 vmovdqa $t1,0x10(%rsp)
715 vmovdqa $t2,0x20(%rsp)
717 vmovdqa $t3,0x30(%rsp)
723 sub \$-16*2*$SZ,$Tbl # size optimization
724 vmovdqu (%r12),$inout # $a4
727 sub Xupdate_256_AVX () {
729 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
730 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
731 '&vpsrld ($t2,$t0,$sigma0[0]);',
732 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
733 '&vpsrld ($t3,$t0,$sigma0[2])',
734 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
735 '&vpxor ($t0,$t3,$t2)',
736 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
737 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
738 '&vpxor ($t0,$t0,$t1)',
739 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
740 '&vpxor ($t0,$t0,$t2)',
741 '&vpsrld ($t2,$t3,$sigma1[2]);',
742 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
743 '&vpsrlq ($t3,$t3,$sigma1[0]);',
744 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
745 '&vpxor ($t2,$t2,$t3);',
746 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
747 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
748 '&vpshufd ($t2,$t2,0b10000100)',
749 '&vpsrldq ($t2,$t2,8)',
750 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
751 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
752 '&vpsrld ($t2,$t3,$sigma1[2])',
753 '&vpsrlq ($t3,$t3,$sigma1[0])',
754 '&vpxor ($t2,$t2,$t3);',
755 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
756 '&vpxor ($t2,$t2,$t3)',
757 '&vpshufd ($t2,$t2,0b11101000)',
758 '&vpslldq ($t2,$t2,8)',
759 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
763 sub AVX_256_00_47 () {
767 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
769 foreach (Xupdate_256_AVX()) { # 29 instructions
775 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
776 foreach (@insns) { eval; } # remaining instructions
777 &vmovdqa (16*$j."(%rsp)",$t2);
781 for ($i=0,$j=0; $j<4; $j++) {
782 &AVX_256_00_47($j,\&body_00_15,@X);
783 push(@X,shift(@X)); # rotate(@X)
785 &mov ("%r12",$_inp); # borrow $a4
786 &vpand ($temp,$temp,$mask14);
787 &mov ("%r15",$_out); # borrow $a2
788 &vpor ($iv,$iv,$temp);
789 &vmovdqu ("(%r15,%r12)",$iv); # write output
790 &lea ("%r12","16(%r12)"); # inp++
792 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
793 &jne (".Lavx_00_47");
795 &vmovdqu ($inout,"(%r12)");
799 for ($i=0; $i<16; ) {
800 foreach(body_00_15()) { eval; }
805 mov $_inp,%r12 # borrow $a4
806 mov $_out,%r13 # borrow $a0
807 mov $_ctx,%r15 # borrow $a2
808 mov $_in0,%rsi # borrow $a3
810 vpand $mask14,$temp,$temp
813 vmovdqu $iv,(%r13,%r12) # write output
814 lea 16(%r12),%r12 # inp++
839 vmovdqu $iv,($ivp) # output IV
842 $code.=<<___ if ($win64);
843 movaps `$framesz+16*0`(%rsp),%xmm6
844 movaps `$framesz+16*1`(%rsp),%xmm7
845 movaps `$framesz+16*2`(%rsp),%xmm8
846 movaps `$framesz+16*3`(%rsp),%xmm9
847 movaps `$framesz+16*4`(%rsp),%xmm10
848 movaps `$framesz+16*5`(%rsp),%xmm11
849 movaps `$framesz+16*6`(%rsp),%xmm12
850 movaps `$framesz+16*7`(%rsp),%xmm13
851 movaps `$framesz+16*8`(%rsp),%xmm14
852 movaps `$framesz+16*9`(%rsp),%xmm15
864 .size ${func}_avx,.-${func}_avx
868 ######################################################################
871 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
876 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
878 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
880 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
881 '&and ($a4,$e)', # f&e
882 '&rorx ($a0,$e,$Sigma1[2])',
883 '&rorx ($a2,$e,$Sigma1[1])',
885 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
886 '&lea ($h,"($h,$a4)")',
887 '&andn ($a4,$e,$g)', # ~e&g
890 '&rorx ($a1,$e,$Sigma1[0])',
891 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
892 '&xor ($a0,$a1)', # Sigma1(e)
895 '&rorx ($a4,$a,$Sigma0[2])',
896 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
897 '&xor ($a2,$b)', # a^b, b^c in next round
898 '&rorx ($a1,$a,$Sigma0[1])',
900 '&rorx ($a0,$a,$Sigma0[0])',
901 '&lea ($d,"($d,$h)")', # d+=h
902 '&and ($a3,$a2)', # (b^c)&(a^b)
903 @aesni_cbc_block[$aesni_cbc_idx++].
906 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
907 '&xor ($a1,$a0)', # Sigma0(a)
908 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
909 '&mov ($a4,$e)', # copy of f in future
911 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
913 # and at the finish one has to $a+=$a1
917 .type ${func}_avx2,\@function,6
921 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
928 mov %rsp,%r11 # copy %rsp
929 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
930 and \$-256*$SZ,%rsp # align stack frame
931 add \$`2*$SZ*($rounds-8)`,%rsp
934 sub $inp,$out # re-bias
936 add $inp,$len # end of input
938 #mov $inp,$_inp # saved later
939 #mov $out,$_out # kept in $offload
941 #mov $key,$_key # remains resident in $inp register
947 $code.=<<___ if ($win64);
948 movaps %xmm6,`$framesz+16*0`(%rsp)
949 movaps %xmm7,`$framesz+16*1`(%rsp)
950 movaps %xmm8,`$framesz+16*2`(%rsp)
951 movaps %xmm9,`$framesz+16*3`(%rsp)
952 movaps %xmm10,`$framesz+16*4`(%rsp)
953 movaps %xmm11,`$framesz+16*5`(%rsp)
954 movaps %xmm12,`$framesz+16*6`(%rsp)
955 movaps %xmm13,`$framesz+16*7`(%rsp)
956 movaps %xmm14,`$framesz+16*8`(%rsp)
957 movaps %xmm15,`$framesz+16*9`(%rsp)
963 mov $inp,%r13 # borrow $a0
964 vpinsrq \$1,$out,$offload,$offload
965 lea 0x80($key),$inp # size optimization, reassign
966 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
967 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
968 mov $ctx,%r15 # borrow $a2
969 mov $in0,%rsi # borrow $a3
970 vmovdqu ($ivp),$iv # load IV
973 vmovdqa 0x00(%r12,%r14,8),$mask14
974 vmovdqa 0x10(%r12,%r14,8),$mask12
975 vmovdqa 0x20(%r12,%r14,8),$mask10
977 sub \$-16*$SZ,%r13 # inp++, size optimization
979 lea (%rsi,%r13),%r12 # borrow $a0
981 cmp $len,%r13 # $_end
983 cmove %rsp,%r12 # next block or random data
989 vmovdqu 0x00-0x80($inp),$roundkey
991 if ($SZ==4) { # SHA256
992 my @X = map("%ymm$_",(0..3));
993 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
999 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1000 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1001 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1002 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1003 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1005 vinserti128 \$1,(%r12),@X[0],@X[0]
1006 vinserti128 \$1,16(%r12),@X[1],@X[1]
1007 vpshufb $t3,@X[0],@X[0]
1008 vinserti128 \$1,32(%r12),@X[2],@X[2]
1009 vpshufb $t3,@X[1],@X[1]
1010 vinserti128 \$1,48(%r12),@X[3],@X[3]
1012 lea $TABLE(%rip),$Tbl
1013 vpshufb $t3,@X[2],@X[2]
1014 lea -16*$SZ(%r13),%r13
1015 vpaddd 0x00($Tbl),@X[0],$t0
1016 vpshufb $t3,@X[3],@X[3]
1017 vpaddd 0x20($Tbl),@X[1],$t1
1018 vpaddd 0x40($Tbl),@X[2],$t2
1019 vpaddd 0x60($Tbl),@X[3],$t3
1020 vmovdqa $t0,0x00(%rsp)
1022 vmovdqa $t1,0x20(%rsp)
1023 lea -$PUSH8(%rsp),%rsp
1025 vmovdqa $t2,0x00(%rsp)
1027 vmovdqa $t3,0x20(%rsp)
1029 sub \$-16*2*$SZ,$Tbl # size optimization
1034 vmovdqu (%r13),$inout
1035 vpinsrq \$0,%r13,$offload,$offload
1038 sub AVX2_256_00_47 () {
1042 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1043 my $base = "+2*$PUSH8(%rsp)";
1045 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1046 foreach (Xupdate_256_AVX()) { # 29 instructions
1048 eval(shift(@insns));
1049 eval(shift(@insns));
1050 eval(shift(@insns));
1052 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1053 foreach (@insns) { eval; } # remaining instructions
1054 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1057 for ($i=0,$j=0; $j<4; $j++) {
1058 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1059 push(@X,shift(@X)); # rotate(@X)
1061 &vmovq ("%r13",$offload); # borrow $a0
1062 &vpextrq ("%r15",$offload,1); # borrow $a2
1063 &vpand ($temp,$temp,$mask14);
1064 &vpor ($iv,$iv,$temp);
1065 &vmovdqu ("(%r15,%r13)",$iv); # write output
1066 &lea ("%r13","16(%r13)"); # inp++
1068 &lea ($Tbl,16*2*$SZ."($Tbl)");
1069 &cmpb (($SZ-1)."($Tbl)",0);
1070 &jne (".Lavx2_00_47");
1072 &vmovdqu ($inout,"(%r13)");
1073 &vpinsrq ($offload,$offload,"%r13",0);
1076 for ($i=0; $i<16; ) {
1077 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1078 foreach(bodyx_00_15()) { eval; }
1082 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1083 vmovq $offload,%r13 # $_inp, borrow $a0
1084 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1086 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1088 vpand $mask14,$temp,$temp
1090 vmovdqu $iv,(%r12,%r13) # write output
1111 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1121 vmovdqu (%r13),$inout
1122 vpinsrq \$0,%r13,$offload,$offload
1125 for ($i=0; $i<16; ) {
1126 my $base="+16($Tbl)";
1127 foreach(bodyx_00_15()) { eval; }
1128 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1131 vmovq $offload,%r13 # borrow $a0
1132 vpextrq \$1,$offload,%r15 # borrow $a2
1133 vpand $mask14,$temp,$temp
1135 lea -$PUSH8($Tbl),$Tbl
1136 vmovdqu $iv,(%r15,%r13) # write output
1137 lea 16(%r13),%r13 # inp++
1141 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1142 lea 16*$SZ(%r13),%r13
1143 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1145 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1154 lea (%rsi,%r13),%r12
1160 cmove %rsp,%r12 # next block or stale data
1176 vmovdqu $iv,($ivp) # output IV
1179 $code.=<<___ if ($win64);
1180 movaps `$framesz+16*0`(%rsp),%xmm6
1181 movaps `$framesz+16*1`(%rsp),%xmm7
1182 movaps `$framesz+16*2`(%rsp),%xmm8
1183 movaps `$framesz+16*3`(%rsp),%xmm9
1184 movaps `$framesz+16*4`(%rsp),%xmm10
1185 movaps `$framesz+16*5`(%rsp),%xmm11
1186 movaps `$framesz+16*6`(%rsp),%xmm12
1187 movaps `$framesz+16*7`(%rsp),%xmm13
1188 movaps `$framesz+16*8`(%rsp),%xmm14
1189 movaps `$framesz+16*9`(%rsp),%xmm15
1201 .size ${func}_avx2,.-${func}_avx2
1206 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1208 my ($rounds,$Tbl)=("%r11d","%rbx");
1210 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1211 my @rndkey=("%xmm4","%xmm5");
1215 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1216 my @MSG=map("%xmm$_",(10..13));
1220 my ($n,$k)=($r/10,$r%10);
1223 movups `16*$n`($in0),$in # load input
1226 $code.=<<___ if ($n);
1227 movups $iv,`16*($n-1)`($out,$in0) # write output
1231 movups `32+16*$k-112`($key),$rndkey[1]
1232 aesenc $rndkey[0],$iv
1239 movups `32+16*($k+0)-112`($key),$rndkey[1]
1240 aesenc $rndkey[0],$iv
1241 movups `32+16*($k+1)-112`($key),$rndkey[0]
1242 aesenc $rndkey[1],$iv
1244 movups `32+16*($k+2)-112`($key),$rndkey[1]
1245 aesenc $rndkey[0],$iv
1246 movups `32+16*($k+3)-112`($key),$rndkey[0]
1247 aesenc $rndkey[1],$iv
1249 aesenclast $rndkey[0],$iv
1250 movups 16-112($key),$rndkey[1] # forward reference
1255 movups `32+16*$k-112`($key),$rndkey[1]
1256 aesenc $rndkey[0],$iv
1259 $r++; unshift(@rndkey,pop(@rndkey));
1263 .type ${func}_shaext,\@function,6
1267 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
1270 $code.=<<___ if ($win64);
1271 lea `-4*16`(%rsp),%rsp
1272 movaps %xmm6,-8-10*16(%rax)
1273 movaps %xmm7,-8-9*16(%rax)
1274 movaps %xmm8,-8-8*16(%rax)
1275 movaps %xmm9,-8-7*16(%rax)
1276 movaps %xmm10,-8-6*16(%rax)
1277 movaps %xmm11,-8-5*16(%rax)
1278 movaps %xmm12,-8-4*16(%rax)
1279 movaps %xmm13,-8-3*16(%rax)
1280 movaps %xmm14,-8-2*16(%rax)
1281 movaps %xmm15,-8-1*16(%rax)
1285 lea K256+0x80(%rip),$Tbl
1286 movdqu ($ctx),$ABEF # DCBA
1287 movdqu 16($ctx),$CDGH # HGFE
1288 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1290 mov 240($key),$rounds
1292 movups ($key),$rndkey0 # $key[0]
1293 movups 16($key),$rndkey[0] # forward reference
1294 lea 112($key),$key # size optimization
1296 pshufd \$0x1b,$ABEF,$Wi # ABCD
1297 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1298 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1299 movdqa $TMP,$BSWAP # offload
1300 palignr \$8,$CDGH,$ABEF # ABEF
1301 punpcklqdq $Wi,$CDGH # CDGH
1307 movdqu ($inp),@MSG[0]
1308 movdqu 0x10($inp),@MSG[1]
1309 movdqu 0x20($inp),@MSG[2]
1311 movdqu 0x30($inp),@MSG[3]
1313 movdqa 0*32-0x80($Tbl),$Wi
1316 movdqa $CDGH,$CDGH_SAVE # offload
1317 movdqa $ABEF,$ABEF_SAVE # offload
1321 sha256rnds2 $ABEF,$CDGH # 0-3
1322 pshufd \$0x0e,$Wi,$Wi
1326 sha256rnds2 $CDGH,$ABEF
1328 movdqa 1*32-0x80($Tbl),$Wi
1335 sha256rnds2 $ABEF,$CDGH # 4-7
1336 pshufd \$0x0e,$Wi,$Wi
1340 sha256rnds2 $CDGH,$ABEF
1342 movdqa 2*32-0x80($Tbl),$Wi
1345 sha256msg1 @MSG[1],@MSG[0]
1349 sha256rnds2 $ABEF,$CDGH # 8-11
1350 pshufd \$0x0e,$Wi,$Wi
1352 palignr \$4,@MSG[2],$TMP
1357 sha256rnds2 $CDGH,$ABEF
1359 movdqa 3*32-0x80($Tbl),$Wi
1361 sha256msg2 @MSG[3],@MSG[0]
1362 sha256msg1 @MSG[2],@MSG[1]
1366 sha256rnds2 $ABEF,$CDGH # 12-15
1367 pshufd \$0x0e,$Wi,$Wi
1372 palignr \$4,@MSG[3],$TMP
1374 sha256rnds2 $CDGH,$ABEF
1376 for($i=4;$i<16-3;$i++) {
1377 &$aesenc() if (($r%10)==0);
1379 movdqa $i*32-0x80($Tbl),$Wi
1381 sha256msg2 @MSG[0],@MSG[1]
1382 sha256msg1 @MSG[3],@MSG[2]
1386 sha256rnds2 $ABEF,$CDGH # 16-19...
1387 pshufd \$0x0e,$Wi,$Wi
1389 palignr \$4,@MSG[0],$TMP
1393 &$aesenc() if ($r==19);
1395 sha256rnds2 $CDGH,$ABEF
1397 push(@MSG,shift(@MSG));
1400 movdqa 13*32-0x80($Tbl),$Wi
1402 sha256msg2 @MSG[0],@MSG[1]
1403 sha256msg1 @MSG[3],@MSG[2]
1407 sha256rnds2 $ABEF,$CDGH # 52-55
1408 pshufd \$0x0e,$Wi,$Wi
1410 palignr \$4,@MSG[0],$TMP
1416 sha256rnds2 $CDGH,$ABEF
1418 movdqa 14*32-0x80($Tbl),$Wi
1420 sha256msg2 @MSG[1],@MSG[2]
1425 sha256rnds2 $ABEF,$CDGH # 56-59
1426 pshufd \$0x0e,$Wi,$Wi
1430 sha256rnds2 $CDGH,$ABEF
1432 movdqa 15*32-0x80($Tbl),$Wi
1438 sha256rnds2 $ABEF,$CDGH # 60-63
1439 pshufd \$0x0e,$Wi,$Wi
1443 sha256rnds2 $CDGH,$ABEF
1444 #pxor $CDGH,$rndkey0 # black magic
1446 while ($r<40) { &$aesenc(); } # remaining aesenc's
1448 #xorps $CDGH,$rndkey0 # black magic
1449 paddd $CDGH_SAVE,$CDGH
1450 paddd $ABEF_SAVE,$ABEF
1453 movups $iv,48($out,$in0) # write output
1457 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1458 pshufd \$0x1b,$ABEF,$TMP # FEBA
1459 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1460 punpckhqdq $CDGH,$ABEF # DCBA
1461 palignr \$8,$TMP,$CDGH # HGFE
1463 movups $iv,($ivp) # write IV
1465 movdqu $CDGH,16($ctx)
1467 $code.=<<___ if ($win64);
1468 movaps -8-10*16(%rax),%xmm6
1469 movaps -8-9*16(%rax),%xmm7
1470 movaps -8-8*16(%rax),%xmm8
1471 movaps -8-7*16(%rax),%xmm9
1472 movaps -8-6*16(%rax),%xmm10
1473 movaps -8-5*16(%rax),%xmm11
1474 movaps -8-4*16(%rax),%xmm12
1475 movaps -8-3*16(%rax),%xmm13
1476 movaps -8-2*16(%rax),%xmm14
1477 movaps -8-1*16(%rax),%xmm15
1484 .size ${func}_shaext,.-${func}_shaext
1488 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1489 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1496 $code.=<<___ if ($avx);
1497 .extern __imp_RtlVirtualUnwind
1498 .type se_handler,\@abi-omnipotent
1512 mov 120($context),%rax # pull context->Rax
1513 mov 248($context),%rbx # pull context->Rip
1515 mov 8($disp),%rsi # disp->ImageBase
1516 mov 56($disp),%r11 # disp->HanderlData
1518 mov 0(%r11),%r10d # HandlerData[0]
1519 lea (%rsi,%r10),%r10 # prologue label
1520 cmp %r10,%rbx # context->Rip<prologue label
1523 mov 152($context),%rax # pull context->Rsp
1525 mov 4(%r11),%r10d # HandlerData[1]
1526 lea (%rsi,%r10),%r10 # epilogue label
1527 cmp %r10,%rbx # context->Rip>=epilogue label
1530 $code.=<<___ if ($avx>1);
1531 lea .Lavx2_shortcut(%rip),%r10
1532 cmp %r10,%rbx # context->Rip<avx2_shortcut
1536 add \$`2*$SZ*($rounds-8)`,%rax
1540 mov %rax,%rsi # put aside Rsp
1541 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1550 mov %rbx,144($context) # restore context->Rbx
1551 mov %rbp,160($context) # restore context->Rbp
1552 mov %r12,216($context) # restore context->R12
1553 mov %r13,224($context) # restore context->R13
1554 mov %r14,232($context) # restore context->R14
1555 mov %r15,240($context) # restore context->R15
1557 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1558 lea 512($context),%rdi # &context.Xmm6
1560 .long 0xa548f3fc # cld; rep movsq
1565 mov %rax,152($context) # restore context->Rsp
1566 mov %rsi,168($context) # restore context->Rsi
1567 mov %rdi,176($context) # restore context->Rdi
1569 mov 40($disp),%rdi # disp->ContextRecord
1570 mov $context,%rsi # context
1571 mov \$154,%ecx # sizeof(CONTEXT)
1572 .long 0xa548f3fc # cld; rep movsq
1575 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1576 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1577 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1578 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1579 mov 40(%rsi),%r10 # disp->ContextRecord
1580 lea 56(%rsi),%r11 # &disp->HandlerData
1581 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1582 mov %r10,32(%rsp) # arg5
1583 mov %r11,40(%rsp) # arg6
1584 mov %r12,48(%rsp) # arg7
1585 mov %rcx,56(%rsp) # arg8, (NULL)
1586 call *__imp_RtlVirtualUnwind(%rip)
1588 mov \$1,%eax # ExceptionContinueSearch
1600 .size se_handler,.-se_handler
1603 .rva .LSEH_begin_${func}_xop
1604 .rva .LSEH_end_${func}_xop
1605 .rva .LSEH_info_${func}_xop
1607 .rva .LSEH_begin_${func}_avx
1608 .rva .LSEH_end_${func}_avx
1609 .rva .LSEH_info_${func}_avx
1611 $code.=<<___ if ($avx>1);
1612 .rva .LSEH_begin_${func}_avx2
1613 .rva .LSEH_end_${func}_avx2
1614 .rva .LSEH_info_${func}_avx2
1616 $code.=<<___ if ($avx);
1619 .LSEH_info_${func}_xop:
1622 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1624 .LSEH_info_${func}_avx:
1627 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1629 $code.=<<___ if ($avx>1);
1630 .LSEH_info_${func}_avx2:
1633 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1637 ####################################################################
1639 local *opcode=shift;
1643 $rex|=0x04 if($dst>=8);
1644 $rex|=0x01 if($src>=8);
1645 unshift @opcode,$rex|0x40 if($rex);
1650 "sha256rnds2" => 0xcb,
1651 "sha256msg1" => 0xcc,
1652 "sha256msg2" => 0xcd );
1657 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
1658 my @opcode=(0x0f,0x38);
1659 rex(\@opcode,$2,$1);
1660 push @opcode,$opcodelet{$instr};
1661 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1662 return ".byte\t".join(',',@opcode);
1664 return $instr."\t".@_[0];
1669 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1670 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;