2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
21 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
22 # parallelism, interleaving it with another algorithm would allow to
23 # utilize processor resources better and achieve better performance.
24 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25 # AESNI code is weaved into it. As SHA256 dominates execution time,
26 # stitch performance does not depend on AES key length. Below are
27 # performance numbers in cycles per processed byte, less is better,
28 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
31 # AES-128/-192/-256+SHA256 this(**) gain
32 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
33 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
34 # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
35 # Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
36 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
37 # Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54%
38 # Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60%
40 # (*) there are XOP, AVX1 and AVX2 code paths, meaning that
41 # Westmere is omitted from loop, this is because gain was not
42 # estimated high enough to justify the effort;
43 # (**) these are EVP-free results, results obtained with 'speed
44 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
45 # (***) these are SHAEXT results;
47 # $output is the last argument if it looks like a file (it has an extension)
48 # $flavour is the first argument if it doesn't look like a file
49 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
50 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
52 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
54 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
56 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
57 die "can't locate x86_64-xlate.pl";
59 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
60 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
61 $avx = ($1>=2.19) + ($1>=2.22);
64 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
65 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
66 $avx = ($1>=2.09) + ($1>=2.10);
69 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
70 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
71 $avx = ($1>=10) + ($1>=12);
74 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
75 $avx = ($2>=3.0) + ($2>3.0);
78 $shaext=$avx; ### set to zero if compiling for 1.0.1
79 $avx=1 if (!$shaext && $avx);
81 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
82 or die "can't call $xlate: $!";
85 $func="aesni_cbc_sha256_enc";
88 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
89 "%r8d","%r9d","%r10d","%r11d");
90 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
97 ########################################################################
98 # void aesni_cbc_sha256_enc(const void *inp,
101 # const AES_KEY *key,
105 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
106 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
110 $_inp="16*$SZ+0*8(%rsp)";
111 $_out="16*$SZ+1*8(%rsp)";
112 $_end="16*$SZ+2*8(%rsp)";
113 $_key="16*$SZ+3*8(%rsp)";
114 $_ivp="16*$SZ+4*8(%rsp)";
115 $_ctx="16*$SZ+5*8(%rsp)";
116 $_in0="16*$SZ+6*8(%rsp)";
117 $_rsp="`16*$SZ+7*8`(%rsp)";
123 .extern OPENSSL_ia32cap_P
125 .type $func,\@abi-omnipotent
131 lea OPENSSL_ia32cap_P(%rip),%r11
133 cmp \$0,`$win64?"%rcx":"%rdi"`
138 $code.=<<___ if ($shaext);
139 bt \$61,%r10 # check for SHA
146 test \$`1<<11`,%r10d # check for XOP
149 $code.=<<___ if ($avx>1);
150 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
151 cmp \$`1<<8|1<<5|1<<3`,%r11d
155 and \$`1<<28`,%r10d # check for AVX
162 cmp \$0,`$win64?"%rcx":"%rdi"`
170 .type $TABLE,\@object
172 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
173 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
174 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
175 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
176 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
177 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
178 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
179 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
180 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
181 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
182 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
183 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
184 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
185 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
186 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
187 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
188 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
189 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
190 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
191 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
192 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
193 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
194 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
195 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
196 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
197 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
198 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
199 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
200 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
201 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
202 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
203 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
205 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
206 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
207 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
208 .long 0,0,0,0, 0,0,0,0
209 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
213 ######################################################################
217 ($iv,$inout,$roundkey,$temp,
218 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
222 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
223 ## &vmovdqu ($inout,($inp));
224 ## &mov ($_inp,$inp);
226 '&vpxor ($inout,$inout,$roundkey);'.
227 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
229 '&vpxor ($inout,$inout,$iv);',
231 '&vaesenc ($inout,$inout,$roundkey);'.
232 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
234 '&vaesenc ($inout,$inout,$roundkey);'.
235 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
237 '&vaesenc ($inout,$inout,$roundkey);'.
238 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
240 '&vaesenc ($inout,$inout,$roundkey);'.
241 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
243 '&vaesenc ($inout,$inout,$roundkey);'.
244 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
246 '&vaesenc ($inout,$inout,$roundkey);'.
247 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
249 '&vaesenc ($inout,$inout,$roundkey);'.
250 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
252 '&vaesenc ($inout,$inout,$roundkey);'.
253 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
255 '&vaesenc ($inout,$inout,$roundkey);'.
256 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
258 '&vaesenclast ($temp,$inout,$roundkey);'.
259 ' &vaesenc ($inout,$inout,$roundkey);'.
260 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
262 '&vpand ($iv,$temp,$mask10);'.
263 ' &vaesenc ($inout,$inout,$roundkey);'.
264 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
266 '&vaesenclast ($temp,$inout,$roundkey);'.
267 ' &vaesenc ($inout,$inout,$roundkey);'.
268 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
270 '&vpand ($temp,$temp,$mask12);'.
271 ' &vaesenc ($inout,$inout,$roundkey);'.
272 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
274 '&vpor ($iv,$iv,$temp);'.
275 ' &vaesenclast ($temp,$inout,$roundkey);'.
276 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
278 ## &mov ($inp,$_inp);
279 ## &mov ($out,$_out);
280 ## &vpand ($temp,$temp,$mask14);
281 ## &vpor ($iv,$iv,$temp);
282 ## &vmovdqu ($iv,($out,$inp);
283 ## &lea (inp,16($inp));
287 my ($a,$b,$c,$d,$e,$f,$g,$h);
289 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
290 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
292 $arg = "\$$arg" if ($arg*1 eq $arg);
293 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
298 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
300 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
305 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
306 '&xor ($a4,$g)', # f^g
308 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
310 '&and ($a4,$e)', # (f^g)&e
312 @aesni_cbc_block[$aesni_cbc_idx++].
314 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
317 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
318 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
319 '&xor ($a2,$b)', # a^b, b^c in next round
321 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
322 '&add ($h,$a4)', # h+=Ch(e,f,g)
323 '&and ($a3,$a2)', # (b^c)&(a^b)
326 '&add ($h,$a0)', # h+=Sigma1(e)
327 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
329 '&add ($d,$h)', # d+=h
330 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
331 '&add ($h,$a3)', # h+=Maj(a,b,c)
334 '&add ($a1,$h);'. # h+=Sigma0(a)
335 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
340 ######################################################################
344 .type ${func}_xop,\@function,6
349 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
350 mov %rsp,%rax # copy %rsp
351 .cfi_def_cfa_register %rax
364 sub \$`$framesz+$win64*16*10`,%rsp
365 and \$-64,%rsp # align stack frame
368 sub $inp,$out # re-bias
370 add $inp,$len # end of input
372 #mov $inp,$_inp # saved later
375 #mov $key,$_key # remains resident in $inp register
380 .cfi_cfa_expression $_rsp,deref,+8
382 $code.=<<___ if ($win64);
383 movaps %xmm6,`$framesz+16*0`(%rsp)
384 movaps %xmm7,`$framesz+16*1`(%rsp)
385 movaps %xmm8,`$framesz+16*2`(%rsp)
386 movaps %xmm9,`$framesz+16*3`(%rsp)
387 movaps %xmm10,`$framesz+16*4`(%rsp)
388 movaps %xmm11,`$framesz+16*5`(%rsp)
389 movaps %xmm12,`$framesz+16*6`(%rsp)
390 movaps %xmm13,`$framesz+16*7`(%rsp)
391 movaps %xmm14,`$framesz+16*8`(%rsp)
392 movaps %xmm15,`$framesz+16*9`(%rsp)
398 mov $inp,%r12 # borrow $a4
399 lea 0x80($key),$inp # size optimization, reassign
400 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
401 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
402 mov $ctx,%r15 # borrow $a2
403 mov $in0,%rsi # borrow $a3
404 vmovdqu ($ivp),$iv # load IV
416 vmovdqa 0x00(%r13,%r14,8),$mask14
417 vmovdqa 0x10(%r13,%r14,8),$mask12
418 vmovdqa 0x20(%r13,%r14,8),$mask10
419 vmovdqu 0x00-0x80($inp),$roundkey
422 if ($SZ==4) { # SHA256
423 my @X = map("%xmm$_",(0..3));
424 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
429 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
430 vmovdqu 0x00(%rsi,%r12),@X[0]
431 vmovdqu 0x10(%rsi,%r12),@X[1]
432 vmovdqu 0x20(%rsi,%r12),@X[2]
433 vmovdqu 0x30(%rsi,%r12),@X[3]
434 vpshufb $t3,@X[0],@X[0]
435 lea $TABLE(%rip),$Tbl
436 vpshufb $t3,@X[1],@X[1]
437 vpshufb $t3,@X[2],@X[2]
438 vpaddd 0x00($Tbl),@X[0],$t0
439 vpshufb $t3,@X[3],@X[3]
440 vpaddd 0x20($Tbl),@X[1],$t1
441 vpaddd 0x40($Tbl),@X[2],$t2
442 vpaddd 0x60($Tbl),@X[3],$t3
443 vmovdqa $t0,0x00(%rsp)
445 vmovdqa $t1,0x10(%rsp)
447 vmovdqa $t2,0x20(%rsp)
449 vmovdqa $t3,0x30(%rsp)
455 sub \$-16*2*$SZ,$Tbl # size optimization
456 vmovdqu (%r12),$inout # $a4
459 sub XOP_256_00_47 () {
463 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
465 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
468 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
471 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
474 &vpsrld ($t0,$t0,$sigma0[2]);
477 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
482 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
485 &vpxor ($t0,$t0,$t1);
490 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
493 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
496 &vpsrld ($t2,@X[3],$sigma1[2]);
499 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
502 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
505 &vpxor ($t3,$t3,$t2);
510 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
515 &vpsrldq ($t3,$t3,8);
520 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
525 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
528 &vpsrld ($t2,@X[0],$sigma1[2]);
531 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
534 &vpxor ($t3,$t3,$t2);
539 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
544 &vpslldq ($t3,$t3,8); # 22 instructions
549 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
554 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
555 foreach (@insns) { eval; } # remaining instructions
556 &vmovdqa (16*$j."(%rsp)",$t2);
560 for ($i=0,$j=0; $j<4; $j++) {
561 &XOP_256_00_47($j,\&body_00_15,@X);
562 push(@X,shift(@X)); # rotate(@X)
564 &mov ("%r12",$_inp); # borrow $a4
565 &vpand ($temp,$temp,$mask14);
566 &mov ("%r15",$_out); # borrow $a2
567 &vpor ($iv,$iv,$temp);
568 &vmovdqu ("(%r15,%r12)",$iv); # write output
569 &lea ("%r12","16(%r12)"); # inp++
571 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
572 &jne (".Lxop_00_47");
574 &vmovdqu ($inout,"(%r12)");
578 for ($i=0; $i<16; ) {
579 foreach(body_00_15()) { eval; }
583 mov $_inp,%r12 # borrow $a4
584 mov $_out,%r13 # borrow $a0
585 mov $_ctx,%r15 # borrow $a2
586 mov $_in0,%rsi # borrow $a3
588 vpand $mask14,$temp,$temp
591 vmovdqu $iv,(%r13,%r12) # write output
592 lea 16(%r12),%r12 # inp++
619 vmovdqu $iv,($ivp) # output IV
622 $code.=<<___ if ($win64);
623 movaps `$framesz+16*0`(%rsp),%xmm6
624 movaps `$framesz+16*1`(%rsp),%xmm7
625 movaps `$framesz+16*2`(%rsp),%xmm8
626 movaps `$framesz+16*3`(%rsp),%xmm9
627 movaps `$framesz+16*4`(%rsp),%xmm10
628 movaps `$framesz+16*5`(%rsp),%xmm11
629 movaps `$framesz+16*6`(%rsp),%xmm12
630 movaps `$framesz+16*7`(%rsp),%xmm13
631 movaps `$framesz+16*8`(%rsp),%xmm14
632 movaps `$framesz+16*9`(%rsp),%xmm15
648 .cfi_def_cfa_register %rsp
652 .size ${func}_xop,.-${func}_xop
654 ######################################################################
657 local *ror = sub { &shrd(@_[0],@_) };
660 .type ${func}_avx,\@function,6
665 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
666 mov %rsp,%rax # copy %rsp
667 .cfi_def_cfa_register %rax
680 sub \$`$framesz+$win64*16*10`,%rsp
681 and \$-64,%rsp # align stack frame
684 sub $inp,$out # re-bias
686 add $inp,$len # end of input
688 #mov $inp,$_inp # saved later
691 #mov $key,$_key # remains resident in $inp register
696 .cfi_cfa_expression $_rsp,deref,+8
698 $code.=<<___ if ($win64);
699 movaps %xmm6,`$framesz+16*0`(%rsp)
700 movaps %xmm7,`$framesz+16*1`(%rsp)
701 movaps %xmm8,`$framesz+16*2`(%rsp)
702 movaps %xmm9,`$framesz+16*3`(%rsp)
703 movaps %xmm10,`$framesz+16*4`(%rsp)
704 movaps %xmm11,`$framesz+16*5`(%rsp)
705 movaps %xmm12,`$framesz+16*6`(%rsp)
706 movaps %xmm13,`$framesz+16*7`(%rsp)
707 movaps %xmm14,`$framesz+16*8`(%rsp)
708 movaps %xmm15,`$framesz+16*9`(%rsp)
714 mov $inp,%r12 # borrow $a4
715 lea 0x80($key),$inp # size optimization, reassign
716 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
717 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
718 mov $ctx,%r15 # borrow $a2
719 mov $in0,%rsi # borrow $a3
720 vmovdqu ($ivp),$iv # load IV
732 vmovdqa 0x00(%r13,%r14,8),$mask14
733 vmovdqa 0x10(%r13,%r14,8),$mask12
734 vmovdqa 0x20(%r13,%r14,8),$mask10
735 vmovdqu 0x00-0x80($inp),$roundkey
737 if ($SZ==4) { # SHA256
738 my @X = map("%xmm$_",(0..3));
739 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
745 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
746 vmovdqu 0x00(%rsi,%r12),@X[0]
747 vmovdqu 0x10(%rsi,%r12),@X[1]
748 vmovdqu 0x20(%rsi,%r12),@X[2]
749 vmovdqu 0x30(%rsi,%r12),@X[3]
750 vpshufb $t3,@X[0],@X[0]
751 lea $TABLE(%rip),$Tbl
752 vpshufb $t3,@X[1],@X[1]
753 vpshufb $t3,@X[2],@X[2]
754 vpaddd 0x00($Tbl),@X[0],$t0
755 vpshufb $t3,@X[3],@X[3]
756 vpaddd 0x20($Tbl),@X[1],$t1
757 vpaddd 0x40($Tbl),@X[2],$t2
758 vpaddd 0x60($Tbl),@X[3],$t3
759 vmovdqa $t0,0x00(%rsp)
761 vmovdqa $t1,0x10(%rsp)
763 vmovdqa $t2,0x20(%rsp)
765 vmovdqa $t3,0x30(%rsp)
771 sub \$-16*2*$SZ,$Tbl # size optimization
772 vmovdqu (%r12),$inout # $a4
775 sub Xupdate_256_AVX () {
777 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
778 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
779 '&vpsrld ($t2,$t0,$sigma0[0]);',
780 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
781 '&vpsrld ($t3,$t0,$sigma0[2])',
782 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
783 '&vpxor ($t0,$t3,$t2)',
784 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
785 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
786 '&vpxor ($t0,$t0,$t1)',
787 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
788 '&vpxor ($t0,$t0,$t2)',
789 '&vpsrld ($t2,$t3,$sigma1[2]);',
790 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
791 '&vpsrlq ($t3,$t3,$sigma1[0]);',
792 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
793 '&vpxor ($t2,$t2,$t3);',
794 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
795 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
796 '&vpshufd ($t2,$t2,0b10000100)',
797 '&vpsrldq ($t2,$t2,8)',
798 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
799 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
800 '&vpsrld ($t2,$t3,$sigma1[2])',
801 '&vpsrlq ($t3,$t3,$sigma1[0])',
802 '&vpxor ($t2,$t2,$t3);',
803 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
804 '&vpxor ($t2,$t2,$t3)',
805 '&vpshufd ($t2,$t2,0b11101000)',
806 '&vpslldq ($t2,$t2,8)',
807 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
811 sub AVX_256_00_47 () {
815 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
817 foreach (Xupdate_256_AVX()) { # 29 instructions
823 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
824 foreach (@insns) { eval; } # remaining instructions
825 &vmovdqa (16*$j."(%rsp)",$t2);
829 for ($i=0,$j=0; $j<4; $j++) {
830 &AVX_256_00_47($j,\&body_00_15,@X);
831 push(@X,shift(@X)); # rotate(@X)
833 &mov ("%r12",$_inp); # borrow $a4
834 &vpand ($temp,$temp,$mask14);
835 &mov ("%r15",$_out); # borrow $a2
836 &vpor ($iv,$iv,$temp);
837 &vmovdqu ("(%r15,%r12)",$iv); # write output
838 &lea ("%r12","16(%r12)"); # inp++
840 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
841 &jne (".Lavx_00_47");
843 &vmovdqu ($inout,"(%r12)");
847 for ($i=0; $i<16; ) {
848 foreach(body_00_15()) { eval; }
853 mov $_inp,%r12 # borrow $a4
854 mov $_out,%r13 # borrow $a0
855 mov $_ctx,%r15 # borrow $a2
856 mov $_in0,%rsi # borrow $a3
858 vpand $mask14,$temp,$temp
861 vmovdqu $iv,(%r13,%r12) # write output
862 lea 16(%r12),%r12 # inp++
888 vmovdqu $iv,($ivp) # output IV
891 $code.=<<___ if ($win64);
892 movaps `$framesz+16*0`(%rsp),%xmm6
893 movaps `$framesz+16*1`(%rsp),%xmm7
894 movaps `$framesz+16*2`(%rsp),%xmm8
895 movaps `$framesz+16*3`(%rsp),%xmm9
896 movaps `$framesz+16*4`(%rsp),%xmm10
897 movaps `$framesz+16*5`(%rsp),%xmm11
898 movaps `$framesz+16*6`(%rsp),%xmm12
899 movaps `$framesz+16*7`(%rsp),%xmm13
900 movaps `$framesz+16*8`(%rsp),%xmm14
901 movaps `$framesz+16*9`(%rsp),%xmm15
917 .cfi_def_cfa_register %rsp
921 .size ${func}_avx,.-${func}_avx
925 ######################################################################
928 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
933 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
935 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
937 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
938 '&and ($a4,$e)', # f&e
939 '&rorx ($a0,$e,$Sigma1[2])',
940 '&rorx ($a2,$e,$Sigma1[1])',
942 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
943 '&lea ($h,"($h,$a4)")',
944 '&andn ($a4,$e,$g)', # ~e&g
947 '&rorx ($a1,$e,$Sigma1[0])',
948 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
949 '&xor ($a0,$a1)', # Sigma1(e)
952 '&rorx ($a4,$a,$Sigma0[2])',
953 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
954 '&xor ($a2,$b)', # a^b, b^c in next round
955 '&rorx ($a1,$a,$Sigma0[1])',
957 '&rorx ($a0,$a,$Sigma0[0])',
958 '&lea ($d,"($d,$h)")', # d+=h
959 '&and ($a3,$a2)', # (b^c)&(a^b)
960 @aesni_cbc_block[$aesni_cbc_idx++].
963 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
964 '&xor ($a1,$a0)', # Sigma0(a)
965 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
966 '&mov ($a4,$e)', # copy of f in future
968 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
970 # and at the finish one has to $a+=$a1
974 .type ${func}_avx2,\@function,6
979 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
980 mov %rsp,%rax # copy %rsp
981 .cfi_def_cfa_register %rax
994 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
995 and \$-256*$SZ,%rsp # align stack frame
996 add \$`2*$SZ*($rounds-8)`,%rsp
999 sub $inp,$out # re-bias
1001 add $inp,$len # end of input
1003 #mov $inp,$_inp # saved later
1004 #mov $out,$_out # kept in $offload
1006 #mov $key,$_key # remains resident in $inp register
1011 .cfi_cfa_expression $_rsp,deref,+8
1013 $code.=<<___ if ($win64);
1014 movaps %xmm6,`$framesz+16*0`(%rsp)
1015 movaps %xmm7,`$framesz+16*1`(%rsp)
1016 movaps %xmm8,`$framesz+16*2`(%rsp)
1017 movaps %xmm9,`$framesz+16*3`(%rsp)
1018 movaps %xmm10,`$framesz+16*4`(%rsp)
1019 movaps %xmm11,`$framesz+16*5`(%rsp)
1020 movaps %xmm12,`$framesz+16*6`(%rsp)
1021 movaps %xmm13,`$framesz+16*7`(%rsp)
1022 movaps %xmm14,`$framesz+16*8`(%rsp)
1023 movaps %xmm15,`$framesz+16*9`(%rsp)
1029 mov $inp,%r13 # borrow $a0
1030 vpinsrq \$1,$out,$offload,$offload
1031 lea 0x80($key),$inp # size optimization, reassign
1032 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
1033 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
1034 mov $ctx,%r15 # borrow $a2
1035 mov $in0,%rsi # borrow $a3
1036 vmovdqu ($ivp),$iv # load IV
1039 vmovdqa 0x00(%r12,%r14,8),$mask14
1040 vmovdqa 0x10(%r12,%r14,8),$mask12
1041 vmovdqa 0x20(%r12,%r14,8),$mask10
1043 sub \$-16*$SZ,%r13 # inp++, size optimization
1045 lea (%rsi,%r13),%r12 # borrow $a0
1047 cmp $len,%r13 # $_end
1049 cmove %rsp,%r12 # next block or random data
1055 vmovdqu 0x00-0x80($inp),$roundkey
1057 if ($SZ==4) { # SHA256
1058 my @X = map("%ymm$_",(0..3));
1059 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1065 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1066 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1067 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1068 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1069 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1071 vinserti128 \$1,(%r12),@X[0],@X[0]
1072 vinserti128 \$1,16(%r12),@X[1],@X[1]
1073 vpshufb $t3,@X[0],@X[0]
1074 vinserti128 \$1,32(%r12),@X[2],@X[2]
1075 vpshufb $t3,@X[1],@X[1]
1076 vinserti128 \$1,48(%r12),@X[3],@X[3]
1078 lea $TABLE(%rip),$Tbl
1079 vpshufb $t3,@X[2],@X[2]
1080 lea -16*$SZ(%r13),%r13
1081 vpaddd 0x00($Tbl),@X[0],$t0
1082 vpshufb $t3,@X[3],@X[3]
1083 vpaddd 0x20($Tbl),@X[1],$t1
1084 vpaddd 0x40($Tbl),@X[2],$t2
1085 vpaddd 0x60($Tbl),@X[3],$t3
1086 vmovdqa $t0,0x00(%rsp)
1088 vmovdqa $t1,0x20(%rsp)
1089 lea -$PUSH8(%rsp),%rsp
1091 vmovdqa $t2,0x00(%rsp)
1093 vmovdqa $t3,0x20(%rsp)
1095 sub \$-16*2*$SZ,$Tbl # size optimization
1100 vmovdqu (%r13),$inout
1101 vpinsrq \$0,%r13,$offload,$offload
1104 sub AVX2_256_00_47 () {
1108 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1109 my $base = "+2*$PUSH8(%rsp)";
1111 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1112 foreach (Xupdate_256_AVX()) { # 29 instructions
1114 eval(shift(@insns));
1115 eval(shift(@insns));
1116 eval(shift(@insns));
1118 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1119 foreach (@insns) { eval; } # remaining instructions
1120 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1123 for ($i=0,$j=0; $j<4; $j++) {
1124 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1125 push(@X,shift(@X)); # rotate(@X)
1127 &vmovq ("%r13",$offload); # borrow $a0
1128 &vpextrq ("%r15",$offload,1); # borrow $a2
1129 &vpand ($temp,$temp,$mask14);
1130 &vpor ($iv,$iv,$temp);
1131 &vmovdqu ("(%r15,%r13)",$iv); # write output
1132 &lea ("%r13","16(%r13)"); # inp++
1134 &lea ($Tbl,16*2*$SZ."($Tbl)");
1135 &cmpb (($SZ-1)."($Tbl)",0);
1136 &jne (".Lavx2_00_47");
1138 &vmovdqu ($inout,"(%r13)");
1139 &vpinsrq ($offload,$offload,"%r13",0);
1142 for ($i=0; $i<16; ) {
1143 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1144 foreach(bodyx_00_15()) { eval; }
1148 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1149 vmovq $offload,%r13 # $_inp, borrow $a0
1150 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1152 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1154 vpand $mask14,$temp,$temp
1156 vmovdqu $iv,(%r12,%r13) # write output
1177 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1187 vmovdqu (%r13),$inout
1188 vpinsrq \$0,%r13,$offload,$offload
1191 for ($i=0; $i<16; ) {
1192 my $base="+16($Tbl)";
1193 foreach(bodyx_00_15()) { eval; }
1194 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1197 vmovq $offload,%r13 # borrow $a0
1198 vpextrq \$1,$offload,%r15 # borrow $a2
1199 vpand $mask14,$temp,$temp
1201 lea -$PUSH8($Tbl),$Tbl
1202 vmovdqu $iv,(%r15,%r13) # write output
1203 lea 16(%r13),%r13 # inp++
1207 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1208 lea 16*$SZ(%r13),%r13
1209 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1211 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1220 lea (%rsi,%r13),%r12
1226 cmove %rsp,%r12 # next block or stale data
1243 vmovdqu $iv,($ivp) # output IV
1246 $code.=<<___ if ($win64);
1247 movaps `$framesz+16*0`(%rsp),%xmm6
1248 movaps `$framesz+16*1`(%rsp),%xmm7
1249 movaps `$framesz+16*2`(%rsp),%xmm8
1250 movaps `$framesz+16*3`(%rsp),%xmm9
1251 movaps `$framesz+16*4`(%rsp),%xmm10
1252 movaps `$framesz+16*5`(%rsp),%xmm11
1253 movaps `$framesz+16*6`(%rsp),%xmm12
1254 movaps `$framesz+16*7`(%rsp),%xmm13
1255 movaps `$framesz+16*8`(%rsp),%xmm14
1256 movaps `$framesz+16*9`(%rsp),%xmm15
1272 .cfi_def_cfa_register %rsp
1276 .size ${func}_avx2,.-${func}_avx2
1281 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1283 my ($rounds,$Tbl)=("%r11d","%rbx");
1285 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1286 my @rndkey=("%xmm4","%xmm5");
1290 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1291 my @MSG=map("%xmm$_",(10..13));
1295 my ($n,$k)=($r/10,$r%10);
1298 movups `16*$n`($in0),$in # load input
1301 $code.=<<___ if ($n);
1302 movups $iv,`16*($n-1)`($out,$in0) # write output
1306 movups `32+16*$k-112`($key),$rndkey[1]
1307 aesenc $rndkey[0],$iv
1314 movups `32+16*($k+0)-112`($key),$rndkey[1]
1315 aesenc $rndkey[0],$iv
1316 movups `32+16*($k+1)-112`($key),$rndkey[0]
1317 aesenc $rndkey[1],$iv
1319 movups `32+16*($k+2)-112`($key),$rndkey[1]
1320 aesenc $rndkey[0],$iv
1321 movups `32+16*($k+3)-112`($key),$rndkey[0]
1322 aesenc $rndkey[1],$iv
1324 aesenclast $rndkey[0],$iv
1325 movups 16-112($key),$rndkey[1] # forward reference
1330 movups `32+16*$k-112`($key),$rndkey[1]
1331 aesenc $rndkey[0],$iv
1334 $r++; unshift(@rndkey,pop(@rndkey));
1341 .type ${func}_shaext,\@function,6
1344 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
1346 $code.=<<___ if ($win64);
1347 lea `-8-10*16`(%rsp),%rsp
1348 movaps %xmm6,-8-10*16(%rax)
1349 movaps %xmm7,-8-9*16(%rax)
1350 movaps %xmm8,-8-8*16(%rax)
1351 movaps %xmm9,-8-7*16(%rax)
1352 movaps %xmm10,-8-6*16(%rax)
1353 movaps %xmm11,-8-5*16(%rax)
1354 movaps %xmm12,-8-4*16(%rax)
1355 movaps %xmm13,-8-3*16(%rax)
1356 movaps %xmm14,-8-2*16(%rax)
1357 movaps %xmm15,-8-1*16(%rax)
1361 lea K256+0x80(%rip),$Tbl
1362 movdqu ($ctx),$ABEF # DCBA
1363 movdqu 16($ctx),$CDGH # HGFE
1364 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1366 mov 240($key),$rounds
1368 movups ($key),$rndkey0 # $key[0]
1369 movups ($ivp),$iv # load IV
1370 movups 16($key),$rndkey[0] # forward reference
1371 lea 112($key),$key # size optimization
1373 pshufd \$0x1b,$ABEF,$Wi # ABCD
1374 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1375 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1376 movdqa $TMP,$BSWAP # offload
1377 palignr \$8,$CDGH,$ABEF # ABEF
1378 punpcklqdq $Wi,$CDGH # CDGH
1384 movdqu ($inp),@MSG[0]
1385 movdqu 0x10($inp),@MSG[1]
1386 movdqu 0x20($inp),@MSG[2]
1388 movdqu 0x30($inp),@MSG[3]
1390 movdqa 0*32-0x80($Tbl),$Wi
1393 movdqa $CDGH,$CDGH_SAVE # offload
1394 movdqa $ABEF,$ABEF_SAVE # offload
1398 sha256rnds2 $ABEF,$CDGH # 0-3
1399 pshufd \$0x0e,$Wi,$Wi
1403 sha256rnds2 $CDGH,$ABEF
1405 movdqa 1*32-0x80($Tbl),$Wi
1412 sha256rnds2 $ABEF,$CDGH # 4-7
1413 pshufd \$0x0e,$Wi,$Wi
1417 sha256rnds2 $CDGH,$ABEF
1419 movdqa 2*32-0x80($Tbl),$Wi
1422 sha256msg1 @MSG[1],@MSG[0]
1426 sha256rnds2 $ABEF,$CDGH # 8-11
1427 pshufd \$0x0e,$Wi,$Wi
1429 palignr \$4,@MSG[2],$TMP
1434 sha256rnds2 $CDGH,$ABEF
1436 movdqa 3*32-0x80($Tbl),$Wi
1438 sha256msg2 @MSG[3],@MSG[0]
1439 sha256msg1 @MSG[2],@MSG[1]
1443 sha256rnds2 $ABEF,$CDGH # 12-15
1444 pshufd \$0x0e,$Wi,$Wi
1449 palignr \$4,@MSG[3],$TMP
1451 sha256rnds2 $CDGH,$ABEF
1453 for($i=4;$i<16-3;$i++) {
1454 &$aesenc() if (($r%10)==0);
1456 movdqa $i*32-0x80($Tbl),$Wi
1458 sha256msg2 @MSG[0],@MSG[1]
1459 sha256msg1 @MSG[3],@MSG[2]
1463 sha256rnds2 $ABEF,$CDGH # 16-19...
1464 pshufd \$0x0e,$Wi,$Wi
1466 palignr \$4,@MSG[0],$TMP
1470 &$aesenc() if ($r==19);
1472 sha256rnds2 $CDGH,$ABEF
1474 push(@MSG,shift(@MSG));
1477 movdqa 13*32-0x80($Tbl),$Wi
1479 sha256msg2 @MSG[0],@MSG[1]
1480 sha256msg1 @MSG[3],@MSG[2]
1484 sha256rnds2 $ABEF,$CDGH # 52-55
1485 pshufd \$0x0e,$Wi,$Wi
1487 palignr \$4,@MSG[0],$TMP
1493 sha256rnds2 $CDGH,$ABEF
1495 movdqa 14*32-0x80($Tbl),$Wi
1497 sha256msg2 @MSG[1],@MSG[2]
1502 sha256rnds2 $ABEF,$CDGH # 56-59
1503 pshufd \$0x0e,$Wi,$Wi
1507 sha256rnds2 $CDGH,$ABEF
1509 movdqa 15*32-0x80($Tbl),$Wi
1515 sha256rnds2 $ABEF,$CDGH # 60-63
1516 pshufd \$0x0e,$Wi,$Wi
1520 sha256rnds2 $CDGH,$ABEF
1521 #pxor $CDGH,$rndkey0 # black magic
1523 while ($r<40) { &$aesenc(); } # remaining aesenc's
1525 #xorps $CDGH,$rndkey0 # black magic
1526 paddd $CDGH_SAVE,$CDGH
1527 paddd $ABEF_SAVE,$ABEF
1530 movups $iv,48($out,$in0) # write output
1534 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1535 pshufd \$0x1b,$ABEF,$TMP # FEBA
1536 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1537 punpckhqdq $CDGH,$ABEF # DCBA
1538 palignr \$8,$TMP,$CDGH # HGFE
1540 movups $iv,($ivp) # write IV
1542 movdqu $CDGH,16($ctx)
1544 $code.=<<___ if ($win64);
1545 movaps 0*16(%rsp),%xmm6
1546 movaps 1*16(%rsp),%xmm7
1547 movaps 2*16(%rsp),%xmm8
1548 movaps 3*16(%rsp),%xmm9
1549 movaps 4*16(%rsp),%xmm10
1550 movaps 5*16(%rsp),%xmm11
1551 movaps 6*16(%rsp),%xmm12
1552 movaps 7*16(%rsp),%xmm13
1553 movaps 8*16(%rsp),%xmm14
1554 movaps 9*16(%rsp),%xmm15
1555 lea 8+10*16(%rsp),%rsp
1560 .size ${func}_shaext,.-${func}_shaext
1565 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1566 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1567 if ($win64 && $avx) {
1574 .extern __imp_RtlVirtualUnwind
1575 .type se_handler,\@abi-omnipotent
1589 mov 120($context),%rax # pull context->Rax
1590 mov 248($context),%rbx # pull context->Rip
1592 mov 8($disp),%rsi # disp->ImageBase
1593 mov 56($disp),%r11 # disp->HanderlData
1595 mov 0(%r11),%r10d # HandlerData[0]
1596 lea (%rsi,%r10),%r10 # prologue label
1597 cmp %r10,%rbx # context->Rip<prologue label
1600 mov 152($context),%rax # pull context->Rsp
1602 mov 4(%r11),%r10d # HandlerData[1]
1603 lea (%rsi,%r10),%r10 # epilogue label
1604 cmp %r10,%rbx # context->Rip>=epilogue label
1607 $code.=<<___ if ($shaext);
1608 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1613 lea 512($context),%rdi # &context.Xmm6
1615 .long 0xa548f3fc # cld; rep movsq
1616 lea 168(%rax),%rax # adjust stack pointer
1620 $code.=<<___ if ($avx>1);
1621 lea .Lavx2_shortcut(%rip),%r10
1622 cmp %r10,%rbx # context->Rip<avx2_shortcut
1626 add \$`2*$SZ*($rounds-8)`,%rax
1630 mov %rax,%rsi # put aside Rsp
1631 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1639 mov %rbx,144($context) # restore context->Rbx
1640 mov %rbp,160($context) # restore context->Rbp
1641 mov %r12,216($context) # restore context->R12
1642 mov %r13,224($context) # restore context->R13
1643 mov %r14,232($context) # restore context->R14
1644 mov %r15,240($context) # restore context->R15
1646 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1647 lea 512($context),%rdi # &context.Xmm6
1649 .long 0xa548f3fc # cld; rep movsq
1654 mov %rax,152($context) # restore context->Rsp
1655 mov %rsi,168($context) # restore context->Rsi
1656 mov %rdi,176($context) # restore context->Rdi
1658 mov 40($disp),%rdi # disp->ContextRecord
1659 mov $context,%rsi # context
1660 mov \$154,%ecx # sizeof(CONTEXT)
1661 .long 0xa548f3fc # cld; rep movsq
1664 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1665 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1666 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1667 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1668 mov 40(%rsi),%r10 # disp->ContextRecord
1669 lea 56(%rsi),%r11 # &disp->HandlerData
1670 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1671 mov %r10,32(%rsp) # arg5
1672 mov %r11,40(%rsp) # arg6
1673 mov %r12,48(%rsp) # arg7
1674 mov %rcx,56(%rsp) # arg8, (NULL)
1675 call *__imp_RtlVirtualUnwind(%rip)
1677 mov \$1,%eax # ExceptionContinueSearch
1689 .size se_handler,.-se_handler
1692 .rva .LSEH_begin_${func}_xop
1693 .rva .LSEH_end_${func}_xop
1694 .rva .LSEH_info_${func}_xop
1696 .rva .LSEH_begin_${func}_avx
1697 .rva .LSEH_end_${func}_avx
1698 .rva .LSEH_info_${func}_avx
1700 $code.=<<___ if ($avx>1);
1701 .rva .LSEH_begin_${func}_avx2
1702 .rva .LSEH_end_${func}_avx2
1703 .rva .LSEH_info_${func}_avx2
1705 $code.=<<___ if ($shaext);
1706 .rva .LSEH_begin_${func}_shaext
1707 .rva .LSEH_end_${func}_shaext
1708 .rva .LSEH_info_${func}_shaext
1713 .LSEH_info_${func}_xop:
1716 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1718 .LSEH_info_${func}_avx:
1721 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1723 $code.=<<___ if ($avx>1);
1724 .LSEH_info_${func}_avx2:
1727 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1729 $code.=<<___ if ($shaext);
1730 .LSEH_info_${func}_shaext:
1733 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1737 ####################################################################
1739 local *opcode=shift;
1743 $rex|=0x04 if($dst>=8);
1744 $rex|=0x01 if($src>=8);
1745 unshift @opcode,$rex|0x40 if($rex);
1750 "sha256rnds2" => 0xcb,
1751 "sha256msg1" => 0xcc,
1752 "sha256msg2" => 0xcd );
1757 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1758 my @opcode=(0x0f,0x38);
1759 rex(\@opcode,$2,$1);
1760 push @opcode,$opcodelet{$instr};
1761 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1762 return ".byte\t".join(',',@opcode);
1764 return $instr."\t".@_[0];
1769 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1770 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;