2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
21 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
22 # parallelism, interleaving it with another algorithm would allow to
23 # utilize processor resources better and achieve better performance.
24 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25 # AESNI code is weaved into it. As SHA256 dominates execution time,
26 # stitch performance does not depend on AES key length. Below are
27 # performance numbers in cycles per processed byte, less is better,
28 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
31 # AES-128/-192/-256+SHA256 this(**) gain
32 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
33 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
34 # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
35 # Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
36 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
37 # Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54%
38 # Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60%
40 # (*) there are XOP, AVX1 and AVX2 code paths, meaning that
41 # Westmere is omitted from loop, this is because gain was not
42 # estimated high enough to justify the effort;
43 # (**) these are EVP-free results, results obtained with 'speed
44 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
45 # (***) these are SHAEXT results;
47 # $output is the last argument if it looks like a file (it has an extension)
48 # $flavour is the first argument if it doesn't look like a file
49 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
50 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
52 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
54 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
56 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
57 die "can't locate x86_64-xlate.pl";
59 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
60 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
61 $avx = ($1>=2.19) + ($1>=2.22);
64 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
65 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
66 $avx = ($1>=2.09) + ($1>=2.10);
69 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
70 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
71 $avx = ($1>=10) + ($1>=12);
74 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
75 $avx = ($2>=3.0) + ($2>3.0);
78 $shaext=$avx; ### set to zero if compiling for 1.0.1
79 $avx=1 if (!$shaext && $avx);
81 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
82 or die "can't call $xlate: $!";
85 $func="aesni_cbc_sha256_enc";
88 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
89 "%r8d","%r9d","%r10d","%r11d");
90 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
97 ########################################################################
98 # void aesni_cbc_sha256_enc(const void *inp,
101 # const AES_KEY *key,
105 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
106 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
110 $_inp="16*$SZ+0*8(%rsp)";
111 $_out="16*$SZ+1*8(%rsp)";
112 $_end="16*$SZ+2*8(%rsp)";
113 $_key="16*$SZ+3*8(%rsp)";
114 $_ivp="16*$SZ+4*8(%rsp)";
115 $_ctx="16*$SZ+5*8(%rsp)";
116 $_in0="16*$SZ+6*8(%rsp)";
117 $_rsp="`16*$SZ+7*8`(%rsp)";
123 .extern OPENSSL_ia32cap_P
125 .type $func,\@abi-omnipotent
132 lea OPENSSL_ia32cap_P(%rip),%r11
134 cmp \$0,`$win64?"%rcx":"%rdi"`
139 $code.=<<___ if ($shaext);
140 bt \$61,%r10 # check for SHA
147 test \$`1<<11`,%r10d # check for XOP
150 $code.=<<___ if ($avx>1);
151 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
152 cmp \$`1<<8|1<<5|1<<3`,%r11d
156 and \$`1<<28`,%r10d # check for AVX
163 cmp \$0,`$win64?"%rcx":"%rdi"`
172 .type $TABLE,\@object
174 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
175 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
176 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
177 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
178 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
179 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
180 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
181 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
182 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
183 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
184 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
185 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
186 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
187 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
188 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
189 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
190 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
191 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
192 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
193 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
194 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
195 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
196 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
197 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
198 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
199 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
200 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
201 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
202 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
203 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
204 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
205 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
207 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
208 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
209 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
210 .long 0,0,0,0, 0,0,0,0
211 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
215 ######################################################################
219 ($iv,$inout,$roundkey,$temp,
220 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
224 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
225 ## &vmovdqu ($inout,($inp));
226 ## &mov ($_inp,$inp);
228 '&vpxor ($inout,$inout,$roundkey);'.
229 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
231 '&vpxor ($inout,$inout,$iv);',
233 '&vaesenc ($inout,$inout,$roundkey);'.
234 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
236 '&vaesenc ($inout,$inout,$roundkey);'.
237 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
239 '&vaesenc ($inout,$inout,$roundkey);'.
240 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
242 '&vaesenc ($inout,$inout,$roundkey);'.
243 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
245 '&vaesenc ($inout,$inout,$roundkey);'.
246 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
248 '&vaesenc ($inout,$inout,$roundkey);'.
249 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
251 '&vaesenc ($inout,$inout,$roundkey);'.
252 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
254 '&vaesenc ($inout,$inout,$roundkey);'.
255 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
257 '&vaesenc ($inout,$inout,$roundkey);'.
258 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
260 '&vaesenclast ($temp,$inout,$roundkey);'.
261 ' &vaesenc ($inout,$inout,$roundkey);'.
262 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
264 '&vpand ($iv,$temp,$mask10);'.
265 ' &vaesenc ($inout,$inout,$roundkey);'.
266 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
268 '&vaesenclast ($temp,$inout,$roundkey);'.
269 ' &vaesenc ($inout,$inout,$roundkey);'.
270 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
272 '&vpand ($temp,$temp,$mask12);'.
273 ' &vaesenc ($inout,$inout,$roundkey);'.
274 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
276 '&vpor ($iv,$iv,$temp);'.
277 ' &vaesenclast ($temp,$inout,$roundkey);'.
278 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
280 ## &mov ($inp,$_inp);
281 ## &mov ($out,$_out);
282 ## &vpand ($temp,$temp,$mask14);
283 ## &vpor ($iv,$iv,$temp);
284 ## &vmovdqu ($iv,($out,$inp);
285 ## &lea (inp,16($inp));
289 my ($a,$b,$c,$d,$e,$f,$g,$h);
291 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
292 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
294 $arg = "\$$arg" if ($arg*1 eq $arg);
295 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
300 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
302 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
307 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
308 '&xor ($a4,$g)', # f^g
310 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
312 '&and ($a4,$e)', # (f^g)&e
314 @aesni_cbc_block[$aesni_cbc_idx++].
316 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
319 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
320 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
321 '&xor ($a2,$b)', # a^b, b^c in next round
323 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
324 '&add ($h,$a4)', # h+=Ch(e,f,g)
325 '&and ($a3,$a2)', # (b^c)&(a^b)
328 '&add ($h,$a0)', # h+=Sigma1(e)
329 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
331 '&add ($d,$h)', # d+=h
332 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
333 '&add ($h,$a3)', # h+=Maj(a,b,c)
336 '&add ($a1,$h);'. # h+=Sigma0(a)
337 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
342 ######################################################################
346 .type ${func}_xop,\@function,6
351 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
352 mov %rsp,%rax # copy %rsp
353 .cfi_def_cfa_register %rax
366 sub \$`$framesz+$win64*16*10`,%rsp
367 and \$-64,%rsp # align stack frame
370 sub $inp,$out # re-bias
372 add $inp,$len # end of input
374 #mov $inp,$_inp # saved later
377 #mov $key,$_key # remains resident in $inp register
382 .cfi_cfa_expression $_rsp,deref,+8
384 $code.=<<___ if ($win64);
385 movaps %xmm6,`$framesz+16*0`(%rsp)
386 movaps %xmm7,`$framesz+16*1`(%rsp)
387 movaps %xmm8,`$framesz+16*2`(%rsp)
388 movaps %xmm9,`$framesz+16*3`(%rsp)
389 movaps %xmm10,`$framesz+16*4`(%rsp)
390 movaps %xmm11,`$framesz+16*5`(%rsp)
391 movaps %xmm12,`$framesz+16*6`(%rsp)
392 movaps %xmm13,`$framesz+16*7`(%rsp)
393 movaps %xmm14,`$framesz+16*8`(%rsp)
394 movaps %xmm15,`$framesz+16*9`(%rsp)
400 mov $inp,%r12 # borrow $a4
401 lea 0x80($key),$inp # size optimization, reassign
402 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
403 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
404 mov $ctx,%r15 # borrow $a2
405 mov $in0,%rsi # borrow $a3
406 vmovdqu ($ivp),$iv # load IV
418 vmovdqa 0x00(%r13,%r14,8),$mask14
419 vmovdqa 0x10(%r13,%r14,8),$mask12
420 vmovdqa 0x20(%r13,%r14,8),$mask10
421 vmovdqu 0x00-0x80($inp),$roundkey
424 if ($SZ==4) { # SHA256
425 my @X = map("%xmm$_",(0..3));
426 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
431 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
432 vmovdqu 0x00(%rsi,%r12),@X[0]
433 vmovdqu 0x10(%rsi,%r12),@X[1]
434 vmovdqu 0x20(%rsi,%r12),@X[2]
435 vmovdqu 0x30(%rsi,%r12),@X[3]
436 vpshufb $t3,@X[0],@X[0]
437 lea $TABLE(%rip),$Tbl
438 vpshufb $t3,@X[1],@X[1]
439 vpshufb $t3,@X[2],@X[2]
440 vpaddd 0x00($Tbl),@X[0],$t0
441 vpshufb $t3,@X[3],@X[3]
442 vpaddd 0x20($Tbl),@X[1],$t1
443 vpaddd 0x40($Tbl),@X[2],$t2
444 vpaddd 0x60($Tbl),@X[3],$t3
445 vmovdqa $t0,0x00(%rsp)
447 vmovdqa $t1,0x10(%rsp)
449 vmovdqa $t2,0x20(%rsp)
451 vmovdqa $t3,0x30(%rsp)
457 sub \$-16*2*$SZ,$Tbl # size optimization
458 vmovdqu (%r12),$inout # $a4
461 sub XOP_256_00_47 () {
465 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
467 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
470 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
473 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
476 &vpsrld ($t0,$t0,$sigma0[2]);
479 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
484 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
487 &vpxor ($t0,$t0,$t1);
492 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
495 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
498 &vpsrld ($t2,@X[3],$sigma1[2]);
501 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
504 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
507 &vpxor ($t3,$t3,$t2);
512 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
517 &vpsrldq ($t3,$t3,8);
522 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
527 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
530 &vpsrld ($t2,@X[0],$sigma1[2]);
533 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
536 &vpxor ($t3,$t3,$t2);
541 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
546 &vpslldq ($t3,$t3,8); # 22 instructions
551 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
556 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
557 foreach (@insns) { eval; } # remaining instructions
558 &vmovdqa (16*$j."(%rsp)",$t2);
562 for ($i=0,$j=0; $j<4; $j++) {
563 &XOP_256_00_47($j,\&body_00_15,@X);
564 push(@X,shift(@X)); # rotate(@X)
566 &mov ("%r12",$_inp); # borrow $a4
567 &vpand ($temp,$temp,$mask14);
568 &mov ("%r15",$_out); # borrow $a2
569 &vpor ($iv,$iv,$temp);
570 &vmovdqu ("(%r15,%r12)",$iv); # write output
571 &lea ("%r12","16(%r12)"); # inp++
573 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
574 &jne (".Lxop_00_47");
576 &vmovdqu ($inout,"(%r12)");
580 for ($i=0; $i<16; ) {
581 foreach(body_00_15()) { eval; }
585 mov $_inp,%r12 # borrow $a4
586 mov $_out,%r13 # borrow $a0
587 mov $_ctx,%r15 # borrow $a2
588 mov $_in0,%rsi # borrow $a3
590 vpand $mask14,$temp,$temp
593 vmovdqu $iv,(%r13,%r12) # write output
594 lea 16(%r12),%r12 # inp++
621 vmovdqu $iv,($ivp) # output IV
624 $code.=<<___ if ($win64);
625 movaps `$framesz+16*0`(%rsp),%xmm6
626 movaps `$framesz+16*1`(%rsp),%xmm7
627 movaps `$framesz+16*2`(%rsp),%xmm8
628 movaps `$framesz+16*3`(%rsp),%xmm9
629 movaps `$framesz+16*4`(%rsp),%xmm10
630 movaps `$framesz+16*5`(%rsp),%xmm11
631 movaps `$framesz+16*6`(%rsp),%xmm12
632 movaps `$framesz+16*7`(%rsp),%xmm13
633 movaps `$framesz+16*8`(%rsp),%xmm14
634 movaps `$framesz+16*9`(%rsp),%xmm15
650 .cfi_def_cfa_register %rsp
654 .size ${func}_xop,.-${func}_xop
656 ######################################################################
659 local *ror = sub { &shrd(@_[0],@_) };
662 .type ${func}_avx,\@function,6
667 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
668 mov %rsp,%rax # copy %rsp
669 .cfi_def_cfa_register %rax
682 sub \$`$framesz+$win64*16*10`,%rsp
683 and \$-64,%rsp # align stack frame
686 sub $inp,$out # re-bias
688 add $inp,$len # end of input
690 #mov $inp,$_inp # saved later
693 #mov $key,$_key # remains resident in $inp register
698 .cfi_cfa_expression $_rsp,deref,+8
700 $code.=<<___ if ($win64);
701 movaps %xmm6,`$framesz+16*0`(%rsp)
702 movaps %xmm7,`$framesz+16*1`(%rsp)
703 movaps %xmm8,`$framesz+16*2`(%rsp)
704 movaps %xmm9,`$framesz+16*3`(%rsp)
705 movaps %xmm10,`$framesz+16*4`(%rsp)
706 movaps %xmm11,`$framesz+16*5`(%rsp)
707 movaps %xmm12,`$framesz+16*6`(%rsp)
708 movaps %xmm13,`$framesz+16*7`(%rsp)
709 movaps %xmm14,`$framesz+16*8`(%rsp)
710 movaps %xmm15,`$framesz+16*9`(%rsp)
716 mov $inp,%r12 # borrow $a4
717 lea 0x80($key),$inp # size optimization, reassign
718 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
719 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
720 mov $ctx,%r15 # borrow $a2
721 mov $in0,%rsi # borrow $a3
722 vmovdqu ($ivp),$iv # load IV
734 vmovdqa 0x00(%r13,%r14,8),$mask14
735 vmovdqa 0x10(%r13,%r14,8),$mask12
736 vmovdqa 0x20(%r13,%r14,8),$mask10
737 vmovdqu 0x00-0x80($inp),$roundkey
739 if ($SZ==4) { # SHA256
740 my @X = map("%xmm$_",(0..3));
741 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
747 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
748 vmovdqu 0x00(%rsi,%r12),@X[0]
749 vmovdqu 0x10(%rsi,%r12),@X[1]
750 vmovdqu 0x20(%rsi,%r12),@X[2]
751 vmovdqu 0x30(%rsi,%r12),@X[3]
752 vpshufb $t3,@X[0],@X[0]
753 lea $TABLE(%rip),$Tbl
754 vpshufb $t3,@X[1],@X[1]
755 vpshufb $t3,@X[2],@X[2]
756 vpaddd 0x00($Tbl),@X[0],$t0
757 vpshufb $t3,@X[3],@X[3]
758 vpaddd 0x20($Tbl),@X[1],$t1
759 vpaddd 0x40($Tbl),@X[2],$t2
760 vpaddd 0x60($Tbl),@X[3],$t3
761 vmovdqa $t0,0x00(%rsp)
763 vmovdqa $t1,0x10(%rsp)
765 vmovdqa $t2,0x20(%rsp)
767 vmovdqa $t3,0x30(%rsp)
773 sub \$-16*2*$SZ,$Tbl # size optimization
774 vmovdqu (%r12),$inout # $a4
777 sub Xupdate_256_AVX () {
779 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
780 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
781 '&vpsrld ($t2,$t0,$sigma0[0]);',
782 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
783 '&vpsrld ($t3,$t0,$sigma0[2])',
784 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
785 '&vpxor ($t0,$t3,$t2)',
786 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
787 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
788 '&vpxor ($t0,$t0,$t1)',
789 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
790 '&vpxor ($t0,$t0,$t2)',
791 '&vpsrld ($t2,$t3,$sigma1[2]);',
792 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
793 '&vpsrlq ($t3,$t3,$sigma1[0]);',
794 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
795 '&vpxor ($t2,$t2,$t3);',
796 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
797 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
798 '&vpshufd ($t2,$t2,0b10000100)',
799 '&vpsrldq ($t2,$t2,8)',
800 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
801 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
802 '&vpsrld ($t2,$t3,$sigma1[2])',
803 '&vpsrlq ($t3,$t3,$sigma1[0])',
804 '&vpxor ($t2,$t2,$t3);',
805 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
806 '&vpxor ($t2,$t2,$t3)',
807 '&vpshufd ($t2,$t2,0b11101000)',
808 '&vpslldq ($t2,$t2,8)',
809 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
813 sub AVX_256_00_47 () {
817 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
819 foreach (Xupdate_256_AVX()) { # 29 instructions
825 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
826 foreach (@insns) { eval; } # remaining instructions
827 &vmovdqa (16*$j."(%rsp)",$t2);
831 for ($i=0,$j=0; $j<4; $j++) {
832 &AVX_256_00_47($j,\&body_00_15,@X);
833 push(@X,shift(@X)); # rotate(@X)
835 &mov ("%r12",$_inp); # borrow $a4
836 &vpand ($temp,$temp,$mask14);
837 &mov ("%r15",$_out); # borrow $a2
838 &vpor ($iv,$iv,$temp);
839 &vmovdqu ("(%r15,%r12)",$iv); # write output
840 &lea ("%r12","16(%r12)"); # inp++
842 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
843 &jne (".Lavx_00_47");
845 &vmovdqu ($inout,"(%r12)");
849 for ($i=0; $i<16; ) {
850 foreach(body_00_15()) { eval; }
855 mov $_inp,%r12 # borrow $a4
856 mov $_out,%r13 # borrow $a0
857 mov $_ctx,%r15 # borrow $a2
858 mov $_in0,%rsi # borrow $a3
860 vpand $mask14,$temp,$temp
863 vmovdqu $iv,(%r13,%r12) # write output
864 lea 16(%r12),%r12 # inp++
890 vmovdqu $iv,($ivp) # output IV
893 $code.=<<___ if ($win64);
894 movaps `$framesz+16*0`(%rsp),%xmm6
895 movaps `$framesz+16*1`(%rsp),%xmm7
896 movaps `$framesz+16*2`(%rsp),%xmm8
897 movaps `$framesz+16*3`(%rsp),%xmm9
898 movaps `$framesz+16*4`(%rsp),%xmm10
899 movaps `$framesz+16*5`(%rsp),%xmm11
900 movaps `$framesz+16*6`(%rsp),%xmm12
901 movaps `$framesz+16*7`(%rsp),%xmm13
902 movaps `$framesz+16*8`(%rsp),%xmm14
903 movaps `$framesz+16*9`(%rsp),%xmm15
919 .cfi_def_cfa_register %rsp
923 .size ${func}_avx,.-${func}_avx
927 ######################################################################
930 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
935 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
937 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
939 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
940 '&and ($a4,$e)', # f&e
941 '&rorx ($a0,$e,$Sigma1[2])',
942 '&rorx ($a2,$e,$Sigma1[1])',
944 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
945 '&lea ($h,"($h,$a4)")',
946 '&andn ($a4,$e,$g)', # ~e&g
949 '&rorx ($a1,$e,$Sigma1[0])',
950 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
951 '&xor ($a0,$a1)', # Sigma1(e)
954 '&rorx ($a4,$a,$Sigma0[2])',
955 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
956 '&xor ($a2,$b)', # a^b, b^c in next round
957 '&rorx ($a1,$a,$Sigma0[1])',
959 '&rorx ($a0,$a,$Sigma0[0])',
960 '&lea ($d,"($d,$h)")', # d+=h
961 '&and ($a3,$a2)', # (b^c)&(a^b)
962 @aesni_cbc_block[$aesni_cbc_idx++].
965 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
966 '&xor ($a1,$a0)', # Sigma0(a)
967 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
968 '&mov ($a4,$e)', # copy of f in future
970 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
972 # and at the finish one has to $a+=$a1
976 .type ${func}_avx2,\@function,6
981 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
982 mov %rsp,%rax # copy %rsp
983 .cfi_def_cfa_register %rax
996 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
997 and \$-256*$SZ,%rsp # align stack frame
998 add \$`2*$SZ*($rounds-8)`,%rsp
1001 sub $inp,$out # re-bias
1003 add $inp,$len # end of input
1005 #mov $inp,$_inp # saved later
1006 #mov $out,$_out # kept in $offload
1008 #mov $key,$_key # remains resident in $inp register
1013 .cfi_cfa_expression $_rsp,deref,+8
1015 $code.=<<___ if ($win64);
1016 movaps %xmm6,`$framesz+16*0`(%rsp)
1017 movaps %xmm7,`$framesz+16*1`(%rsp)
1018 movaps %xmm8,`$framesz+16*2`(%rsp)
1019 movaps %xmm9,`$framesz+16*3`(%rsp)
1020 movaps %xmm10,`$framesz+16*4`(%rsp)
1021 movaps %xmm11,`$framesz+16*5`(%rsp)
1022 movaps %xmm12,`$framesz+16*6`(%rsp)
1023 movaps %xmm13,`$framesz+16*7`(%rsp)
1024 movaps %xmm14,`$framesz+16*8`(%rsp)
1025 movaps %xmm15,`$framesz+16*9`(%rsp)
1031 mov $inp,%r13 # borrow $a0
1032 vpinsrq \$1,$out,$offload,$offload
1033 lea 0x80($key),$inp # size optimization, reassign
1034 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
1035 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
1036 mov $ctx,%r15 # borrow $a2
1037 mov $in0,%rsi # borrow $a3
1038 vmovdqu ($ivp),$iv # load IV
1041 vmovdqa 0x00(%r12,%r14,8),$mask14
1042 vmovdqa 0x10(%r12,%r14,8),$mask12
1043 vmovdqa 0x20(%r12,%r14,8),$mask10
1045 sub \$-16*$SZ,%r13 # inp++, size optimization
1047 lea (%rsi,%r13),%r12 # borrow $a0
1049 cmp $len,%r13 # $_end
1051 cmove %rsp,%r12 # next block or random data
1057 vmovdqu 0x00-0x80($inp),$roundkey
1059 if ($SZ==4) { # SHA256
1060 my @X = map("%ymm$_",(0..3));
1061 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1067 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1068 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1069 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1070 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1071 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1073 vinserti128 \$1,(%r12),@X[0],@X[0]
1074 vinserti128 \$1,16(%r12),@X[1],@X[1]
1075 vpshufb $t3,@X[0],@X[0]
1076 vinserti128 \$1,32(%r12),@X[2],@X[2]
1077 vpshufb $t3,@X[1],@X[1]
1078 vinserti128 \$1,48(%r12),@X[3],@X[3]
1080 lea $TABLE(%rip),$Tbl
1081 vpshufb $t3,@X[2],@X[2]
1082 lea -16*$SZ(%r13),%r13
1083 vpaddd 0x00($Tbl),@X[0],$t0
1084 vpshufb $t3,@X[3],@X[3]
1085 vpaddd 0x20($Tbl),@X[1],$t1
1086 vpaddd 0x40($Tbl),@X[2],$t2
1087 vpaddd 0x60($Tbl),@X[3],$t3
1088 vmovdqa $t0,0x00(%rsp)
1090 vmovdqa $t1,0x20(%rsp)
1091 lea -$PUSH8(%rsp),%rsp
1093 vmovdqa $t2,0x00(%rsp)
1095 vmovdqa $t3,0x20(%rsp)
1097 sub \$-16*2*$SZ,$Tbl # size optimization
1102 vmovdqu (%r13),$inout
1103 vpinsrq \$0,%r13,$offload,$offload
1106 sub AVX2_256_00_47 () {
1110 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1111 my $base = "+2*$PUSH8(%rsp)";
1113 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1114 foreach (Xupdate_256_AVX()) { # 29 instructions
1116 eval(shift(@insns));
1117 eval(shift(@insns));
1118 eval(shift(@insns));
1120 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1121 foreach (@insns) { eval; } # remaining instructions
1122 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1125 for ($i=0,$j=0; $j<4; $j++) {
1126 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1127 push(@X,shift(@X)); # rotate(@X)
1129 &vmovq ("%r13",$offload); # borrow $a0
1130 &vpextrq ("%r15",$offload,1); # borrow $a2
1131 &vpand ($temp,$temp,$mask14);
1132 &vpor ($iv,$iv,$temp);
1133 &vmovdqu ("(%r15,%r13)",$iv); # write output
1134 &lea ("%r13","16(%r13)"); # inp++
1136 &lea ($Tbl,16*2*$SZ."($Tbl)");
1137 &cmpb (($SZ-1)."($Tbl)",0);
1138 &jne (".Lavx2_00_47");
1140 &vmovdqu ($inout,"(%r13)");
1141 &vpinsrq ($offload,$offload,"%r13",0);
1144 for ($i=0; $i<16; ) {
1145 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1146 foreach(bodyx_00_15()) { eval; }
1150 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1151 vmovq $offload,%r13 # $_inp, borrow $a0
1152 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1154 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1156 vpand $mask14,$temp,$temp
1158 vmovdqu $iv,(%r12,%r13) # write output
1179 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1189 vmovdqu (%r13),$inout
1190 vpinsrq \$0,%r13,$offload,$offload
1193 for ($i=0; $i<16; ) {
1194 my $base="+16($Tbl)";
1195 foreach(bodyx_00_15()) { eval; }
1196 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1199 vmovq $offload,%r13 # borrow $a0
1200 vpextrq \$1,$offload,%r15 # borrow $a2
1201 vpand $mask14,$temp,$temp
1203 lea -$PUSH8($Tbl),$Tbl
1204 vmovdqu $iv,(%r15,%r13) # write output
1205 lea 16(%r13),%r13 # inp++
1209 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1210 lea 16*$SZ(%r13),%r13
1211 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1213 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1222 lea (%rsi,%r13),%r12
1228 cmove %rsp,%r12 # next block or stale data
1245 vmovdqu $iv,($ivp) # output IV
1248 $code.=<<___ if ($win64);
1249 movaps `$framesz+16*0`(%rsp),%xmm6
1250 movaps `$framesz+16*1`(%rsp),%xmm7
1251 movaps `$framesz+16*2`(%rsp),%xmm8
1252 movaps `$framesz+16*3`(%rsp),%xmm9
1253 movaps `$framesz+16*4`(%rsp),%xmm10
1254 movaps `$framesz+16*5`(%rsp),%xmm11
1255 movaps `$framesz+16*6`(%rsp),%xmm12
1256 movaps `$framesz+16*7`(%rsp),%xmm13
1257 movaps `$framesz+16*8`(%rsp),%xmm14
1258 movaps `$framesz+16*9`(%rsp),%xmm15
1274 .cfi_def_cfa_register %rsp
1278 .size ${func}_avx2,.-${func}_avx2
1283 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1285 my ($rounds,$Tbl)=("%r11d","%rbx");
1287 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1288 my @rndkey=("%xmm4","%xmm5");
1292 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1293 my @MSG=map("%xmm$_",(10..13));
1297 my ($n,$k)=($r/10,$r%10);
1300 movups `16*$n`($in0),$in # load input
1303 $code.=<<___ if ($n);
1304 movups $iv,`16*($n-1)`($out,$in0) # write output
1308 movups `32+16*$k-112`($key),$rndkey[1]
1309 aesenc $rndkey[0],$iv
1316 movups `32+16*($k+0)-112`($key),$rndkey[1]
1317 aesenc $rndkey[0],$iv
1318 movups `32+16*($k+1)-112`($key),$rndkey[0]
1319 aesenc $rndkey[1],$iv
1321 movups `32+16*($k+2)-112`($key),$rndkey[1]
1322 aesenc $rndkey[0],$iv
1323 movups `32+16*($k+3)-112`($key),$rndkey[0]
1324 aesenc $rndkey[1],$iv
1326 aesenclast $rndkey[0],$iv
1327 movups 16-112($key),$rndkey[1] # forward reference
1332 movups `32+16*$k-112`($key),$rndkey[1]
1333 aesenc $rndkey[0],$iv
1336 $r++; unshift(@rndkey,pop(@rndkey));
1343 .type ${func}_shaext,\@function,6
1346 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
1348 $code.=<<___ if ($win64);
1349 lea `-8-10*16`(%rsp),%rsp
1350 movaps %xmm6,-8-10*16(%rax)
1351 movaps %xmm7,-8-9*16(%rax)
1352 movaps %xmm8,-8-8*16(%rax)
1353 movaps %xmm9,-8-7*16(%rax)
1354 movaps %xmm10,-8-6*16(%rax)
1355 movaps %xmm11,-8-5*16(%rax)
1356 movaps %xmm12,-8-4*16(%rax)
1357 movaps %xmm13,-8-3*16(%rax)
1358 movaps %xmm14,-8-2*16(%rax)
1359 movaps %xmm15,-8-1*16(%rax)
1363 lea K256+0x80(%rip),$Tbl
1364 movdqu ($ctx),$ABEF # DCBA
1365 movdqu 16($ctx),$CDGH # HGFE
1366 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1368 mov 240($key),$rounds
1370 movups ($key),$rndkey0 # $key[0]
1371 movups ($ivp),$iv # load IV
1372 movups 16($key),$rndkey[0] # forward reference
1373 lea 112($key),$key # size optimization
1375 pshufd \$0x1b,$ABEF,$Wi # ABCD
1376 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1377 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1378 movdqa $TMP,$BSWAP # offload
1379 palignr \$8,$CDGH,$ABEF # ABEF
1380 punpcklqdq $Wi,$CDGH # CDGH
1386 movdqu ($inp),@MSG[0]
1387 movdqu 0x10($inp),@MSG[1]
1388 movdqu 0x20($inp),@MSG[2]
1390 movdqu 0x30($inp),@MSG[3]
1392 movdqa 0*32-0x80($Tbl),$Wi
1395 movdqa $CDGH,$CDGH_SAVE # offload
1396 movdqa $ABEF,$ABEF_SAVE # offload
1400 sha256rnds2 $ABEF,$CDGH # 0-3
1401 pshufd \$0x0e,$Wi,$Wi
1405 sha256rnds2 $CDGH,$ABEF
1407 movdqa 1*32-0x80($Tbl),$Wi
1414 sha256rnds2 $ABEF,$CDGH # 4-7
1415 pshufd \$0x0e,$Wi,$Wi
1419 sha256rnds2 $CDGH,$ABEF
1421 movdqa 2*32-0x80($Tbl),$Wi
1424 sha256msg1 @MSG[1],@MSG[0]
1428 sha256rnds2 $ABEF,$CDGH # 8-11
1429 pshufd \$0x0e,$Wi,$Wi
1431 palignr \$4,@MSG[2],$TMP
1436 sha256rnds2 $CDGH,$ABEF
1438 movdqa 3*32-0x80($Tbl),$Wi
1440 sha256msg2 @MSG[3],@MSG[0]
1441 sha256msg1 @MSG[2],@MSG[1]
1445 sha256rnds2 $ABEF,$CDGH # 12-15
1446 pshufd \$0x0e,$Wi,$Wi
1451 palignr \$4,@MSG[3],$TMP
1453 sha256rnds2 $CDGH,$ABEF
1455 for($i=4;$i<16-3;$i++) {
1456 &$aesenc() if (($r%10)==0);
1458 movdqa $i*32-0x80($Tbl),$Wi
1460 sha256msg2 @MSG[0],@MSG[1]
1461 sha256msg1 @MSG[3],@MSG[2]
1465 sha256rnds2 $ABEF,$CDGH # 16-19...
1466 pshufd \$0x0e,$Wi,$Wi
1468 palignr \$4,@MSG[0],$TMP
1472 &$aesenc() if ($r==19);
1474 sha256rnds2 $CDGH,$ABEF
1476 push(@MSG,shift(@MSG));
1479 movdqa 13*32-0x80($Tbl),$Wi
1481 sha256msg2 @MSG[0],@MSG[1]
1482 sha256msg1 @MSG[3],@MSG[2]
1486 sha256rnds2 $ABEF,$CDGH # 52-55
1487 pshufd \$0x0e,$Wi,$Wi
1489 palignr \$4,@MSG[0],$TMP
1495 sha256rnds2 $CDGH,$ABEF
1497 movdqa 14*32-0x80($Tbl),$Wi
1499 sha256msg2 @MSG[1],@MSG[2]
1504 sha256rnds2 $ABEF,$CDGH # 56-59
1505 pshufd \$0x0e,$Wi,$Wi
1509 sha256rnds2 $CDGH,$ABEF
1511 movdqa 15*32-0x80($Tbl),$Wi
1517 sha256rnds2 $ABEF,$CDGH # 60-63
1518 pshufd \$0x0e,$Wi,$Wi
1522 sha256rnds2 $CDGH,$ABEF
1523 #pxor $CDGH,$rndkey0 # black magic
1525 while ($r<40) { &$aesenc(); } # remaining aesenc's
1527 #xorps $CDGH,$rndkey0 # black magic
1528 paddd $CDGH_SAVE,$CDGH
1529 paddd $ABEF_SAVE,$ABEF
1532 movups $iv,48($out,$in0) # write output
1536 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1537 pshufd \$0x1b,$ABEF,$TMP # FEBA
1538 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1539 punpckhqdq $CDGH,$ABEF # DCBA
1540 palignr \$8,$TMP,$CDGH # HGFE
1542 movups $iv,($ivp) # write IV
1544 movdqu $CDGH,16($ctx)
1546 $code.=<<___ if ($win64);
1547 movaps 0*16(%rsp),%xmm6
1548 movaps 1*16(%rsp),%xmm7
1549 movaps 2*16(%rsp),%xmm8
1550 movaps 3*16(%rsp),%xmm9
1551 movaps 4*16(%rsp),%xmm10
1552 movaps 5*16(%rsp),%xmm11
1553 movaps 6*16(%rsp),%xmm12
1554 movaps 7*16(%rsp),%xmm13
1555 movaps 8*16(%rsp),%xmm14
1556 movaps 9*16(%rsp),%xmm15
1557 lea 8+10*16(%rsp),%rsp
1562 .size ${func}_shaext,.-${func}_shaext
1567 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1568 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1569 if ($win64 && $avx) {
1576 .extern __imp_RtlVirtualUnwind
1577 .type se_handler,\@abi-omnipotent
1591 mov 120($context),%rax # pull context->Rax
1592 mov 248($context),%rbx # pull context->Rip
1594 mov 8($disp),%rsi # disp->ImageBase
1595 mov 56($disp),%r11 # disp->HanderlData
1597 mov 0(%r11),%r10d # HandlerData[0]
1598 lea (%rsi,%r10),%r10 # prologue label
1599 cmp %r10,%rbx # context->Rip<prologue label
1602 mov 152($context),%rax # pull context->Rsp
1604 mov 4(%r11),%r10d # HandlerData[1]
1605 lea (%rsi,%r10),%r10 # epilogue label
1606 cmp %r10,%rbx # context->Rip>=epilogue label
1609 $code.=<<___ if ($shaext);
1610 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1615 lea 512($context),%rdi # &context.Xmm6
1617 .long 0xa548f3fc # cld; rep movsq
1618 lea 168(%rax),%rax # adjust stack pointer
1622 $code.=<<___ if ($avx>1);
1623 lea .Lavx2_shortcut(%rip),%r10
1624 cmp %r10,%rbx # context->Rip<avx2_shortcut
1628 add \$`2*$SZ*($rounds-8)`,%rax
1632 mov %rax,%rsi # put aside Rsp
1633 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1641 mov %rbx,144($context) # restore context->Rbx
1642 mov %rbp,160($context) # restore context->Rbp
1643 mov %r12,216($context) # restore context->R12
1644 mov %r13,224($context) # restore context->R13
1645 mov %r14,232($context) # restore context->R14
1646 mov %r15,240($context) # restore context->R15
1648 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1649 lea 512($context),%rdi # &context.Xmm6
1651 .long 0xa548f3fc # cld; rep movsq
1656 mov %rax,152($context) # restore context->Rsp
1657 mov %rsi,168($context) # restore context->Rsi
1658 mov %rdi,176($context) # restore context->Rdi
1660 mov 40($disp),%rdi # disp->ContextRecord
1661 mov $context,%rsi # context
1662 mov \$154,%ecx # sizeof(CONTEXT)
1663 .long 0xa548f3fc # cld; rep movsq
1666 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1667 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1668 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1669 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1670 mov 40(%rsi),%r10 # disp->ContextRecord
1671 lea 56(%rsi),%r11 # &disp->HandlerData
1672 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1673 mov %r10,32(%rsp) # arg5
1674 mov %r11,40(%rsp) # arg6
1675 mov %r12,48(%rsp) # arg7
1676 mov %rcx,56(%rsp) # arg8, (NULL)
1677 call *__imp_RtlVirtualUnwind(%rip)
1679 mov \$1,%eax # ExceptionContinueSearch
1691 .size se_handler,.-se_handler
1694 .rva .LSEH_begin_${func}_xop
1695 .rva .LSEH_end_${func}_xop
1696 .rva .LSEH_info_${func}_xop
1698 .rva .LSEH_begin_${func}_avx
1699 .rva .LSEH_end_${func}_avx
1700 .rva .LSEH_info_${func}_avx
1702 $code.=<<___ if ($avx>1);
1703 .rva .LSEH_begin_${func}_avx2
1704 .rva .LSEH_end_${func}_avx2
1705 .rva .LSEH_info_${func}_avx2
1707 $code.=<<___ if ($shaext);
1708 .rva .LSEH_begin_${func}_shaext
1709 .rva .LSEH_end_${func}_shaext
1710 .rva .LSEH_info_${func}_shaext
1715 .LSEH_info_${func}_xop:
1718 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1720 .LSEH_info_${func}_avx:
1723 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1725 $code.=<<___ if ($avx>1);
1726 .LSEH_info_${func}_avx2:
1729 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1731 $code.=<<___ if ($shaext);
1732 .LSEH_info_${func}_shaext:
1735 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1739 ####################################################################
1741 local *opcode=shift;
1745 $rex|=0x04 if($dst>=8);
1746 $rex|=0x01 if($src>=8);
1747 unshift @opcode,$rex|0x40 if($rex);
1752 "sha256rnds2" => 0xcb,
1753 "sha256msg1" => 0xcc,
1754 "sha256msg2" => 0xcd );
1759 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1760 my @opcode=(0x0f,0x38);
1761 rex(\@opcode,$2,$1);
1762 push @opcode,$opcodelet{$instr};
1763 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1764 return ".byte\t".join(',',@opcode);
1766 return $instr."\t".@_[0];
1771 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1772 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;