2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
21 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
22 # parallelism, interleaving it with another algorithm would allow to
23 # utilize processor resources better and achieve better performance.
24 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25 # AESNI code is weaved into it. As SHA256 dominates execution time,
26 # stitch performance does not depend on AES key length. Below are
27 # performance numbers in cycles per processed byte, less is better,
28 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
31 # AES-128/-192/-256+SHA256 this(**)gain
32 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
33 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
34 # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
35 # Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
36 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
38 # (*) there are XOP, AVX1 and AVX2 code paths, meaning that
39 # Westmere is omitted from loop, this is because gain was not
40 # estimated high enough to justify the effort;
41 # (**) these are EVP-free results, results obtained with 'speed
42 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
46 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
48 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
50 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
51 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
52 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
53 die "can't locate x86_64-xlate.pl";
55 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
56 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
57 $avx = ($1>=2.19) + ($1>=2.22);
60 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
61 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
62 $avx = ($1>=2.09) + ($1>=2.10);
65 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
66 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
67 $avx = ($1>=10) + ($1>=12);
70 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
71 $avx = ($2>=3.0) + ($2>3.0);
74 $shaext=$avx; ### set to zero if compiling for 1.0.1
75 $avx=1 if (!$shaext && $avx);
77 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
80 $func="aesni_cbc_sha256_enc";
83 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
84 "%r8d","%r9d","%r10d","%r11d");
85 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
92 ########################################################################
93 # void aesni_cbc_sha256_enc(const void *inp,
100 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
101 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
105 $_inp="16*$SZ+0*8(%rsp)";
106 $_out="16*$SZ+1*8(%rsp)";
107 $_end="16*$SZ+2*8(%rsp)";
108 $_key="16*$SZ+3*8(%rsp)";
109 $_ivp="16*$SZ+4*8(%rsp)";
110 $_ctx="16*$SZ+5*8(%rsp)";
111 $_in0="16*$SZ+6*8(%rsp)";
112 $_rsp="`16*$SZ+7*8`(%rsp)";
118 .extern OPENSSL_ia32cap_P
120 .type $func,\@abi-omnipotent
126 lea OPENSSL_ia32cap_P(%rip),%r11
128 cmp \$0,`$win64?"%rcx":"%rdi"`
133 $code.=<<___ if ($shaext);
134 bt \$61,%r10 # check for SHA
141 test \$`1<<11`,%r10d # check for XOP
144 $code.=<<___ if ($avx>1);
145 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
146 cmp \$`1<<8|1<<5|1<<3`,%r11d
150 and \$`1<<28`,%r10d # check for AVX
157 cmp \$0,`$win64?"%rcx":"%rdi"`
165 .type $TABLE,\@object
167 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
168 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
169 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
170 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
171 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
172 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
173 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
175 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
176 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
177 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
178 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
179 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
180 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
181 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
182 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
183 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
184 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
185 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
186 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
187 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
188 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
189 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
190 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
191 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
192 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
193 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
194 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
195 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
196 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
197 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
198 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
200 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
201 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
202 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
203 .long 0,0,0,0, 0,0,0,0
204 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
208 ######################################################################
212 ($iv,$inout,$roundkey,$temp,
213 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
217 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
218 ## &vmovdqu ($inout,($inp));
219 ## &mov ($_inp,$inp);
221 '&vpxor ($inout,$inout,$roundkey);'.
222 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
224 '&vpxor ($inout,$inout,$iv);',
226 '&vaesenc ($inout,$inout,$roundkey);'.
227 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
229 '&vaesenc ($inout,$inout,$roundkey);'.
230 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
232 '&vaesenc ($inout,$inout,$roundkey);'.
233 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
235 '&vaesenc ($inout,$inout,$roundkey);'.
236 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
238 '&vaesenc ($inout,$inout,$roundkey);'.
239 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
241 '&vaesenc ($inout,$inout,$roundkey);'.
242 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
244 '&vaesenc ($inout,$inout,$roundkey);'.
245 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
247 '&vaesenc ($inout,$inout,$roundkey);'.
248 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
250 '&vaesenc ($inout,$inout,$roundkey);'.
251 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
253 '&vaesenclast ($temp,$inout,$roundkey);'.
254 ' &vaesenc ($inout,$inout,$roundkey);'.
255 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
257 '&vpand ($iv,$temp,$mask10);'.
258 ' &vaesenc ($inout,$inout,$roundkey);'.
259 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
261 '&vaesenclast ($temp,$inout,$roundkey);'.
262 ' &vaesenc ($inout,$inout,$roundkey);'.
263 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
265 '&vpand ($temp,$temp,$mask12);'.
266 ' &vaesenc ($inout,$inout,$roundkey);'.
267 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
269 '&vpor ($iv,$iv,$temp);'.
270 ' &vaesenclast ($temp,$inout,$roundkey);'.
271 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
273 ## &mov ($inp,$_inp);
274 ## &mov ($out,$_out);
275 ## &vpand ($temp,$temp,$mask14);
276 ## &vpor ($iv,$iv,$temp);
277 ## &vmovdqu ($iv,($out,$inp);
278 ## &lea (inp,16($inp));
282 my ($a,$b,$c,$d,$e,$f,$g,$h);
284 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
285 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
287 $arg = "\$$arg" if ($arg*1 eq $arg);
288 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
293 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
295 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
300 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
301 '&xor ($a4,$g)', # f^g
303 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
305 '&and ($a4,$e)', # (f^g)&e
307 @aesni_cbc_block[$aesni_cbc_idx++].
309 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
312 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
313 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
314 '&xor ($a2,$b)', # a^b, b^c in next round
316 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
317 '&add ($h,$a4)', # h+=Ch(e,f,g)
318 '&and ($a3,$a2)', # (b^c)&(a^b)
321 '&add ($h,$a0)', # h+=Sigma1(e)
322 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
324 '&add ($d,$h)', # d+=h
325 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
326 '&add ($h,$a3)', # h+=Maj(a,b,c)
329 '&add ($a1,$h);'. # h+=Sigma0(a)
330 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
335 ######################################################################
339 .type ${func}_xop,\@function,6
344 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
345 mov %rsp,%rax # copy %rsp
346 .cfi_def_cfa_register %rax
359 sub \$`$framesz+$win64*16*10`,%rsp
360 and \$-64,%rsp # align stack frame
363 sub $inp,$out # re-bias
365 add $inp,$len # end of input
367 #mov $inp,$_inp # saved later
370 #mov $key,$_key # remains resident in $inp register
375 .cfi_cfa_expression $_rsp,deref,+8
377 $code.=<<___ if ($win64);
378 movaps %xmm6,`$framesz+16*0`(%rsp)
379 movaps %xmm7,`$framesz+16*1`(%rsp)
380 movaps %xmm8,`$framesz+16*2`(%rsp)
381 movaps %xmm9,`$framesz+16*3`(%rsp)
382 movaps %xmm10,`$framesz+16*4`(%rsp)
383 movaps %xmm11,`$framesz+16*5`(%rsp)
384 movaps %xmm12,`$framesz+16*6`(%rsp)
385 movaps %xmm13,`$framesz+16*7`(%rsp)
386 movaps %xmm14,`$framesz+16*8`(%rsp)
387 movaps %xmm15,`$framesz+16*9`(%rsp)
393 mov $inp,%r12 # borrow $a4
394 lea 0x80($key),$inp # size optimization, reassign
395 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
396 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
397 mov $ctx,%r15 # borrow $a2
398 mov $in0,%rsi # borrow $a3
399 vmovdqu ($ivp),$iv # load IV
411 vmovdqa 0x00(%r13,%r14,8),$mask14
412 vmovdqa 0x10(%r13,%r14,8),$mask12
413 vmovdqa 0x20(%r13,%r14,8),$mask10
414 vmovdqu 0x00-0x80($inp),$roundkey
417 if ($SZ==4) { # SHA256
418 my @X = map("%xmm$_",(0..3));
419 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
424 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
425 vmovdqu 0x00(%rsi,%r12),@X[0]
426 vmovdqu 0x10(%rsi,%r12),@X[1]
427 vmovdqu 0x20(%rsi,%r12),@X[2]
428 vmovdqu 0x30(%rsi,%r12),@X[3]
429 vpshufb $t3,@X[0],@X[0]
430 lea $TABLE(%rip),$Tbl
431 vpshufb $t3,@X[1],@X[1]
432 vpshufb $t3,@X[2],@X[2]
433 vpaddd 0x00($Tbl),@X[0],$t0
434 vpshufb $t3,@X[3],@X[3]
435 vpaddd 0x20($Tbl),@X[1],$t1
436 vpaddd 0x40($Tbl),@X[2],$t2
437 vpaddd 0x60($Tbl),@X[3],$t3
438 vmovdqa $t0,0x00(%rsp)
440 vmovdqa $t1,0x10(%rsp)
442 vmovdqa $t2,0x20(%rsp)
444 vmovdqa $t3,0x30(%rsp)
450 sub \$-16*2*$SZ,$Tbl # size optimization
451 vmovdqu (%r12),$inout # $a4
454 sub XOP_256_00_47 () {
458 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
460 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
463 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
466 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
469 &vpsrld ($t0,$t0,$sigma0[2]);
472 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
477 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
480 &vpxor ($t0,$t0,$t1);
485 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
488 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
491 &vpsrld ($t2,@X[3],$sigma1[2]);
494 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
497 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
500 &vpxor ($t3,$t3,$t2);
505 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
510 &vpsrldq ($t3,$t3,8);
515 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
520 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
523 &vpsrld ($t2,@X[0],$sigma1[2]);
526 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
529 &vpxor ($t3,$t3,$t2);
534 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
539 &vpslldq ($t3,$t3,8); # 22 instructions
544 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
549 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
550 foreach (@insns) { eval; } # remaining instructions
551 &vmovdqa (16*$j."(%rsp)",$t2);
555 for ($i=0,$j=0; $j<4; $j++) {
556 &XOP_256_00_47($j,\&body_00_15,@X);
557 push(@X,shift(@X)); # rotate(@X)
559 &mov ("%r12",$_inp); # borrow $a4
560 &vpand ($temp,$temp,$mask14);
561 &mov ("%r15",$_out); # borrow $a2
562 &vpor ($iv,$iv,$temp);
563 &vmovdqu ("(%r15,%r12)",$iv); # write output
564 &lea ("%r12","16(%r12)"); # inp++
566 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
567 &jne (".Lxop_00_47");
569 &vmovdqu ($inout,"(%r12)");
573 for ($i=0; $i<16; ) {
574 foreach(body_00_15()) { eval; }
578 mov $_inp,%r12 # borrow $a4
579 mov $_out,%r13 # borrow $a0
580 mov $_ctx,%r15 # borrow $a2
581 mov $_in0,%rsi # borrow $a3
583 vpand $mask14,$temp,$temp
586 vmovdqu $iv,(%r13,%r12) # write output
587 lea 16(%r12),%r12 # inp++
614 vmovdqu $iv,($ivp) # output IV
617 $code.=<<___ if ($win64);
618 movaps `$framesz+16*0`(%rsp),%xmm6
619 movaps `$framesz+16*1`(%rsp),%xmm7
620 movaps `$framesz+16*2`(%rsp),%xmm8
621 movaps `$framesz+16*3`(%rsp),%xmm9
622 movaps `$framesz+16*4`(%rsp),%xmm10
623 movaps `$framesz+16*5`(%rsp),%xmm11
624 movaps `$framesz+16*6`(%rsp),%xmm12
625 movaps `$framesz+16*7`(%rsp),%xmm13
626 movaps `$framesz+16*8`(%rsp),%xmm14
627 movaps `$framesz+16*9`(%rsp),%xmm15
643 .cfi_def_cfa_register %rsp
647 .size ${func}_xop,.-${func}_xop
649 ######################################################################
652 local *ror = sub { &shrd(@_[0],@_) };
655 .type ${func}_avx,\@function,6
660 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
661 mov %rsp,%rax # copy %rsp
662 .cfi_def_cfa_register %rax
675 sub \$`$framesz+$win64*16*10`,%rsp
676 and \$-64,%rsp # align stack frame
679 sub $inp,$out # re-bias
681 add $inp,$len # end of input
683 #mov $inp,$_inp # saved later
686 #mov $key,$_key # remains resident in $inp register
691 .cfi_cfa_expression $_rsp,deref,+8
693 $code.=<<___ if ($win64);
694 movaps %xmm6,`$framesz+16*0`(%rsp)
695 movaps %xmm7,`$framesz+16*1`(%rsp)
696 movaps %xmm8,`$framesz+16*2`(%rsp)
697 movaps %xmm9,`$framesz+16*3`(%rsp)
698 movaps %xmm10,`$framesz+16*4`(%rsp)
699 movaps %xmm11,`$framesz+16*5`(%rsp)
700 movaps %xmm12,`$framesz+16*6`(%rsp)
701 movaps %xmm13,`$framesz+16*7`(%rsp)
702 movaps %xmm14,`$framesz+16*8`(%rsp)
703 movaps %xmm15,`$framesz+16*9`(%rsp)
709 mov $inp,%r12 # borrow $a4
710 lea 0x80($key),$inp # size optimization, reassign
711 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
712 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
713 mov $ctx,%r15 # borrow $a2
714 mov $in0,%rsi # borrow $a3
715 vmovdqu ($ivp),$iv # load IV
727 vmovdqa 0x00(%r13,%r14,8),$mask14
728 vmovdqa 0x10(%r13,%r14,8),$mask12
729 vmovdqa 0x20(%r13,%r14,8),$mask10
730 vmovdqu 0x00-0x80($inp),$roundkey
732 if ($SZ==4) { # SHA256
733 my @X = map("%xmm$_",(0..3));
734 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
740 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
741 vmovdqu 0x00(%rsi,%r12),@X[0]
742 vmovdqu 0x10(%rsi,%r12),@X[1]
743 vmovdqu 0x20(%rsi,%r12),@X[2]
744 vmovdqu 0x30(%rsi,%r12),@X[3]
745 vpshufb $t3,@X[0],@X[0]
746 lea $TABLE(%rip),$Tbl
747 vpshufb $t3,@X[1],@X[1]
748 vpshufb $t3,@X[2],@X[2]
749 vpaddd 0x00($Tbl),@X[0],$t0
750 vpshufb $t3,@X[3],@X[3]
751 vpaddd 0x20($Tbl),@X[1],$t1
752 vpaddd 0x40($Tbl),@X[2],$t2
753 vpaddd 0x60($Tbl),@X[3],$t3
754 vmovdqa $t0,0x00(%rsp)
756 vmovdqa $t1,0x10(%rsp)
758 vmovdqa $t2,0x20(%rsp)
760 vmovdqa $t3,0x30(%rsp)
766 sub \$-16*2*$SZ,$Tbl # size optimization
767 vmovdqu (%r12),$inout # $a4
770 sub Xupdate_256_AVX () {
772 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
773 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
774 '&vpsrld ($t2,$t0,$sigma0[0]);',
775 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
776 '&vpsrld ($t3,$t0,$sigma0[2])',
777 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
778 '&vpxor ($t0,$t3,$t2)',
779 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
780 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
781 '&vpxor ($t0,$t0,$t1)',
782 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
783 '&vpxor ($t0,$t0,$t2)',
784 '&vpsrld ($t2,$t3,$sigma1[2]);',
785 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
786 '&vpsrlq ($t3,$t3,$sigma1[0]);',
787 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
788 '&vpxor ($t2,$t2,$t3);',
789 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
790 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
791 '&vpshufd ($t2,$t2,0b10000100)',
792 '&vpsrldq ($t2,$t2,8)',
793 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
794 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
795 '&vpsrld ($t2,$t3,$sigma1[2])',
796 '&vpsrlq ($t3,$t3,$sigma1[0])',
797 '&vpxor ($t2,$t2,$t3);',
798 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
799 '&vpxor ($t2,$t2,$t3)',
800 '&vpshufd ($t2,$t2,0b11101000)',
801 '&vpslldq ($t2,$t2,8)',
802 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
806 sub AVX_256_00_47 () {
810 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
812 foreach (Xupdate_256_AVX()) { # 29 instructions
818 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
819 foreach (@insns) { eval; } # remaining instructions
820 &vmovdqa (16*$j."(%rsp)",$t2);
824 for ($i=0,$j=0; $j<4; $j++) {
825 &AVX_256_00_47($j,\&body_00_15,@X);
826 push(@X,shift(@X)); # rotate(@X)
828 &mov ("%r12",$_inp); # borrow $a4
829 &vpand ($temp,$temp,$mask14);
830 &mov ("%r15",$_out); # borrow $a2
831 &vpor ($iv,$iv,$temp);
832 &vmovdqu ("(%r15,%r12)",$iv); # write output
833 &lea ("%r12","16(%r12)"); # inp++
835 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
836 &jne (".Lavx_00_47");
838 &vmovdqu ($inout,"(%r12)");
842 for ($i=0; $i<16; ) {
843 foreach(body_00_15()) { eval; }
848 mov $_inp,%r12 # borrow $a4
849 mov $_out,%r13 # borrow $a0
850 mov $_ctx,%r15 # borrow $a2
851 mov $_in0,%rsi # borrow $a3
853 vpand $mask14,$temp,$temp
856 vmovdqu $iv,(%r13,%r12) # write output
857 lea 16(%r12),%r12 # inp++
883 vmovdqu $iv,($ivp) # output IV
886 $code.=<<___ if ($win64);
887 movaps `$framesz+16*0`(%rsp),%xmm6
888 movaps `$framesz+16*1`(%rsp),%xmm7
889 movaps `$framesz+16*2`(%rsp),%xmm8
890 movaps `$framesz+16*3`(%rsp),%xmm9
891 movaps `$framesz+16*4`(%rsp),%xmm10
892 movaps `$framesz+16*5`(%rsp),%xmm11
893 movaps `$framesz+16*6`(%rsp),%xmm12
894 movaps `$framesz+16*7`(%rsp),%xmm13
895 movaps `$framesz+16*8`(%rsp),%xmm14
896 movaps `$framesz+16*9`(%rsp),%xmm15
912 .cfi_def_cfa_register %rsp
916 .size ${func}_avx,.-${func}_avx
920 ######################################################################
923 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
928 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
930 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
932 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
933 '&and ($a4,$e)', # f&e
934 '&rorx ($a0,$e,$Sigma1[2])',
935 '&rorx ($a2,$e,$Sigma1[1])',
937 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
938 '&lea ($h,"($h,$a4)")',
939 '&andn ($a4,$e,$g)', # ~e&g
942 '&rorx ($a1,$e,$Sigma1[0])',
943 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
944 '&xor ($a0,$a1)', # Sigma1(e)
947 '&rorx ($a4,$a,$Sigma0[2])',
948 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
949 '&xor ($a2,$b)', # a^b, b^c in next round
950 '&rorx ($a1,$a,$Sigma0[1])',
952 '&rorx ($a0,$a,$Sigma0[0])',
953 '&lea ($d,"($d,$h)")', # d+=h
954 '&and ($a3,$a2)', # (b^c)&(a^b)
955 @aesni_cbc_block[$aesni_cbc_idx++].
958 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
959 '&xor ($a1,$a0)', # Sigma0(a)
960 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
961 '&mov ($a4,$e)', # copy of f in future
963 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
965 # and at the finish one has to $a+=$a1
969 .type ${func}_avx2,\@function,6
974 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
975 mov %rsp,%rax # copy %rsp
976 .cfi_def_cfa_register %rax
989 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
990 and \$-256*$SZ,%rsp # align stack frame
991 add \$`2*$SZ*($rounds-8)`,%rsp
994 sub $inp,$out # re-bias
996 add $inp,$len # end of input
998 #mov $inp,$_inp # saved later
999 #mov $out,$_out # kept in $offload
1001 #mov $key,$_key # remains resident in $inp register
1006 .cfi_cfa_expression $_rsp,deref,+8
1008 $code.=<<___ if ($win64);
1009 movaps %xmm6,`$framesz+16*0`(%rsp)
1010 movaps %xmm7,`$framesz+16*1`(%rsp)
1011 movaps %xmm8,`$framesz+16*2`(%rsp)
1012 movaps %xmm9,`$framesz+16*3`(%rsp)
1013 movaps %xmm10,`$framesz+16*4`(%rsp)
1014 movaps %xmm11,`$framesz+16*5`(%rsp)
1015 movaps %xmm12,`$framesz+16*6`(%rsp)
1016 movaps %xmm13,`$framesz+16*7`(%rsp)
1017 movaps %xmm14,`$framesz+16*8`(%rsp)
1018 movaps %xmm15,`$framesz+16*9`(%rsp)
1024 mov $inp,%r13 # borrow $a0
1025 vpinsrq \$1,$out,$offload,$offload
1026 lea 0x80($key),$inp # size optimization, reassign
1027 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
1028 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
1029 mov $ctx,%r15 # borrow $a2
1030 mov $in0,%rsi # borrow $a3
1031 vmovdqu ($ivp),$iv # load IV
1034 vmovdqa 0x00(%r12,%r14,8),$mask14
1035 vmovdqa 0x10(%r12,%r14,8),$mask12
1036 vmovdqa 0x20(%r12,%r14,8),$mask10
1038 sub \$-16*$SZ,%r13 # inp++, size optimization
1040 lea (%rsi,%r13),%r12 # borrow $a0
1042 cmp $len,%r13 # $_end
1044 cmove %rsp,%r12 # next block or random data
1050 vmovdqu 0x00-0x80($inp),$roundkey
1052 if ($SZ==4) { # SHA256
1053 my @X = map("%ymm$_",(0..3));
1054 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1060 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1061 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1062 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1063 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1064 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1066 vinserti128 \$1,(%r12),@X[0],@X[0]
1067 vinserti128 \$1,16(%r12),@X[1],@X[1]
1068 vpshufb $t3,@X[0],@X[0]
1069 vinserti128 \$1,32(%r12),@X[2],@X[2]
1070 vpshufb $t3,@X[1],@X[1]
1071 vinserti128 \$1,48(%r12),@X[3],@X[3]
1073 lea $TABLE(%rip),$Tbl
1074 vpshufb $t3,@X[2],@X[2]
1075 lea -16*$SZ(%r13),%r13
1076 vpaddd 0x00($Tbl),@X[0],$t0
1077 vpshufb $t3,@X[3],@X[3]
1078 vpaddd 0x20($Tbl),@X[1],$t1
1079 vpaddd 0x40($Tbl),@X[2],$t2
1080 vpaddd 0x60($Tbl),@X[3],$t3
1081 vmovdqa $t0,0x00(%rsp)
1083 vmovdqa $t1,0x20(%rsp)
1084 lea -$PUSH8(%rsp),%rsp
1086 vmovdqa $t2,0x00(%rsp)
1088 vmovdqa $t3,0x20(%rsp)
1090 sub \$-16*2*$SZ,$Tbl # size optimization
1095 vmovdqu (%r13),$inout
1096 vpinsrq \$0,%r13,$offload,$offload
1099 sub AVX2_256_00_47 () {
1103 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1104 my $base = "+2*$PUSH8(%rsp)";
1106 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1107 foreach (Xupdate_256_AVX()) { # 29 instructions
1109 eval(shift(@insns));
1110 eval(shift(@insns));
1111 eval(shift(@insns));
1113 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1114 foreach (@insns) { eval; } # remaining instructions
1115 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1118 for ($i=0,$j=0; $j<4; $j++) {
1119 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1120 push(@X,shift(@X)); # rotate(@X)
1122 &vmovq ("%r13",$offload); # borrow $a0
1123 &vpextrq ("%r15",$offload,1); # borrow $a2
1124 &vpand ($temp,$temp,$mask14);
1125 &vpor ($iv,$iv,$temp);
1126 &vmovdqu ("(%r15,%r13)",$iv); # write output
1127 &lea ("%r13","16(%r13)"); # inp++
1129 &lea ($Tbl,16*2*$SZ."($Tbl)");
1130 &cmpb (($SZ-1)."($Tbl)",0);
1131 &jne (".Lavx2_00_47");
1133 &vmovdqu ($inout,"(%r13)");
1134 &vpinsrq ($offload,$offload,"%r13",0);
1137 for ($i=0; $i<16; ) {
1138 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1139 foreach(bodyx_00_15()) { eval; }
1143 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1144 vmovq $offload,%r13 # $_inp, borrow $a0
1145 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1147 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1149 vpand $mask14,$temp,$temp
1151 vmovdqu $iv,(%r12,%r13) # write output
1172 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1182 vmovdqu (%r13),$inout
1183 vpinsrq \$0,%r13,$offload,$offload
1186 for ($i=0; $i<16; ) {
1187 my $base="+16($Tbl)";
1188 foreach(bodyx_00_15()) { eval; }
1189 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1192 vmovq $offload,%r13 # borrow $a0
1193 vpextrq \$1,$offload,%r15 # borrow $a2
1194 vpand $mask14,$temp,$temp
1196 lea -$PUSH8($Tbl),$Tbl
1197 vmovdqu $iv,(%r15,%r13) # write output
1198 lea 16(%r13),%r13 # inp++
1202 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1203 lea 16*$SZ(%r13),%r13
1204 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1206 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1215 lea (%rsi,%r13),%r12
1221 cmove %rsp,%r12 # next block or stale data
1238 vmovdqu $iv,($ivp) # output IV
1241 $code.=<<___ if ($win64);
1242 movaps `$framesz+16*0`(%rsp),%xmm6
1243 movaps `$framesz+16*1`(%rsp),%xmm7
1244 movaps `$framesz+16*2`(%rsp),%xmm8
1245 movaps `$framesz+16*3`(%rsp),%xmm9
1246 movaps `$framesz+16*4`(%rsp),%xmm10
1247 movaps `$framesz+16*5`(%rsp),%xmm11
1248 movaps `$framesz+16*6`(%rsp),%xmm12
1249 movaps `$framesz+16*7`(%rsp),%xmm13
1250 movaps `$framesz+16*8`(%rsp),%xmm14
1251 movaps `$framesz+16*9`(%rsp),%xmm15
1267 .cfi_def_cfa_register %rsp
1271 .size ${func}_avx2,.-${func}_avx2
1276 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1278 my ($rounds,$Tbl)=("%r11d","%rbx");
1280 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1281 my @rndkey=("%xmm4","%xmm5");
1285 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1286 my @MSG=map("%xmm$_",(10..13));
1290 my ($n,$k)=($r/10,$r%10);
1293 movups `16*$n`($in0),$in # load input
1296 $code.=<<___ if ($n);
1297 movups $iv,`16*($n-1)`($out,$in0) # write output
1301 movups `32+16*$k-112`($key),$rndkey[1]
1302 aesenc $rndkey[0],$iv
1309 movups `32+16*($k+0)-112`($key),$rndkey[1]
1310 aesenc $rndkey[0],$iv
1311 movups `32+16*($k+1)-112`($key),$rndkey[0]
1312 aesenc $rndkey[1],$iv
1314 movups `32+16*($k+2)-112`($key),$rndkey[1]
1315 aesenc $rndkey[0],$iv
1316 movups `32+16*($k+3)-112`($key),$rndkey[0]
1317 aesenc $rndkey[1],$iv
1319 aesenclast $rndkey[0],$iv
1320 movups 16-112($key),$rndkey[1] # forward reference
1325 movups `32+16*$k-112`($key),$rndkey[1]
1326 aesenc $rndkey[0],$iv
1329 $r++; unshift(@rndkey,pop(@rndkey));
1336 .type ${func}_shaext,\@function,6
1339 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
1341 $code.=<<___ if ($win64);
1342 lea `-8-10*16`(%rsp),%rsp
1343 movaps %xmm6,-8-10*16(%rax)
1344 movaps %xmm7,-8-9*16(%rax)
1345 movaps %xmm8,-8-8*16(%rax)
1346 movaps %xmm9,-8-7*16(%rax)
1347 movaps %xmm10,-8-6*16(%rax)
1348 movaps %xmm11,-8-5*16(%rax)
1349 movaps %xmm12,-8-4*16(%rax)
1350 movaps %xmm13,-8-3*16(%rax)
1351 movaps %xmm14,-8-2*16(%rax)
1352 movaps %xmm15,-8-1*16(%rax)
1356 lea K256+0x80(%rip),$Tbl
1357 movdqu ($ctx),$ABEF # DCBA
1358 movdqu 16($ctx),$CDGH # HGFE
1359 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1361 mov 240($key),$rounds
1363 movups ($key),$rndkey0 # $key[0]
1364 movups ($ivp),$iv # load IV
1365 movups 16($key),$rndkey[0] # forward reference
1366 lea 112($key),$key # size optimization
1368 pshufd \$0x1b,$ABEF,$Wi # ABCD
1369 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1370 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1371 movdqa $TMP,$BSWAP # offload
1372 palignr \$8,$CDGH,$ABEF # ABEF
1373 punpcklqdq $Wi,$CDGH # CDGH
1379 movdqu ($inp),@MSG[0]
1380 movdqu 0x10($inp),@MSG[1]
1381 movdqu 0x20($inp),@MSG[2]
1383 movdqu 0x30($inp),@MSG[3]
1385 movdqa 0*32-0x80($Tbl),$Wi
1388 movdqa $CDGH,$CDGH_SAVE # offload
1389 movdqa $ABEF,$ABEF_SAVE # offload
1393 sha256rnds2 $ABEF,$CDGH # 0-3
1394 pshufd \$0x0e,$Wi,$Wi
1398 sha256rnds2 $CDGH,$ABEF
1400 movdqa 1*32-0x80($Tbl),$Wi
1407 sha256rnds2 $ABEF,$CDGH # 4-7
1408 pshufd \$0x0e,$Wi,$Wi
1412 sha256rnds2 $CDGH,$ABEF
1414 movdqa 2*32-0x80($Tbl),$Wi
1417 sha256msg1 @MSG[1],@MSG[0]
1421 sha256rnds2 $ABEF,$CDGH # 8-11
1422 pshufd \$0x0e,$Wi,$Wi
1424 palignr \$4,@MSG[2],$TMP
1429 sha256rnds2 $CDGH,$ABEF
1431 movdqa 3*32-0x80($Tbl),$Wi
1433 sha256msg2 @MSG[3],@MSG[0]
1434 sha256msg1 @MSG[2],@MSG[1]
1438 sha256rnds2 $ABEF,$CDGH # 12-15
1439 pshufd \$0x0e,$Wi,$Wi
1444 palignr \$4,@MSG[3],$TMP
1446 sha256rnds2 $CDGH,$ABEF
1448 for($i=4;$i<16-3;$i++) {
1449 &$aesenc() if (($r%10)==0);
1451 movdqa $i*32-0x80($Tbl),$Wi
1453 sha256msg2 @MSG[0],@MSG[1]
1454 sha256msg1 @MSG[3],@MSG[2]
1458 sha256rnds2 $ABEF,$CDGH # 16-19...
1459 pshufd \$0x0e,$Wi,$Wi
1461 palignr \$4,@MSG[0],$TMP
1465 &$aesenc() if ($r==19);
1467 sha256rnds2 $CDGH,$ABEF
1469 push(@MSG,shift(@MSG));
1472 movdqa 13*32-0x80($Tbl),$Wi
1474 sha256msg2 @MSG[0],@MSG[1]
1475 sha256msg1 @MSG[3],@MSG[2]
1479 sha256rnds2 $ABEF,$CDGH # 52-55
1480 pshufd \$0x0e,$Wi,$Wi
1482 palignr \$4,@MSG[0],$TMP
1488 sha256rnds2 $CDGH,$ABEF
1490 movdqa 14*32-0x80($Tbl),$Wi
1492 sha256msg2 @MSG[1],@MSG[2]
1497 sha256rnds2 $ABEF,$CDGH # 56-59
1498 pshufd \$0x0e,$Wi,$Wi
1502 sha256rnds2 $CDGH,$ABEF
1504 movdqa 15*32-0x80($Tbl),$Wi
1510 sha256rnds2 $ABEF,$CDGH # 60-63
1511 pshufd \$0x0e,$Wi,$Wi
1515 sha256rnds2 $CDGH,$ABEF
1516 #pxor $CDGH,$rndkey0 # black magic
1518 while ($r<40) { &$aesenc(); } # remaining aesenc's
1520 #xorps $CDGH,$rndkey0 # black magic
1521 paddd $CDGH_SAVE,$CDGH
1522 paddd $ABEF_SAVE,$ABEF
1525 movups $iv,48($out,$in0) # write output
1529 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1530 pshufd \$0x1b,$ABEF,$TMP # FEBA
1531 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1532 punpckhqdq $CDGH,$ABEF # DCBA
1533 palignr \$8,$TMP,$CDGH # HGFE
1535 movups $iv,($ivp) # write IV
1537 movdqu $CDGH,16($ctx)
1539 $code.=<<___ if ($win64);
1540 movaps 0*16(%rsp),%xmm6
1541 movaps 1*16(%rsp),%xmm7
1542 movaps 2*16(%rsp),%xmm8
1543 movaps 3*16(%rsp),%xmm9
1544 movaps 4*16(%rsp),%xmm10
1545 movaps 5*16(%rsp),%xmm11
1546 movaps 6*16(%rsp),%xmm12
1547 movaps 7*16(%rsp),%xmm13
1548 movaps 8*16(%rsp),%xmm14
1549 movaps 9*16(%rsp),%xmm15
1550 lea 8+10*16(%rsp),%rsp
1555 .size ${func}_shaext,.-${func}_shaext
1560 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1561 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1562 if ($win64 && $avx) {
1569 .extern __imp_RtlVirtualUnwind
1570 .type se_handler,\@abi-omnipotent
1584 mov 120($context),%rax # pull context->Rax
1585 mov 248($context),%rbx # pull context->Rip
1587 mov 8($disp),%rsi # disp->ImageBase
1588 mov 56($disp),%r11 # disp->HanderlData
1590 mov 0(%r11),%r10d # HandlerData[0]
1591 lea (%rsi,%r10),%r10 # prologue label
1592 cmp %r10,%rbx # context->Rip<prologue label
1595 mov 152($context),%rax # pull context->Rsp
1597 mov 4(%r11),%r10d # HandlerData[1]
1598 lea (%rsi,%r10),%r10 # epilogue label
1599 cmp %r10,%rbx # context->Rip>=epilogue label
1602 $code.=<<___ if ($shaext);
1603 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1608 lea 512($context),%rdi # &context.Xmm6
1610 .long 0xa548f3fc # cld; rep movsq
1611 lea 168(%rax),%rax # adjust stack pointer
1615 $code.=<<___ if ($avx>1);
1616 lea .Lavx2_shortcut(%rip),%r10
1617 cmp %r10,%rbx # context->Rip<avx2_shortcut
1621 add \$`2*$SZ*($rounds-8)`,%rax
1625 mov %rax,%rsi # put aside Rsp
1626 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1634 mov %rbx,144($context) # restore context->Rbx
1635 mov %rbp,160($context) # restore context->Rbp
1636 mov %r12,216($context) # restore context->R12
1637 mov %r13,224($context) # restore context->R13
1638 mov %r14,232($context) # restore context->R14
1639 mov %r15,240($context) # restore context->R15
1641 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1642 lea 512($context),%rdi # &context.Xmm6
1644 .long 0xa548f3fc # cld; rep movsq
1649 mov %rax,152($context) # restore context->Rsp
1650 mov %rsi,168($context) # restore context->Rsi
1651 mov %rdi,176($context) # restore context->Rdi
1653 mov 40($disp),%rdi # disp->ContextRecord
1654 mov $context,%rsi # context
1655 mov \$154,%ecx # sizeof(CONTEXT)
1656 .long 0xa548f3fc # cld; rep movsq
1659 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1660 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1661 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1662 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1663 mov 40(%rsi),%r10 # disp->ContextRecord
1664 lea 56(%rsi),%r11 # &disp->HandlerData
1665 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1666 mov %r10,32(%rsp) # arg5
1667 mov %r11,40(%rsp) # arg6
1668 mov %r12,48(%rsp) # arg7
1669 mov %rcx,56(%rsp) # arg8, (NULL)
1670 call *__imp_RtlVirtualUnwind(%rip)
1672 mov \$1,%eax # ExceptionContinueSearch
1684 .size se_handler,.-se_handler
1687 .rva .LSEH_begin_${func}_xop
1688 .rva .LSEH_end_${func}_xop
1689 .rva .LSEH_info_${func}_xop
1691 .rva .LSEH_begin_${func}_avx
1692 .rva .LSEH_end_${func}_avx
1693 .rva .LSEH_info_${func}_avx
1695 $code.=<<___ if ($avx>1);
1696 .rva .LSEH_begin_${func}_avx2
1697 .rva .LSEH_end_${func}_avx2
1698 .rva .LSEH_info_${func}_avx2
1700 $code.=<<___ if ($shaext);
1701 .rva .LSEH_begin_${func}_shaext
1702 .rva .LSEH_end_${func}_shaext
1703 .rva .LSEH_info_${func}_shaext
1708 .LSEH_info_${func}_xop:
1711 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1713 .LSEH_info_${func}_avx:
1716 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1718 $code.=<<___ if ($avx>1);
1719 .LSEH_info_${func}_avx2:
1722 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1724 $code.=<<___ if ($shaext);
1725 .LSEH_info_${func}_shaext:
1728 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1732 ####################################################################
1734 local *opcode=shift;
1738 $rex|=0x04 if($dst>=8);
1739 $rex|=0x01 if($src>=8);
1740 unshift @opcode,$rex|0x40 if($rex);
1745 "sha256rnds2" => 0xcb,
1746 "sha256msg1" => 0xcc,
1747 "sha256msg2" => 0xcd );
1752 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1753 my @opcode=(0x0f,0x38);
1754 rex(\@opcode,$2,$1);
1755 push @opcode,$opcodelet{$instr};
1756 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1757 return ".byte\t".join(',',@opcode);
1759 return $instr."\t".@_[0];
1764 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1765 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;