aes/asm/aesni-x86[_64].pl update.
[openssl.git] / crypto / aes / asm / aesni-sha256-x86_64.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # January 2013
11 #
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
22 # subroutine:
23 #
24 #                AES-128/-192/-256+SHA256       this(**)gain
25 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
26 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
27 # Haswell           4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
28 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
29 #
30 # (*)   there are XOP, AVX1 and AVX2 code pathes, meaning that
31 #       Westmere is omitted from loop, this is because gain was not
32 #       estimated high enough to justify the effort;
33 # (**)  these are EVP-free results, results obtained with 'speed
34 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
35
36 $flavour = shift;
37 $output  = shift;
38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
39
40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
41
42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
45 die "can't locate x86_64-xlate.pl";
46
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49         $avx = ($1>=2.19) + ($1>=2.22);
50 }
51
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54         $avx = ($1>=2.09) + ($1>=2.10);
55 }
56
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59         $avx = ($1>=10) + ($1>=12);
60 }
61
62 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
63         $avx = ($2>=3.0) + ($2>3.0);
64 }
65
66 $shaext=$avx;   ### set to zero if compiling for 1.0.1
67 $avx=1          if (!$shaext && $avx);
68
69 open OUT,"| \"$^X\" $xlate $flavour $output";
70 *STDOUT=*OUT;
71
72 $func="aesni_cbc_sha256_enc";
73 $TABLE="K256";
74 $SZ=4;
75 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
76                                 "%r8d","%r9d","%r10d","%r11d");
77 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
78 @Sigma0=( 2,13,22);
79 @Sigma1=( 6,11,25);
80 @sigma0=( 7,18, 3);
81 @sigma1=(17,19,10);
82 $rounds=64;
83
84 ########################################################################
85 # void aesni_cbc_sha256_enc(const void *inp,
86 #                       void *out,
87 #                       size_t length,
88 #                       const AES_KEY *key,
89 #                       unsigned char *iv,
90 #                       SHA256_CTX *ctx,
91 #                       const void *in0);
92 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
93 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
94
95 $Tbl="%rbp";
96
97 $_inp="16*$SZ+0*8(%rsp)";
98 $_out="16*$SZ+1*8(%rsp)";
99 $_end="16*$SZ+2*8(%rsp)";
100 $_key="16*$SZ+3*8(%rsp)";
101 $_ivp="16*$SZ+4*8(%rsp)";
102 $_ctx="16*$SZ+5*8(%rsp)";
103 $_in0="16*$SZ+6*8(%rsp)";
104 $_rsp="16*$SZ+7*8(%rsp)";
105 $framesz=16*$SZ+8*8;
106
107 $code=<<___;
108 .text
109
110 .extern OPENSSL_ia32cap_P
111 .globl  $func
112 .type   $func,\@abi-omnipotent
113 .align  16
114 $func:
115 ___
116                                                 if ($avx) {
117 $code.=<<___;
118         lea     OPENSSL_ia32cap_P(%rip),%r11
119         mov     \$1,%eax
120         cmp     \$0,`$win64?"%rcx":"%rdi"`
121         je      .Lprobe
122         mov     0(%r11),%eax
123         mov     4(%r11),%r10
124 ___
125 $code.=<<___ if ($shaext);
126         bt      \$61,%r10                       # check for SHA
127         jc      ${func}_shaext
128 ___
129 $code.=<<___;
130         mov     %r10,%r11
131         shr     \$32,%r11
132
133         test    \$`1<<11`,%r10d                 # check for XOP
134         jnz     ${func}_xop
135 ___
136 $code.=<<___ if ($avx>1);
137         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
138         cmp     \$`1<<8|1<<5|1<<3`,%r11d
139         je      ${func}_avx2
140 ___
141 $code.=<<___;
142         and     \$`1<<30`,%eax                  # mask "Intel CPU" bit
143         and     \$`1<<28|1<<9`,%r10d            # mask AVX+SSSE3 bits
144         or      %eax,%r10d
145         cmp     \$`1<<28|1<<9|1<<30`,%r10d
146         je      ${func}_avx
147         ud2
148 ___
149                                                 }
150 $code.=<<___;
151         xor     %eax,%eax
152         cmp     \$0,`$win64?"%rcx":"%rdi"`
153         je      .Lprobe
154         ud2
155 .Lprobe:
156         ret
157 .size   $func,.-$func
158
159 .align  64
160 .type   $TABLE,\@object
161 $TABLE:
162         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
163         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
164         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
165         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
166         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
167         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
168         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
169         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
170         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
171         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
172         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
173         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
174         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
175         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
176         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
177         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
178         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
179         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
180         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
181         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
182         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
183         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
184         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
185         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
186         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
187         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
188         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
189         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
190         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
191         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
192         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
193         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
194
195         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
196         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
197         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
198         .long   0,0,0,0,   0,0,0,0
199         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
200 .align  64
201 ___
202
203 ######################################################################
204 # SIMD code paths
205 #
206 {{{
207 ($iv,$inout,$roundkey,$temp,
208  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
209
210 $aesni_cbc_idx=0;
211 @aesni_cbc_block = (
212 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
213 ##      &vmovdqu        ($inout,($inp));
214 ##      &mov            ($_inp,$inp);
215
216         '&vpxor         ($inout,$inout,$roundkey);'.
217         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
218
219         '&vpxor         ($inout,$inout,$iv);',
220
221         '&vaesenc       ($inout,$inout,$roundkey);'.
222         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
223
224         '&vaesenc       ($inout,$inout,$roundkey);'.
225         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
226
227         '&vaesenc       ($inout,$inout,$roundkey);'.
228         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
229
230         '&vaesenc       ($inout,$inout,$roundkey);'.
231         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
232
233         '&vaesenc       ($inout,$inout,$roundkey);'.
234         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
235
236         '&vaesenc       ($inout,$inout,$roundkey);'.
237         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
238
239         '&vaesenc       ($inout,$inout,$roundkey);'.
240         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
241
242         '&vaesenc       ($inout,$inout,$roundkey);'.
243         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
244
245         '&vaesenc       ($inout,$inout,$roundkey);'.
246         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
247
248         '&vaesenclast   ($temp,$inout,$roundkey);'.
249         ' &vaesenc      ($inout,$inout,$roundkey);'.
250         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
251
252         '&vpand         ($iv,$temp,$mask10);'.
253         ' &vaesenc      ($inout,$inout,$roundkey);'.
254         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
255
256         '&vaesenclast   ($temp,$inout,$roundkey);'.
257         ' &vaesenc      ($inout,$inout,$roundkey);'.
258         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
259
260         '&vpand         ($temp,$temp,$mask12);'.
261         ' &vaesenc      ($inout,$inout,$roundkey);'.
262          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
263
264         '&vpor          ($iv,$iv,$temp);'.
265         ' &vaesenclast  ($temp,$inout,$roundkey);'.
266         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
267
268 ##      &mov            ($inp,$_inp);
269 ##      &mov            ($out,$_out);
270 ##      &vpand          ($temp,$temp,$mask14);
271 ##      &vpor           ($iv,$iv,$temp);
272 ##      &vmovdqu        ($iv,($out,$inp);
273 ##      &lea            (inp,16($inp));
274 );
275
276 my $a4=$T1;
277 my ($a,$b,$c,$d,$e,$f,$g,$h);
278
279 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
280 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
281   my $arg = pop;
282     $arg = "\$$arg" if ($arg*1 eq $arg);
283     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
284 }
285
286 sub body_00_15 () {
287         (
288         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
289
290         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
291         '&mov   ($a,$a1)',
292         '&mov   ($a4,$f)',
293
294         '&xor   ($a0,$e)',
295         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
296         '&xor   ($a4,$g)',                      # f^g
297
298         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
299         '&xor   ($a1,$a)',
300         '&and   ($a4,$e)',                      # (f^g)&e
301
302         @aesni_cbc_block[$aesni_cbc_idx++].
303         '&xor   ($a0,$e)',
304         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
305         '&mov   ($a2,$a)',
306
307         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
308         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
309         '&xor   ($a2,$b)',                      # a^b, b^c in next round
310
311         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
312         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
313         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
314
315         '&xor   ($a1,$a)',
316         '&add   ($h,$a0)',                      # h+=Sigma1(e)
317         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
318
319         '&add   ($d,$h)',                       # d+=h
320         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
321         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
322
323         '&mov   ($a0,$d)',
324         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
325         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
326         );
327 }
328
329 if ($avx) {{
330 ######################################################################
331 # XOP code path
332 #
333 $code.=<<___;
334 .type   ${func}_xop,\@function,6
335 .align  64
336 ${func}_xop:
337 .Lxop_shortcut:
338         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
339         push    %rbx
340         push    %rbp
341         push    %r12
342         push    %r13
343         push    %r14
344         push    %r15
345         mov     %rsp,%r11               # copy %rsp
346         sub     \$`$framesz+$win64*16*10`,%rsp
347         and     \$-64,%rsp              # align stack frame
348
349         shl     \$6,$len
350         sub     $inp,$out               # re-bias
351         sub     $inp,$in0
352         add     $inp,$len               # end of input
353
354         #mov    $inp,$_inp              # saved later
355         mov     $out,$_out
356         mov     $len,$_end
357         #mov    $key,$_key              # remains resident in $inp register
358         mov     $ivp,$_ivp
359         mov     $ctx,$_ctx
360         mov     $in0,$_in0
361         mov     %r11,$_rsp
362 ___
363 $code.=<<___ if ($win64);
364         movaps  %xmm6,`$framesz+16*0`(%rsp)
365         movaps  %xmm7,`$framesz+16*1`(%rsp)
366         movaps  %xmm8,`$framesz+16*2`(%rsp)
367         movaps  %xmm9,`$framesz+16*3`(%rsp)
368         movaps  %xmm10,`$framesz+16*4`(%rsp)
369         movaps  %xmm11,`$framesz+16*5`(%rsp)
370         movaps  %xmm12,`$framesz+16*6`(%rsp)
371         movaps  %xmm13,`$framesz+16*7`(%rsp)
372         movaps  %xmm14,`$framesz+16*8`(%rsp)
373         movaps  %xmm15,`$framesz+16*9`(%rsp)
374 ___
375 $code.=<<___;
376 .Lprologue_xop:
377         vzeroall
378
379         mov     $inp,%r12               # borrow $a4
380         lea     0x80($key),$inp         # size optimization, reassign
381         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
382         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
383         mov     $ctx,%r15               # borrow $a2
384         mov     $in0,%rsi               # borrow $a3
385         vmovdqu ($ivp),$iv              # load IV
386         sub     \$9,%r14
387
388         mov     $SZ*0(%r15),$A
389         mov     $SZ*1(%r15),$B
390         mov     $SZ*2(%r15),$C
391         mov     $SZ*3(%r15),$D
392         mov     $SZ*4(%r15),$E
393         mov     $SZ*5(%r15),$F
394         mov     $SZ*6(%r15),$G
395         mov     $SZ*7(%r15),$H
396
397         vmovdqa 0x00(%r13,%r14,8),$mask14
398         vmovdqa 0x10(%r13,%r14,8),$mask12
399         vmovdqa 0x20(%r13,%r14,8),$mask10
400         vmovdqu 0x00-0x80($inp),$roundkey
401         jmp     .Lloop_xop
402 ___
403                                         if ($SZ==4) {   # SHA256
404     my @X = map("%xmm$_",(0..3));
405     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
406
407 $code.=<<___;
408 .align  16
409 .Lloop_xop:
410         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
411         vmovdqu 0x00(%rsi,%r12),@X[0]
412         vmovdqu 0x10(%rsi,%r12),@X[1]
413         vmovdqu 0x20(%rsi,%r12),@X[2]
414         vmovdqu 0x30(%rsi,%r12),@X[3]
415         vpshufb $t3,@X[0],@X[0]
416         lea     $TABLE(%rip),$Tbl
417         vpshufb $t3,@X[1],@X[1]
418         vpshufb $t3,@X[2],@X[2]
419         vpaddd  0x00($Tbl),@X[0],$t0
420         vpshufb $t3,@X[3],@X[3]
421         vpaddd  0x20($Tbl),@X[1],$t1
422         vpaddd  0x40($Tbl),@X[2],$t2
423         vpaddd  0x60($Tbl),@X[3],$t3
424         vmovdqa $t0,0x00(%rsp)
425         mov     $A,$a1
426         vmovdqa $t1,0x10(%rsp)
427         mov     $B,$a3
428         vmovdqa $t2,0x20(%rsp)
429         xor     $C,$a3                  # magic
430         vmovdqa $t3,0x30(%rsp)
431         mov     $E,$a0
432         jmp     .Lxop_00_47
433
434 .align  16
435 .Lxop_00_47:
436         sub     \$-16*2*$SZ,$Tbl        # size optimization
437         vmovdqu (%r12),$inout           # $a4
438         mov     %r12,$_inp              # $a4
439 ___
440 sub XOP_256_00_47 () {
441 my $j = shift;
442 my $body = shift;
443 my @X = @_;
444 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
445
446         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
447           eval(shift(@insns));
448           eval(shift(@insns));
449          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
450           eval(shift(@insns));
451           eval(shift(@insns));
452         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
453           eval(shift(@insns));
454           eval(shift(@insns));
455         &vpsrld         ($t0,$t0,$sigma0[2]);
456           eval(shift(@insns));
457           eval(shift(@insns));
458          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
459           eval(shift(@insns));
460           eval(shift(@insns));
461           eval(shift(@insns));
462           eval(shift(@insns));
463         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
464           eval(shift(@insns));
465           eval(shift(@insns));
466         &vpxor          ($t0,$t0,$t1);
467           eval(shift(@insns));
468           eval(shift(@insns));
469           eval(shift(@insns));
470           eval(shift(@insns));
471          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
472           eval(shift(@insns));
473           eval(shift(@insns));
474         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
475           eval(shift(@insns));
476           eval(shift(@insns));
477          &vpsrld        ($t2,@X[3],$sigma1[2]);
478           eval(shift(@insns));
479           eval(shift(@insns));
480         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
481           eval(shift(@insns));
482           eval(shift(@insns));
483          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
484           eval(shift(@insns));
485           eval(shift(@insns));
486          &vpxor         ($t3,$t3,$t2);
487           eval(shift(@insns));
488           eval(shift(@insns));
489           eval(shift(@insns));
490           eval(shift(@insns));
491          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
492           eval(shift(@insns));
493           eval(shift(@insns));
494           eval(shift(@insns));
495           eval(shift(@insns));
496         &vpsrldq        ($t3,$t3,8);
497           eval(shift(@insns));
498           eval(shift(@insns));
499           eval(shift(@insns));
500           eval(shift(@insns));
501         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
502           eval(shift(@insns));
503           eval(shift(@insns));
504           eval(shift(@insns));
505           eval(shift(@insns));
506          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
507           eval(shift(@insns));
508           eval(shift(@insns));
509          &vpsrld        ($t2,@X[0],$sigma1[2]);
510           eval(shift(@insns));
511           eval(shift(@insns));
512          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
513           eval(shift(@insns));
514           eval(shift(@insns));
515          &vpxor         ($t3,$t3,$t2);
516           eval(shift(@insns));
517           eval(shift(@insns));
518           eval(shift(@insns));
519           eval(shift(@insns));
520          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
521           eval(shift(@insns));
522           eval(shift(@insns));
523           eval(shift(@insns));
524           eval(shift(@insns));
525         &vpslldq        ($t3,$t3,8);            # 22 instructions
526           eval(shift(@insns));
527           eval(shift(@insns));
528           eval(shift(@insns));
529           eval(shift(@insns));
530         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
531           eval(shift(@insns));
532           eval(shift(@insns));
533           eval(shift(@insns));
534           eval(shift(@insns));
535         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
536           foreach (@insns) { eval; }            # remaining instructions
537         &vmovdqa        (16*$j."(%rsp)",$t2);
538 }
539
540     $aesni_cbc_idx=0;
541     for ($i=0,$j=0; $j<4; $j++) {
542         &XOP_256_00_47($j,\&body_00_15,@X);
543         push(@X,shift(@X));                     # rotate(@X)
544     }
545         &mov            ("%r12",$_inp);         # borrow $a4
546         &vpand          ($temp,$temp,$mask14);
547         &mov            ("%r15",$_out);         # borrow $a2
548         &vpor           ($iv,$iv,$temp);
549         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
550         &lea            ("%r12","16(%r12)");    # inp++
551
552         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
553         &jne    (".Lxop_00_47");
554
555         &vmovdqu        ($inout,"(%r12)");
556         &mov            ($_inp,"%r12");
557
558     $aesni_cbc_idx=0;
559     for ($i=0; $i<16; ) {
560         foreach(body_00_15()) { eval; }
561     }
562                                         }
563 $code.=<<___;
564         mov     $_inp,%r12              # borrow $a4
565         mov     $_out,%r13              # borrow $a0
566         mov     $_ctx,%r15              # borrow $a2
567         mov     $_in0,%rsi              # borrow $a3
568
569         vpand   $mask14,$temp,$temp
570         mov     $a1,$A
571         vpor    $temp,$iv,$iv
572         vmovdqu $iv,(%r13,%r12)         # write output
573         lea     16(%r12),%r12           # inp++
574
575         add     $SZ*0(%r15),$A
576         add     $SZ*1(%r15),$B
577         add     $SZ*2(%r15),$C
578         add     $SZ*3(%r15),$D
579         add     $SZ*4(%r15),$E
580         add     $SZ*5(%r15),$F
581         add     $SZ*6(%r15),$G
582         add     $SZ*7(%r15),$H
583
584         cmp     $_end,%r12
585
586         mov     $A,$SZ*0(%r15)
587         mov     $B,$SZ*1(%r15)
588         mov     $C,$SZ*2(%r15)
589         mov     $D,$SZ*3(%r15)
590         mov     $E,$SZ*4(%r15)
591         mov     $F,$SZ*5(%r15)
592         mov     $G,$SZ*6(%r15)
593         mov     $H,$SZ*7(%r15)
594
595         jb      .Lloop_xop
596
597         mov     $_ivp,$ivp
598         mov     $_rsp,%rsi
599         vmovdqu $iv,($ivp)              # output IV
600         vzeroall
601 ___
602 $code.=<<___ if ($win64);
603         movaps  `$framesz+16*0`(%rsp),%xmm6
604         movaps  `$framesz+16*1`(%rsp),%xmm7
605         movaps  `$framesz+16*2`(%rsp),%xmm8
606         movaps  `$framesz+16*3`(%rsp),%xmm9
607         movaps  `$framesz+16*4`(%rsp),%xmm10
608         movaps  `$framesz+16*5`(%rsp),%xmm11
609         movaps  `$framesz+16*6`(%rsp),%xmm12
610         movaps  `$framesz+16*7`(%rsp),%xmm13
611         movaps  `$framesz+16*8`(%rsp),%xmm14
612         movaps  `$framesz+16*9`(%rsp),%xmm15
613 ___
614 $code.=<<___;
615         mov     (%rsi),%r15
616         mov     8(%rsi),%r14
617         mov     16(%rsi),%r13
618         mov     24(%rsi),%r12
619         mov     32(%rsi),%rbp
620         mov     40(%rsi),%rbx
621         lea     48(%rsi),%rsp
622 .Lepilogue_xop:
623         ret
624 .size   ${func}_xop,.-${func}_xop
625 ___
626 ######################################################################
627 # AVX+shrd code path
628 #
629 local *ror = sub { &shrd(@_[0],@_) };
630
631 $code.=<<___;
632 .type   ${func}_avx,\@function,6
633 .align  64
634 ${func}_avx:
635 .Lavx_shortcut:
636         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
637         push    %rbx
638         push    %rbp
639         push    %r12
640         push    %r13
641         push    %r14
642         push    %r15
643         mov     %rsp,%r11               # copy %rsp
644         sub     \$`$framesz+$win64*16*10`,%rsp
645         and     \$-64,%rsp              # align stack frame
646
647         shl     \$6,$len
648         sub     $inp,$out               # re-bias
649         sub     $inp,$in0
650         add     $inp,$len               # end of input
651
652         #mov    $inp,$_inp              # saved later
653         mov     $out,$_out
654         mov     $len,$_end
655         #mov    $key,$_key              # remains resident in $inp register
656         mov     $ivp,$_ivp
657         mov     $ctx,$_ctx
658         mov     $in0,$_in0
659         mov     %r11,$_rsp
660 ___
661 $code.=<<___ if ($win64);
662         movaps  %xmm6,`$framesz+16*0`(%rsp)
663         movaps  %xmm7,`$framesz+16*1`(%rsp)
664         movaps  %xmm8,`$framesz+16*2`(%rsp)
665         movaps  %xmm9,`$framesz+16*3`(%rsp)
666         movaps  %xmm10,`$framesz+16*4`(%rsp)
667         movaps  %xmm11,`$framesz+16*5`(%rsp)
668         movaps  %xmm12,`$framesz+16*6`(%rsp)
669         movaps  %xmm13,`$framesz+16*7`(%rsp)
670         movaps  %xmm14,`$framesz+16*8`(%rsp)
671         movaps  %xmm15,`$framesz+16*9`(%rsp)
672 ___
673 $code.=<<___;
674 .Lprologue_avx:
675         vzeroall
676
677         mov     $inp,%r12               # borrow $a4
678         lea     0x80($key),$inp         # size optimization, reassign
679         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
680         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
681         mov     $ctx,%r15               # borrow $a2
682         mov     $in0,%rsi               # borrow $a3
683         vmovdqu ($ivp),$iv              # load IV
684         sub     \$9,%r14
685
686         mov     $SZ*0(%r15),$A
687         mov     $SZ*1(%r15),$B
688         mov     $SZ*2(%r15),$C
689         mov     $SZ*3(%r15),$D
690         mov     $SZ*4(%r15),$E
691         mov     $SZ*5(%r15),$F
692         mov     $SZ*6(%r15),$G
693         mov     $SZ*7(%r15),$H
694
695         vmovdqa 0x00(%r13,%r14,8),$mask14
696         vmovdqa 0x10(%r13,%r14,8),$mask12
697         vmovdqa 0x20(%r13,%r14,8),$mask10
698         vmovdqu 0x00-0x80($inp),$roundkey
699 ___
700                                         if ($SZ==4) {   # SHA256
701     my @X = map("%xmm$_",(0..3));
702     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
703
704 $code.=<<___;
705         jmp     .Lloop_avx
706 .align  16
707 .Lloop_avx:
708         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
709         vmovdqu 0x00(%rsi,%r12),@X[0]
710         vmovdqu 0x10(%rsi,%r12),@X[1]
711         vmovdqu 0x20(%rsi,%r12),@X[2]
712         vmovdqu 0x30(%rsi,%r12),@X[3]
713         vpshufb $t3,@X[0],@X[0]
714         lea     $TABLE(%rip),$Tbl
715         vpshufb $t3,@X[1],@X[1]
716         vpshufb $t3,@X[2],@X[2]
717         vpaddd  0x00($Tbl),@X[0],$t0
718         vpshufb $t3,@X[3],@X[3]
719         vpaddd  0x20($Tbl),@X[1],$t1
720         vpaddd  0x40($Tbl),@X[2],$t2
721         vpaddd  0x60($Tbl),@X[3],$t3
722         vmovdqa $t0,0x00(%rsp)
723         mov     $A,$a1
724         vmovdqa $t1,0x10(%rsp)
725         mov     $B,$a3
726         vmovdqa $t2,0x20(%rsp)
727         xor     $C,$a3                  # magic
728         vmovdqa $t3,0x30(%rsp)
729         mov     $E,$a0
730         jmp     .Lavx_00_47
731
732 .align  16
733 .Lavx_00_47:
734         sub     \$-16*2*$SZ,$Tbl        # size optimization
735         vmovdqu (%r12),$inout           # $a4
736         mov     %r12,$_inp              # $a4
737 ___
738 sub Xupdate_256_AVX () {
739         (
740         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
741          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
742         '&vpsrld        ($t2,$t0,$sigma0[0]);',
743          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
744         '&vpsrld        ($t3,$t0,$sigma0[2])',
745         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
746         '&vpxor         ($t0,$t3,$t2)',
747          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
748         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
749         '&vpxor         ($t0,$t0,$t1)',
750         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
751         '&vpxor         ($t0,$t0,$t2)',
752          '&vpsrld       ($t2,$t3,$sigma1[2]);',
753         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
754          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
755         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
756          '&vpxor        ($t2,$t2,$t3);',
757          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
758          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
759          '&vpshufd      ($t2,$t2,0b10000100)',
760          '&vpsrldq      ($t2,$t2,8)',
761         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
762          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
763          '&vpsrld       ($t2,$t3,$sigma1[2])',
764          '&vpsrlq       ($t3,$t3,$sigma1[0])',
765          '&vpxor        ($t2,$t2,$t3);',
766          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
767          '&vpxor        ($t2,$t2,$t3)',
768          '&vpshufd      ($t2,$t2,0b11101000)',
769          '&vpslldq      ($t2,$t2,8)',
770         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
771         );
772 }
773
774 sub AVX_256_00_47 () {
775 my $j = shift;
776 my $body = shift;
777 my @X = @_;
778 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
779
780         foreach (Xupdate_256_AVX()) {           # 29 instructions
781             eval;
782             eval(shift(@insns));
783             eval(shift(@insns));
784             eval(shift(@insns));
785         }
786         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
787           foreach (@insns) { eval; }            # remaining instructions
788         &vmovdqa        (16*$j."(%rsp)",$t2);
789 }
790
791     $aesni_cbc_idx=0;
792     for ($i=0,$j=0; $j<4; $j++) {
793         &AVX_256_00_47($j,\&body_00_15,@X);
794         push(@X,shift(@X));                     # rotate(@X)
795     }
796         &mov            ("%r12",$_inp);         # borrow $a4
797         &vpand          ($temp,$temp,$mask14);
798         &mov            ("%r15",$_out);         # borrow $a2
799         &vpor           ($iv,$iv,$temp);
800         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
801         &lea            ("%r12","16(%r12)");    # inp++
802
803         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
804         &jne    (".Lavx_00_47");
805
806         &vmovdqu        ($inout,"(%r12)");
807         &mov            ($_inp,"%r12");
808
809     $aesni_cbc_idx=0;
810     for ($i=0; $i<16; ) {
811         foreach(body_00_15()) { eval; }
812     }
813
814                                         }
815 $code.=<<___;
816         mov     $_inp,%r12              # borrow $a4
817         mov     $_out,%r13              # borrow $a0
818         mov     $_ctx,%r15              # borrow $a2
819         mov     $_in0,%rsi              # borrow $a3
820
821         vpand   $mask14,$temp,$temp
822         mov     $a1,$A
823         vpor    $temp,$iv,$iv
824         vmovdqu $iv,(%r13,%r12)         # write output
825         lea     16(%r12),%r12           # inp++
826
827         add     $SZ*0(%r15),$A
828         add     $SZ*1(%r15),$B
829         add     $SZ*2(%r15),$C
830         add     $SZ*3(%r15),$D
831         add     $SZ*4(%r15),$E
832         add     $SZ*5(%r15),$F
833         add     $SZ*6(%r15),$G
834         add     $SZ*7(%r15),$H
835
836         cmp     $_end,%r12
837
838         mov     $A,$SZ*0(%r15)
839         mov     $B,$SZ*1(%r15)
840         mov     $C,$SZ*2(%r15)
841         mov     $D,$SZ*3(%r15)
842         mov     $E,$SZ*4(%r15)
843         mov     $F,$SZ*5(%r15)
844         mov     $G,$SZ*6(%r15)
845         mov     $H,$SZ*7(%r15)
846         jb      .Lloop_avx
847
848         mov     $_ivp,$ivp
849         mov     $_rsp,%rsi
850         vmovdqu $iv,($ivp)              # output IV
851         vzeroall
852 ___
853 $code.=<<___ if ($win64);
854         movaps  `$framesz+16*0`(%rsp),%xmm6
855         movaps  `$framesz+16*1`(%rsp),%xmm7
856         movaps  `$framesz+16*2`(%rsp),%xmm8
857         movaps  `$framesz+16*3`(%rsp),%xmm9
858         movaps  `$framesz+16*4`(%rsp),%xmm10
859         movaps  `$framesz+16*5`(%rsp),%xmm11
860         movaps  `$framesz+16*6`(%rsp),%xmm12
861         movaps  `$framesz+16*7`(%rsp),%xmm13
862         movaps  `$framesz+16*8`(%rsp),%xmm14
863         movaps  `$framesz+16*9`(%rsp),%xmm15
864 ___
865 $code.=<<___;
866         mov     (%rsi),%r15
867         mov     8(%rsi),%r14
868         mov     16(%rsi),%r13
869         mov     24(%rsi),%r12
870         mov     32(%rsi),%rbp
871         mov     40(%rsi),%rbx
872         lea     48(%rsi),%rsp
873 .Lepilogue_avx:
874         ret
875 .size   ${func}_avx,.-${func}_avx
876 ___
877
878 if ($avx>1) {{
879 ######################################################################
880 # AVX2+BMI code path
881 #
882 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp 
883 my $PUSH8=8*2*$SZ;
884 use integer;
885
886 sub bodyx_00_15 () {
887         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
888         (
889         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
890
891         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
892         '&and   ($a4,$e)',              # f&e
893         '&rorx  ($a0,$e,$Sigma1[2])',
894         '&rorx  ($a2,$e,$Sigma1[1])',
895
896         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
897         '&lea   ($h,"($h,$a4)")',
898         '&andn  ($a4,$e,$g)',           # ~e&g
899         '&xor   ($a0,$a2)',
900
901         '&rorx  ($a1,$e,$Sigma1[0])',
902         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
903         '&xor   ($a0,$a1)',             # Sigma1(e)
904         '&mov   ($a2,$a)',
905
906         '&rorx  ($a4,$a,$Sigma0[2])',
907         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
908         '&xor   ($a2,$b)',              # a^b, b^c in next round
909         '&rorx  ($a1,$a,$Sigma0[1])',
910
911         '&rorx  ($a0,$a,$Sigma0[0])',
912         '&lea   ($d,"($d,$h)")',        # d+=h
913         '&and   ($a3,$a2)',             # (b^c)&(a^b)
914         @aesni_cbc_block[$aesni_cbc_idx++].
915         '&xor   ($a1,$a4)',
916
917         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
918         '&xor   ($a1,$a0)',             # Sigma0(a)
919         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
920         '&mov   ($a4,$e)',              # copy of f in future
921
922         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
923         );
924         # and at the finish one has to $a+=$a1
925 }
926
927 $code.=<<___;
928 .type   ${func}_avx2,\@function,6
929 .align  64
930 ${func}_avx2:
931 .Lavx2_shortcut:
932         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
933         push    %rbx
934         push    %rbp
935         push    %r12
936         push    %r13
937         push    %r14
938         push    %r15
939         mov     %rsp,%r11               # copy %rsp
940         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
941         and     \$-256*$SZ,%rsp         # align stack frame
942         add     \$`2*$SZ*($rounds-8)`,%rsp
943
944         shl     \$6,$len
945         sub     $inp,$out               # re-bias
946         sub     $inp,$in0
947         add     $inp,$len               # end of input
948
949         #mov    $inp,$_inp              # saved later
950         #mov    $out,$_out              # kept in $offload
951         mov     $len,$_end
952         #mov    $key,$_key              # remains resident in $inp register
953         mov     $ivp,$_ivp
954         mov     $ctx,$_ctx
955         mov     $in0,$_in0
956         mov     %r11,$_rsp
957 ___
958 $code.=<<___ if ($win64);
959         movaps  %xmm6,`$framesz+16*0`(%rsp)
960         movaps  %xmm7,`$framesz+16*1`(%rsp)
961         movaps  %xmm8,`$framesz+16*2`(%rsp)
962         movaps  %xmm9,`$framesz+16*3`(%rsp)
963         movaps  %xmm10,`$framesz+16*4`(%rsp)
964         movaps  %xmm11,`$framesz+16*5`(%rsp)
965         movaps  %xmm12,`$framesz+16*6`(%rsp)
966         movaps  %xmm13,`$framesz+16*7`(%rsp)
967         movaps  %xmm14,`$framesz+16*8`(%rsp)
968         movaps  %xmm15,`$framesz+16*9`(%rsp)
969 ___
970 $code.=<<___;
971 .Lprologue_avx2:
972         vzeroall
973
974         mov     $inp,%r13               # borrow $a0
975         vpinsrq \$1,$out,$offload,$offload
976         lea     0x80($key),$inp         # size optimization, reassign
977         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
978         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
979         mov     $ctx,%r15               # borrow $a2
980         mov     $in0,%rsi               # borrow $a3
981         vmovdqu ($ivp),$iv              # load IV
982         lea     -9(%r14),%r14
983
984         vmovdqa 0x00(%r12,%r14,8),$mask14
985         vmovdqa 0x10(%r12,%r14,8),$mask12
986         vmovdqa 0x20(%r12,%r14,8),$mask10
987
988         sub     \$-16*$SZ,%r13          # inp++, size optimization
989         mov     $SZ*0(%r15),$A
990         lea     (%rsi,%r13),%r12        # borrow $a0
991         mov     $SZ*1(%r15),$B
992         cmp     $len,%r13               # $_end
993         mov     $SZ*2(%r15),$C
994         cmove   %rsp,%r12               # next block or random data
995         mov     $SZ*3(%r15),$D
996         mov     $SZ*4(%r15),$E
997         mov     $SZ*5(%r15),$F
998         mov     $SZ*6(%r15),$G
999         mov     $SZ*7(%r15),$H
1000         vmovdqu 0x00-0x80($inp),$roundkey
1001 ___
1002                                         if ($SZ==4) {   # SHA256
1003     my @X = map("%ymm$_",(0..3));
1004     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1005
1006 $code.=<<___;
1007         jmp     .Loop_avx2
1008 .align  16
1009 .Loop_avx2:
1010         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1011         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1012         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1013         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1014         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1015
1016         vinserti128     \$1,(%r12),@X[0],@X[0]
1017         vinserti128     \$1,16(%r12),@X[1],@X[1]
1018          vpshufb        $t3,@X[0],@X[0]
1019         vinserti128     \$1,32(%r12),@X[2],@X[2]
1020          vpshufb        $t3,@X[1],@X[1]
1021         vinserti128     \$1,48(%r12),@X[3],@X[3]
1022
1023         lea     $TABLE(%rip),$Tbl
1024         vpshufb $t3,@X[2],@X[2]
1025         lea     -16*$SZ(%r13),%r13
1026         vpaddd  0x00($Tbl),@X[0],$t0
1027         vpshufb $t3,@X[3],@X[3]
1028         vpaddd  0x20($Tbl),@X[1],$t1
1029         vpaddd  0x40($Tbl),@X[2],$t2
1030         vpaddd  0x60($Tbl),@X[3],$t3
1031         vmovdqa $t0,0x00(%rsp)
1032         xor     $a1,$a1
1033         vmovdqa $t1,0x20(%rsp)
1034         lea     -$PUSH8(%rsp),%rsp
1035         mov     $B,$a3
1036         vmovdqa $t2,0x00(%rsp)
1037         xor     $C,$a3                  # magic
1038         vmovdqa $t3,0x20(%rsp)
1039         mov     $F,$a4
1040         sub     \$-16*2*$SZ,$Tbl        # size optimization
1041         jmp     .Lavx2_00_47
1042
1043 .align  16
1044 .Lavx2_00_47:
1045         vmovdqu (%r13),$inout
1046         vpinsrq \$0,%r13,$offload,$offload
1047 ___
1048
1049 sub AVX2_256_00_47 () {
1050 my $j = shift;
1051 my $body = shift;
1052 my @X = @_;
1053 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1054 my $base = "+2*$PUSH8(%rsp)";
1055
1056         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1057         foreach (Xupdate_256_AVX()) {           # 29 instructions
1058             eval;
1059             eval(shift(@insns));
1060             eval(shift(@insns));
1061             eval(shift(@insns));
1062         }
1063         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1064           foreach (@insns) { eval; }            # remaining instructions
1065         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1066 }
1067     $aesni_cbc_idx=0;
1068     for ($i=0,$j=0; $j<4; $j++) {
1069         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1070         push(@X,shift(@X));                     # rotate(@X)
1071     }
1072         &vmovq          ("%r13",$offload);      # borrow $a0
1073         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1074         &vpand          ($temp,$temp,$mask14);
1075         &vpor           ($iv,$iv,$temp);
1076         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1077         &lea            ("%r13","16(%r13)");    # inp++
1078
1079         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1080         &cmpb   (($SZ-1)."($Tbl)",0);
1081         &jne    (".Lavx2_00_47");
1082
1083         &vmovdqu        ($inout,"(%r13)");
1084         &vpinsrq        ($offload,$offload,"%r13",0);
1085
1086     $aesni_cbc_idx=0;
1087     for ($i=0; $i<16; ) {
1088         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1089         foreach(bodyx_00_15()) { eval; }
1090     }
1091                                         }
1092 $code.=<<___;
1093         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1094         vmovq   $offload,%r13                   # $_inp, borrow $a0
1095         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1096         add     $a1,$A
1097         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1098
1099         vpand   $mask14,$temp,$temp
1100         vpor    $temp,$iv,$iv
1101         vmovdqu $iv,(%r12,%r13)                 # write output
1102         lea     16(%r13),%r13
1103
1104         add     $SZ*0(%r15),$A
1105         add     $SZ*1(%r15),$B
1106         add     $SZ*2(%r15),$C
1107         add     $SZ*3(%r15),$D
1108         add     $SZ*4(%r15),$E
1109         add     $SZ*5(%r15),$F
1110         add     $SZ*6(%r15),$G
1111         add     $SZ*7(%r15),$H
1112
1113         mov     $A,$SZ*0(%r15)
1114         mov     $B,$SZ*1(%r15)
1115         mov     $C,$SZ*2(%r15)
1116         mov     $D,$SZ*3(%r15)
1117         mov     $E,$SZ*4(%r15)
1118         mov     $F,$SZ*5(%r15)
1119         mov     $G,$SZ*6(%r15)
1120         mov     $H,$SZ*7(%r15)
1121
1122         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1123         je      .Ldone_avx2
1124
1125         xor     $a1,$a1
1126         mov     $B,$a3
1127         mov     $F,$a4
1128         xor     $C,$a3                  # magic
1129         jmp     .Lower_avx2
1130 .align  16
1131 .Lower_avx2:
1132         vmovdqu (%r13),$inout
1133         vpinsrq \$0,%r13,$offload,$offload
1134 ___
1135     $aesni_cbc_idx=0;
1136     for ($i=0; $i<16; ) {
1137         my $base="+16($Tbl)";
1138         foreach(bodyx_00_15()) { eval; }
1139         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1140     }
1141 $code.=<<___;
1142         vmovq   $offload,%r13                   # borrow $a0
1143         vpextrq \$1,$offload,%r15               # borrow $a2
1144         vpand   $mask14,$temp,$temp
1145         vpor    $temp,$iv,$iv
1146         lea     -$PUSH8($Tbl),$Tbl
1147         vmovdqu $iv,(%r15,%r13)                 # write output
1148         lea     16(%r13),%r13                   # inp++
1149         cmp     %rsp,$Tbl
1150         jae     .Lower_avx2
1151
1152         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1153         lea     16*$SZ(%r13),%r13
1154         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1155         add     $a1,$A
1156         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1157
1158         add     $SZ*0(%r15),$A
1159         add     $SZ*1(%r15),$B
1160         add     $SZ*2(%r15),$C
1161         add     $SZ*3(%r15),$D
1162         add     $SZ*4(%r15),$E
1163         add     $SZ*5(%r15),$F
1164         add     $SZ*6(%r15),$G
1165         lea     (%rsi,%r13),%r12
1166         add     $SZ*7(%r15),$H
1167
1168         cmp     $_end,%r13
1169
1170         mov     $A,$SZ*0(%r15)
1171         cmove   %rsp,%r12               # next block or stale data
1172         mov     $B,$SZ*1(%r15)
1173         mov     $C,$SZ*2(%r15)
1174         mov     $D,$SZ*3(%r15)
1175         mov     $E,$SZ*4(%r15)
1176         mov     $F,$SZ*5(%r15)
1177         mov     $G,$SZ*6(%r15)
1178         mov     $H,$SZ*7(%r15)
1179
1180         jbe     .Loop_avx2
1181         lea     (%rsp),$Tbl
1182
1183 .Ldone_avx2:
1184         lea     ($Tbl),%rsp
1185         mov     $_ivp,$ivp
1186         mov     $_rsp,%rsi
1187         vmovdqu $iv,($ivp)              # output IV
1188         vzeroall
1189 ___
1190 $code.=<<___ if ($win64);
1191         movaps  `$framesz+16*0`(%rsp),%xmm6
1192         movaps  `$framesz+16*1`(%rsp),%xmm7
1193         movaps  `$framesz+16*2`(%rsp),%xmm8
1194         movaps  `$framesz+16*3`(%rsp),%xmm9
1195         movaps  `$framesz+16*4`(%rsp),%xmm10
1196         movaps  `$framesz+16*5`(%rsp),%xmm11
1197         movaps  `$framesz+16*6`(%rsp),%xmm12
1198         movaps  `$framesz+16*7`(%rsp),%xmm13
1199         movaps  `$framesz+16*8`(%rsp),%xmm14
1200         movaps  `$framesz+16*9`(%rsp),%xmm15
1201 ___
1202 $code.=<<___;
1203         mov     (%rsi),%r15
1204         mov     8(%rsi),%r14
1205         mov     16(%rsi),%r13
1206         mov     24(%rsi),%r12
1207         mov     32(%rsi),%rbp
1208         mov     40(%rsi),%rbx
1209         lea     48(%rsi),%rsp
1210 .Lepilogue_avx2:
1211         ret
1212 .size   ${func}_avx2,.-${func}_avx2
1213 ___
1214 }}
1215 }}
1216 {{
1217 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1218
1219 my ($rounds,$Tbl)=("%r11d","%rbx");
1220
1221 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1222 my @rndkey=("%xmm4","%xmm5");
1223 my $r=0;
1224 my $sn=0;
1225
1226 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1227 my @MSG=map("%xmm$_",(10..13));
1228
1229 my $aesenc=sub {
1230   use integer;
1231   my ($n,$k)=($r/10,$r%10);
1232     if ($k==0) {
1233       $code.=<<___;
1234         movups          `16*$n`($in0),$in               # load input
1235         xorps           $rndkey0,$in
1236 ___
1237       $code.=<<___ if ($n);
1238         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1239 ___
1240       $code.=<<___;
1241         xorps           $in,$iv
1242         movups          `32+16*$k-112`($key),$rndkey[1]
1243         aesenc          $rndkey[0],$iv
1244 ___
1245     } elsif ($k==9) {
1246       $sn++;
1247       $code.=<<___;
1248         cmp             \$11,$rounds
1249         jb              .Laesenclast$sn
1250         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1251         aesenc          $rndkey[0],$iv
1252         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1253         aesenc          $rndkey[1],$iv
1254         je              .Laesenclast$sn
1255         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1256         aesenc          $rndkey[0],$iv
1257         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1258         aesenc          $rndkey[1],$iv
1259 .Laesenclast$sn:
1260         aesenclast      $rndkey[0],$iv
1261         movups          16-112($key),$rndkey[1]         # forward reference
1262         nop
1263 ___
1264     } else {
1265       $code.=<<___;
1266         movups          `32+16*$k-112`($key),$rndkey[1]
1267         aesenc          $rndkey[0],$iv
1268 ___
1269     }
1270     $r++;       unshift(@rndkey,pop(@rndkey));
1271 };
1272
1273 if ($shaext) {
1274 my $Tbl="%rax";
1275
1276 $code.=<<___;
1277 .type   ${func}_shaext,\@function,6
1278 .align  32
1279 ${func}_shaext:
1280         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1281 ___
1282 $code.=<<___ if ($win64);
1283         lea     `-8-10*16`(%rsp),%rsp
1284         movaps  %xmm6,-8-10*16(%rax)
1285         movaps  %xmm7,-8-9*16(%rax)
1286         movaps  %xmm8,-8-8*16(%rax)
1287         movaps  %xmm9,-8-7*16(%rax)
1288         movaps  %xmm10,-8-6*16(%rax)
1289         movaps  %xmm11,-8-5*16(%rax)
1290         movaps  %xmm12,-8-4*16(%rax)
1291         movaps  %xmm13,-8-3*16(%rax)
1292         movaps  %xmm14,-8-2*16(%rax)
1293         movaps  %xmm15,-8-1*16(%rax)
1294 .Lprologue_shaext:
1295 ___
1296 $code.=<<___;
1297         lea             K256+0x80(%rip),$Tbl
1298         movdqu          ($ctx),$ABEF            # DCBA
1299         movdqu          16($ctx),$CDGH          # HGFE
1300         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1301
1302         mov             240($key),$rounds
1303         sub             $in0,$out
1304         movups          ($key),$rndkey0         # $key[0]
1305         movups          16($key),$rndkey[0]     # forward reference
1306         lea             112($key),$key          # size optimization
1307
1308         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1309         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1310         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1311         movdqa          $TMP,$BSWAP             # offload
1312         palignr         \$8,$CDGH,$ABEF         # ABEF
1313         punpcklqdq      $Wi,$CDGH               # CDGH
1314
1315         jmp     .Loop_shaext
1316
1317 .align  16
1318 .Loop_shaext:
1319         movdqu          ($inp),@MSG[0]
1320         movdqu          0x10($inp),@MSG[1]
1321         movdqu          0x20($inp),@MSG[2]
1322         pshufb          $TMP,@MSG[0]
1323         movdqu          0x30($inp),@MSG[3]
1324
1325         movdqa          0*32-0x80($Tbl),$Wi
1326         paddd           @MSG[0],$Wi
1327         pshufb          $TMP,@MSG[1]
1328         movdqa          $CDGH,$CDGH_SAVE        # offload
1329         movdqa          $ABEF,$ABEF_SAVE        # offload
1330 ___
1331         &$aesenc();
1332 $code.=<<___;
1333         sha256rnds2     $ABEF,$CDGH             # 0-3
1334         pshufd          \$0x0e,$Wi,$Wi
1335 ___
1336         &$aesenc();
1337 $code.=<<___;
1338         sha256rnds2     $CDGH,$ABEF
1339
1340         movdqa          1*32-0x80($Tbl),$Wi
1341         paddd           @MSG[1],$Wi
1342         pshufb          $TMP,@MSG[2]
1343         lea             0x40($inp),$inp
1344 ___
1345         &$aesenc();
1346 $code.=<<___;
1347         sha256rnds2     $ABEF,$CDGH             # 4-7
1348         pshufd          \$0x0e,$Wi,$Wi
1349 ___
1350         &$aesenc();
1351 $code.=<<___;
1352         sha256rnds2     $CDGH,$ABEF
1353
1354         movdqa          2*32-0x80($Tbl),$Wi
1355         paddd           @MSG[2],$Wi
1356         pshufb          $TMP,@MSG[3]
1357         sha256msg1      @MSG[1],@MSG[0]
1358 ___
1359         &$aesenc();
1360 $code.=<<___;
1361         sha256rnds2     $ABEF,$CDGH             # 8-11
1362         pshufd          \$0x0e,$Wi,$Wi
1363         movdqa          @MSG[3],$TMP
1364         palignr         \$4,@MSG[2],$TMP
1365         paddd           $TMP,@MSG[0]
1366 ___
1367         &$aesenc();
1368 $code.=<<___;
1369         sha256rnds2     $CDGH,$ABEF
1370
1371         movdqa          3*32-0x80($Tbl),$Wi
1372         paddd           @MSG[3],$Wi
1373         sha256msg2      @MSG[3],@MSG[0]
1374         sha256msg1      @MSG[2],@MSG[1]
1375 ___
1376         &$aesenc();
1377 $code.=<<___;
1378         sha256rnds2     $ABEF,$CDGH             # 12-15
1379         pshufd          \$0x0e,$Wi,$Wi
1380 ___
1381         &$aesenc();
1382 $code.=<<___;
1383         movdqa          @MSG[0],$TMP
1384         palignr         \$4,@MSG[3],$TMP
1385         paddd           $TMP,@MSG[1]
1386         sha256rnds2     $CDGH,$ABEF
1387 ___
1388 for($i=4;$i<16-3;$i++) {
1389         &$aesenc()      if (($r%10)==0);
1390 $code.=<<___;
1391         movdqa          $i*32-0x80($Tbl),$Wi
1392         paddd           @MSG[0],$Wi
1393         sha256msg2      @MSG[0],@MSG[1]
1394         sha256msg1      @MSG[3],@MSG[2]
1395 ___
1396         &$aesenc();
1397 $code.=<<___;
1398         sha256rnds2     $ABEF,$CDGH             # 16-19...
1399         pshufd          \$0x0e,$Wi,$Wi
1400         movdqa          @MSG[1],$TMP
1401         palignr         \$4,@MSG[0],$TMP
1402         paddd           $TMP,@MSG[2]
1403 ___
1404         &$aesenc();
1405         &$aesenc()      if ($r==19);
1406 $code.=<<___;
1407         sha256rnds2     $CDGH,$ABEF
1408 ___
1409         push(@MSG,shift(@MSG));
1410 }
1411 $code.=<<___;
1412         movdqa          13*32-0x80($Tbl),$Wi
1413         paddd           @MSG[0],$Wi
1414         sha256msg2      @MSG[0],@MSG[1]
1415         sha256msg1      @MSG[3],@MSG[2]
1416 ___
1417         &$aesenc();
1418 $code.=<<___;
1419         sha256rnds2     $ABEF,$CDGH             # 52-55
1420         pshufd          \$0x0e,$Wi,$Wi
1421         movdqa          @MSG[1],$TMP
1422         palignr         \$4,@MSG[0],$TMP
1423         paddd           $TMP,@MSG[2]
1424 ___
1425         &$aesenc();
1426         &$aesenc();
1427 $code.=<<___;
1428         sha256rnds2     $CDGH,$ABEF
1429
1430         movdqa          14*32-0x80($Tbl),$Wi
1431         paddd           @MSG[1],$Wi
1432         sha256msg2      @MSG[1],@MSG[2]
1433         movdqa          $BSWAP,$TMP
1434 ___
1435         &$aesenc();
1436 $code.=<<___;
1437         sha256rnds2     $ABEF,$CDGH             # 56-59
1438         pshufd          \$0x0e,$Wi,$Wi
1439 ___
1440         &$aesenc();
1441 $code.=<<___;
1442         sha256rnds2     $CDGH,$ABEF
1443
1444         movdqa          15*32-0x80($Tbl),$Wi
1445         paddd           @MSG[2],$Wi
1446 ___
1447         &$aesenc();
1448         &$aesenc();
1449 $code.=<<___;
1450         sha256rnds2     $ABEF,$CDGH             # 60-63
1451         pshufd          \$0x0e,$Wi,$Wi
1452 ___
1453         &$aesenc();
1454 $code.=<<___;
1455         sha256rnds2     $CDGH,$ABEF
1456         #pxor           $CDGH,$rndkey0          # black magic
1457 ___
1458         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1459 $code.=<<___;
1460         #xorps          $CDGH,$rndkey0          # black magic
1461         paddd           $CDGH_SAVE,$CDGH
1462         paddd           $ABEF_SAVE,$ABEF
1463
1464         dec             $len
1465         movups          $iv,48($out,$in0)       # write output
1466         lea             64($in0),$in0
1467         jnz             .Loop_shaext
1468
1469         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1470         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1471         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1472         punpckhqdq      $CDGH,$ABEF             # DCBA
1473         palignr         \$8,$TMP,$CDGH          # HGFE
1474
1475         movups          $iv,($ivp)              # write IV
1476         movdqu          $ABEF,($ctx)
1477         movdqu          $CDGH,16($ctx)
1478 ___
1479 $code.=<<___ if ($win64);
1480         movaps  0*16(%rsp),%xmm6
1481         movaps  1*16(%rsp),%xmm7
1482         movaps  2*16(%rsp),%xmm8
1483         movaps  3*16(%rsp),%xmm9
1484         movaps  4*16(%rsp),%xmm10
1485         movaps  5*16(%rsp),%xmm11
1486         movaps  6*16(%rsp),%xmm12
1487         movaps  7*16(%rsp),%xmm13
1488         movaps  8*16(%rsp),%xmm14
1489         movaps  9*16(%rsp),%xmm15
1490         lea     8+10*16(%rsp),%rsp
1491 .Lepilogue_shaext:
1492 ___
1493 $code.=<<___;
1494         ret
1495 .size   ${func}_shaext,.-${func}_shaext
1496 ___
1497 }
1498 }}}}}
1499
1500 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1501 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1502 if ($win64) {
1503 $rec="%rcx";
1504 $frame="%rdx";
1505 $context="%r8";
1506 $disp="%r9";
1507
1508 $code.=<<___ if ($avx);
1509 .extern __imp_RtlVirtualUnwind
1510 .type   se_handler,\@abi-omnipotent
1511 .align  16
1512 se_handler:
1513         push    %rsi
1514         push    %rdi
1515         push    %rbx
1516         push    %rbp
1517         push    %r12
1518         push    %r13
1519         push    %r14
1520         push    %r15
1521         pushfq
1522         sub     \$64,%rsp
1523
1524         mov     120($context),%rax      # pull context->Rax
1525         mov     248($context),%rbx      # pull context->Rip
1526
1527         mov     8($disp),%rsi           # disp->ImageBase
1528         mov     56($disp),%r11          # disp->HanderlData
1529
1530         mov     0(%r11),%r10d           # HandlerData[0]
1531         lea     (%rsi,%r10),%r10        # prologue label
1532         cmp     %r10,%rbx               # context->Rip<prologue label
1533         jb      .Lin_prologue
1534
1535         mov     152($context),%rax      # pull context->Rsp
1536
1537         mov     4(%r11),%r10d           # HandlerData[1]
1538         lea     (%rsi,%r10),%r10        # epilogue label
1539         cmp     %r10,%rbx               # context->Rip>=epilogue label
1540         jae     .Lin_prologue
1541 ___
1542 $code.=<<___ if ($shaext);
1543         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1544         cmp     %r10,%rbx
1545         jb      .Lnot_in_shaext
1546
1547         lea     (%rax),%rsi
1548         lea     512($context),%rdi      # &context.Xmm6
1549         mov     \$20,%ecx
1550         .long   0xa548f3fc              # cld; rep movsq
1551         lea     168(%rax),%rax          # adjust stack pointer
1552         jmp     .Lin_prologue
1553 .Lnot_in_shaext:
1554 ___
1555 $code.=<<___ if ($avx>1);
1556         lea     .Lavx2_shortcut(%rip),%r10
1557         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1558         jb      .Lnot_in_avx2
1559
1560         and     \$-256*$SZ,%rax
1561         add     \$`2*$SZ*($rounds-8)`,%rax
1562 .Lnot_in_avx2:
1563 ___
1564 $code.=<<___;
1565         mov     %rax,%rsi               # put aside Rsp
1566         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1567         lea     48(%rax),%rax
1568
1569         mov     -8(%rax),%rbx
1570         mov     -16(%rax),%rbp
1571         mov     -24(%rax),%r12
1572         mov     -32(%rax),%r13
1573         mov     -40(%rax),%r14
1574         mov     -48(%rax),%r15
1575         mov     %rbx,144($context)      # restore context->Rbx
1576         mov     %rbp,160($context)      # restore context->Rbp
1577         mov     %r12,216($context)      # restore context->R12
1578         mov     %r13,224($context)      # restore context->R13
1579         mov     %r14,232($context)      # restore context->R14
1580         mov     %r15,240($context)      # restore context->R15
1581
1582         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1583         lea     512($context),%rdi      # &context.Xmm6
1584         mov     \$20,%ecx
1585         .long   0xa548f3fc              # cld; rep movsq
1586
1587 .Lin_prologue:
1588         mov     8(%rax),%rdi
1589         mov     16(%rax),%rsi
1590         mov     %rax,152($context)      # restore context->Rsp
1591         mov     %rsi,168($context)      # restore context->Rsi
1592         mov     %rdi,176($context)      # restore context->Rdi
1593
1594         mov     40($disp),%rdi          # disp->ContextRecord
1595         mov     $context,%rsi           # context
1596         mov     \$154,%ecx              # sizeof(CONTEXT)
1597         .long   0xa548f3fc              # cld; rep movsq
1598
1599         mov     $disp,%rsi
1600         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1601         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1602         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1603         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1604         mov     40(%rsi),%r10           # disp->ContextRecord
1605         lea     56(%rsi),%r11           # &disp->HandlerData
1606         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1607         mov     %r10,32(%rsp)           # arg5
1608         mov     %r11,40(%rsp)           # arg6
1609         mov     %r12,48(%rsp)           # arg7
1610         mov     %rcx,56(%rsp)           # arg8, (NULL)
1611         call    *__imp_RtlVirtualUnwind(%rip)
1612
1613         mov     \$1,%eax                # ExceptionContinueSearch
1614         add     \$64,%rsp
1615         popfq
1616         pop     %r15
1617         pop     %r14
1618         pop     %r13
1619         pop     %r12
1620         pop     %rbp
1621         pop     %rbx
1622         pop     %rdi
1623         pop     %rsi
1624         ret
1625 .size   se_handler,.-se_handler
1626
1627 .section        .pdata
1628         .rva    .LSEH_begin_${func}_xop
1629         .rva    .LSEH_end_${func}_xop
1630         .rva    .LSEH_info_${func}_xop
1631
1632         .rva    .LSEH_begin_${func}_avx
1633         .rva    .LSEH_end_${func}_avx
1634         .rva    .LSEH_info_${func}_avx
1635 ___
1636 $code.=<<___ if ($avx>1);
1637         .rva    .LSEH_begin_${func}_avx2
1638         .rva    .LSEH_end_${func}_avx2
1639         .rva    .LSEH_info_${func}_avx2
1640 ___
1641 $code.=<<___ if ($shaext);
1642         .rva    .LSEH_begin_${func}_shaext
1643         .rva    .LSEH_end_${func}_shaext
1644         .rva    .LSEH_info_${func}_shaext
1645 ___
1646 $code.=<<___ if ($avx);
1647 .section        .xdata
1648 .align  8
1649 .LSEH_info_${func}_xop:
1650         .byte   9,0,0,0
1651         .rva    se_handler
1652         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1653
1654 .LSEH_info_${func}_avx:
1655         .byte   9,0,0,0
1656         .rva    se_handler
1657         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1658 ___
1659 $code.=<<___ if ($avx>1);
1660 .LSEH_info_${func}_avx2:
1661         .byte   9,0,0,0
1662         .rva    se_handler
1663         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1664 ___
1665 $code.=<<___ if ($shaext);
1666 .LSEH_info_${func}_shaext:
1667         .byte   9,0,0,0
1668         .rva    se_handler
1669         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1670 ___
1671 }
1672
1673 ####################################################################
1674 sub rex {
1675   local *opcode=shift;
1676   my ($dst,$src)=@_;
1677   my $rex=0;
1678
1679     $rex|=0x04                  if($dst>=8);
1680     $rex|=0x01                  if($src>=8);
1681     unshift @opcode,$rex|0x40   if($rex);
1682 }
1683
1684 {
1685   my %opcodelet = (
1686                 "sha256rnds2" => 0xcb,
1687                 "sha256msg1"  => 0xcc,
1688                 "sha256msg2"  => 0xcd   );
1689
1690   sub sha256op38 {
1691     my $instr = shift;
1692
1693     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1694       my @opcode=(0x0f,0x38);
1695         rex(\@opcode,$2,$1);
1696         push @opcode,$opcodelet{$instr};
1697         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1698         return ".byte\t".join(',',@opcode);
1699     } else {
1700         return $instr."\t".@_[0];
1701     }
1702   }
1703 }
1704
1705 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1706 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1707 print $code;
1708 close STDOUT;