Skylake performance results.
[openssl.git] / crypto / aes / asm / aesni-sha256-x86_64.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # January 2013
11 #
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
22 # subroutine:
23 #
24 #                AES-128/-192/-256+SHA256       this(**)gain
25 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
26 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
27 # Haswell           4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
28 # Skylake           2.62/3.14/3.62+7.70         8.10    +27%/34%/40%
29 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
30 #
31 # (*)   there are XOP, AVX1 and AVX2 code pathes, meaning that
32 #       Westmere is omitted from loop, this is because gain was not
33 #       estimated high enough to justify the effort;
34 # (**)  these are EVP-free results, results obtained with 'speed
35 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
36
37 $flavour = shift;
38 $output  = shift;
39 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
40
41 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
42
43 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
44 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
45 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
46 die "can't locate x86_64-xlate.pl";
47
48 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
49                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
50         $avx = ($1>=2.19) + ($1>=2.22);
51 }
52
53 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
54            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
55         $avx = ($1>=2.09) + ($1>=2.10);
56 }
57
58 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
59            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
60         $avx = ($1>=10) + ($1>=12);
61 }
62
63 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
64         $avx = ($2>=3.0) + ($2>3.0);
65 }
66
67 $shaext=$avx;   ### set to zero if compiling for 1.0.1
68 $avx=1          if (!$shaext && $avx);
69
70 open OUT,"| \"$^X\" $xlate $flavour $output";
71 *STDOUT=*OUT;
72
73 $func="aesni_cbc_sha256_enc";
74 $TABLE="K256";
75 $SZ=4;
76 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
77                                 "%r8d","%r9d","%r10d","%r11d");
78 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
79 @Sigma0=( 2,13,22);
80 @Sigma1=( 6,11,25);
81 @sigma0=( 7,18, 3);
82 @sigma1=(17,19,10);
83 $rounds=64;
84
85 ########################################################################
86 # void aesni_cbc_sha256_enc(const void *inp,
87 #                       void *out,
88 #                       size_t length,
89 #                       const AES_KEY *key,
90 #                       unsigned char *iv,
91 #                       SHA256_CTX *ctx,
92 #                       const void *in0);
93 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
94 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
95
96 $Tbl="%rbp";
97
98 $_inp="16*$SZ+0*8(%rsp)";
99 $_out="16*$SZ+1*8(%rsp)";
100 $_end="16*$SZ+2*8(%rsp)";
101 $_key="16*$SZ+3*8(%rsp)";
102 $_ivp="16*$SZ+4*8(%rsp)";
103 $_ctx="16*$SZ+5*8(%rsp)";
104 $_in0="16*$SZ+6*8(%rsp)";
105 $_rsp="16*$SZ+7*8(%rsp)";
106 $framesz=16*$SZ+8*8;
107
108 $code=<<___;
109 .text
110
111 .extern OPENSSL_ia32cap_P
112 .globl  $func
113 .type   $func,\@abi-omnipotent
114 .align  16
115 $func:
116 ___
117                                                 if ($avx) {
118 $code.=<<___;
119         lea     OPENSSL_ia32cap_P(%rip),%r11
120         mov     \$1,%eax
121         cmp     \$0,`$win64?"%rcx":"%rdi"`
122         je      .Lprobe
123         mov     0(%r11),%eax
124         mov     4(%r11),%r10
125 ___
126 $code.=<<___ if ($shaext);
127         bt      \$61,%r10                       # check for SHA
128         jc      ${func}_shaext
129 ___
130 $code.=<<___;
131         mov     %r10,%r11
132         shr     \$32,%r11
133
134         test    \$`1<<11`,%r10d                 # check for XOP
135         jnz     ${func}_xop
136 ___
137 $code.=<<___ if ($avx>1);
138         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
139         cmp     \$`1<<8|1<<5|1<<3`,%r11d
140         je      ${func}_avx2
141 ___
142 $code.=<<___;
143         and     \$`1<<30`,%eax                  # mask "Intel CPU" bit
144         and     \$`1<<28|1<<9`,%r10d            # mask AVX+SSSE3 bits
145         or      %eax,%r10d
146         cmp     \$`1<<28|1<<9|1<<30`,%r10d
147         je      ${func}_avx
148         ud2
149 ___
150                                                 }
151 $code.=<<___;
152         xor     %eax,%eax
153         cmp     \$0,`$win64?"%rcx":"%rdi"`
154         je      .Lprobe
155         ud2
156 .Lprobe:
157         ret
158 .size   $func,.-$func
159
160 .align  64
161 .type   $TABLE,\@object
162 $TABLE:
163         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
164         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
165         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
166         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
167         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
168         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
169         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
170         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
171         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
172         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
173         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
174         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
175         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
176         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
177         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
178         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
179         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
180         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
181         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
182         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
183         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
184         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
185         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
186         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
187         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
188         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
189         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
190         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
191         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
192         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
193         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
194         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
195
196         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
197         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
198         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
199         .long   0,0,0,0,   0,0,0,0
200         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
201 .align  64
202 ___
203
204 ######################################################################
205 # SIMD code paths
206 #
207 {{{
208 ($iv,$inout,$roundkey,$temp,
209  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
210
211 $aesni_cbc_idx=0;
212 @aesni_cbc_block = (
213 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
214 ##      &vmovdqu        ($inout,($inp));
215 ##      &mov            ($_inp,$inp);
216
217         '&vpxor         ($inout,$inout,$roundkey);'.
218         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
219
220         '&vpxor         ($inout,$inout,$iv);',
221
222         '&vaesenc       ($inout,$inout,$roundkey);'.
223         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
224
225         '&vaesenc       ($inout,$inout,$roundkey);'.
226         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
227
228         '&vaesenc       ($inout,$inout,$roundkey);'.
229         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
230
231         '&vaesenc       ($inout,$inout,$roundkey);'.
232         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
233
234         '&vaesenc       ($inout,$inout,$roundkey);'.
235         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
236
237         '&vaesenc       ($inout,$inout,$roundkey);'.
238         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
239
240         '&vaesenc       ($inout,$inout,$roundkey);'.
241         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
242
243         '&vaesenc       ($inout,$inout,$roundkey);'.
244         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
245
246         '&vaesenc       ($inout,$inout,$roundkey);'.
247         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
248
249         '&vaesenclast   ($temp,$inout,$roundkey);'.
250         ' &vaesenc      ($inout,$inout,$roundkey);'.
251         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
252
253         '&vpand         ($iv,$temp,$mask10);'.
254         ' &vaesenc      ($inout,$inout,$roundkey);'.
255         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
256
257         '&vaesenclast   ($temp,$inout,$roundkey);'.
258         ' &vaesenc      ($inout,$inout,$roundkey);'.
259         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
260
261         '&vpand         ($temp,$temp,$mask12);'.
262         ' &vaesenc      ($inout,$inout,$roundkey);'.
263          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
264
265         '&vpor          ($iv,$iv,$temp);'.
266         ' &vaesenclast  ($temp,$inout,$roundkey);'.
267         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
268
269 ##      &mov            ($inp,$_inp);
270 ##      &mov            ($out,$_out);
271 ##      &vpand          ($temp,$temp,$mask14);
272 ##      &vpor           ($iv,$iv,$temp);
273 ##      &vmovdqu        ($iv,($out,$inp);
274 ##      &lea            (inp,16($inp));
275 );
276
277 my $a4=$T1;
278 my ($a,$b,$c,$d,$e,$f,$g,$h);
279
280 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
281 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
282   my $arg = pop;
283     $arg = "\$$arg" if ($arg*1 eq $arg);
284     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
285 }
286
287 sub body_00_15 () {
288         (
289         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
290
291         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
292         '&mov   ($a,$a1)',
293         '&mov   ($a4,$f)',
294
295         '&xor   ($a0,$e)',
296         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
297         '&xor   ($a4,$g)',                      # f^g
298
299         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
300         '&xor   ($a1,$a)',
301         '&and   ($a4,$e)',                      # (f^g)&e
302
303         @aesni_cbc_block[$aesni_cbc_idx++].
304         '&xor   ($a0,$e)',
305         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
306         '&mov   ($a2,$a)',
307
308         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
309         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
310         '&xor   ($a2,$b)',                      # a^b, b^c in next round
311
312         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
313         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
314         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
315
316         '&xor   ($a1,$a)',
317         '&add   ($h,$a0)',                      # h+=Sigma1(e)
318         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
319
320         '&add   ($d,$h)',                       # d+=h
321         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
322         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
323
324         '&mov   ($a0,$d)',
325         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
326         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
327         );
328 }
329
330 if ($avx) {{
331 ######################################################################
332 # XOP code path
333 #
334 $code.=<<___;
335 .type   ${func}_xop,\@function,6
336 .align  64
337 ${func}_xop:
338 .Lxop_shortcut:
339         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
340         push    %rbx
341         push    %rbp
342         push    %r12
343         push    %r13
344         push    %r14
345         push    %r15
346         mov     %rsp,%r11               # copy %rsp
347         sub     \$`$framesz+$win64*16*10`,%rsp
348         and     \$-64,%rsp              # align stack frame
349
350         shl     \$6,$len
351         sub     $inp,$out               # re-bias
352         sub     $inp,$in0
353         add     $inp,$len               # end of input
354
355         #mov    $inp,$_inp              # saved later
356         mov     $out,$_out
357         mov     $len,$_end
358         #mov    $key,$_key              # remains resident in $inp register
359         mov     $ivp,$_ivp
360         mov     $ctx,$_ctx
361         mov     $in0,$_in0
362         mov     %r11,$_rsp
363 ___
364 $code.=<<___ if ($win64);
365         movaps  %xmm6,`$framesz+16*0`(%rsp)
366         movaps  %xmm7,`$framesz+16*1`(%rsp)
367         movaps  %xmm8,`$framesz+16*2`(%rsp)
368         movaps  %xmm9,`$framesz+16*3`(%rsp)
369         movaps  %xmm10,`$framesz+16*4`(%rsp)
370         movaps  %xmm11,`$framesz+16*5`(%rsp)
371         movaps  %xmm12,`$framesz+16*6`(%rsp)
372         movaps  %xmm13,`$framesz+16*7`(%rsp)
373         movaps  %xmm14,`$framesz+16*8`(%rsp)
374         movaps  %xmm15,`$framesz+16*9`(%rsp)
375 ___
376 $code.=<<___;
377 .Lprologue_xop:
378         vzeroall
379
380         mov     $inp,%r12               # borrow $a4
381         lea     0x80($key),$inp         # size optimization, reassign
382         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
383         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
384         mov     $ctx,%r15               # borrow $a2
385         mov     $in0,%rsi               # borrow $a3
386         vmovdqu ($ivp),$iv              # load IV
387         sub     \$9,%r14
388
389         mov     $SZ*0(%r15),$A
390         mov     $SZ*1(%r15),$B
391         mov     $SZ*2(%r15),$C
392         mov     $SZ*3(%r15),$D
393         mov     $SZ*4(%r15),$E
394         mov     $SZ*5(%r15),$F
395         mov     $SZ*6(%r15),$G
396         mov     $SZ*7(%r15),$H
397
398         vmovdqa 0x00(%r13,%r14,8),$mask14
399         vmovdqa 0x10(%r13,%r14,8),$mask12
400         vmovdqa 0x20(%r13,%r14,8),$mask10
401         vmovdqu 0x00-0x80($inp),$roundkey
402         jmp     .Lloop_xop
403 ___
404                                         if ($SZ==4) {   # SHA256
405     my @X = map("%xmm$_",(0..3));
406     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
407
408 $code.=<<___;
409 .align  16
410 .Lloop_xop:
411         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
412         vmovdqu 0x00(%rsi,%r12),@X[0]
413         vmovdqu 0x10(%rsi,%r12),@X[1]
414         vmovdqu 0x20(%rsi,%r12),@X[2]
415         vmovdqu 0x30(%rsi,%r12),@X[3]
416         vpshufb $t3,@X[0],@X[0]
417         lea     $TABLE(%rip),$Tbl
418         vpshufb $t3,@X[1],@X[1]
419         vpshufb $t3,@X[2],@X[2]
420         vpaddd  0x00($Tbl),@X[0],$t0
421         vpshufb $t3,@X[3],@X[3]
422         vpaddd  0x20($Tbl),@X[1],$t1
423         vpaddd  0x40($Tbl),@X[2],$t2
424         vpaddd  0x60($Tbl),@X[3],$t3
425         vmovdqa $t0,0x00(%rsp)
426         mov     $A,$a1
427         vmovdqa $t1,0x10(%rsp)
428         mov     $B,$a3
429         vmovdqa $t2,0x20(%rsp)
430         xor     $C,$a3                  # magic
431         vmovdqa $t3,0x30(%rsp)
432         mov     $E,$a0
433         jmp     .Lxop_00_47
434
435 .align  16
436 .Lxop_00_47:
437         sub     \$-16*2*$SZ,$Tbl        # size optimization
438         vmovdqu (%r12),$inout           # $a4
439         mov     %r12,$_inp              # $a4
440 ___
441 sub XOP_256_00_47 () {
442 my $j = shift;
443 my $body = shift;
444 my @X = @_;
445 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
446
447         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
448           eval(shift(@insns));
449           eval(shift(@insns));
450          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
451           eval(shift(@insns));
452           eval(shift(@insns));
453         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
454           eval(shift(@insns));
455           eval(shift(@insns));
456         &vpsrld         ($t0,$t0,$sigma0[2]);
457           eval(shift(@insns));
458           eval(shift(@insns));
459          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
460           eval(shift(@insns));
461           eval(shift(@insns));
462           eval(shift(@insns));
463           eval(shift(@insns));
464         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
465           eval(shift(@insns));
466           eval(shift(@insns));
467         &vpxor          ($t0,$t0,$t1);
468           eval(shift(@insns));
469           eval(shift(@insns));
470           eval(shift(@insns));
471           eval(shift(@insns));
472          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
473           eval(shift(@insns));
474           eval(shift(@insns));
475         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
476           eval(shift(@insns));
477           eval(shift(@insns));
478          &vpsrld        ($t2,@X[3],$sigma1[2]);
479           eval(shift(@insns));
480           eval(shift(@insns));
481         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
482           eval(shift(@insns));
483           eval(shift(@insns));
484          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
485           eval(shift(@insns));
486           eval(shift(@insns));
487          &vpxor         ($t3,$t3,$t2);
488           eval(shift(@insns));
489           eval(shift(@insns));
490           eval(shift(@insns));
491           eval(shift(@insns));
492          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
493           eval(shift(@insns));
494           eval(shift(@insns));
495           eval(shift(@insns));
496           eval(shift(@insns));
497         &vpsrldq        ($t3,$t3,8);
498           eval(shift(@insns));
499           eval(shift(@insns));
500           eval(shift(@insns));
501           eval(shift(@insns));
502         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
503           eval(shift(@insns));
504           eval(shift(@insns));
505           eval(shift(@insns));
506           eval(shift(@insns));
507          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
508           eval(shift(@insns));
509           eval(shift(@insns));
510          &vpsrld        ($t2,@X[0],$sigma1[2]);
511           eval(shift(@insns));
512           eval(shift(@insns));
513          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
514           eval(shift(@insns));
515           eval(shift(@insns));
516          &vpxor         ($t3,$t3,$t2);
517           eval(shift(@insns));
518           eval(shift(@insns));
519           eval(shift(@insns));
520           eval(shift(@insns));
521          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
522           eval(shift(@insns));
523           eval(shift(@insns));
524           eval(shift(@insns));
525           eval(shift(@insns));
526         &vpslldq        ($t3,$t3,8);            # 22 instructions
527           eval(shift(@insns));
528           eval(shift(@insns));
529           eval(shift(@insns));
530           eval(shift(@insns));
531         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
532           eval(shift(@insns));
533           eval(shift(@insns));
534           eval(shift(@insns));
535           eval(shift(@insns));
536         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
537           foreach (@insns) { eval; }            # remaining instructions
538         &vmovdqa        (16*$j."(%rsp)",$t2);
539 }
540
541     $aesni_cbc_idx=0;
542     for ($i=0,$j=0; $j<4; $j++) {
543         &XOP_256_00_47($j,\&body_00_15,@X);
544         push(@X,shift(@X));                     # rotate(@X)
545     }
546         &mov            ("%r12",$_inp);         # borrow $a4
547         &vpand          ($temp,$temp,$mask14);
548         &mov            ("%r15",$_out);         # borrow $a2
549         &vpor           ($iv,$iv,$temp);
550         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
551         &lea            ("%r12","16(%r12)");    # inp++
552
553         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
554         &jne    (".Lxop_00_47");
555
556         &vmovdqu        ($inout,"(%r12)");
557         &mov            ($_inp,"%r12");
558
559     $aesni_cbc_idx=0;
560     for ($i=0; $i<16; ) {
561         foreach(body_00_15()) { eval; }
562     }
563                                         }
564 $code.=<<___;
565         mov     $_inp,%r12              # borrow $a4
566         mov     $_out,%r13              # borrow $a0
567         mov     $_ctx,%r15              # borrow $a2
568         mov     $_in0,%rsi              # borrow $a3
569
570         vpand   $mask14,$temp,$temp
571         mov     $a1,$A
572         vpor    $temp,$iv,$iv
573         vmovdqu $iv,(%r13,%r12)         # write output
574         lea     16(%r12),%r12           # inp++
575
576         add     $SZ*0(%r15),$A
577         add     $SZ*1(%r15),$B
578         add     $SZ*2(%r15),$C
579         add     $SZ*3(%r15),$D
580         add     $SZ*4(%r15),$E
581         add     $SZ*5(%r15),$F
582         add     $SZ*6(%r15),$G
583         add     $SZ*7(%r15),$H
584
585         cmp     $_end,%r12
586
587         mov     $A,$SZ*0(%r15)
588         mov     $B,$SZ*1(%r15)
589         mov     $C,$SZ*2(%r15)
590         mov     $D,$SZ*3(%r15)
591         mov     $E,$SZ*4(%r15)
592         mov     $F,$SZ*5(%r15)
593         mov     $G,$SZ*6(%r15)
594         mov     $H,$SZ*7(%r15)
595
596         jb      .Lloop_xop
597
598         mov     $_ivp,$ivp
599         mov     $_rsp,%rsi
600         vmovdqu $iv,($ivp)              # output IV
601         vzeroall
602 ___
603 $code.=<<___ if ($win64);
604         movaps  `$framesz+16*0`(%rsp),%xmm6
605         movaps  `$framesz+16*1`(%rsp),%xmm7
606         movaps  `$framesz+16*2`(%rsp),%xmm8
607         movaps  `$framesz+16*3`(%rsp),%xmm9
608         movaps  `$framesz+16*4`(%rsp),%xmm10
609         movaps  `$framesz+16*5`(%rsp),%xmm11
610         movaps  `$framesz+16*6`(%rsp),%xmm12
611         movaps  `$framesz+16*7`(%rsp),%xmm13
612         movaps  `$framesz+16*8`(%rsp),%xmm14
613         movaps  `$framesz+16*9`(%rsp),%xmm15
614 ___
615 $code.=<<___;
616         mov     (%rsi),%r15
617         mov     8(%rsi),%r14
618         mov     16(%rsi),%r13
619         mov     24(%rsi),%r12
620         mov     32(%rsi),%rbp
621         mov     40(%rsi),%rbx
622         lea     48(%rsi),%rsp
623 .Lepilogue_xop:
624         ret
625 .size   ${func}_xop,.-${func}_xop
626 ___
627 ######################################################################
628 # AVX+shrd code path
629 #
630 local *ror = sub { &shrd(@_[0],@_) };
631
632 $code.=<<___;
633 .type   ${func}_avx,\@function,6
634 .align  64
635 ${func}_avx:
636 .Lavx_shortcut:
637         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
638         push    %rbx
639         push    %rbp
640         push    %r12
641         push    %r13
642         push    %r14
643         push    %r15
644         mov     %rsp,%r11               # copy %rsp
645         sub     \$`$framesz+$win64*16*10`,%rsp
646         and     \$-64,%rsp              # align stack frame
647
648         shl     \$6,$len
649         sub     $inp,$out               # re-bias
650         sub     $inp,$in0
651         add     $inp,$len               # end of input
652
653         #mov    $inp,$_inp              # saved later
654         mov     $out,$_out
655         mov     $len,$_end
656         #mov    $key,$_key              # remains resident in $inp register
657         mov     $ivp,$_ivp
658         mov     $ctx,$_ctx
659         mov     $in0,$_in0
660         mov     %r11,$_rsp
661 ___
662 $code.=<<___ if ($win64);
663         movaps  %xmm6,`$framesz+16*0`(%rsp)
664         movaps  %xmm7,`$framesz+16*1`(%rsp)
665         movaps  %xmm8,`$framesz+16*2`(%rsp)
666         movaps  %xmm9,`$framesz+16*3`(%rsp)
667         movaps  %xmm10,`$framesz+16*4`(%rsp)
668         movaps  %xmm11,`$framesz+16*5`(%rsp)
669         movaps  %xmm12,`$framesz+16*6`(%rsp)
670         movaps  %xmm13,`$framesz+16*7`(%rsp)
671         movaps  %xmm14,`$framesz+16*8`(%rsp)
672         movaps  %xmm15,`$framesz+16*9`(%rsp)
673 ___
674 $code.=<<___;
675 .Lprologue_avx:
676         vzeroall
677
678         mov     $inp,%r12               # borrow $a4
679         lea     0x80($key),$inp         # size optimization, reassign
680         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
681         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
682         mov     $ctx,%r15               # borrow $a2
683         mov     $in0,%rsi               # borrow $a3
684         vmovdqu ($ivp),$iv              # load IV
685         sub     \$9,%r14
686
687         mov     $SZ*0(%r15),$A
688         mov     $SZ*1(%r15),$B
689         mov     $SZ*2(%r15),$C
690         mov     $SZ*3(%r15),$D
691         mov     $SZ*4(%r15),$E
692         mov     $SZ*5(%r15),$F
693         mov     $SZ*6(%r15),$G
694         mov     $SZ*7(%r15),$H
695
696         vmovdqa 0x00(%r13,%r14,8),$mask14
697         vmovdqa 0x10(%r13,%r14,8),$mask12
698         vmovdqa 0x20(%r13,%r14,8),$mask10
699         vmovdqu 0x00-0x80($inp),$roundkey
700 ___
701                                         if ($SZ==4) {   # SHA256
702     my @X = map("%xmm$_",(0..3));
703     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
704
705 $code.=<<___;
706         jmp     .Lloop_avx
707 .align  16
708 .Lloop_avx:
709         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
710         vmovdqu 0x00(%rsi,%r12),@X[0]
711         vmovdqu 0x10(%rsi,%r12),@X[1]
712         vmovdqu 0x20(%rsi,%r12),@X[2]
713         vmovdqu 0x30(%rsi,%r12),@X[3]
714         vpshufb $t3,@X[0],@X[0]
715         lea     $TABLE(%rip),$Tbl
716         vpshufb $t3,@X[1],@X[1]
717         vpshufb $t3,@X[2],@X[2]
718         vpaddd  0x00($Tbl),@X[0],$t0
719         vpshufb $t3,@X[3],@X[3]
720         vpaddd  0x20($Tbl),@X[1],$t1
721         vpaddd  0x40($Tbl),@X[2],$t2
722         vpaddd  0x60($Tbl),@X[3],$t3
723         vmovdqa $t0,0x00(%rsp)
724         mov     $A,$a1
725         vmovdqa $t1,0x10(%rsp)
726         mov     $B,$a3
727         vmovdqa $t2,0x20(%rsp)
728         xor     $C,$a3                  # magic
729         vmovdqa $t3,0x30(%rsp)
730         mov     $E,$a0
731         jmp     .Lavx_00_47
732
733 .align  16
734 .Lavx_00_47:
735         sub     \$-16*2*$SZ,$Tbl        # size optimization
736         vmovdqu (%r12),$inout           # $a4
737         mov     %r12,$_inp              # $a4
738 ___
739 sub Xupdate_256_AVX () {
740         (
741         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
742          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
743         '&vpsrld        ($t2,$t0,$sigma0[0]);',
744          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
745         '&vpsrld        ($t3,$t0,$sigma0[2])',
746         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
747         '&vpxor         ($t0,$t3,$t2)',
748          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
749         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
750         '&vpxor         ($t0,$t0,$t1)',
751         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
752         '&vpxor         ($t0,$t0,$t2)',
753          '&vpsrld       ($t2,$t3,$sigma1[2]);',
754         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
755          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
756         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
757          '&vpxor        ($t2,$t2,$t3);',
758          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
759          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
760          '&vpshufd      ($t2,$t2,0b10000100)',
761          '&vpsrldq      ($t2,$t2,8)',
762         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
763          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
764          '&vpsrld       ($t2,$t3,$sigma1[2])',
765          '&vpsrlq       ($t3,$t3,$sigma1[0])',
766          '&vpxor        ($t2,$t2,$t3);',
767          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
768          '&vpxor        ($t2,$t2,$t3)',
769          '&vpshufd      ($t2,$t2,0b11101000)',
770          '&vpslldq      ($t2,$t2,8)',
771         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
772         );
773 }
774
775 sub AVX_256_00_47 () {
776 my $j = shift;
777 my $body = shift;
778 my @X = @_;
779 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
780
781         foreach (Xupdate_256_AVX()) {           # 29 instructions
782             eval;
783             eval(shift(@insns));
784             eval(shift(@insns));
785             eval(shift(@insns));
786         }
787         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
788           foreach (@insns) { eval; }            # remaining instructions
789         &vmovdqa        (16*$j."(%rsp)",$t2);
790 }
791
792     $aesni_cbc_idx=0;
793     for ($i=0,$j=0; $j<4; $j++) {
794         &AVX_256_00_47($j,\&body_00_15,@X);
795         push(@X,shift(@X));                     # rotate(@X)
796     }
797         &mov            ("%r12",$_inp);         # borrow $a4
798         &vpand          ($temp,$temp,$mask14);
799         &mov            ("%r15",$_out);         # borrow $a2
800         &vpor           ($iv,$iv,$temp);
801         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
802         &lea            ("%r12","16(%r12)");    # inp++
803
804         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
805         &jne    (".Lavx_00_47");
806
807         &vmovdqu        ($inout,"(%r12)");
808         &mov            ($_inp,"%r12");
809
810     $aesni_cbc_idx=0;
811     for ($i=0; $i<16; ) {
812         foreach(body_00_15()) { eval; }
813     }
814
815                                         }
816 $code.=<<___;
817         mov     $_inp,%r12              # borrow $a4
818         mov     $_out,%r13              # borrow $a0
819         mov     $_ctx,%r15              # borrow $a2
820         mov     $_in0,%rsi              # borrow $a3
821
822         vpand   $mask14,$temp,$temp
823         mov     $a1,$A
824         vpor    $temp,$iv,$iv
825         vmovdqu $iv,(%r13,%r12)         # write output
826         lea     16(%r12),%r12           # inp++
827
828         add     $SZ*0(%r15),$A
829         add     $SZ*1(%r15),$B
830         add     $SZ*2(%r15),$C
831         add     $SZ*3(%r15),$D
832         add     $SZ*4(%r15),$E
833         add     $SZ*5(%r15),$F
834         add     $SZ*6(%r15),$G
835         add     $SZ*7(%r15),$H
836
837         cmp     $_end,%r12
838
839         mov     $A,$SZ*0(%r15)
840         mov     $B,$SZ*1(%r15)
841         mov     $C,$SZ*2(%r15)
842         mov     $D,$SZ*3(%r15)
843         mov     $E,$SZ*4(%r15)
844         mov     $F,$SZ*5(%r15)
845         mov     $G,$SZ*6(%r15)
846         mov     $H,$SZ*7(%r15)
847         jb      .Lloop_avx
848
849         mov     $_ivp,$ivp
850         mov     $_rsp,%rsi
851         vmovdqu $iv,($ivp)              # output IV
852         vzeroall
853 ___
854 $code.=<<___ if ($win64);
855         movaps  `$framesz+16*0`(%rsp),%xmm6
856         movaps  `$framesz+16*1`(%rsp),%xmm7
857         movaps  `$framesz+16*2`(%rsp),%xmm8
858         movaps  `$framesz+16*3`(%rsp),%xmm9
859         movaps  `$framesz+16*4`(%rsp),%xmm10
860         movaps  `$framesz+16*5`(%rsp),%xmm11
861         movaps  `$framesz+16*6`(%rsp),%xmm12
862         movaps  `$framesz+16*7`(%rsp),%xmm13
863         movaps  `$framesz+16*8`(%rsp),%xmm14
864         movaps  `$framesz+16*9`(%rsp),%xmm15
865 ___
866 $code.=<<___;
867         mov     (%rsi),%r15
868         mov     8(%rsi),%r14
869         mov     16(%rsi),%r13
870         mov     24(%rsi),%r12
871         mov     32(%rsi),%rbp
872         mov     40(%rsi),%rbx
873         lea     48(%rsi),%rsp
874 .Lepilogue_avx:
875         ret
876 .size   ${func}_avx,.-${func}_avx
877 ___
878
879 if ($avx>1) {{
880 ######################################################################
881 # AVX2+BMI code path
882 #
883 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp 
884 my $PUSH8=8*2*$SZ;
885 use integer;
886
887 sub bodyx_00_15 () {
888         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
889         (
890         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
891
892         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
893         '&and   ($a4,$e)',              # f&e
894         '&rorx  ($a0,$e,$Sigma1[2])',
895         '&rorx  ($a2,$e,$Sigma1[1])',
896
897         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
898         '&lea   ($h,"($h,$a4)")',
899         '&andn  ($a4,$e,$g)',           # ~e&g
900         '&xor   ($a0,$a2)',
901
902         '&rorx  ($a1,$e,$Sigma1[0])',
903         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
904         '&xor   ($a0,$a1)',             # Sigma1(e)
905         '&mov   ($a2,$a)',
906
907         '&rorx  ($a4,$a,$Sigma0[2])',
908         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
909         '&xor   ($a2,$b)',              # a^b, b^c in next round
910         '&rorx  ($a1,$a,$Sigma0[1])',
911
912         '&rorx  ($a0,$a,$Sigma0[0])',
913         '&lea   ($d,"($d,$h)")',        # d+=h
914         '&and   ($a3,$a2)',             # (b^c)&(a^b)
915         @aesni_cbc_block[$aesni_cbc_idx++].
916         '&xor   ($a1,$a4)',
917
918         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
919         '&xor   ($a1,$a0)',             # Sigma0(a)
920         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
921         '&mov   ($a4,$e)',              # copy of f in future
922
923         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
924         );
925         # and at the finish one has to $a+=$a1
926 }
927
928 $code.=<<___;
929 .type   ${func}_avx2,\@function,6
930 .align  64
931 ${func}_avx2:
932 .Lavx2_shortcut:
933         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
934         push    %rbx
935         push    %rbp
936         push    %r12
937         push    %r13
938         push    %r14
939         push    %r15
940         mov     %rsp,%r11               # copy %rsp
941         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
942         and     \$-256*$SZ,%rsp         # align stack frame
943         add     \$`2*$SZ*($rounds-8)`,%rsp
944
945         shl     \$6,$len
946         sub     $inp,$out               # re-bias
947         sub     $inp,$in0
948         add     $inp,$len               # end of input
949
950         #mov    $inp,$_inp              # saved later
951         #mov    $out,$_out              # kept in $offload
952         mov     $len,$_end
953         #mov    $key,$_key              # remains resident in $inp register
954         mov     $ivp,$_ivp
955         mov     $ctx,$_ctx
956         mov     $in0,$_in0
957         mov     %r11,$_rsp
958 ___
959 $code.=<<___ if ($win64);
960         movaps  %xmm6,`$framesz+16*0`(%rsp)
961         movaps  %xmm7,`$framesz+16*1`(%rsp)
962         movaps  %xmm8,`$framesz+16*2`(%rsp)
963         movaps  %xmm9,`$framesz+16*3`(%rsp)
964         movaps  %xmm10,`$framesz+16*4`(%rsp)
965         movaps  %xmm11,`$framesz+16*5`(%rsp)
966         movaps  %xmm12,`$framesz+16*6`(%rsp)
967         movaps  %xmm13,`$framesz+16*7`(%rsp)
968         movaps  %xmm14,`$framesz+16*8`(%rsp)
969         movaps  %xmm15,`$framesz+16*9`(%rsp)
970 ___
971 $code.=<<___;
972 .Lprologue_avx2:
973         vzeroall
974
975         mov     $inp,%r13               # borrow $a0
976         vpinsrq \$1,$out,$offload,$offload
977         lea     0x80($key),$inp         # size optimization, reassign
978         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
979         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
980         mov     $ctx,%r15               # borrow $a2
981         mov     $in0,%rsi               # borrow $a3
982         vmovdqu ($ivp),$iv              # load IV
983         lea     -9(%r14),%r14
984
985         vmovdqa 0x00(%r12,%r14,8),$mask14
986         vmovdqa 0x10(%r12,%r14,8),$mask12
987         vmovdqa 0x20(%r12,%r14,8),$mask10
988
989         sub     \$-16*$SZ,%r13          # inp++, size optimization
990         mov     $SZ*0(%r15),$A
991         lea     (%rsi,%r13),%r12        # borrow $a0
992         mov     $SZ*1(%r15),$B
993         cmp     $len,%r13               # $_end
994         mov     $SZ*2(%r15),$C
995         cmove   %rsp,%r12               # next block or random data
996         mov     $SZ*3(%r15),$D
997         mov     $SZ*4(%r15),$E
998         mov     $SZ*5(%r15),$F
999         mov     $SZ*6(%r15),$G
1000         mov     $SZ*7(%r15),$H
1001         vmovdqu 0x00-0x80($inp),$roundkey
1002 ___
1003                                         if ($SZ==4) {   # SHA256
1004     my @X = map("%ymm$_",(0..3));
1005     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1006
1007 $code.=<<___;
1008         jmp     .Loop_avx2
1009 .align  16
1010 .Loop_avx2:
1011         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1012         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1013         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1014         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1015         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1016
1017         vinserti128     \$1,(%r12),@X[0],@X[0]
1018         vinserti128     \$1,16(%r12),@X[1],@X[1]
1019          vpshufb        $t3,@X[0],@X[0]
1020         vinserti128     \$1,32(%r12),@X[2],@X[2]
1021          vpshufb        $t3,@X[1],@X[1]
1022         vinserti128     \$1,48(%r12),@X[3],@X[3]
1023
1024         lea     $TABLE(%rip),$Tbl
1025         vpshufb $t3,@X[2],@X[2]
1026         lea     -16*$SZ(%r13),%r13
1027         vpaddd  0x00($Tbl),@X[0],$t0
1028         vpshufb $t3,@X[3],@X[3]
1029         vpaddd  0x20($Tbl),@X[1],$t1
1030         vpaddd  0x40($Tbl),@X[2],$t2
1031         vpaddd  0x60($Tbl),@X[3],$t3
1032         vmovdqa $t0,0x00(%rsp)
1033         xor     $a1,$a1
1034         vmovdqa $t1,0x20(%rsp)
1035         lea     -$PUSH8(%rsp),%rsp
1036         mov     $B,$a3
1037         vmovdqa $t2,0x00(%rsp)
1038         xor     $C,$a3                  # magic
1039         vmovdqa $t3,0x20(%rsp)
1040         mov     $F,$a4
1041         sub     \$-16*2*$SZ,$Tbl        # size optimization
1042         jmp     .Lavx2_00_47
1043
1044 .align  16
1045 .Lavx2_00_47:
1046         vmovdqu (%r13),$inout
1047         vpinsrq \$0,%r13,$offload,$offload
1048 ___
1049
1050 sub AVX2_256_00_47 () {
1051 my $j = shift;
1052 my $body = shift;
1053 my @X = @_;
1054 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1055 my $base = "+2*$PUSH8(%rsp)";
1056
1057         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1058         foreach (Xupdate_256_AVX()) {           # 29 instructions
1059             eval;
1060             eval(shift(@insns));
1061             eval(shift(@insns));
1062             eval(shift(@insns));
1063         }
1064         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1065           foreach (@insns) { eval; }            # remaining instructions
1066         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1067 }
1068     $aesni_cbc_idx=0;
1069     for ($i=0,$j=0; $j<4; $j++) {
1070         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1071         push(@X,shift(@X));                     # rotate(@X)
1072     }
1073         &vmovq          ("%r13",$offload);      # borrow $a0
1074         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1075         &vpand          ($temp,$temp,$mask14);
1076         &vpor           ($iv,$iv,$temp);
1077         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1078         &lea            ("%r13","16(%r13)");    # inp++
1079
1080         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1081         &cmpb   (($SZ-1)."($Tbl)",0);
1082         &jne    (".Lavx2_00_47");
1083
1084         &vmovdqu        ($inout,"(%r13)");
1085         &vpinsrq        ($offload,$offload,"%r13",0);
1086
1087     $aesni_cbc_idx=0;
1088     for ($i=0; $i<16; ) {
1089         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1090         foreach(bodyx_00_15()) { eval; }
1091     }
1092                                         }
1093 $code.=<<___;
1094         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1095         vmovq   $offload,%r13                   # $_inp, borrow $a0
1096         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1097         add     $a1,$A
1098         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1099
1100         vpand   $mask14,$temp,$temp
1101         vpor    $temp,$iv,$iv
1102         vmovdqu $iv,(%r12,%r13)                 # write output
1103         lea     16(%r13),%r13
1104
1105         add     $SZ*0(%r15),$A
1106         add     $SZ*1(%r15),$B
1107         add     $SZ*2(%r15),$C
1108         add     $SZ*3(%r15),$D
1109         add     $SZ*4(%r15),$E
1110         add     $SZ*5(%r15),$F
1111         add     $SZ*6(%r15),$G
1112         add     $SZ*7(%r15),$H
1113
1114         mov     $A,$SZ*0(%r15)
1115         mov     $B,$SZ*1(%r15)
1116         mov     $C,$SZ*2(%r15)
1117         mov     $D,$SZ*3(%r15)
1118         mov     $E,$SZ*4(%r15)
1119         mov     $F,$SZ*5(%r15)
1120         mov     $G,$SZ*6(%r15)
1121         mov     $H,$SZ*7(%r15)
1122
1123         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1124         je      .Ldone_avx2
1125
1126         xor     $a1,$a1
1127         mov     $B,$a3
1128         mov     $F,$a4
1129         xor     $C,$a3                  # magic
1130         jmp     .Lower_avx2
1131 .align  16
1132 .Lower_avx2:
1133         vmovdqu (%r13),$inout
1134         vpinsrq \$0,%r13,$offload,$offload
1135 ___
1136     $aesni_cbc_idx=0;
1137     for ($i=0; $i<16; ) {
1138         my $base="+16($Tbl)";
1139         foreach(bodyx_00_15()) { eval; }
1140         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1141     }
1142 $code.=<<___;
1143         vmovq   $offload,%r13                   # borrow $a0
1144         vpextrq \$1,$offload,%r15               # borrow $a2
1145         vpand   $mask14,$temp,$temp
1146         vpor    $temp,$iv,$iv
1147         lea     -$PUSH8($Tbl),$Tbl
1148         vmovdqu $iv,(%r15,%r13)                 # write output
1149         lea     16(%r13),%r13                   # inp++
1150         cmp     %rsp,$Tbl
1151         jae     .Lower_avx2
1152
1153         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1154         lea     16*$SZ(%r13),%r13
1155         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1156         add     $a1,$A
1157         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1158
1159         add     $SZ*0(%r15),$A
1160         add     $SZ*1(%r15),$B
1161         add     $SZ*2(%r15),$C
1162         add     $SZ*3(%r15),$D
1163         add     $SZ*4(%r15),$E
1164         add     $SZ*5(%r15),$F
1165         add     $SZ*6(%r15),$G
1166         lea     (%rsi,%r13),%r12
1167         add     $SZ*7(%r15),$H
1168
1169         cmp     $_end,%r13
1170
1171         mov     $A,$SZ*0(%r15)
1172         cmove   %rsp,%r12               # next block or stale data
1173         mov     $B,$SZ*1(%r15)
1174         mov     $C,$SZ*2(%r15)
1175         mov     $D,$SZ*3(%r15)
1176         mov     $E,$SZ*4(%r15)
1177         mov     $F,$SZ*5(%r15)
1178         mov     $G,$SZ*6(%r15)
1179         mov     $H,$SZ*7(%r15)
1180
1181         jbe     .Loop_avx2
1182         lea     (%rsp),$Tbl
1183
1184 .Ldone_avx2:
1185         lea     ($Tbl),%rsp
1186         mov     $_ivp,$ivp
1187         mov     $_rsp,%rsi
1188         vmovdqu $iv,($ivp)              # output IV
1189         vzeroall
1190 ___
1191 $code.=<<___ if ($win64);
1192         movaps  `$framesz+16*0`(%rsp),%xmm6
1193         movaps  `$framesz+16*1`(%rsp),%xmm7
1194         movaps  `$framesz+16*2`(%rsp),%xmm8
1195         movaps  `$framesz+16*3`(%rsp),%xmm9
1196         movaps  `$framesz+16*4`(%rsp),%xmm10
1197         movaps  `$framesz+16*5`(%rsp),%xmm11
1198         movaps  `$framesz+16*6`(%rsp),%xmm12
1199         movaps  `$framesz+16*7`(%rsp),%xmm13
1200         movaps  `$framesz+16*8`(%rsp),%xmm14
1201         movaps  `$framesz+16*9`(%rsp),%xmm15
1202 ___
1203 $code.=<<___;
1204         mov     (%rsi),%r15
1205         mov     8(%rsi),%r14
1206         mov     16(%rsi),%r13
1207         mov     24(%rsi),%r12
1208         mov     32(%rsi),%rbp
1209         mov     40(%rsi),%rbx
1210         lea     48(%rsi),%rsp
1211 .Lepilogue_avx2:
1212         ret
1213 .size   ${func}_avx2,.-${func}_avx2
1214 ___
1215 }}
1216 }}
1217 {{
1218 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1219
1220 my ($rounds,$Tbl)=("%r11d","%rbx");
1221
1222 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1223 my @rndkey=("%xmm4","%xmm5");
1224 my $r=0;
1225 my $sn=0;
1226
1227 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1228 my @MSG=map("%xmm$_",(10..13));
1229
1230 my $aesenc=sub {
1231   use integer;
1232   my ($n,$k)=($r/10,$r%10);
1233     if ($k==0) {
1234       $code.=<<___;
1235         movups          `16*$n`($in0),$in               # load input
1236         xorps           $rndkey0,$in
1237 ___
1238       $code.=<<___ if ($n);
1239         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1240 ___
1241       $code.=<<___;
1242         xorps           $in,$iv
1243         movups          `32+16*$k-112`($key),$rndkey[1]
1244         aesenc          $rndkey[0],$iv
1245 ___
1246     } elsif ($k==9) {
1247       $sn++;
1248       $code.=<<___;
1249         cmp             \$11,$rounds
1250         jb              .Laesenclast$sn
1251         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1252         aesenc          $rndkey[0],$iv
1253         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1254         aesenc          $rndkey[1],$iv
1255         je              .Laesenclast$sn
1256         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1257         aesenc          $rndkey[0],$iv
1258         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1259         aesenc          $rndkey[1],$iv
1260 .Laesenclast$sn:
1261         aesenclast      $rndkey[0],$iv
1262         movups          16-112($key),$rndkey[1]         # forward reference
1263         nop
1264 ___
1265     } else {
1266       $code.=<<___;
1267         movups          `32+16*$k-112`($key),$rndkey[1]
1268         aesenc          $rndkey[0],$iv
1269 ___
1270     }
1271     $r++;       unshift(@rndkey,pop(@rndkey));
1272 };
1273
1274 if ($shaext) {
1275 my $Tbl="%rax";
1276
1277 $code.=<<___;
1278 .type   ${func}_shaext,\@function,6
1279 .align  32
1280 ${func}_shaext:
1281         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1282 ___
1283 $code.=<<___ if ($win64);
1284         lea     `-8-10*16`(%rsp),%rsp
1285         movaps  %xmm6,-8-10*16(%rax)
1286         movaps  %xmm7,-8-9*16(%rax)
1287         movaps  %xmm8,-8-8*16(%rax)
1288         movaps  %xmm9,-8-7*16(%rax)
1289         movaps  %xmm10,-8-6*16(%rax)
1290         movaps  %xmm11,-8-5*16(%rax)
1291         movaps  %xmm12,-8-4*16(%rax)
1292         movaps  %xmm13,-8-3*16(%rax)
1293         movaps  %xmm14,-8-2*16(%rax)
1294         movaps  %xmm15,-8-1*16(%rax)
1295 .Lprologue_shaext:
1296 ___
1297 $code.=<<___;
1298         lea             K256+0x80(%rip),$Tbl
1299         movdqu          ($ctx),$ABEF            # DCBA
1300         movdqu          16($ctx),$CDGH          # HGFE
1301         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1302
1303         mov             240($key),$rounds
1304         sub             $in0,$out
1305         movups          ($key),$rndkey0         # $key[0]
1306         movups          16($key),$rndkey[0]     # forward reference
1307         lea             112($key),$key          # size optimization
1308
1309         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1310         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1311         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1312         movdqa          $TMP,$BSWAP             # offload
1313         palignr         \$8,$CDGH,$ABEF         # ABEF
1314         punpcklqdq      $Wi,$CDGH               # CDGH
1315
1316         jmp     .Loop_shaext
1317
1318 .align  16
1319 .Loop_shaext:
1320         movdqu          ($inp),@MSG[0]
1321         movdqu          0x10($inp),@MSG[1]
1322         movdqu          0x20($inp),@MSG[2]
1323         pshufb          $TMP,@MSG[0]
1324         movdqu          0x30($inp),@MSG[3]
1325
1326         movdqa          0*32-0x80($Tbl),$Wi
1327         paddd           @MSG[0],$Wi
1328         pshufb          $TMP,@MSG[1]
1329         movdqa          $CDGH,$CDGH_SAVE        # offload
1330         movdqa          $ABEF,$ABEF_SAVE        # offload
1331 ___
1332         &$aesenc();
1333 $code.=<<___;
1334         sha256rnds2     $ABEF,$CDGH             # 0-3
1335         pshufd          \$0x0e,$Wi,$Wi
1336 ___
1337         &$aesenc();
1338 $code.=<<___;
1339         sha256rnds2     $CDGH,$ABEF
1340
1341         movdqa          1*32-0x80($Tbl),$Wi
1342         paddd           @MSG[1],$Wi
1343         pshufb          $TMP,@MSG[2]
1344         lea             0x40($inp),$inp
1345 ___
1346         &$aesenc();
1347 $code.=<<___;
1348         sha256rnds2     $ABEF,$CDGH             # 4-7
1349         pshufd          \$0x0e,$Wi,$Wi
1350 ___
1351         &$aesenc();
1352 $code.=<<___;
1353         sha256rnds2     $CDGH,$ABEF
1354
1355         movdqa          2*32-0x80($Tbl),$Wi
1356         paddd           @MSG[2],$Wi
1357         pshufb          $TMP,@MSG[3]
1358         sha256msg1      @MSG[1],@MSG[0]
1359 ___
1360         &$aesenc();
1361 $code.=<<___;
1362         sha256rnds2     $ABEF,$CDGH             # 8-11
1363         pshufd          \$0x0e,$Wi,$Wi
1364         movdqa          @MSG[3],$TMP
1365         palignr         \$4,@MSG[2],$TMP
1366         paddd           $TMP,@MSG[0]
1367 ___
1368         &$aesenc();
1369 $code.=<<___;
1370         sha256rnds2     $CDGH,$ABEF
1371
1372         movdqa          3*32-0x80($Tbl),$Wi
1373         paddd           @MSG[3],$Wi
1374         sha256msg2      @MSG[3],@MSG[0]
1375         sha256msg1      @MSG[2],@MSG[1]
1376 ___
1377         &$aesenc();
1378 $code.=<<___;
1379         sha256rnds2     $ABEF,$CDGH             # 12-15
1380         pshufd          \$0x0e,$Wi,$Wi
1381 ___
1382         &$aesenc();
1383 $code.=<<___;
1384         movdqa          @MSG[0],$TMP
1385         palignr         \$4,@MSG[3],$TMP
1386         paddd           $TMP,@MSG[1]
1387         sha256rnds2     $CDGH,$ABEF
1388 ___
1389 for($i=4;$i<16-3;$i++) {
1390         &$aesenc()      if (($r%10)==0);
1391 $code.=<<___;
1392         movdqa          $i*32-0x80($Tbl),$Wi
1393         paddd           @MSG[0],$Wi
1394         sha256msg2      @MSG[0],@MSG[1]
1395         sha256msg1      @MSG[3],@MSG[2]
1396 ___
1397         &$aesenc();
1398 $code.=<<___;
1399         sha256rnds2     $ABEF,$CDGH             # 16-19...
1400         pshufd          \$0x0e,$Wi,$Wi
1401         movdqa          @MSG[1],$TMP
1402         palignr         \$4,@MSG[0],$TMP
1403         paddd           $TMP,@MSG[2]
1404 ___
1405         &$aesenc();
1406         &$aesenc()      if ($r==19);
1407 $code.=<<___;
1408         sha256rnds2     $CDGH,$ABEF
1409 ___
1410         push(@MSG,shift(@MSG));
1411 }
1412 $code.=<<___;
1413         movdqa          13*32-0x80($Tbl),$Wi
1414         paddd           @MSG[0],$Wi
1415         sha256msg2      @MSG[0],@MSG[1]
1416         sha256msg1      @MSG[3],@MSG[2]
1417 ___
1418         &$aesenc();
1419 $code.=<<___;
1420         sha256rnds2     $ABEF,$CDGH             # 52-55
1421         pshufd          \$0x0e,$Wi,$Wi
1422         movdqa          @MSG[1],$TMP
1423         palignr         \$4,@MSG[0],$TMP
1424         paddd           $TMP,@MSG[2]
1425 ___
1426         &$aesenc();
1427         &$aesenc();
1428 $code.=<<___;
1429         sha256rnds2     $CDGH,$ABEF
1430
1431         movdqa          14*32-0x80($Tbl),$Wi
1432         paddd           @MSG[1],$Wi
1433         sha256msg2      @MSG[1],@MSG[2]
1434         movdqa          $BSWAP,$TMP
1435 ___
1436         &$aesenc();
1437 $code.=<<___;
1438         sha256rnds2     $ABEF,$CDGH             # 56-59
1439         pshufd          \$0x0e,$Wi,$Wi
1440 ___
1441         &$aesenc();
1442 $code.=<<___;
1443         sha256rnds2     $CDGH,$ABEF
1444
1445         movdqa          15*32-0x80($Tbl),$Wi
1446         paddd           @MSG[2],$Wi
1447 ___
1448         &$aesenc();
1449         &$aesenc();
1450 $code.=<<___;
1451         sha256rnds2     $ABEF,$CDGH             # 60-63
1452         pshufd          \$0x0e,$Wi,$Wi
1453 ___
1454         &$aesenc();
1455 $code.=<<___;
1456         sha256rnds2     $CDGH,$ABEF
1457         #pxor           $CDGH,$rndkey0          # black magic
1458 ___
1459         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1460 $code.=<<___;
1461         #xorps          $CDGH,$rndkey0          # black magic
1462         paddd           $CDGH_SAVE,$CDGH
1463         paddd           $ABEF_SAVE,$ABEF
1464
1465         dec             $len
1466         movups          $iv,48($out,$in0)       # write output
1467         lea             64($in0),$in0
1468         jnz             .Loop_shaext
1469
1470         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1471         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1472         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1473         punpckhqdq      $CDGH,$ABEF             # DCBA
1474         palignr         \$8,$TMP,$CDGH          # HGFE
1475
1476         movups          $iv,($ivp)              # write IV
1477         movdqu          $ABEF,($ctx)
1478         movdqu          $CDGH,16($ctx)
1479 ___
1480 $code.=<<___ if ($win64);
1481         movaps  0*16(%rsp),%xmm6
1482         movaps  1*16(%rsp),%xmm7
1483         movaps  2*16(%rsp),%xmm8
1484         movaps  3*16(%rsp),%xmm9
1485         movaps  4*16(%rsp),%xmm10
1486         movaps  5*16(%rsp),%xmm11
1487         movaps  6*16(%rsp),%xmm12
1488         movaps  7*16(%rsp),%xmm13
1489         movaps  8*16(%rsp),%xmm14
1490         movaps  9*16(%rsp),%xmm15
1491         lea     8+10*16(%rsp),%rsp
1492 .Lepilogue_shaext:
1493 ___
1494 $code.=<<___;
1495         ret
1496 .size   ${func}_shaext,.-${func}_shaext
1497 ___
1498 }
1499 }}}}}
1500
1501 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1502 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1503 if ($win64 && $avx) {
1504 $rec="%rcx";
1505 $frame="%rdx";
1506 $context="%r8";
1507 $disp="%r9";
1508
1509 $code.=<<___;
1510 .extern __imp_RtlVirtualUnwind
1511 .type   se_handler,\@abi-omnipotent
1512 .align  16
1513 se_handler:
1514         push    %rsi
1515         push    %rdi
1516         push    %rbx
1517         push    %rbp
1518         push    %r12
1519         push    %r13
1520         push    %r14
1521         push    %r15
1522         pushfq
1523         sub     \$64,%rsp
1524
1525         mov     120($context),%rax      # pull context->Rax
1526         mov     248($context),%rbx      # pull context->Rip
1527
1528         mov     8($disp),%rsi           # disp->ImageBase
1529         mov     56($disp),%r11          # disp->HanderlData
1530
1531         mov     0(%r11),%r10d           # HandlerData[0]
1532         lea     (%rsi,%r10),%r10        # prologue label
1533         cmp     %r10,%rbx               # context->Rip<prologue label
1534         jb      .Lin_prologue
1535
1536         mov     152($context),%rax      # pull context->Rsp
1537
1538         mov     4(%r11),%r10d           # HandlerData[1]
1539         lea     (%rsi,%r10),%r10        # epilogue label
1540         cmp     %r10,%rbx               # context->Rip>=epilogue label
1541         jae     .Lin_prologue
1542 ___
1543 $code.=<<___ if ($shaext);
1544         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1545         cmp     %r10,%rbx
1546         jb      .Lnot_in_shaext
1547
1548         lea     (%rax),%rsi
1549         lea     512($context),%rdi      # &context.Xmm6
1550         mov     \$20,%ecx
1551         .long   0xa548f3fc              # cld; rep movsq
1552         lea     168(%rax),%rax          # adjust stack pointer
1553         jmp     .Lin_prologue
1554 .Lnot_in_shaext:
1555 ___
1556 $code.=<<___ if ($avx>1);
1557         lea     .Lavx2_shortcut(%rip),%r10
1558         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1559         jb      .Lnot_in_avx2
1560
1561         and     \$-256*$SZ,%rax
1562         add     \$`2*$SZ*($rounds-8)`,%rax
1563 .Lnot_in_avx2:
1564 ___
1565 $code.=<<___;
1566         mov     %rax,%rsi               # put aside Rsp
1567         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1568         lea     48(%rax),%rax
1569
1570         mov     -8(%rax),%rbx
1571         mov     -16(%rax),%rbp
1572         mov     -24(%rax),%r12
1573         mov     -32(%rax),%r13
1574         mov     -40(%rax),%r14
1575         mov     -48(%rax),%r15
1576         mov     %rbx,144($context)      # restore context->Rbx
1577         mov     %rbp,160($context)      # restore context->Rbp
1578         mov     %r12,216($context)      # restore context->R12
1579         mov     %r13,224($context)      # restore context->R13
1580         mov     %r14,232($context)      # restore context->R14
1581         mov     %r15,240($context)      # restore context->R15
1582
1583         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1584         lea     512($context),%rdi      # &context.Xmm6
1585         mov     \$20,%ecx
1586         .long   0xa548f3fc              # cld; rep movsq
1587
1588 .Lin_prologue:
1589         mov     8(%rax),%rdi
1590         mov     16(%rax),%rsi
1591         mov     %rax,152($context)      # restore context->Rsp
1592         mov     %rsi,168($context)      # restore context->Rsi
1593         mov     %rdi,176($context)      # restore context->Rdi
1594
1595         mov     40($disp),%rdi          # disp->ContextRecord
1596         mov     $context,%rsi           # context
1597         mov     \$154,%ecx              # sizeof(CONTEXT)
1598         .long   0xa548f3fc              # cld; rep movsq
1599
1600         mov     $disp,%rsi
1601         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1602         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1603         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1604         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1605         mov     40(%rsi),%r10           # disp->ContextRecord
1606         lea     56(%rsi),%r11           # &disp->HandlerData
1607         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1608         mov     %r10,32(%rsp)           # arg5
1609         mov     %r11,40(%rsp)           # arg6
1610         mov     %r12,48(%rsp)           # arg7
1611         mov     %rcx,56(%rsp)           # arg8, (NULL)
1612         call    *__imp_RtlVirtualUnwind(%rip)
1613
1614         mov     \$1,%eax                # ExceptionContinueSearch
1615         add     \$64,%rsp
1616         popfq
1617         pop     %r15
1618         pop     %r14
1619         pop     %r13
1620         pop     %r12
1621         pop     %rbp
1622         pop     %rbx
1623         pop     %rdi
1624         pop     %rsi
1625         ret
1626 .size   se_handler,.-se_handler
1627
1628 .section        .pdata
1629         .rva    .LSEH_begin_${func}_xop
1630         .rva    .LSEH_end_${func}_xop
1631         .rva    .LSEH_info_${func}_xop
1632
1633         .rva    .LSEH_begin_${func}_avx
1634         .rva    .LSEH_end_${func}_avx
1635         .rva    .LSEH_info_${func}_avx
1636 ___
1637 $code.=<<___ if ($avx>1);
1638         .rva    .LSEH_begin_${func}_avx2
1639         .rva    .LSEH_end_${func}_avx2
1640         .rva    .LSEH_info_${func}_avx2
1641 ___
1642 $code.=<<___ if ($shaext);
1643         .rva    .LSEH_begin_${func}_shaext
1644         .rva    .LSEH_end_${func}_shaext
1645         .rva    .LSEH_info_${func}_shaext
1646 ___
1647 $code.=<<___;
1648 .section        .xdata
1649 .align  8
1650 .LSEH_info_${func}_xop:
1651         .byte   9,0,0,0
1652         .rva    se_handler
1653         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1654
1655 .LSEH_info_${func}_avx:
1656         .byte   9,0,0,0
1657         .rva    se_handler
1658         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1659 ___
1660 $code.=<<___ if ($avx>1);
1661 .LSEH_info_${func}_avx2:
1662         .byte   9,0,0,0
1663         .rva    se_handler
1664         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1665 ___
1666 $code.=<<___ if ($shaext);
1667 .LSEH_info_${func}_shaext:
1668         .byte   9,0,0,0
1669         .rva    se_handler
1670         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1671 ___
1672 }
1673
1674 ####################################################################
1675 sub rex {
1676   local *opcode=shift;
1677   my ($dst,$src)=@_;
1678   my $rex=0;
1679
1680     $rex|=0x04                  if($dst>=8);
1681     $rex|=0x01                  if($src>=8);
1682     unshift @opcode,$rex|0x40   if($rex);
1683 }
1684
1685 {
1686   my %opcodelet = (
1687                 "sha256rnds2" => 0xcb,
1688                 "sha256msg1"  => 0xcc,
1689                 "sha256msg2"  => 0xcd   );
1690
1691   sub sha256op38 {
1692     my $instr = shift;
1693
1694     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1695       my @opcode=(0x0f,0x38);
1696         rex(\@opcode,$2,$1);
1697         push @opcode,$opcodelet{$instr};
1698         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1699         return ".byte\t".join(',',@opcode);
1700     } else {
1701         return $instr."\t".@_[0];
1702     }
1703   }
1704 }
1705
1706 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1707 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1708 print $code;
1709 close STDOUT;