x86_64 assembly pack: tune clang version detection.
[openssl.git] / crypto / aes / asm / aesni-sha256-x86_64.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # January 2013
11 #
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
22 # subroutine:
23 #
24 #                AES-128/-192/-256+SHA256       this(**)gain
25 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
26 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
27 # Haswell           4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
28 # Skylake           2.62/3.14/3.62+7.70         8.10    +27%/34%/40%
29 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
30 #
31 # (*)   there are XOP, AVX1 and AVX2 code pathes, meaning that
32 #       Westmere is omitted from loop, this is because gain was not
33 #       estimated high enough to justify the effort;
34 # (**)  these are EVP-free results, results obtained with 'speed
35 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
36
37 $flavour = shift;
38 $output  = shift;
39 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
40
41 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
42
43 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
44 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
45 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
46 die "can't locate x86_64-xlate.pl";
47
48 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
49                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
50         $avx = ($1>=2.19) + ($1>=2.22);
51 }
52
53 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
54            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
55         $avx = ($1>=2.09) + ($1>=2.10);
56 }
57
58 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
59            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
60         $avx = ($1>=10) + ($1>=12);
61 }
62
63 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) {
64         $avx = ($2>=3.0) + ($2>3.0);
65 }
66
67 $shaext=$avx;   ### set to zero if compiling for 1.0.1
68 $avx=1          if (!$shaext && $avx);
69
70 open OUT,"| \"$^X\" $xlate $flavour $output";
71 *STDOUT=*OUT;
72
73 $func="aesni_cbc_sha256_enc";
74 $TABLE="K256";
75 $SZ=4;
76 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
77                                 "%r8d","%r9d","%r10d","%r11d");
78 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
79 @Sigma0=( 2,13,22);
80 @Sigma1=( 6,11,25);
81 @sigma0=( 7,18, 3);
82 @sigma1=(17,19,10);
83 $rounds=64;
84
85 ########################################################################
86 # void aesni_cbc_sha256_enc(const void *inp,
87 #                       void *out,
88 #                       size_t length,
89 #                       const AES_KEY *key,
90 #                       unsigned char *iv,
91 #                       SHA256_CTX *ctx,
92 #                       const void *in0);
93 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
94 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
95
96 $Tbl="%rbp";
97
98 $_inp="16*$SZ+0*8(%rsp)";
99 $_out="16*$SZ+1*8(%rsp)";
100 $_end="16*$SZ+2*8(%rsp)";
101 $_key="16*$SZ+3*8(%rsp)";
102 $_ivp="16*$SZ+4*8(%rsp)";
103 $_ctx="16*$SZ+5*8(%rsp)";
104 $_in0="16*$SZ+6*8(%rsp)";
105 $_rsp="16*$SZ+7*8(%rsp)";
106 $framesz=16*$SZ+8*8;
107
108 $code=<<___;
109 .text
110
111 .extern OPENSSL_ia32cap_P
112 .globl  $func
113 .type   $func,\@abi-omnipotent
114 .align  16
115 $func:
116 ___
117                                                 if ($avx) {
118 $code.=<<___;
119         lea     OPENSSL_ia32cap_P(%rip),%r11
120         mov     \$1,%eax
121         cmp     \$0,`$win64?"%rcx":"%rdi"`
122         je      .Lprobe
123         mov     0(%r11),%eax
124         mov     4(%r11),%r10
125 ___
126 $code.=<<___ if ($shaext);
127         bt      \$61,%r10                       # check for SHA
128         jc      ${func}_shaext
129 ___
130 $code.=<<___;
131         mov     %r10,%r11
132         shr     \$32,%r11
133
134         test    \$`1<<11`,%r10d                 # check for XOP
135         jnz     ${func}_xop
136 ___
137 $code.=<<___ if ($avx>1);
138         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
139         cmp     \$`1<<8|1<<5|1<<3`,%r11d
140         je      ${func}_avx2
141 ___
142 $code.=<<___;
143         and     \$`1<<28`,%r10d                 # check for AVX
144         jnz     ${func}_avx
145         ud2
146 ___
147                                                 }
148 $code.=<<___;
149         xor     %eax,%eax
150         cmp     \$0,`$win64?"%rcx":"%rdi"`
151         je      .Lprobe
152         ud2
153 .Lprobe:
154         ret
155 .size   $func,.-$func
156
157 .align  64
158 .type   $TABLE,\@object
159 $TABLE:
160         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
161         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
162         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
163         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
164         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
165         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
166         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
167         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
168         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
169         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
170         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
171         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
172         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
173         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
174         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
175         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
176         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
177         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
178         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
179         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
180         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
181         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
182         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
183         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
184         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
185         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
186         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
187         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
188         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
189         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
190         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
191         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
192
193         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
194         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
195         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
196         .long   0,0,0,0,   0,0,0,0
197         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
198 .align  64
199 ___
200
201 ######################################################################
202 # SIMD code paths
203 #
204 {{{
205 ($iv,$inout,$roundkey,$temp,
206  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
207
208 $aesni_cbc_idx=0;
209 @aesni_cbc_block = (
210 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
211 ##      &vmovdqu        ($inout,($inp));
212 ##      &mov            ($_inp,$inp);
213
214         '&vpxor         ($inout,$inout,$roundkey);'.
215         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
216
217         '&vpxor         ($inout,$inout,$iv);',
218
219         '&vaesenc       ($inout,$inout,$roundkey);'.
220         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
221
222         '&vaesenc       ($inout,$inout,$roundkey);'.
223         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
224
225         '&vaesenc       ($inout,$inout,$roundkey);'.
226         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
227
228         '&vaesenc       ($inout,$inout,$roundkey);'.
229         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
230
231         '&vaesenc       ($inout,$inout,$roundkey);'.
232         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
233
234         '&vaesenc       ($inout,$inout,$roundkey);'.
235         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
236
237         '&vaesenc       ($inout,$inout,$roundkey);'.
238         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
239
240         '&vaesenc       ($inout,$inout,$roundkey);'.
241         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
242
243         '&vaesenc       ($inout,$inout,$roundkey);'.
244         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
245
246         '&vaesenclast   ($temp,$inout,$roundkey);'.
247         ' &vaesenc      ($inout,$inout,$roundkey);'.
248         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
249
250         '&vpand         ($iv,$temp,$mask10);'.
251         ' &vaesenc      ($inout,$inout,$roundkey);'.
252         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
253
254         '&vaesenclast   ($temp,$inout,$roundkey);'.
255         ' &vaesenc      ($inout,$inout,$roundkey);'.
256         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
257
258         '&vpand         ($temp,$temp,$mask12);'.
259         ' &vaesenc      ($inout,$inout,$roundkey);'.
260          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
261
262         '&vpor          ($iv,$iv,$temp);'.
263         ' &vaesenclast  ($temp,$inout,$roundkey);'.
264         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
265
266 ##      &mov            ($inp,$_inp);
267 ##      &mov            ($out,$_out);
268 ##      &vpand          ($temp,$temp,$mask14);
269 ##      &vpor           ($iv,$iv,$temp);
270 ##      &vmovdqu        ($iv,($out,$inp);
271 ##      &lea            (inp,16($inp));
272 );
273
274 my $a4=$T1;
275 my ($a,$b,$c,$d,$e,$f,$g,$h);
276
277 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
278 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
279   my $arg = pop;
280     $arg = "\$$arg" if ($arg*1 eq $arg);
281     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
282 }
283
284 sub body_00_15 () {
285         (
286         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
287
288         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
289         '&mov   ($a,$a1)',
290         '&mov   ($a4,$f)',
291
292         '&xor   ($a0,$e)',
293         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
294         '&xor   ($a4,$g)',                      # f^g
295
296         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
297         '&xor   ($a1,$a)',
298         '&and   ($a4,$e)',                      # (f^g)&e
299
300         @aesni_cbc_block[$aesni_cbc_idx++].
301         '&xor   ($a0,$e)',
302         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
303         '&mov   ($a2,$a)',
304
305         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
306         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
307         '&xor   ($a2,$b)',                      # a^b, b^c in next round
308
309         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
310         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
311         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
312
313         '&xor   ($a1,$a)',
314         '&add   ($h,$a0)',                      # h+=Sigma1(e)
315         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
316
317         '&add   ($d,$h)',                       # d+=h
318         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
319         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
320
321         '&mov   ($a0,$d)',
322         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
323         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
324         );
325 }
326
327 if ($avx) {{
328 ######################################################################
329 # XOP code path
330 #
331 $code.=<<___;
332 .type   ${func}_xop,\@function,6
333 .align  64
334 ${func}_xop:
335 .Lxop_shortcut:
336         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
337         push    %rbx
338         push    %rbp
339         push    %r12
340         push    %r13
341         push    %r14
342         push    %r15
343         mov     %rsp,%r11               # copy %rsp
344         sub     \$`$framesz+$win64*16*10`,%rsp
345         and     \$-64,%rsp              # align stack frame
346
347         shl     \$6,$len
348         sub     $inp,$out               # re-bias
349         sub     $inp,$in0
350         add     $inp,$len               # end of input
351
352         #mov    $inp,$_inp              # saved later
353         mov     $out,$_out
354         mov     $len,$_end
355         #mov    $key,$_key              # remains resident in $inp register
356         mov     $ivp,$_ivp
357         mov     $ctx,$_ctx
358         mov     $in0,$_in0
359         mov     %r11,$_rsp
360 ___
361 $code.=<<___ if ($win64);
362         movaps  %xmm6,`$framesz+16*0`(%rsp)
363         movaps  %xmm7,`$framesz+16*1`(%rsp)
364         movaps  %xmm8,`$framesz+16*2`(%rsp)
365         movaps  %xmm9,`$framesz+16*3`(%rsp)
366         movaps  %xmm10,`$framesz+16*4`(%rsp)
367         movaps  %xmm11,`$framesz+16*5`(%rsp)
368         movaps  %xmm12,`$framesz+16*6`(%rsp)
369         movaps  %xmm13,`$framesz+16*7`(%rsp)
370         movaps  %xmm14,`$framesz+16*8`(%rsp)
371         movaps  %xmm15,`$framesz+16*9`(%rsp)
372 ___
373 $code.=<<___;
374 .Lprologue_xop:
375         vzeroall
376
377         mov     $inp,%r12               # borrow $a4
378         lea     0x80($key),$inp         # size optimization, reassign
379         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
380         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
381         mov     $ctx,%r15               # borrow $a2
382         mov     $in0,%rsi               # borrow $a3
383         vmovdqu ($ivp),$iv              # load IV
384         sub     \$9,%r14
385
386         mov     $SZ*0(%r15),$A
387         mov     $SZ*1(%r15),$B
388         mov     $SZ*2(%r15),$C
389         mov     $SZ*3(%r15),$D
390         mov     $SZ*4(%r15),$E
391         mov     $SZ*5(%r15),$F
392         mov     $SZ*6(%r15),$G
393         mov     $SZ*7(%r15),$H
394
395         vmovdqa 0x00(%r13,%r14,8),$mask14
396         vmovdqa 0x10(%r13,%r14,8),$mask12
397         vmovdqa 0x20(%r13,%r14,8),$mask10
398         vmovdqu 0x00-0x80($inp),$roundkey
399         jmp     .Lloop_xop
400 ___
401                                         if ($SZ==4) {   # SHA256
402     my @X = map("%xmm$_",(0..3));
403     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
404
405 $code.=<<___;
406 .align  16
407 .Lloop_xop:
408         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
409         vmovdqu 0x00(%rsi,%r12),@X[0]
410         vmovdqu 0x10(%rsi,%r12),@X[1]
411         vmovdqu 0x20(%rsi,%r12),@X[2]
412         vmovdqu 0x30(%rsi,%r12),@X[3]
413         vpshufb $t3,@X[0],@X[0]
414         lea     $TABLE(%rip),$Tbl
415         vpshufb $t3,@X[1],@X[1]
416         vpshufb $t3,@X[2],@X[2]
417         vpaddd  0x00($Tbl),@X[0],$t0
418         vpshufb $t3,@X[3],@X[3]
419         vpaddd  0x20($Tbl),@X[1],$t1
420         vpaddd  0x40($Tbl),@X[2],$t2
421         vpaddd  0x60($Tbl),@X[3],$t3
422         vmovdqa $t0,0x00(%rsp)
423         mov     $A,$a1
424         vmovdqa $t1,0x10(%rsp)
425         mov     $B,$a3
426         vmovdqa $t2,0x20(%rsp)
427         xor     $C,$a3                  # magic
428         vmovdqa $t3,0x30(%rsp)
429         mov     $E,$a0
430         jmp     .Lxop_00_47
431
432 .align  16
433 .Lxop_00_47:
434         sub     \$-16*2*$SZ,$Tbl        # size optimization
435         vmovdqu (%r12),$inout           # $a4
436         mov     %r12,$_inp              # $a4
437 ___
438 sub XOP_256_00_47 () {
439 my $j = shift;
440 my $body = shift;
441 my @X = @_;
442 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
443
444         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
445           eval(shift(@insns));
446           eval(shift(@insns));
447          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
448           eval(shift(@insns));
449           eval(shift(@insns));
450         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
451           eval(shift(@insns));
452           eval(shift(@insns));
453         &vpsrld         ($t0,$t0,$sigma0[2]);
454           eval(shift(@insns));
455           eval(shift(@insns));
456          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
457           eval(shift(@insns));
458           eval(shift(@insns));
459           eval(shift(@insns));
460           eval(shift(@insns));
461         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
462           eval(shift(@insns));
463           eval(shift(@insns));
464         &vpxor          ($t0,$t0,$t1);
465           eval(shift(@insns));
466           eval(shift(@insns));
467           eval(shift(@insns));
468           eval(shift(@insns));
469          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
470           eval(shift(@insns));
471           eval(shift(@insns));
472         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
473           eval(shift(@insns));
474           eval(shift(@insns));
475          &vpsrld        ($t2,@X[3],$sigma1[2]);
476           eval(shift(@insns));
477           eval(shift(@insns));
478         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
479           eval(shift(@insns));
480           eval(shift(@insns));
481          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
482           eval(shift(@insns));
483           eval(shift(@insns));
484          &vpxor         ($t3,$t3,$t2);
485           eval(shift(@insns));
486           eval(shift(@insns));
487           eval(shift(@insns));
488           eval(shift(@insns));
489          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
490           eval(shift(@insns));
491           eval(shift(@insns));
492           eval(shift(@insns));
493           eval(shift(@insns));
494         &vpsrldq        ($t3,$t3,8);
495           eval(shift(@insns));
496           eval(shift(@insns));
497           eval(shift(@insns));
498           eval(shift(@insns));
499         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
500           eval(shift(@insns));
501           eval(shift(@insns));
502           eval(shift(@insns));
503           eval(shift(@insns));
504          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
505           eval(shift(@insns));
506           eval(shift(@insns));
507          &vpsrld        ($t2,@X[0],$sigma1[2]);
508           eval(shift(@insns));
509           eval(shift(@insns));
510          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
511           eval(shift(@insns));
512           eval(shift(@insns));
513          &vpxor         ($t3,$t3,$t2);
514           eval(shift(@insns));
515           eval(shift(@insns));
516           eval(shift(@insns));
517           eval(shift(@insns));
518          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
519           eval(shift(@insns));
520           eval(shift(@insns));
521           eval(shift(@insns));
522           eval(shift(@insns));
523         &vpslldq        ($t3,$t3,8);            # 22 instructions
524           eval(shift(@insns));
525           eval(shift(@insns));
526           eval(shift(@insns));
527           eval(shift(@insns));
528         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
529           eval(shift(@insns));
530           eval(shift(@insns));
531           eval(shift(@insns));
532           eval(shift(@insns));
533         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
534           foreach (@insns) { eval; }            # remaining instructions
535         &vmovdqa        (16*$j."(%rsp)",$t2);
536 }
537
538     $aesni_cbc_idx=0;
539     for ($i=0,$j=0; $j<4; $j++) {
540         &XOP_256_00_47($j,\&body_00_15,@X);
541         push(@X,shift(@X));                     # rotate(@X)
542     }
543         &mov            ("%r12",$_inp);         # borrow $a4
544         &vpand          ($temp,$temp,$mask14);
545         &mov            ("%r15",$_out);         # borrow $a2
546         &vpor           ($iv,$iv,$temp);
547         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
548         &lea            ("%r12","16(%r12)");    # inp++
549
550         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
551         &jne    (".Lxop_00_47");
552
553         &vmovdqu        ($inout,"(%r12)");
554         &mov            ($_inp,"%r12");
555
556     $aesni_cbc_idx=0;
557     for ($i=0; $i<16; ) {
558         foreach(body_00_15()) { eval; }
559     }
560                                         }
561 $code.=<<___;
562         mov     $_inp,%r12              # borrow $a4
563         mov     $_out,%r13              # borrow $a0
564         mov     $_ctx,%r15              # borrow $a2
565         mov     $_in0,%rsi              # borrow $a3
566
567         vpand   $mask14,$temp,$temp
568         mov     $a1,$A
569         vpor    $temp,$iv,$iv
570         vmovdqu $iv,(%r13,%r12)         # write output
571         lea     16(%r12),%r12           # inp++
572
573         add     $SZ*0(%r15),$A
574         add     $SZ*1(%r15),$B
575         add     $SZ*2(%r15),$C
576         add     $SZ*3(%r15),$D
577         add     $SZ*4(%r15),$E
578         add     $SZ*5(%r15),$F
579         add     $SZ*6(%r15),$G
580         add     $SZ*7(%r15),$H
581
582         cmp     $_end,%r12
583
584         mov     $A,$SZ*0(%r15)
585         mov     $B,$SZ*1(%r15)
586         mov     $C,$SZ*2(%r15)
587         mov     $D,$SZ*3(%r15)
588         mov     $E,$SZ*4(%r15)
589         mov     $F,$SZ*5(%r15)
590         mov     $G,$SZ*6(%r15)
591         mov     $H,$SZ*7(%r15)
592
593         jb      .Lloop_xop
594
595         mov     $_ivp,$ivp
596         mov     $_rsp,%rsi
597         vmovdqu $iv,($ivp)              # output IV
598         vzeroall
599 ___
600 $code.=<<___ if ($win64);
601         movaps  `$framesz+16*0`(%rsp),%xmm6
602         movaps  `$framesz+16*1`(%rsp),%xmm7
603         movaps  `$framesz+16*2`(%rsp),%xmm8
604         movaps  `$framesz+16*3`(%rsp),%xmm9
605         movaps  `$framesz+16*4`(%rsp),%xmm10
606         movaps  `$framesz+16*5`(%rsp),%xmm11
607         movaps  `$framesz+16*6`(%rsp),%xmm12
608         movaps  `$framesz+16*7`(%rsp),%xmm13
609         movaps  `$framesz+16*8`(%rsp),%xmm14
610         movaps  `$framesz+16*9`(%rsp),%xmm15
611 ___
612 $code.=<<___;
613         mov     (%rsi),%r15
614         mov     8(%rsi),%r14
615         mov     16(%rsi),%r13
616         mov     24(%rsi),%r12
617         mov     32(%rsi),%rbp
618         mov     40(%rsi),%rbx
619         lea     48(%rsi),%rsp
620 .Lepilogue_xop:
621         ret
622 .size   ${func}_xop,.-${func}_xop
623 ___
624 ######################################################################
625 # AVX+shrd code path
626 #
627 local *ror = sub { &shrd(@_[0],@_) };
628
629 $code.=<<___;
630 .type   ${func}_avx,\@function,6
631 .align  64
632 ${func}_avx:
633 .Lavx_shortcut:
634         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
635         push    %rbx
636         push    %rbp
637         push    %r12
638         push    %r13
639         push    %r14
640         push    %r15
641         mov     %rsp,%r11               # copy %rsp
642         sub     \$`$framesz+$win64*16*10`,%rsp
643         and     \$-64,%rsp              # align stack frame
644
645         shl     \$6,$len
646         sub     $inp,$out               # re-bias
647         sub     $inp,$in0
648         add     $inp,$len               # end of input
649
650         #mov    $inp,$_inp              # saved later
651         mov     $out,$_out
652         mov     $len,$_end
653         #mov    $key,$_key              # remains resident in $inp register
654         mov     $ivp,$_ivp
655         mov     $ctx,$_ctx
656         mov     $in0,$_in0
657         mov     %r11,$_rsp
658 ___
659 $code.=<<___ if ($win64);
660         movaps  %xmm6,`$framesz+16*0`(%rsp)
661         movaps  %xmm7,`$framesz+16*1`(%rsp)
662         movaps  %xmm8,`$framesz+16*2`(%rsp)
663         movaps  %xmm9,`$framesz+16*3`(%rsp)
664         movaps  %xmm10,`$framesz+16*4`(%rsp)
665         movaps  %xmm11,`$framesz+16*5`(%rsp)
666         movaps  %xmm12,`$framesz+16*6`(%rsp)
667         movaps  %xmm13,`$framesz+16*7`(%rsp)
668         movaps  %xmm14,`$framesz+16*8`(%rsp)
669         movaps  %xmm15,`$framesz+16*9`(%rsp)
670 ___
671 $code.=<<___;
672 .Lprologue_avx:
673         vzeroall
674
675         mov     $inp,%r12               # borrow $a4
676         lea     0x80($key),$inp         # size optimization, reassign
677         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
678         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
679         mov     $ctx,%r15               # borrow $a2
680         mov     $in0,%rsi               # borrow $a3
681         vmovdqu ($ivp),$iv              # load IV
682         sub     \$9,%r14
683
684         mov     $SZ*0(%r15),$A
685         mov     $SZ*1(%r15),$B
686         mov     $SZ*2(%r15),$C
687         mov     $SZ*3(%r15),$D
688         mov     $SZ*4(%r15),$E
689         mov     $SZ*5(%r15),$F
690         mov     $SZ*6(%r15),$G
691         mov     $SZ*7(%r15),$H
692
693         vmovdqa 0x00(%r13,%r14,8),$mask14
694         vmovdqa 0x10(%r13,%r14,8),$mask12
695         vmovdqa 0x20(%r13,%r14,8),$mask10
696         vmovdqu 0x00-0x80($inp),$roundkey
697 ___
698                                         if ($SZ==4) {   # SHA256
699     my @X = map("%xmm$_",(0..3));
700     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
701
702 $code.=<<___;
703         jmp     .Lloop_avx
704 .align  16
705 .Lloop_avx:
706         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
707         vmovdqu 0x00(%rsi,%r12),@X[0]
708         vmovdqu 0x10(%rsi,%r12),@X[1]
709         vmovdqu 0x20(%rsi,%r12),@X[2]
710         vmovdqu 0x30(%rsi,%r12),@X[3]
711         vpshufb $t3,@X[0],@X[0]
712         lea     $TABLE(%rip),$Tbl
713         vpshufb $t3,@X[1],@X[1]
714         vpshufb $t3,@X[2],@X[2]
715         vpaddd  0x00($Tbl),@X[0],$t0
716         vpshufb $t3,@X[3],@X[3]
717         vpaddd  0x20($Tbl),@X[1],$t1
718         vpaddd  0x40($Tbl),@X[2],$t2
719         vpaddd  0x60($Tbl),@X[3],$t3
720         vmovdqa $t0,0x00(%rsp)
721         mov     $A,$a1
722         vmovdqa $t1,0x10(%rsp)
723         mov     $B,$a3
724         vmovdqa $t2,0x20(%rsp)
725         xor     $C,$a3                  # magic
726         vmovdqa $t3,0x30(%rsp)
727         mov     $E,$a0
728         jmp     .Lavx_00_47
729
730 .align  16
731 .Lavx_00_47:
732         sub     \$-16*2*$SZ,$Tbl        # size optimization
733         vmovdqu (%r12),$inout           # $a4
734         mov     %r12,$_inp              # $a4
735 ___
736 sub Xupdate_256_AVX () {
737         (
738         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
739          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
740         '&vpsrld        ($t2,$t0,$sigma0[0]);',
741          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
742         '&vpsrld        ($t3,$t0,$sigma0[2])',
743         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
744         '&vpxor         ($t0,$t3,$t2)',
745          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
746         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
747         '&vpxor         ($t0,$t0,$t1)',
748         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
749         '&vpxor         ($t0,$t0,$t2)',
750          '&vpsrld       ($t2,$t3,$sigma1[2]);',
751         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
752          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
753         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
754          '&vpxor        ($t2,$t2,$t3);',
755          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
756          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
757          '&vpshufd      ($t2,$t2,0b10000100)',
758          '&vpsrldq      ($t2,$t2,8)',
759         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
760          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
761          '&vpsrld       ($t2,$t3,$sigma1[2])',
762          '&vpsrlq       ($t3,$t3,$sigma1[0])',
763          '&vpxor        ($t2,$t2,$t3);',
764          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
765          '&vpxor        ($t2,$t2,$t3)',
766          '&vpshufd      ($t2,$t2,0b11101000)',
767          '&vpslldq      ($t2,$t2,8)',
768         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
769         );
770 }
771
772 sub AVX_256_00_47 () {
773 my $j = shift;
774 my $body = shift;
775 my @X = @_;
776 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
777
778         foreach (Xupdate_256_AVX()) {           # 29 instructions
779             eval;
780             eval(shift(@insns));
781             eval(shift(@insns));
782             eval(shift(@insns));
783         }
784         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
785           foreach (@insns) { eval; }            # remaining instructions
786         &vmovdqa        (16*$j."(%rsp)",$t2);
787 }
788
789     $aesni_cbc_idx=0;
790     for ($i=0,$j=0; $j<4; $j++) {
791         &AVX_256_00_47($j,\&body_00_15,@X);
792         push(@X,shift(@X));                     # rotate(@X)
793     }
794         &mov            ("%r12",$_inp);         # borrow $a4
795         &vpand          ($temp,$temp,$mask14);
796         &mov            ("%r15",$_out);         # borrow $a2
797         &vpor           ($iv,$iv,$temp);
798         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
799         &lea            ("%r12","16(%r12)");    # inp++
800
801         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
802         &jne    (".Lavx_00_47");
803
804         &vmovdqu        ($inout,"(%r12)");
805         &mov            ($_inp,"%r12");
806
807     $aesni_cbc_idx=0;
808     for ($i=0; $i<16; ) {
809         foreach(body_00_15()) { eval; }
810     }
811
812                                         }
813 $code.=<<___;
814         mov     $_inp,%r12              # borrow $a4
815         mov     $_out,%r13              # borrow $a0
816         mov     $_ctx,%r15              # borrow $a2
817         mov     $_in0,%rsi              # borrow $a3
818
819         vpand   $mask14,$temp,$temp
820         mov     $a1,$A
821         vpor    $temp,$iv,$iv
822         vmovdqu $iv,(%r13,%r12)         # write output
823         lea     16(%r12),%r12           # inp++
824
825         add     $SZ*0(%r15),$A
826         add     $SZ*1(%r15),$B
827         add     $SZ*2(%r15),$C
828         add     $SZ*3(%r15),$D
829         add     $SZ*4(%r15),$E
830         add     $SZ*5(%r15),$F
831         add     $SZ*6(%r15),$G
832         add     $SZ*7(%r15),$H
833
834         cmp     $_end,%r12
835
836         mov     $A,$SZ*0(%r15)
837         mov     $B,$SZ*1(%r15)
838         mov     $C,$SZ*2(%r15)
839         mov     $D,$SZ*3(%r15)
840         mov     $E,$SZ*4(%r15)
841         mov     $F,$SZ*5(%r15)
842         mov     $G,$SZ*6(%r15)
843         mov     $H,$SZ*7(%r15)
844         jb      .Lloop_avx
845
846         mov     $_ivp,$ivp
847         mov     $_rsp,%rsi
848         vmovdqu $iv,($ivp)              # output IV
849         vzeroall
850 ___
851 $code.=<<___ if ($win64);
852         movaps  `$framesz+16*0`(%rsp),%xmm6
853         movaps  `$framesz+16*1`(%rsp),%xmm7
854         movaps  `$framesz+16*2`(%rsp),%xmm8
855         movaps  `$framesz+16*3`(%rsp),%xmm9
856         movaps  `$framesz+16*4`(%rsp),%xmm10
857         movaps  `$framesz+16*5`(%rsp),%xmm11
858         movaps  `$framesz+16*6`(%rsp),%xmm12
859         movaps  `$framesz+16*7`(%rsp),%xmm13
860         movaps  `$framesz+16*8`(%rsp),%xmm14
861         movaps  `$framesz+16*9`(%rsp),%xmm15
862 ___
863 $code.=<<___;
864         mov     (%rsi),%r15
865         mov     8(%rsi),%r14
866         mov     16(%rsi),%r13
867         mov     24(%rsi),%r12
868         mov     32(%rsi),%rbp
869         mov     40(%rsi),%rbx
870         lea     48(%rsi),%rsp
871 .Lepilogue_avx:
872         ret
873 .size   ${func}_avx,.-${func}_avx
874 ___
875
876 if ($avx>1) {{
877 ######################################################################
878 # AVX2+BMI code path
879 #
880 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp 
881 my $PUSH8=8*2*$SZ;
882 use integer;
883
884 sub bodyx_00_15 () {
885         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
886         (
887         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
888
889         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
890         '&and   ($a4,$e)',              # f&e
891         '&rorx  ($a0,$e,$Sigma1[2])',
892         '&rorx  ($a2,$e,$Sigma1[1])',
893
894         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
895         '&lea   ($h,"($h,$a4)")',
896         '&andn  ($a4,$e,$g)',           # ~e&g
897         '&xor   ($a0,$a2)',
898
899         '&rorx  ($a1,$e,$Sigma1[0])',
900         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
901         '&xor   ($a0,$a1)',             # Sigma1(e)
902         '&mov   ($a2,$a)',
903
904         '&rorx  ($a4,$a,$Sigma0[2])',
905         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
906         '&xor   ($a2,$b)',              # a^b, b^c in next round
907         '&rorx  ($a1,$a,$Sigma0[1])',
908
909         '&rorx  ($a0,$a,$Sigma0[0])',
910         '&lea   ($d,"($d,$h)")',        # d+=h
911         '&and   ($a3,$a2)',             # (b^c)&(a^b)
912         @aesni_cbc_block[$aesni_cbc_idx++].
913         '&xor   ($a1,$a4)',
914
915         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
916         '&xor   ($a1,$a0)',             # Sigma0(a)
917         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
918         '&mov   ($a4,$e)',              # copy of f in future
919
920         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
921         );
922         # and at the finish one has to $a+=$a1
923 }
924
925 $code.=<<___;
926 .type   ${func}_avx2,\@function,6
927 .align  64
928 ${func}_avx2:
929 .Lavx2_shortcut:
930         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
931         push    %rbx
932         push    %rbp
933         push    %r12
934         push    %r13
935         push    %r14
936         push    %r15
937         mov     %rsp,%r11               # copy %rsp
938         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
939         and     \$-256*$SZ,%rsp         # align stack frame
940         add     \$`2*$SZ*($rounds-8)`,%rsp
941
942         shl     \$6,$len
943         sub     $inp,$out               # re-bias
944         sub     $inp,$in0
945         add     $inp,$len               # end of input
946
947         #mov    $inp,$_inp              # saved later
948         #mov    $out,$_out              # kept in $offload
949         mov     $len,$_end
950         #mov    $key,$_key              # remains resident in $inp register
951         mov     $ivp,$_ivp
952         mov     $ctx,$_ctx
953         mov     $in0,$_in0
954         mov     %r11,$_rsp
955 ___
956 $code.=<<___ if ($win64);
957         movaps  %xmm6,`$framesz+16*0`(%rsp)
958         movaps  %xmm7,`$framesz+16*1`(%rsp)
959         movaps  %xmm8,`$framesz+16*2`(%rsp)
960         movaps  %xmm9,`$framesz+16*3`(%rsp)
961         movaps  %xmm10,`$framesz+16*4`(%rsp)
962         movaps  %xmm11,`$framesz+16*5`(%rsp)
963         movaps  %xmm12,`$framesz+16*6`(%rsp)
964         movaps  %xmm13,`$framesz+16*7`(%rsp)
965         movaps  %xmm14,`$framesz+16*8`(%rsp)
966         movaps  %xmm15,`$framesz+16*9`(%rsp)
967 ___
968 $code.=<<___;
969 .Lprologue_avx2:
970         vzeroall
971
972         mov     $inp,%r13               # borrow $a0
973         vpinsrq \$1,$out,$offload,$offload
974         lea     0x80($key),$inp         # size optimization, reassign
975         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
976         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
977         mov     $ctx,%r15               # borrow $a2
978         mov     $in0,%rsi               # borrow $a3
979         vmovdqu ($ivp),$iv              # load IV
980         lea     -9(%r14),%r14
981
982         vmovdqa 0x00(%r12,%r14,8),$mask14
983         vmovdqa 0x10(%r12,%r14,8),$mask12
984         vmovdqa 0x20(%r12,%r14,8),$mask10
985
986         sub     \$-16*$SZ,%r13          # inp++, size optimization
987         mov     $SZ*0(%r15),$A
988         lea     (%rsi,%r13),%r12        # borrow $a0
989         mov     $SZ*1(%r15),$B
990         cmp     $len,%r13               # $_end
991         mov     $SZ*2(%r15),$C
992         cmove   %rsp,%r12               # next block or random data
993         mov     $SZ*3(%r15),$D
994         mov     $SZ*4(%r15),$E
995         mov     $SZ*5(%r15),$F
996         mov     $SZ*6(%r15),$G
997         mov     $SZ*7(%r15),$H
998         vmovdqu 0x00-0x80($inp),$roundkey
999 ___
1000                                         if ($SZ==4) {   # SHA256
1001     my @X = map("%ymm$_",(0..3));
1002     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1003
1004 $code.=<<___;
1005         jmp     .Loop_avx2
1006 .align  16
1007 .Loop_avx2:
1008         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1009         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1010         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1011         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1012         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1013
1014         vinserti128     \$1,(%r12),@X[0],@X[0]
1015         vinserti128     \$1,16(%r12),@X[1],@X[1]
1016          vpshufb        $t3,@X[0],@X[0]
1017         vinserti128     \$1,32(%r12),@X[2],@X[2]
1018          vpshufb        $t3,@X[1],@X[1]
1019         vinserti128     \$1,48(%r12),@X[3],@X[3]
1020
1021         lea     $TABLE(%rip),$Tbl
1022         vpshufb $t3,@X[2],@X[2]
1023         lea     -16*$SZ(%r13),%r13
1024         vpaddd  0x00($Tbl),@X[0],$t0
1025         vpshufb $t3,@X[3],@X[3]
1026         vpaddd  0x20($Tbl),@X[1],$t1
1027         vpaddd  0x40($Tbl),@X[2],$t2
1028         vpaddd  0x60($Tbl),@X[3],$t3
1029         vmovdqa $t0,0x00(%rsp)
1030         xor     $a1,$a1
1031         vmovdqa $t1,0x20(%rsp)
1032         lea     -$PUSH8(%rsp),%rsp
1033         mov     $B,$a3
1034         vmovdqa $t2,0x00(%rsp)
1035         xor     $C,$a3                  # magic
1036         vmovdqa $t3,0x20(%rsp)
1037         mov     $F,$a4
1038         sub     \$-16*2*$SZ,$Tbl        # size optimization
1039         jmp     .Lavx2_00_47
1040
1041 .align  16
1042 .Lavx2_00_47:
1043         vmovdqu (%r13),$inout
1044         vpinsrq \$0,%r13,$offload,$offload
1045 ___
1046
1047 sub AVX2_256_00_47 () {
1048 my $j = shift;
1049 my $body = shift;
1050 my @X = @_;
1051 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1052 my $base = "+2*$PUSH8(%rsp)";
1053
1054         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1055         foreach (Xupdate_256_AVX()) {           # 29 instructions
1056             eval;
1057             eval(shift(@insns));
1058             eval(shift(@insns));
1059             eval(shift(@insns));
1060         }
1061         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1062           foreach (@insns) { eval; }            # remaining instructions
1063         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1064 }
1065     $aesni_cbc_idx=0;
1066     for ($i=0,$j=0; $j<4; $j++) {
1067         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1068         push(@X,shift(@X));                     # rotate(@X)
1069     }
1070         &vmovq          ("%r13",$offload);      # borrow $a0
1071         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1072         &vpand          ($temp,$temp,$mask14);
1073         &vpor           ($iv,$iv,$temp);
1074         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1075         &lea            ("%r13","16(%r13)");    # inp++
1076
1077         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1078         &cmpb   (($SZ-1)."($Tbl)",0);
1079         &jne    (".Lavx2_00_47");
1080
1081         &vmovdqu        ($inout,"(%r13)");
1082         &vpinsrq        ($offload,$offload,"%r13",0);
1083
1084     $aesni_cbc_idx=0;
1085     for ($i=0; $i<16; ) {
1086         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1087         foreach(bodyx_00_15()) { eval; }
1088     }
1089                                         }
1090 $code.=<<___;
1091         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1092         vmovq   $offload,%r13                   # $_inp, borrow $a0
1093         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1094         add     $a1,$A
1095         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1096
1097         vpand   $mask14,$temp,$temp
1098         vpor    $temp,$iv,$iv
1099         vmovdqu $iv,(%r12,%r13)                 # write output
1100         lea     16(%r13),%r13
1101
1102         add     $SZ*0(%r15),$A
1103         add     $SZ*1(%r15),$B
1104         add     $SZ*2(%r15),$C
1105         add     $SZ*3(%r15),$D
1106         add     $SZ*4(%r15),$E
1107         add     $SZ*5(%r15),$F
1108         add     $SZ*6(%r15),$G
1109         add     $SZ*7(%r15),$H
1110
1111         mov     $A,$SZ*0(%r15)
1112         mov     $B,$SZ*1(%r15)
1113         mov     $C,$SZ*2(%r15)
1114         mov     $D,$SZ*3(%r15)
1115         mov     $E,$SZ*4(%r15)
1116         mov     $F,$SZ*5(%r15)
1117         mov     $G,$SZ*6(%r15)
1118         mov     $H,$SZ*7(%r15)
1119
1120         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1121         je      .Ldone_avx2
1122
1123         xor     $a1,$a1
1124         mov     $B,$a3
1125         mov     $F,$a4
1126         xor     $C,$a3                  # magic
1127         jmp     .Lower_avx2
1128 .align  16
1129 .Lower_avx2:
1130         vmovdqu (%r13),$inout
1131         vpinsrq \$0,%r13,$offload,$offload
1132 ___
1133     $aesni_cbc_idx=0;
1134     for ($i=0; $i<16; ) {
1135         my $base="+16($Tbl)";
1136         foreach(bodyx_00_15()) { eval; }
1137         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1138     }
1139 $code.=<<___;
1140         vmovq   $offload,%r13                   # borrow $a0
1141         vpextrq \$1,$offload,%r15               # borrow $a2
1142         vpand   $mask14,$temp,$temp
1143         vpor    $temp,$iv,$iv
1144         lea     -$PUSH8($Tbl),$Tbl
1145         vmovdqu $iv,(%r15,%r13)                 # write output
1146         lea     16(%r13),%r13                   # inp++
1147         cmp     %rsp,$Tbl
1148         jae     .Lower_avx2
1149
1150         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1151         lea     16*$SZ(%r13),%r13
1152         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1153         add     $a1,$A
1154         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1155
1156         add     $SZ*0(%r15),$A
1157         add     $SZ*1(%r15),$B
1158         add     $SZ*2(%r15),$C
1159         add     $SZ*3(%r15),$D
1160         add     $SZ*4(%r15),$E
1161         add     $SZ*5(%r15),$F
1162         add     $SZ*6(%r15),$G
1163         lea     (%rsi,%r13),%r12
1164         add     $SZ*7(%r15),$H
1165
1166         cmp     $_end,%r13
1167
1168         mov     $A,$SZ*0(%r15)
1169         cmove   %rsp,%r12               # next block or stale data
1170         mov     $B,$SZ*1(%r15)
1171         mov     $C,$SZ*2(%r15)
1172         mov     $D,$SZ*3(%r15)
1173         mov     $E,$SZ*4(%r15)
1174         mov     $F,$SZ*5(%r15)
1175         mov     $G,$SZ*6(%r15)
1176         mov     $H,$SZ*7(%r15)
1177
1178         jbe     .Loop_avx2
1179         lea     (%rsp),$Tbl
1180
1181 .Ldone_avx2:
1182         lea     ($Tbl),%rsp
1183         mov     $_ivp,$ivp
1184         mov     $_rsp,%rsi
1185         vmovdqu $iv,($ivp)              # output IV
1186         vzeroall
1187 ___
1188 $code.=<<___ if ($win64);
1189         movaps  `$framesz+16*0`(%rsp),%xmm6
1190         movaps  `$framesz+16*1`(%rsp),%xmm7
1191         movaps  `$framesz+16*2`(%rsp),%xmm8
1192         movaps  `$framesz+16*3`(%rsp),%xmm9
1193         movaps  `$framesz+16*4`(%rsp),%xmm10
1194         movaps  `$framesz+16*5`(%rsp),%xmm11
1195         movaps  `$framesz+16*6`(%rsp),%xmm12
1196         movaps  `$framesz+16*7`(%rsp),%xmm13
1197         movaps  `$framesz+16*8`(%rsp),%xmm14
1198         movaps  `$framesz+16*9`(%rsp),%xmm15
1199 ___
1200 $code.=<<___;
1201         mov     (%rsi),%r15
1202         mov     8(%rsi),%r14
1203         mov     16(%rsi),%r13
1204         mov     24(%rsi),%r12
1205         mov     32(%rsi),%rbp
1206         mov     40(%rsi),%rbx
1207         lea     48(%rsi),%rsp
1208 .Lepilogue_avx2:
1209         ret
1210 .size   ${func}_avx2,.-${func}_avx2
1211 ___
1212 }}
1213 }}
1214 {{
1215 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1216
1217 my ($rounds,$Tbl)=("%r11d","%rbx");
1218
1219 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1220 my @rndkey=("%xmm4","%xmm5");
1221 my $r=0;
1222 my $sn=0;
1223
1224 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1225 my @MSG=map("%xmm$_",(10..13));
1226
1227 my $aesenc=sub {
1228   use integer;
1229   my ($n,$k)=($r/10,$r%10);
1230     if ($k==0) {
1231       $code.=<<___;
1232         movups          `16*$n`($in0),$in               # load input
1233         xorps           $rndkey0,$in
1234 ___
1235       $code.=<<___ if ($n);
1236         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1237 ___
1238       $code.=<<___;
1239         xorps           $in,$iv
1240         movups          `32+16*$k-112`($key),$rndkey[1]
1241         aesenc          $rndkey[0],$iv
1242 ___
1243     } elsif ($k==9) {
1244       $sn++;
1245       $code.=<<___;
1246         cmp             \$11,$rounds
1247         jb              .Laesenclast$sn
1248         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1249         aesenc          $rndkey[0],$iv
1250         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1251         aesenc          $rndkey[1],$iv
1252         je              .Laesenclast$sn
1253         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1254         aesenc          $rndkey[0],$iv
1255         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1256         aesenc          $rndkey[1],$iv
1257 .Laesenclast$sn:
1258         aesenclast      $rndkey[0],$iv
1259         movups          16-112($key),$rndkey[1]         # forward reference
1260         nop
1261 ___
1262     } else {
1263       $code.=<<___;
1264         movups          `32+16*$k-112`($key),$rndkey[1]
1265         aesenc          $rndkey[0],$iv
1266 ___
1267     }
1268     $r++;       unshift(@rndkey,pop(@rndkey));
1269 };
1270
1271 if ($shaext) {
1272 my $Tbl="%rax";
1273
1274 $code.=<<___;
1275 .type   ${func}_shaext,\@function,6
1276 .align  32
1277 ${func}_shaext:
1278         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1279 ___
1280 $code.=<<___ if ($win64);
1281         lea     `-8-10*16`(%rsp),%rsp
1282         movaps  %xmm6,-8-10*16(%rax)
1283         movaps  %xmm7,-8-9*16(%rax)
1284         movaps  %xmm8,-8-8*16(%rax)
1285         movaps  %xmm9,-8-7*16(%rax)
1286         movaps  %xmm10,-8-6*16(%rax)
1287         movaps  %xmm11,-8-5*16(%rax)
1288         movaps  %xmm12,-8-4*16(%rax)
1289         movaps  %xmm13,-8-3*16(%rax)
1290         movaps  %xmm14,-8-2*16(%rax)
1291         movaps  %xmm15,-8-1*16(%rax)
1292 .Lprologue_shaext:
1293 ___
1294 $code.=<<___;
1295         lea             K256+0x80(%rip),$Tbl
1296         movdqu          ($ctx),$ABEF            # DCBA
1297         movdqu          16($ctx),$CDGH          # HGFE
1298         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1299
1300         mov             240($key),$rounds
1301         sub             $in0,$out
1302         movups          ($key),$rndkey0         # $key[0]
1303         movups          16($key),$rndkey[0]     # forward reference
1304         lea             112($key),$key          # size optimization
1305
1306         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1307         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1308         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1309         movdqa          $TMP,$BSWAP             # offload
1310         palignr         \$8,$CDGH,$ABEF         # ABEF
1311         punpcklqdq      $Wi,$CDGH               # CDGH
1312
1313         jmp     .Loop_shaext
1314
1315 .align  16
1316 .Loop_shaext:
1317         movdqu          ($inp),@MSG[0]
1318         movdqu          0x10($inp),@MSG[1]
1319         movdqu          0x20($inp),@MSG[2]
1320         pshufb          $TMP,@MSG[0]
1321         movdqu          0x30($inp),@MSG[3]
1322
1323         movdqa          0*32-0x80($Tbl),$Wi
1324         paddd           @MSG[0],$Wi
1325         pshufb          $TMP,@MSG[1]
1326         movdqa          $CDGH,$CDGH_SAVE        # offload
1327         movdqa          $ABEF,$ABEF_SAVE        # offload
1328 ___
1329         &$aesenc();
1330 $code.=<<___;
1331         sha256rnds2     $ABEF,$CDGH             # 0-3
1332         pshufd          \$0x0e,$Wi,$Wi
1333 ___
1334         &$aesenc();
1335 $code.=<<___;
1336         sha256rnds2     $CDGH,$ABEF
1337
1338         movdqa          1*32-0x80($Tbl),$Wi
1339         paddd           @MSG[1],$Wi
1340         pshufb          $TMP,@MSG[2]
1341         lea             0x40($inp),$inp
1342 ___
1343         &$aesenc();
1344 $code.=<<___;
1345         sha256rnds2     $ABEF,$CDGH             # 4-7
1346         pshufd          \$0x0e,$Wi,$Wi
1347 ___
1348         &$aesenc();
1349 $code.=<<___;
1350         sha256rnds2     $CDGH,$ABEF
1351
1352         movdqa          2*32-0x80($Tbl),$Wi
1353         paddd           @MSG[2],$Wi
1354         pshufb          $TMP,@MSG[3]
1355         sha256msg1      @MSG[1],@MSG[0]
1356 ___
1357         &$aesenc();
1358 $code.=<<___;
1359         sha256rnds2     $ABEF,$CDGH             # 8-11
1360         pshufd          \$0x0e,$Wi,$Wi
1361         movdqa          @MSG[3],$TMP
1362         palignr         \$4,@MSG[2],$TMP
1363         paddd           $TMP,@MSG[0]
1364 ___
1365         &$aesenc();
1366 $code.=<<___;
1367         sha256rnds2     $CDGH,$ABEF
1368
1369         movdqa          3*32-0x80($Tbl),$Wi
1370         paddd           @MSG[3],$Wi
1371         sha256msg2      @MSG[3],@MSG[0]
1372         sha256msg1      @MSG[2],@MSG[1]
1373 ___
1374         &$aesenc();
1375 $code.=<<___;
1376         sha256rnds2     $ABEF,$CDGH             # 12-15
1377         pshufd          \$0x0e,$Wi,$Wi
1378 ___
1379         &$aesenc();
1380 $code.=<<___;
1381         movdqa          @MSG[0],$TMP
1382         palignr         \$4,@MSG[3],$TMP
1383         paddd           $TMP,@MSG[1]
1384         sha256rnds2     $CDGH,$ABEF
1385 ___
1386 for($i=4;$i<16-3;$i++) {
1387         &$aesenc()      if (($r%10)==0);
1388 $code.=<<___;
1389         movdqa          $i*32-0x80($Tbl),$Wi
1390         paddd           @MSG[0],$Wi
1391         sha256msg2      @MSG[0],@MSG[1]
1392         sha256msg1      @MSG[3],@MSG[2]
1393 ___
1394         &$aesenc();
1395 $code.=<<___;
1396         sha256rnds2     $ABEF,$CDGH             # 16-19...
1397         pshufd          \$0x0e,$Wi,$Wi
1398         movdqa          @MSG[1],$TMP
1399         palignr         \$4,@MSG[0],$TMP
1400         paddd           $TMP,@MSG[2]
1401 ___
1402         &$aesenc();
1403         &$aesenc()      if ($r==19);
1404 $code.=<<___;
1405         sha256rnds2     $CDGH,$ABEF
1406 ___
1407         push(@MSG,shift(@MSG));
1408 }
1409 $code.=<<___;
1410         movdqa          13*32-0x80($Tbl),$Wi
1411         paddd           @MSG[0],$Wi
1412         sha256msg2      @MSG[0],@MSG[1]
1413         sha256msg1      @MSG[3],@MSG[2]
1414 ___
1415         &$aesenc();
1416 $code.=<<___;
1417         sha256rnds2     $ABEF,$CDGH             # 52-55
1418         pshufd          \$0x0e,$Wi,$Wi
1419         movdqa          @MSG[1],$TMP
1420         palignr         \$4,@MSG[0],$TMP
1421         paddd           $TMP,@MSG[2]
1422 ___
1423         &$aesenc();
1424         &$aesenc();
1425 $code.=<<___;
1426         sha256rnds2     $CDGH,$ABEF
1427
1428         movdqa          14*32-0x80($Tbl),$Wi
1429         paddd           @MSG[1],$Wi
1430         sha256msg2      @MSG[1],@MSG[2]
1431         movdqa          $BSWAP,$TMP
1432 ___
1433         &$aesenc();
1434 $code.=<<___;
1435         sha256rnds2     $ABEF,$CDGH             # 56-59
1436         pshufd          \$0x0e,$Wi,$Wi
1437 ___
1438         &$aesenc();
1439 $code.=<<___;
1440         sha256rnds2     $CDGH,$ABEF
1441
1442         movdqa          15*32-0x80($Tbl),$Wi
1443         paddd           @MSG[2],$Wi
1444 ___
1445         &$aesenc();
1446         &$aesenc();
1447 $code.=<<___;
1448         sha256rnds2     $ABEF,$CDGH             # 60-63
1449         pshufd          \$0x0e,$Wi,$Wi
1450 ___
1451         &$aesenc();
1452 $code.=<<___;
1453         sha256rnds2     $CDGH,$ABEF
1454         #pxor           $CDGH,$rndkey0          # black magic
1455 ___
1456         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1457 $code.=<<___;
1458         #xorps          $CDGH,$rndkey0          # black magic
1459         paddd           $CDGH_SAVE,$CDGH
1460         paddd           $ABEF_SAVE,$ABEF
1461
1462         dec             $len
1463         movups          $iv,48($out,$in0)       # write output
1464         lea             64($in0),$in0
1465         jnz             .Loop_shaext
1466
1467         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1468         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1469         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1470         punpckhqdq      $CDGH,$ABEF             # DCBA
1471         palignr         \$8,$TMP,$CDGH          # HGFE
1472
1473         movups          $iv,($ivp)              # write IV
1474         movdqu          $ABEF,($ctx)
1475         movdqu          $CDGH,16($ctx)
1476 ___
1477 $code.=<<___ if ($win64);
1478         movaps  0*16(%rsp),%xmm6
1479         movaps  1*16(%rsp),%xmm7
1480         movaps  2*16(%rsp),%xmm8
1481         movaps  3*16(%rsp),%xmm9
1482         movaps  4*16(%rsp),%xmm10
1483         movaps  5*16(%rsp),%xmm11
1484         movaps  6*16(%rsp),%xmm12
1485         movaps  7*16(%rsp),%xmm13
1486         movaps  8*16(%rsp),%xmm14
1487         movaps  9*16(%rsp),%xmm15
1488         lea     8+10*16(%rsp),%rsp
1489 .Lepilogue_shaext:
1490 ___
1491 $code.=<<___;
1492         ret
1493 .size   ${func}_shaext,.-${func}_shaext
1494 ___
1495 }
1496 }}}}}
1497
1498 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1499 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1500 if ($win64 && $avx) {
1501 $rec="%rcx";
1502 $frame="%rdx";
1503 $context="%r8";
1504 $disp="%r9";
1505
1506 $code.=<<___;
1507 .extern __imp_RtlVirtualUnwind
1508 .type   se_handler,\@abi-omnipotent
1509 .align  16
1510 se_handler:
1511         push    %rsi
1512         push    %rdi
1513         push    %rbx
1514         push    %rbp
1515         push    %r12
1516         push    %r13
1517         push    %r14
1518         push    %r15
1519         pushfq
1520         sub     \$64,%rsp
1521
1522         mov     120($context),%rax      # pull context->Rax
1523         mov     248($context),%rbx      # pull context->Rip
1524
1525         mov     8($disp),%rsi           # disp->ImageBase
1526         mov     56($disp),%r11          # disp->HanderlData
1527
1528         mov     0(%r11),%r10d           # HandlerData[0]
1529         lea     (%rsi,%r10),%r10        # prologue label
1530         cmp     %r10,%rbx               # context->Rip<prologue label
1531         jb      .Lin_prologue
1532
1533         mov     152($context),%rax      # pull context->Rsp
1534
1535         mov     4(%r11),%r10d           # HandlerData[1]
1536         lea     (%rsi,%r10),%r10        # epilogue label
1537         cmp     %r10,%rbx               # context->Rip>=epilogue label
1538         jae     .Lin_prologue
1539 ___
1540 $code.=<<___ if ($shaext);
1541         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1542         cmp     %r10,%rbx
1543         jb      .Lnot_in_shaext
1544
1545         lea     (%rax),%rsi
1546         lea     512($context),%rdi      # &context.Xmm6
1547         mov     \$20,%ecx
1548         .long   0xa548f3fc              # cld; rep movsq
1549         lea     168(%rax),%rax          # adjust stack pointer
1550         jmp     .Lin_prologue
1551 .Lnot_in_shaext:
1552 ___
1553 $code.=<<___ if ($avx>1);
1554         lea     .Lavx2_shortcut(%rip),%r10
1555         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1556         jb      .Lnot_in_avx2
1557
1558         and     \$-256*$SZ,%rax
1559         add     \$`2*$SZ*($rounds-8)`,%rax
1560 .Lnot_in_avx2:
1561 ___
1562 $code.=<<___;
1563         mov     %rax,%rsi               # put aside Rsp
1564         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1565         lea     48(%rax),%rax
1566
1567         mov     -8(%rax),%rbx
1568         mov     -16(%rax),%rbp
1569         mov     -24(%rax),%r12
1570         mov     -32(%rax),%r13
1571         mov     -40(%rax),%r14
1572         mov     -48(%rax),%r15
1573         mov     %rbx,144($context)      # restore context->Rbx
1574         mov     %rbp,160($context)      # restore context->Rbp
1575         mov     %r12,216($context)      # restore context->R12
1576         mov     %r13,224($context)      # restore context->R13
1577         mov     %r14,232($context)      # restore context->R14
1578         mov     %r15,240($context)      # restore context->R15
1579
1580         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1581         lea     512($context),%rdi      # &context.Xmm6
1582         mov     \$20,%ecx
1583         .long   0xa548f3fc              # cld; rep movsq
1584
1585 .Lin_prologue:
1586         mov     8(%rax),%rdi
1587         mov     16(%rax),%rsi
1588         mov     %rax,152($context)      # restore context->Rsp
1589         mov     %rsi,168($context)      # restore context->Rsi
1590         mov     %rdi,176($context)      # restore context->Rdi
1591
1592         mov     40($disp),%rdi          # disp->ContextRecord
1593         mov     $context,%rsi           # context
1594         mov     \$154,%ecx              # sizeof(CONTEXT)
1595         .long   0xa548f3fc              # cld; rep movsq
1596
1597         mov     $disp,%rsi
1598         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1599         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1600         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1601         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1602         mov     40(%rsi),%r10           # disp->ContextRecord
1603         lea     56(%rsi),%r11           # &disp->HandlerData
1604         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1605         mov     %r10,32(%rsp)           # arg5
1606         mov     %r11,40(%rsp)           # arg6
1607         mov     %r12,48(%rsp)           # arg7
1608         mov     %rcx,56(%rsp)           # arg8, (NULL)
1609         call    *__imp_RtlVirtualUnwind(%rip)
1610
1611         mov     \$1,%eax                # ExceptionContinueSearch
1612         add     \$64,%rsp
1613         popfq
1614         pop     %r15
1615         pop     %r14
1616         pop     %r13
1617         pop     %r12
1618         pop     %rbp
1619         pop     %rbx
1620         pop     %rdi
1621         pop     %rsi
1622         ret
1623 .size   se_handler,.-se_handler
1624
1625 .section        .pdata
1626         .rva    .LSEH_begin_${func}_xop
1627         .rva    .LSEH_end_${func}_xop
1628         .rva    .LSEH_info_${func}_xop
1629
1630         .rva    .LSEH_begin_${func}_avx
1631         .rva    .LSEH_end_${func}_avx
1632         .rva    .LSEH_info_${func}_avx
1633 ___
1634 $code.=<<___ if ($avx>1);
1635         .rva    .LSEH_begin_${func}_avx2
1636         .rva    .LSEH_end_${func}_avx2
1637         .rva    .LSEH_info_${func}_avx2
1638 ___
1639 $code.=<<___ if ($shaext);
1640         .rva    .LSEH_begin_${func}_shaext
1641         .rva    .LSEH_end_${func}_shaext
1642         .rva    .LSEH_info_${func}_shaext
1643 ___
1644 $code.=<<___;
1645 .section        .xdata
1646 .align  8
1647 .LSEH_info_${func}_xop:
1648         .byte   9,0,0,0
1649         .rva    se_handler
1650         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1651
1652 .LSEH_info_${func}_avx:
1653         .byte   9,0,0,0
1654         .rva    se_handler
1655         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1656 ___
1657 $code.=<<___ if ($avx>1);
1658 .LSEH_info_${func}_avx2:
1659         .byte   9,0,0,0
1660         .rva    se_handler
1661         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1662 ___
1663 $code.=<<___ if ($shaext);
1664 .LSEH_info_${func}_shaext:
1665         .byte   9,0,0,0
1666         .rva    se_handler
1667         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1668 ___
1669 }
1670
1671 ####################################################################
1672 sub rex {
1673   local *opcode=shift;
1674   my ($dst,$src)=@_;
1675   my $rex=0;
1676
1677     $rex|=0x04                  if($dst>=8);
1678     $rex|=0x01                  if($src>=8);
1679     unshift @opcode,$rex|0x40   if($rex);
1680 }
1681
1682 {
1683   my %opcodelet = (
1684                 "sha256rnds2" => 0xcb,
1685                 "sha256msg1"  => 0xcc,
1686                 "sha256msg2"  => 0xcd   );
1687
1688   sub sha256op38 {
1689     my $instr = shift;
1690
1691     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1692       my @opcode=(0x0f,0x38);
1693         rex(\@opcode,$2,$1);
1694         push @opcode,$opcodelet{$instr};
1695         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1696         return ".byte\t".join(',',@opcode);
1697     } else {
1698         return $instr."\t".@_[0];
1699     }
1700   }
1701 }
1702
1703 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1704 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1705 print $code;
1706 close STDOUT;