Remove trailing whitespace from some files.
[openssl.git] / crypto / aes / asm / aesni-sha256-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # January 2013
18 #
19 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
21 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
22 # parallelism, interleaving it with another algorithm would allow to
23 # utilize processor resources better and achieve better performance.
24 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25 # AESNI code is weaved into it. As SHA256 dominates execution time,
26 # stitch performance does not depend on AES key length. Below are
27 # performance numbers in cycles per processed byte, less is better,
28 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
29 # subroutine:
30 #
31 #                AES-128/-192/-256+SHA256       this(**)gain
32 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
33 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
34 # Haswell           4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
35 # Skylake           2.62/3.14/3.62+7.70         8.10    +27%/34%/40%
36 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
37 #
38 # (*)   there are XOP, AVX1 and AVX2 code paths, meaning that
39 #       Westmere is omitted from loop, this is because gain was not
40 #       estimated high enough to justify the effort;
41 # (**)  these are EVP-free results, results obtained with 'speed
42 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
43
44 $flavour = shift;
45 $output  = shift;
46 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
47
48 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
49
50 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
51 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
52 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
53 die "can't locate x86_64-xlate.pl";
54
55 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
56                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
57         $avx = ($1>=2.19) + ($1>=2.22);
58 }
59
60 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
61            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
62         $avx = ($1>=2.09) + ($1>=2.10);
63 }
64
65 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
66            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
67         $avx = ($1>=10) + ($1>=12);
68 }
69
70 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
71         $avx = ($2>=3.0) + ($2>3.0);
72 }
73
74 $shaext=$avx;   ### set to zero if compiling for 1.0.1
75 $avx=1          if (!$shaext && $avx);
76
77 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
78 *STDOUT=*OUT;
79
80 $func="aesni_cbc_sha256_enc";
81 $TABLE="K256";
82 $SZ=4;
83 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
84                                 "%r8d","%r9d","%r10d","%r11d");
85 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
86 @Sigma0=( 2,13,22);
87 @Sigma1=( 6,11,25);
88 @sigma0=( 7,18, 3);
89 @sigma1=(17,19,10);
90 $rounds=64;
91
92 ########################################################################
93 # void aesni_cbc_sha256_enc(const void *inp,
94 #                       void *out,
95 #                       size_t length,
96 #                       const AES_KEY *key,
97 #                       unsigned char *iv,
98 #                       SHA256_CTX *ctx,
99 #                       const void *in0);
100 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
101 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
102
103 $Tbl="%rbp";
104
105 $_inp="16*$SZ+0*8(%rsp)";
106 $_out="16*$SZ+1*8(%rsp)";
107 $_end="16*$SZ+2*8(%rsp)";
108 $_key="16*$SZ+3*8(%rsp)";
109 $_ivp="16*$SZ+4*8(%rsp)";
110 $_ctx="16*$SZ+5*8(%rsp)";
111 $_in0="16*$SZ+6*8(%rsp)";
112 $_rsp="16*$SZ+7*8(%rsp)";
113 $framesz=16*$SZ+8*8;
114
115 $code=<<___;
116 .text
117
118 .extern OPENSSL_ia32cap_P
119 .globl  $func
120 .type   $func,\@abi-omnipotent
121 .align  16
122 $func:
123 ___
124                                                 if ($avx) {
125 $code.=<<___;
126         lea     OPENSSL_ia32cap_P(%rip),%r11
127         mov     \$1,%eax
128         cmp     \$0,`$win64?"%rcx":"%rdi"`
129         je      .Lprobe
130         mov     0(%r11),%eax
131         mov     4(%r11),%r10
132 ___
133 $code.=<<___ if ($shaext);
134         bt      \$61,%r10                       # check for SHA
135         jc      ${func}_shaext
136 ___
137 $code.=<<___;
138         mov     %r10,%r11
139         shr     \$32,%r11
140
141         test    \$`1<<11`,%r10d                 # check for XOP
142         jnz     ${func}_xop
143 ___
144 $code.=<<___ if ($avx>1);
145         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
146         cmp     \$`1<<8|1<<5|1<<3`,%r11d
147         je      ${func}_avx2
148 ___
149 $code.=<<___;
150         and     \$`1<<28`,%r10d                 # check for AVX
151         jnz     ${func}_avx
152         ud2
153 ___
154                                                 }
155 $code.=<<___;
156         xor     %eax,%eax
157         cmp     \$0,`$win64?"%rcx":"%rdi"`
158         je      .Lprobe
159         ud2
160 .Lprobe:
161         ret
162 .size   $func,.-$func
163
164 .align  64
165 .type   $TABLE,\@object
166 $TABLE:
167         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
168         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
169         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
170         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
171         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
172         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
173         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
174         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
175         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
176         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
177         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
178         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
179         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
180         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
181         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
182         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
183         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
184         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
185         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
186         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
187         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
188         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
189         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
190         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
191         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
192         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
193         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
194         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
195         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
196         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
197         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
198         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
199
200         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
201         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
202         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
203         .long   0,0,0,0,   0,0,0,0
204         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
205 .align  64
206 ___
207
208 ######################################################################
209 # SIMD code paths
210 #
211 {{{
212 ($iv,$inout,$roundkey,$temp,
213  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
214
215 $aesni_cbc_idx=0;
216 @aesni_cbc_block = (
217 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
218 ##      &vmovdqu        ($inout,($inp));
219 ##      &mov            ($_inp,$inp);
220
221         '&vpxor         ($inout,$inout,$roundkey);'.
222         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
223
224         '&vpxor         ($inout,$inout,$iv);',
225
226         '&vaesenc       ($inout,$inout,$roundkey);'.
227         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
228
229         '&vaesenc       ($inout,$inout,$roundkey);'.
230         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
231
232         '&vaesenc       ($inout,$inout,$roundkey);'.
233         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
234
235         '&vaesenc       ($inout,$inout,$roundkey);'.
236         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
237
238         '&vaesenc       ($inout,$inout,$roundkey);'.
239         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
240
241         '&vaesenc       ($inout,$inout,$roundkey);'.
242         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
243
244         '&vaesenc       ($inout,$inout,$roundkey);'.
245         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
246
247         '&vaesenc       ($inout,$inout,$roundkey);'.
248         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
249
250         '&vaesenc       ($inout,$inout,$roundkey);'.
251         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
252
253         '&vaesenclast   ($temp,$inout,$roundkey);'.
254         ' &vaesenc      ($inout,$inout,$roundkey);'.
255         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
256
257         '&vpand         ($iv,$temp,$mask10);'.
258         ' &vaesenc      ($inout,$inout,$roundkey);'.
259         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
260
261         '&vaesenclast   ($temp,$inout,$roundkey);'.
262         ' &vaesenc      ($inout,$inout,$roundkey);'.
263         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
264
265         '&vpand         ($temp,$temp,$mask12);'.
266         ' &vaesenc      ($inout,$inout,$roundkey);'.
267          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
268
269         '&vpor          ($iv,$iv,$temp);'.
270         ' &vaesenclast  ($temp,$inout,$roundkey);'.
271         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
272
273 ##      &mov            ($inp,$_inp);
274 ##      &mov            ($out,$_out);
275 ##      &vpand          ($temp,$temp,$mask14);
276 ##      &vpor           ($iv,$iv,$temp);
277 ##      &vmovdqu        ($iv,($out,$inp);
278 ##      &lea            (inp,16($inp));
279 );
280
281 my $a4=$T1;
282 my ($a,$b,$c,$d,$e,$f,$g,$h);
283
284 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
285 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
286   my $arg = pop;
287     $arg = "\$$arg" if ($arg*1 eq $arg);
288     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
289 }
290
291 sub body_00_15 () {
292         (
293         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
294
295         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
296         '&mov   ($a,$a1)',
297         '&mov   ($a4,$f)',
298
299         '&xor   ($a0,$e)',
300         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
301         '&xor   ($a4,$g)',                      # f^g
302
303         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
304         '&xor   ($a1,$a)',
305         '&and   ($a4,$e)',                      # (f^g)&e
306
307         @aesni_cbc_block[$aesni_cbc_idx++].
308         '&xor   ($a0,$e)',
309         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
310         '&mov   ($a2,$a)',
311
312         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
313         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
314         '&xor   ($a2,$b)',                      # a^b, b^c in next round
315
316         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
317         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
318         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
319
320         '&xor   ($a1,$a)',
321         '&add   ($h,$a0)',                      # h+=Sigma1(e)
322         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
323
324         '&add   ($d,$h)',                       # d+=h
325         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
326         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
327
328         '&mov   ($a0,$d)',
329         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
330         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
331         );
332 }
333
334 if ($avx) {{
335 ######################################################################
336 # XOP code path
337 #
338 $code.=<<___;
339 .type   ${func}_xop,\@function,6
340 .align  64
341 ${func}_xop:
342 .Lxop_shortcut:
343         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
344         push    %rbx
345         push    %rbp
346         push    %r12
347         push    %r13
348         push    %r14
349         push    %r15
350         mov     %rsp,%r11               # copy %rsp
351         sub     \$`$framesz+$win64*16*10`,%rsp
352         and     \$-64,%rsp              # align stack frame
353
354         shl     \$6,$len
355         sub     $inp,$out               # re-bias
356         sub     $inp,$in0
357         add     $inp,$len               # end of input
358
359         #mov    $inp,$_inp              # saved later
360         mov     $out,$_out
361         mov     $len,$_end
362         #mov    $key,$_key              # remains resident in $inp register
363         mov     $ivp,$_ivp
364         mov     $ctx,$_ctx
365         mov     $in0,$_in0
366         mov     %r11,$_rsp
367 ___
368 $code.=<<___ if ($win64);
369         movaps  %xmm6,`$framesz+16*0`(%rsp)
370         movaps  %xmm7,`$framesz+16*1`(%rsp)
371         movaps  %xmm8,`$framesz+16*2`(%rsp)
372         movaps  %xmm9,`$framesz+16*3`(%rsp)
373         movaps  %xmm10,`$framesz+16*4`(%rsp)
374         movaps  %xmm11,`$framesz+16*5`(%rsp)
375         movaps  %xmm12,`$framesz+16*6`(%rsp)
376         movaps  %xmm13,`$framesz+16*7`(%rsp)
377         movaps  %xmm14,`$framesz+16*8`(%rsp)
378         movaps  %xmm15,`$framesz+16*9`(%rsp)
379 ___
380 $code.=<<___;
381 .Lprologue_xop:
382         vzeroall
383
384         mov     $inp,%r12               # borrow $a4
385         lea     0x80($key),$inp         # size optimization, reassign
386         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
387         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
388         mov     $ctx,%r15               # borrow $a2
389         mov     $in0,%rsi               # borrow $a3
390         vmovdqu ($ivp),$iv              # load IV
391         sub     \$9,%r14
392
393         mov     $SZ*0(%r15),$A
394         mov     $SZ*1(%r15),$B
395         mov     $SZ*2(%r15),$C
396         mov     $SZ*3(%r15),$D
397         mov     $SZ*4(%r15),$E
398         mov     $SZ*5(%r15),$F
399         mov     $SZ*6(%r15),$G
400         mov     $SZ*7(%r15),$H
401
402         vmovdqa 0x00(%r13,%r14,8),$mask14
403         vmovdqa 0x10(%r13,%r14,8),$mask12
404         vmovdqa 0x20(%r13,%r14,8),$mask10
405         vmovdqu 0x00-0x80($inp),$roundkey
406         jmp     .Lloop_xop
407 ___
408                                         if ($SZ==4) {   # SHA256
409     my @X = map("%xmm$_",(0..3));
410     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
411
412 $code.=<<___;
413 .align  16
414 .Lloop_xop:
415         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
416         vmovdqu 0x00(%rsi,%r12),@X[0]
417         vmovdqu 0x10(%rsi,%r12),@X[1]
418         vmovdqu 0x20(%rsi,%r12),@X[2]
419         vmovdqu 0x30(%rsi,%r12),@X[3]
420         vpshufb $t3,@X[0],@X[0]
421         lea     $TABLE(%rip),$Tbl
422         vpshufb $t3,@X[1],@X[1]
423         vpshufb $t3,@X[2],@X[2]
424         vpaddd  0x00($Tbl),@X[0],$t0
425         vpshufb $t3,@X[3],@X[3]
426         vpaddd  0x20($Tbl),@X[1],$t1
427         vpaddd  0x40($Tbl),@X[2],$t2
428         vpaddd  0x60($Tbl),@X[3],$t3
429         vmovdqa $t0,0x00(%rsp)
430         mov     $A,$a1
431         vmovdqa $t1,0x10(%rsp)
432         mov     $B,$a3
433         vmovdqa $t2,0x20(%rsp)
434         xor     $C,$a3                  # magic
435         vmovdqa $t3,0x30(%rsp)
436         mov     $E,$a0
437         jmp     .Lxop_00_47
438
439 .align  16
440 .Lxop_00_47:
441         sub     \$-16*2*$SZ,$Tbl        # size optimization
442         vmovdqu (%r12),$inout           # $a4
443         mov     %r12,$_inp              # $a4
444 ___
445 sub XOP_256_00_47 () {
446 my $j = shift;
447 my $body = shift;
448 my @X = @_;
449 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
450
451         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
452           eval(shift(@insns));
453           eval(shift(@insns));
454          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
455           eval(shift(@insns));
456           eval(shift(@insns));
457         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
458           eval(shift(@insns));
459           eval(shift(@insns));
460         &vpsrld         ($t0,$t0,$sigma0[2]);
461           eval(shift(@insns));
462           eval(shift(@insns));
463          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
464           eval(shift(@insns));
465           eval(shift(@insns));
466           eval(shift(@insns));
467           eval(shift(@insns));
468         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
469           eval(shift(@insns));
470           eval(shift(@insns));
471         &vpxor          ($t0,$t0,$t1);
472           eval(shift(@insns));
473           eval(shift(@insns));
474           eval(shift(@insns));
475           eval(shift(@insns));
476          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
477           eval(shift(@insns));
478           eval(shift(@insns));
479         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
480           eval(shift(@insns));
481           eval(shift(@insns));
482          &vpsrld        ($t2,@X[3],$sigma1[2]);
483           eval(shift(@insns));
484           eval(shift(@insns));
485         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
486           eval(shift(@insns));
487           eval(shift(@insns));
488          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
489           eval(shift(@insns));
490           eval(shift(@insns));
491          &vpxor         ($t3,$t3,$t2);
492           eval(shift(@insns));
493           eval(shift(@insns));
494           eval(shift(@insns));
495           eval(shift(@insns));
496          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
497           eval(shift(@insns));
498           eval(shift(@insns));
499           eval(shift(@insns));
500           eval(shift(@insns));
501         &vpsrldq        ($t3,$t3,8);
502           eval(shift(@insns));
503           eval(shift(@insns));
504           eval(shift(@insns));
505           eval(shift(@insns));
506         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
507           eval(shift(@insns));
508           eval(shift(@insns));
509           eval(shift(@insns));
510           eval(shift(@insns));
511          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
512           eval(shift(@insns));
513           eval(shift(@insns));
514          &vpsrld        ($t2,@X[0],$sigma1[2]);
515           eval(shift(@insns));
516           eval(shift(@insns));
517          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
518           eval(shift(@insns));
519           eval(shift(@insns));
520          &vpxor         ($t3,$t3,$t2);
521           eval(shift(@insns));
522           eval(shift(@insns));
523           eval(shift(@insns));
524           eval(shift(@insns));
525          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
526           eval(shift(@insns));
527           eval(shift(@insns));
528           eval(shift(@insns));
529           eval(shift(@insns));
530         &vpslldq        ($t3,$t3,8);            # 22 instructions
531           eval(shift(@insns));
532           eval(shift(@insns));
533           eval(shift(@insns));
534           eval(shift(@insns));
535         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
536           eval(shift(@insns));
537           eval(shift(@insns));
538           eval(shift(@insns));
539           eval(shift(@insns));
540         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
541           foreach (@insns) { eval; }            # remaining instructions
542         &vmovdqa        (16*$j."(%rsp)",$t2);
543 }
544
545     $aesni_cbc_idx=0;
546     for ($i=0,$j=0; $j<4; $j++) {
547         &XOP_256_00_47($j,\&body_00_15,@X);
548         push(@X,shift(@X));                     # rotate(@X)
549     }
550         &mov            ("%r12",$_inp);         # borrow $a4
551         &vpand          ($temp,$temp,$mask14);
552         &mov            ("%r15",$_out);         # borrow $a2
553         &vpor           ($iv,$iv,$temp);
554         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
555         &lea            ("%r12","16(%r12)");    # inp++
556
557         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
558         &jne    (".Lxop_00_47");
559
560         &vmovdqu        ($inout,"(%r12)");
561         &mov            ($_inp,"%r12");
562
563     $aesni_cbc_idx=0;
564     for ($i=0; $i<16; ) {
565         foreach(body_00_15()) { eval; }
566     }
567                                         }
568 $code.=<<___;
569         mov     $_inp,%r12              # borrow $a4
570         mov     $_out,%r13              # borrow $a0
571         mov     $_ctx,%r15              # borrow $a2
572         mov     $_in0,%rsi              # borrow $a3
573
574         vpand   $mask14,$temp,$temp
575         mov     $a1,$A
576         vpor    $temp,$iv,$iv
577         vmovdqu $iv,(%r13,%r12)         # write output
578         lea     16(%r12),%r12           # inp++
579
580         add     $SZ*0(%r15),$A
581         add     $SZ*1(%r15),$B
582         add     $SZ*2(%r15),$C
583         add     $SZ*3(%r15),$D
584         add     $SZ*4(%r15),$E
585         add     $SZ*5(%r15),$F
586         add     $SZ*6(%r15),$G
587         add     $SZ*7(%r15),$H
588
589         cmp     $_end,%r12
590
591         mov     $A,$SZ*0(%r15)
592         mov     $B,$SZ*1(%r15)
593         mov     $C,$SZ*2(%r15)
594         mov     $D,$SZ*3(%r15)
595         mov     $E,$SZ*4(%r15)
596         mov     $F,$SZ*5(%r15)
597         mov     $G,$SZ*6(%r15)
598         mov     $H,$SZ*7(%r15)
599
600         jb      .Lloop_xop
601
602         mov     $_ivp,$ivp
603         mov     $_rsp,%rsi
604         vmovdqu $iv,($ivp)              # output IV
605         vzeroall
606 ___
607 $code.=<<___ if ($win64);
608         movaps  `$framesz+16*0`(%rsp),%xmm6
609         movaps  `$framesz+16*1`(%rsp),%xmm7
610         movaps  `$framesz+16*2`(%rsp),%xmm8
611         movaps  `$framesz+16*3`(%rsp),%xmm9
612         movaps  `$framesz+16*4`(%rsp),%xmm10
613         movaps  `$framesz+16*5`(%rsp),%xmm11
614         movaps  `$framesz+16*6`(%rsp),%xmm12
615         movaps  `$framesz+16*7`(%rsp),%xmm13
616         movaps  `$framesz+16*8`(%rsp),%xmm14
617         movaps  `$framesz+16*9`(%rsp),%xmm15
618 ___
619 $code.=<<___;
620         mov     (%rsi),%r15
621         mov     8(%rsi),%r14
622         mov     16(%rsi),%r13
623         mov     24(%rsi),%r12
624         mov     32(%rsi),%rbp
625         mov     40(%rsi),%rbx
626         lea     48(%rsi),%rsp
627 .Lepilogue_xop:
628         ret
629 .size   ${func}_xop,.-${func}_xop
630 ___
631 ######################################################################
632 # AVX+shrd code path
633 #
634 local *ror = sub { &shrd(@_[0],@_) };
635
636 $code.=<<___;
637 .type   ${func}_avx,\@function,6
638 .align  64
639 ${func}_avx:
640 .Lavx_shortcut:
641         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
642         push    %rbx
643         push    %rbp
644         push    %r12
645         push    %r13
646         push    %r14
647         push    %r15
648         mov     %rsp,%r11               # copy %rsp
649         sub     \$`$framesz+$win64*16*10`,%rsp
650         and     \$-64,%rsp              # align stack frame
651
652         shl     \$6,$len
653         sub     $inp,$out               # re-bias
654         sub     $inp,$in0
655         add     $inp,$len               # end of input
656
657         #mov    $inp,$_inp              # saved later
658         mov     $out,$_out
659         mov     $len,$_end
660         #mov    $key,$_key              # remains resident in $inp register
661         mov     $ivp,$_ivp
662         mov     $ctx,$_ctx
663         mov     $in0,$_in0
664         mov     %r11,$_rsp
665 ___
666 $code.=<<___ if ($win64);
667         movaps  %xmm6,`$framesz+16*0`(%rsp)
668         movaps  %xmm7,`$framesz+16*1`(%rsp)
669         movaps  %xmm8,`$framesz+16*2`(%rsp)
670         movaps  %xmm9,`$framesz+16*3`(%rsp)
671         movaps  %xmm10,`$framesz+16*4`(%rsp)
672         movaps  %xmm11,`$framesz+16*5`(%rsp)
673         movaps  %xmm12,`$framesz+16*6`(%rsp)
674         movaps  %xmm13,`$framesz+16*7`(%rsp)
675         movaps  %xmm14,`$framesz+16*8`(%rsp)
676         movaps  %xmm15,`$framesz+16*9`(%rsp)
677 ___
678 $code.=<<___;
679 .Lprologue_avx:
680         vzeroall
681
682         mov     $inp,%r12               # borrow $a4
683         lea     0x80($key),$inp         # size optimization, reassign
684         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
685         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
686         mov     $ctx,%r15               # borrow $a2
687         mov     $in0,%rsi               # borrow $a3
688         vmovdqu ($ivp),$iv              # load IV
689         sub     \$9,%r14
690
691         mov     $SZ*0(%r15),$A
692         mov     $SZ*1(%r15),$B
693         mov     $SZ*2(%r15),$C
694         mov     $SZ*3(%r15),$D
695         mov     $SZ*4(%r15),$E
696         mov     $SZ*5(%r15),$F
697         mov     $SZ*6(%r15),$G
698         mov     $SZ*7(%r15),$H
699
700         vmovdqa 0x00(%r13,%r14,8),$mask14
701         vmovdqa 0x10(%r13,%r14,8),$mask12
702         vmovdqa 0x20(%r13,%r14,8),$mask10
703         vmovdqu 0x00-0x80($inp),$roundkey
704 ___
705                                         if ($SZ==4) {   # SHA256
706     my @X = map("%xmm$_",(0..3));
707     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
708
709 $code.=<<___;
710         jmp     .Lloop_avx
711 .align  16
712 .Lloop_avx:
713         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
714         vmovdqu 0x00(%rsi,%r12),@X[0]
715         vmovdqu 0x10(%rsi,%r12),@X[1]
716         vmovdqu 0x20(%rsi,%r12),@X[2]
717         vmovdqu 0x30(%rsi,%r12),@X[3]
718         vpshufb $t3,@X[0],@X[0]
719         lea     $TABLE(%rip),$Tbl
720         vpshufb $t3,@X[1],@X[1]
721         vpshufb $t3,@X[2],@X[2]
722         vpaddd  0x00($Tbl),@X[0],$t0
723         vpshufb $t3,@X[3],@X[3]
724         vpaddd  0x20($Tbl),@X[1],$t1
725         vpaddd  0x40($Tbl),@X[2],$t2
726         vpaddd  0x60($Tbl),@X[3],$t3
727         vmovdqa $t0,0x00(%rsp)
728         mov     $A,$a1
729         vmovdqa $t1,0x10(%rsp)
730         mov     $B,$a3
731         vmovdqa $t2,0x20(%rsp)
732         xor     $C,$a3                  # magic
733         vmovdqa $t3,0x30(%rsp)
734         mov     $E,$a0
735         jmp     .Lavx_00_47
736
737 .align  16
738 .Lavx_00_47:
739         sub     \$-16*2*$SZ,$Tbl        # size optimization
740         vmovdqu (%r12),$inout           # $a4
741         mov     %r12,$_inp              # $a4
742 ___
743 sub Xupdate_256_AVX () {
744         (
745         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
746          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
747         '&vpsrld        ($t2,$t0,$sigma0[0]);',
748          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
749         '&vpsrld        ($t3,$t0,$sigma0[2])',
750         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
751         '&vpxor         ($t0,$t3,$t2)',
752          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
753         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
754         '&vpxor         ($t0,$t0,$t1)',
755         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
756         '&vpxor         ($t0,$t0,$t2)',
757          '&vpsrld       ($t2,$t3,$sigma1[2]);',
758         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
759          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
760         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
761          '&vpxor        ($t2,$t2,$t3);',
762          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
763          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
764          '&vpshufd      ($t2,$t2,0b10000100)',
765          '&vpsrldq      ($t2,$t2,8)',
766         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
767          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
768          '&vpsrld       ($t2,$t3,$sigma1[2])',
769          '&vpsrlq       ($t3,$t3,$sigma1[0])',
770          '&vpxor        ($t2,$t2,$t3);',
771          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
772          '&vpxor        ($t2,$t2,$t3)',
773          '&vpshufd      ($t2,$t2,0b11101000)',
774          '&vpslldq      ($t2,$t2,8)',
775         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
776         );
777 }
778
779 sub AVX_256_00_47 () {
780 my $j = shift;
781 my $body = shift;
782 my @X = @_;
783 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
784
785         foreach (Xupdate_256_AVX()) {           # 29 instructions
786             eval;
787             eval(shift(@insns));
788             eval(shift(@insns));
789             eval(shift(@insns));
790         }
791         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
792           foreach (@insns) { eval; }            # remaining instructions
793         &vmovdqa        (16*$j."(%rsp)",$t2);
794 }
795
796     $aesni_cbc_idx=0;
797     for ($i=0,$j=0; $j<4; $j++) {
798         &AVX_256_00_47($j,\&body_00_15,@X);
799         push(@X,shift(@X));                     # rotate(@X)
800     }
801         &mov            ("%r12",$_inp);         # borrow $a4
802         &vpand          ($temp,$temp,$mask14);
803         &mov            ("%r15",$_out);         # borrow $a2
804         &vpor           ($iv,$iv,$temp);
805         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
806         &lea            ("%r12","16(%r12)");    # inp++
807
808         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
809         &jne    (".Lavx_00_47");
810
811         &vmovdqu        ($inout,"(%r12)");
812         &mov            ($_inp,"%r12");
813
814     $aesni_cbc_idx=0;
815     for ($i=0; $i<16; ) {
816         foreach(body_00_15()) { eval; }
817     }
818
819                                         }
820 $code.=<<___;
821         mov     $_inp,%r12              # borrow $a4
822         mov     $_out,%r13              # borrow $a0
823         mov     $_ctx,%r15              # borrow $a2
824         mov     $_in0,%rsi              # borrow $a3
825
826         vpand   $mask14,$temp,$temp
827         mov     $a1,$A
828         vpor    $temp,$iv,$iv
829         vmovdqu $iv,(%r13,%r12)         # write output
830         lea     16(%r12),%r12           # inp++
831
832         add     $SZ*0(%r15),$A
833         add     $SZ*1(%r15),$B
834         add     $SZ*2(%r15),$C
835         add     $SZ*3(%r15),$D
836         add     $SZ*4(%r15),$E
837         add     $SZ*5(%r15),$F
838         add     $SZ*6(%r15),$G
839         add     $SZ*7(%r15),$H
840
841         cmp     $_end,%r12
842
843         mov     $A,$SZ*0(%r15)
844         mov     $B,$SZ*1(%r15)
845         mov     $C,$SZ*2(%r15)
846         mov     $D,$SZ*3(%r15)
847         mov     $E,$SZ*4(%r15)
848         mov     $F,$SZ*5(%r15)
849         mov     $G,$SZ*6(%r15)
850         mov     $H,$SZ*7(%r15)
851         jb      .Lloop_avx
852
853         mov     $_ivp,$ivp
854         mov     $_rsp,%rsi
855         vmovdqu $iv,($ivp)              # output IV
856         vzeroall
857 ___
858 $code.=<<___ if ($win64);
859         movaps  `$framesz+16*0`(%rsp),%xmm6
860         movaps  `$framesz+16*1`(%rsp),%xmm7
861         movaps  `$framesz+16*2`(%rsp),%xmm8
862         movaps  `$framesz+16*3`(%rsp),%xmm9
863         movaps  `$framesz+16*4`(%rsp),%xmm10
864         movaps  `$framesz+16*5`(%rsp),%xmm11
865         movaps  `$framesz+16*6`(%rsp),%xmm12
866         movaps  `$framesz+16*7`(%rsp),%xmm13
867         movaps  `$framesz+16*8`(%rsp),%xmm14
868         movaps  `$framesz+16*9`(%rsp),%xmm15
869 ___
870 $code.=<<___;
871         mov     (%rsi),%r15
872         mov     8(%rsi),%r14
873         mov     16(%rsi),%r13
874         mov     24(%rsi),%r12
875         mov     32(%rsi),%rbp
876         mov     40(%rsi),%rbx
877         lea     48(%rsi),%rsp
878 .Lepilogue_avx:
879         ret
880 .size   ${func}_avx,.-${func}_avx
881 ___
882
883 if ($avx>1) {{
884 ######################################################################
885 # AVX2+BMI code path
886 #
887 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp
888 my $PUSH8=8*2*$SZ;
889 use integer;
890
891 sub bodyx_00_15 () {
892         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
893         (
894         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
895
896         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
897         '&and   ($a4,$e)',              # f&e
898         '&rorx  ($a0,$e,$Sigma1[2])',
899         '&rorx  ($a2,$e,$Sigma1[1])',
900
901         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
902         '&lea   ($h,"($h,$a4)")',
903         '&andn  ($a4,$e,$g)',           # ~e&g
904         '&xor   ($a0,$a2)',
905
906         '&rorx  ($a1,$e,$Sigma1[0])',
907         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
908         '&xor   ($a0,$a1)',             # Sigma1(e)
909         '&mov   ($a2,$a)',
910
911         '&rorx  ($a4,$a,$Sigma0[2])',
912         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
913         '&xor   ($a2,$b)',              # a^b, b^c in next round
914         '&rorx  ($a1,$a,$Sigma0[1])',
915
916         '&rorx  ($a0,$a,$Sigma0[0])',
917         '&lea   ($d,"($d,$h)")',        # d+=h
918         '&and   ($a3,$a2)',             # (b^c)&(a^b)
919         @aesni_cbc_block[$aesni_cbc_idx++].
920         '&xor   ($a1,$a4)',
921
922         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
923         '&xor   ($a1,$a0)',             # Sigma0(a)
924         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
925         '&mov   ($a4,$e)',              # copy of f in future
926
927         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
928         );
929         # and at the finish one has to $a+=$a1
930 }
931
932 $code.=<<___;
933 .type   ${func}_avx2,\@function,6
934 .align  64
935 ${func}_avx2:
936 .Lavx2_shortcut:
937         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
938         push    %rbx
939         push    %rbp
940         push    %r12
941         push    %r13
942         push    %r14
943         push    %r15
944         mov     %rsp,%r11               # copy %rsp
945         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
946         and     \$-256*$SZ,%rsp         # align stack frame
947         add     \$`2*$SZ*($rounds-8)`,%rsp
948
949         shl     \$6,$len
950         sub     $inp,$out               # re-bias
951         sub     $inp,$in0
952         add     $inp,$len               # end of input
953
954         #mov    $inp,$_inp              # saved later
955         #mov    $out,$_out              # kept in $offload
956         mov     $len,$_end
957         #mov    $key,$_key              # remains resident in $inp register
958         mov     $ivp,$_ivp
959         mov     $ctx,$_ctx
960         mov     $in0,$_in0
961         mov     %r11,$_rsp
962 ___
963 $code.=<<___ if ($win64);
964         movaps  %xmm6,`$framesz+16*0`(%rsp)
965         movaps  %xmm7,`$framesz+16*1`(%rsp)
966         movaps  %xmm8,`$framesz+16*2`(%rsp)
967         movaps  %xmm9,`$framesz+16*3`(%rsp)
968         movaps  %xmm10,`$framesz+16*4`(%rsp)
969         movaps  %xmm11,`$framesz+16*5`(%rsp)
970         movaps  %xmm12,`$framesz+16*6`(%rsp)
971         movaps  %xmm13,`$framesz+16*7`(%rsp)
972         movaps  %xmm14,`$framesz+16*8`(%rsp)
973         movaps  %xmm15,`$framesz+16*9`(%rsp)
974 ___
975 $code.=<<___;
976 .Lprologue_avx2:
977         vzeroall
978
979         mov     $inp,%r13               # borrow $a0
980         vpinsrq \$1,$out,$offload,$offload
981         lea     0x80($key),$inp         # size optimization, reassign
982         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
983         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
984         mov     $ctx,%r15               # borrow $a2
985         mov     $in0,%rsi               # borrow $a3
986         vmovdqu ($ivp),$iv              # load IV
987         lea     -9(%r14),%r14
988
989         vmovdqa 0x00(%r12,%r14,8),$mask14
990         vmovdqa 0x10(%r12,%r14,8),$mask12
991         vmovdqa 0x20(%r12,%r14,8),$mask10
992
993         sub     \$-16*$SZ,%r13          # inp++, size optimization
994         mov     $SZ*0(%r15),$A
995         lea     (%rsi,%r13),%r12        # borrow $a0
996         mov     $SZ*1(%r15),$B
997         cmp     $len,%r13               # $_end
998         mov     $SZ*2(%r15),$C
999         cmove   %rsp,%r12               # next block or random data
1000         mov     $SZ*3(%r15),$D
1001         mov     $SZ*4(%r15),$E
1002         mov     $SZ*5(%r15),$F
1003         mov     $SZ*6(%r15),$G
1004         mov     $SZ*7(%r15),$H
1005         vmovdqu 0x00-0x80($inp),$roundkey
1006 ___
1007                                         if ($SZ==4) {   # SHA256
1008     my @X = map("%ymm$_",(0..3));
1009     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1010
1011 $code.=<<___;
1012         jmp     .Loop_avx2
1013 .align  16
1014 .Loop_avx2:
1015         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1016         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1017         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1018         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1019         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1020
1021         vinserti128     \$1,(%r12),@X[0],@X[0]
1022         vinserti128     \$1,16(%r12),@X[1],@X[1]
1023          vpshufb        $t3,@X[0],@X[0]
1024         vinserti128     \$1,32(%r12),@X[2],@X[2]
1025          vpshufb        $t3,@X[1],@X[1]
1026         vinserti128     \$1,48(%r12),@X[3],@X[3]
1027
1028         lea     $TABLE(%rip),$Tbl
1029         vpshufb $t3,@X[2],@X[2]
1030         lea     -16*$SZ(%r13),%r13
1031         vpaddd  0x00($Tbl),@X[0],$t0
1032         vpshufb $t3,@X[3],@X[3]
1033         vpaddd  0x20($Tbl),@X[1],$t1
1034         vpaddd  0x40($Tbl),@X[2],$t2
1035         vpaddd  0x60($Tbl),@X[3],$t3
1036         vmovdqa $t0,0x00(%rsp)
1037         xor     $a1,$a1
1038         vmovdqa $t1,0x20(%rsp)
1039         lea     -$PUSH8(%rsp),%rsp
1040         mov     $B,$a3
1041         vmovdqa $t2,0x00(%rsp)
1042         xor     $C,$a3                  # magic
1043         vmovdqa $t3,0x20(%rsp)
1044         mov     $F,$a4
1045         sub     \$-16*2*$SZ,$Tbl        # size optimization
1046         jmp     .Lavx2_00_47
1047
1048 .align  16
1049 .Lavx2_00_47:
1050         vmovdqu (%r13),$inout
1051         vpinsrq \$0,%r13,$offload,$offload
1052 ___
1053
1054 sub AVX2_256_00_47 () {
1055 my $j = shift;
1056 my $body = shift;
1057 my @X = @_;
1058 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1059 my $base = "+2*$PUSH8(%rsp)";
1060
1061         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1062         foreach (Xupdate_256_AVX()) {           # 29 instructions
1063             eval;
1064             eval(shift(@insns));
1065             eval(shift(@insns));
1066             eval(shift(@insns));
1067         }
1068         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1069           foreach (@insns) { eval; }            # remaining instructions
1070         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1071 }
1072     $aesni_cbc_idx=0;
1073     for ($i=0,$j=0; $j<4; $j++) {
1074         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1075         push(@X,shift(@X));                     # rotate(@X)
1076     }
1077         &vmovq          ("%r13",$offload);      # borrow $a0
1078         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1079         &vpand          ($temp,$temp,$mask14);
1080         &vpor           ($iv,$iv,$temp);
1081         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1082         &lea            ("%r13","16(%r13)");    # inp++
1083
1084         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1085         &cmpb   (($SZ-1)."($Tbl)",0);
1086         &jne    (".Lavx2_00_47");
1087
1088         &vmovdqu        ($inout,"(%r13)");
1089         &vpinsrq        ($offload,$offload,"%r13",0);
1090
1091     $aesni_cbc_idx=0;
1092     for ($i=0; $i<16; ) {
1093         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1094         foreach(bodyx_00_15()) { eval; }
1095     }
1096                                         }
1097 $code.=<<___;
1098         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1099         vmovq   $offload,%r13                   # $_inp, borrow $a0
1100         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1101         add     $a1,$A
1102         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1103
1104         vpand   $mask14,$temp,$temp
1105         vpor    $temp,$iv,$iv
1106         vmovdqu $iv,(%r12,%r13)                 # write output
1107         lea     16(%r13),%r13
1108
1109         add     $SZ*0(%r15),$A
1110         add     $SZ*1(%r15),$B
1111         add     $SZ*2(%r15),$C
1112         add     $SZ*3(%r15),$D
1113         add     $SZ*4(%r15),$E
1114         add     $SZ*5(%r15),$F
1115         add     $SZ*6(%r15),$G
1116         add     $SZ*7(%r15),$H
1117
1118         mov     $A,$SZ*0(%r15)
1119         mov     $B,$SZ*1(%r15)
1120         mov     $C,$SZ*2(%r15)
1121         mov     $D,$SZ*3(%r15)
1122         mov     $E,$SZ*4(%r15)
1123         mov     $F,$SZ*5(%r15)
1124         mov     $G,$SZ*6(%r15)
1125         mov     $H,$SZ*7(%r15)
1126
1127         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1128         je      .Ldone_avx2
1129
1130         xor     $a1,$a1
1131         mov     $B,$a3
1132         mov     $F,$a4
1133         xor     $C,$a3                  # magic
1134         jmp     .Lower_avx2
1135 .align  16
1136 .Lower_avx2:
1137         vmovdqu (%r13),$inout
1138         vpinsrq \$0,%r13,$offload,$offload
1139 ___
1140     $aesni_cbc_idx=0;
1141     for ($i=0; $i<16; ) {
1142         my $base="+16($Tbl)";
1143         foreach(bodyx_00_15()) { eval; }
1144         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1145     }
1146 $code.=<<___;
1147         vmovq   $offload,%r13                   # borrow $a0
1148         vpextrq \$1,$offload,%r15               # borrow $a2
1149         vpand   $mask14,$temp,$temp
1150         vpor    $temp,$iv,$iv
1151         lea     -$PUSH8($Tbl),$Tbl
1152         vmovdqu $iv,(%r15,%r13)                 # write output
1153         lea     16(%r13),%r13                   # inp++
1154         cmp     %rsp,$Tbl
1155         jae     .Lower_avx2
1156
1157         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1158         lea     16*$SZ(%r13),%r13
1159         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1160         add     $a1,$A
1161         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1162
1163         add     $SZ*0(%r15),$A
1164         add     $SZ*1(%r15),$B
1165         add     $SZ*2(%r15),$C
1166         add     $SZ*3(%r15),$D
1167         add     $SZ*4(%r15),$E
1168         add     $SZ*5(%r15),$F
1169         add     $SZ*6(%r15),$G
1170         lea     (%rsi,%r13),%r12
1171         add     $SZ*7(%r15),$H
1172
1173         cmp     $_end,%r13
1174
1175         mov     $A,$SZ*0(%r15)
1176         cmove   %rsp,%r12               # next block or stale data
1177         mov     $B,$SZ*1(%r15)
1178         mov     $C,$SZ*2(%r15)
1179         mov     $D,$SZ*3(%r15)
1180         mov     $E,$SZ*4(%r15)
1181         mov     $F,$SZ*5(%r15)
1182         mov     $G,$SZ*6(%r15)
1183         mov     $H,$SZ*7(%r15)
1184
1185         jbe     .Loop_avx2
1186         lea     (%rsp),$Tbl
1187
1188 .Ldone_avx2:
1189         lea     ($Tbl),%rsp
1190         mov     $_ivp,$ivp
1191         mov     $_rsp,%rsi
1192         vmovdqu $iv,($ivp)              # output IV
1193         vzeroall
1194 ___
1195 $code.=<<___ if ($win64);
1196         movaps  `$framesz+16*0`(%rsp),%xmm6
1197         movaps  `$framesz+16*1`(%rsp),%xmm7
1198         movaps  `$framesz+16*2`(%rsp),%xmm8
1199         movaps  `$framesz+16*3`(%rsp),%xmm9
1200         movaps  `$framesz+16*4`(%rsp),%xmm10
1201         movaps  `$framesz+16*5`(%rsp),%xmm11
1202         movaps  `$framesz+16*6`(%rsp),%xmm12
1203         movaps  `$framesz+16*7`(%rsp),%xmm13
1204         movaps  `$framesz+16*8`(%rsp),%xmm14
1205         movaps  `$framesz+16*9`(%rsp),%xmm15
1206 ___
1207 $code.=<<___;
1208         mov     (%rsi),%r15
1209         mov     8(%rsi),%r14
1210         mov     16(%rsi),%r13
1211         mov     24(%rsi),%r12
1212         mov     32(%rsi),%rbp
1213         mov     40(%rsi),%rbx
1214         lea     48(%rsi),%rsp
1215 .Lepilogue_avx2:
1216         ret
1217 .size   ${func}_avx2,.-${func}_avx2
1218 ___
1219 }}
1220 }}
1221 {{
1222 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1223
1224 my ($rounds,$Tbl)=("%r11d","%rbx");
1225
1226 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1227 my @rndkey=("%xmm4","%xmm5");
1228 my $r=0;
1229 my $sn=0;
1230
1231 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1232 my @MSG=map("%xmm$_",(10..13));
1233
1234 my $aesenc=sub {
1235   use integer;
1236   my ($n,$k)=($r/10,$r%10);
1237     if ($k==0) {
1238       $code.=<<___;
1239         movups          `16*$n`($in0),$in               # load input
1240         xorps           $rndkey0,$in
1241 ___
1242       $code.=<<___ if ($n);
1243         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1244 ___
1245       $code.=<<___;
1246         xorps           $in,$iv
1247         movups          `32+16*$k-112`($key),$rndkey[1]
1248         aesenc          $rndkey[0],$iv
1249 ___
1250     } elsif ($k==9) {
1251       $sn++;
1252       $code.=<<___;
1253         cmp             \$11,$rounds
1254         jb              .Laesenclast$sn
1255         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1256         aesenc          $rndkey[0],$iv
1257         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1258         aesenc          $rndkey[1],$iv
1259         je              .Laesenclast$sn
1260         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1261         aesenc          $rndkey[0],$iv
1262         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1263         aesenc          $rndkey[1],$iv
1264 .Laesenclast$sn:
1265         aesenclast      $rndkey[0],$iv
1266         movups          16-112($key),$rndkey[1]         # forward reference
1267         nop
1268 ___
1269     } else {
1270       $code.=<<___;
1271         movups          `32+16*$k-112`($key),$rndkey[1]
1272         aesenc          $rndkey[0],$iv
1273 ___
1274     }
1275     $r++;       unshift(@rndkey,pop(@rndkey));
1276 };
1277
1278 if ($shaext) {
1279 my $Tbl="%rax";
1280
1281 $code.=<<___;
1282 .type   ${func}_shaext,\@function,6
1283 .align  32
1284 ${func}_shaext:
1285         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1286 ___
1287 $code.=<<___ if ($win64);
1288         lea     `-8-10*16`(%rsp),%rsp
1289         movaps  %xmm6,-8-10*16(%rax)
1290         movaps  %xmm7,-8-9*16(%rax)
1291         movaps  %xmm8,-8-8*16(%rax)
1292         movaps  %xmm9,-8-7*16(%rax)
1293         movaps  %xmm10,-8-6*16(%rax)
1294         movaps  %xmm11,-8-5*16(%rax)
1295         movaps  %xmm12,-8-4*16(%rax)
1296         movaps  %xmm13,-8-3*16(%rax)
1297         movaps  %xmm14,-8-2*16(%rax)
1298         movaps  %xmm15,-8-1*16(%rax)
1299 .Lprologue_shaext:
1300 ___
1301 $code.=<<___;
1302         lea             K256+0x80(%rip),$Tbl
1303         movdqu          ($ctx),$ABEF            # DCBA
1304         movdqu          16($ctx),$CDGH          # HGFE
1305         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1306
1307         mov             240($key),$rounds
1308         sub             $in0,$out
1309         movups          ($key),$rndkey0         # $key[0]
1310         movups          16($key),$rndkey[0]     # forward reference
1311         lea             112($key),$key          # size optimization
1312
1313         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1314         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1315         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1316         movdqa          $TMP,$BSWAP             # offload
1317         palignr         \$8,$CDGH,$ABEF         # ABEF
1318         punpcklqdq      $Wi,$CDGH               # CDGH
1319
1320         jmp     .Loop_shaext
1321
1322 .align  16
1323 .Loop_shaext:
1324         movdqu          ($inp),@MSG[0]
1325         movdqu          0x10($inp),@MSG[1]
1326         movdqu          0x20($inp),@MSG[2]
1327         pshufb          $TMP,@MSG[0]
1328         movdqu          0x30($inp),@MSG[3]
1329
1330         movdqa          0*32-0x80($Tbl),$Wi
1331         paddd           @MSG[0],$Wi
1332         pshufb          $TMP,@MSG[1]
1333         movdqa          $CDGH,$CDGH_SAVE        # offload
1334         movdqa          $ABEF,$ABEF_SAVE        # offload
1335 ___
1336         &$aesenc();
1337 $code.=<<___;
1338         sha256rnds2     $ABEF,$CDGH             # 0-3
1339         pshufd          \$0x0e,$Wi,$Wi
1340 ___
1341         &$aesenc();
1342 $code.=<<___;
1343         sha256rnds2     $CDGH,$ABEF
1344
1345         movdqa          1*32-0x80($Tbl),$Wi
1346         paddd           @MSG[1],$Wi
1347         pshufb          $TMP,@MSG[2]
1348         lea             0x40($inp),$inp
1349 ___
1350         &$aesenc();
1351 $code.=<<___;
1352         sha256rnds2     $ABEF,$CDGH             # 4-7
1353         pshufd          \$0x0e,$Wi,$Wi
1354 ___
1355         &$aesenc();
1356 $code.=<<___;
1357         sha256rnds2     $CDGH,$ABEF
1358
1359         movdqa          2*32-0x80($Tbl),$Wi
1360         paddd           @MSG[2],$Wi
1361         pshufb          $TMP,@MSG[3]
1362         sha256msg1      @MSG[1],@MSG[0]
1363 ___
1364         &$aesenc();
1365 $code.=<<___;
1366         sha256rnds2     $ABEF,$CDGH             # 8-11
1367         pshufd          \$0x0e,$Wi,$Wi
1368         movdqa          @MSG[3],$TMP
1369         palignr         \$4,@MSG[2],$TMP
1370         paddd           $TMP,@MSG[0]
1371 ___
1372         &$aesenc();
1373 $code.=<<___;
1374         sha256rnds2     $CDGH,$ABEF
1375
1376         movdqa          3*32-0x80($Tbl),$Wi
1377         paddd           @MSG[3],$Wi
1378         sha256msg2      @MSG[3],@MSG[0]
1379         sha256msg1      @MSG[2],@MSG[1]
1380 ___
1381         &$aesenc();
1382 $code.=<<___;
1383         sha256rnds2     $ABEF,$CDGH             # 12-15
1384         pshufd          \$0x0e,$Wi,$Wi
1385 ___
1386         &$aesenc();
1387 $code.=<<___;
1388         movdqa          @MSG[0],$TMP
1389         palignr         \$4,@MSG[3],$TMP
1390         paddd           $TMP,@MSG[1]
1391         sha256rnds2     $CDGH,$ABEF
1392 ___
1393 for($i=4;$i<16-3;$i++) {
1394         &$aesenc()      if (($r%10)==0);
1395 $code.=<<___;
1396         movdqa          $i*32-0x80($Tbl),$Wi
1397         paddd           @MSG[0],$Wi
1398         sha256msg2      @MSG[0],@MSG[1]
1399         sha256msg1      @MSG[3],@MSG[2]
1400 ___
1401         &$aesenc();
1402 $code.=<<___;
1403         sha256rnds2     $ABEF,$CDGH             # 16-19...
1404         pshufd          \$0x0e,$Wi,$Wi
1405         movdqa          @MSG[1],$TMP
1406         palignr         \$4,@MSG[0],$TMP
1407         paddd           $TMP,@MSG[2]
1408 ___
1409         &$aesenc();
1410         &$aesenc()      if ($r==19);
1411 $code.=<<___;
1412         sha256rnds2     $CDGH,$ABEF
1413 ___
1414         push(@MSG,shift(@MSG));
1415 }
1416 $code.=<<___;
1417         movdqa          13*32-0x80($Tbl),$Wi
1418         paddd           @MSG[0],$Wi
1419         sha256msg2      @MSG[0],@MSG[1]
1420         sha256msg1      @MSG[3],@MSG[2]
1421 ___
1422         &$aesenc();
1423 $code.=<<___;
1424         sha256rnds2     $ABEF,$CDGH             # 52-55
1425         pshufd          \$0x0e,$Wi,$Wi
1426         movdqa          @MSG[1],$TMP
1427         palignr         \$4,@MSG[0],$TMP
1428         paddd           $TMP,@MSG[2]
1429 ___
1430         &$aesenc();
1431         &$aesenc();
1432 $code.=<<___;
1433         sha256rnds2     $CDGH,$ABEF
1434
1435         movdqa          14*32-0x80($Tbl),$Wi
1436         paddd           @MSG[1],$Wi
1437         sha256msg2      @MSG[1],@MSG[2]
1438         movdqa          $BSWAP,$TMP
1439 ___
1440         &$aesenc();
1441 $code.=<<___;
1442         sha256rnds2     $ABEF,$CDGH             # 56-59
1443         pshufd          \$0x0e,$Wi,$Wi
1444 ___
1445         &$aesenc();
1446 $code.=<<___;
1447         sha256rnds2     $CDGH,$ABEF
1448
1449         movdqa          15*32-0x80($Tbl),$Wi
1450         paddd           @MSG[2],$Wi
1451 ___
1452         &$aesenc();
1453         &$aesenc();
1454 $code.=<<___;
1455         sha256rnds2     $ABEF,$CDGH             # 60-63
1456         pshufd          \$0x0e,$Wi,$Wi
1457 ___
1458         &$aesenc();
1459 $code.=<<___;
1460         sha256rnds2     $CDGH,$ABEF
1461         #pxor           $CDGH,$rndkey0          # black magic
1462 ___
1463         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1464 $code.=<<___;
1465         #xorps          $CDGH,$rndkey0          # black magic
1466         paddd           $CDGH_SAVE,$CDGH
1467         paddd           $ABEF_SAVE,$ABEF
1468
1469         dec             $len
1470         movups          $iv,48($out,$in0)       # write output
1471         lea             64($in0),$in0
1472         jnz             .Loop_shaext
1473
1474         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1475         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1476         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1477         punpckhqdq      $CDGH,$ABEF             # DCBA
1478         palignr         \$8,$TMP,$CDGH          # HGFE
1479
1480         movups          $iv,($ivp)              # write IV
1481         movdqu          $ABEF,($ctx)
1482         movdqu          $CDGH,16($ctx)
1483 ___
1484 $code.=<<___ if ($win64);
1485         movaps  0*16(%rsp),%xmm6
1486         movaps  1*16(%rsp),%xmm7
1487         movaps  2*16(%rsp),%xmm8
1488         movaps  3*16(%rsp),%xmm9
1489         movaps  4*16(%rsp),%xmm10
1490         movaps  5*16(%rsp),%xmm11
1491         movaps  6*16(%rsp),%xmm12
1492         movaps  7*16(%rsp),%xmm13
1493         movaps  8*16(%rsp),%xmm14
1494         movaps  9*16(%rsp),%xmm15
1495         lea     8+10*16(%rsp),%rsp
1496 .Lepilogue_shaext:
1497 ___
1498 $code.=<<___;
1499         ret
1500 .size   ${func}_shaext,.-${func}_shaext
1501 ___
1502 }
1503 }}}}}
1504
1505 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1506 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1507 if ($win64 && $avx) {
1508 $rec="%rcx";
1509 $frame="%rdx";
1510 $context="%r8";
1511 $disp="%r9";
1512
1513 $code.=<<___;
1514 .extern __imp_RtlVirtualUnwind
1515 .type   se_handler,\@abi-omnipotent
1516 .align  16
1517 se_handler:
1518         push    %rsi
1519         push    %rdi
1520         push    %rbx
1521         push    %rbp
1522         push    %r12
1523         push    %r13
1524         push    %r14
1525         push    %r15
1526         pushfq
1527         sub     \$64,%rsp
1528
1529         mov     120($context),%rax      # pull context->Rax
1530         mov     248($context),%rbx      # pull context->Rip
1531
1532         mov     8($disp),%rsi           # disp->ImageBase
1533         mov     56($disp),%r11          # disp->HanderlData
1534
1535         mov     0(%r11),%r10d           # HandlerData[0]
1536         lea     (%rsi,%r10),%r10        # prologue label
1537         cmp     %r10,%rbx               # context->Rip<prologue label
1538         jb      .Lin_prologue
1539
1540         mov     152($context),%rax      # pull context->Rsp
1541
1542         mov     4(%r11),%r10d           # HandlerData[1]
1543         lea     (%rsi,%r10),%r10        # epilogue label
1544         cmp     %r10,%rbx               # context->Rip>=epilogue label
1545         jae     .Lin_prologue
1546 ___
1547 $code.=<<___ if ($shaext);
1548         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1549         cmp     %r10,%rbx
1550         jb      .Lnot_in_shaext
1551
1552         lea     (%rax),%rsi
1553         lea     512($context),%rdi      # &context.Xmm6
1554         mov     \$20,%ecx
1555         .long   0xa548f3fc              # cld; rep movsq
1556         lea     168(%rax),%rax          # adjust stack pointer
1557         jmp     .Lin_prologue
1558 .Lnot_in_shaext:
1559 ___
1560 $code.=<<___ if ($avx>1);
1561         lea     .Lavx2_shortcut(%rip),%r10
1562         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1563         jb      .Lnot_in_avx2
1564
1565         and     \$-256*$SZ,%rax
1566         add     \$`2*$SZ*($rounds-8)`,%rax
1567 .Lnot_in_avx2:
1568 ___
1569 $code.=<<___;
1570         mov     %rax,%rsi               # put aside Rsp
1571         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1572         lea     48(%rax),%rax
1573
1574         mov     -8(%rax),%rbx
1575         mov     -16(%rax),%rbp
1576         mov     -24(%rax),%r12
1577         mov     -32(%rax),%r13
1578         mov     -40(%rax),%r14
1579         mov     -48(%rax),%r15
1580         mov     %rbx,144($context)      # restore context->Rbx
1581         mov     %rbp,160($context)      # restore context->Rbp
1582         mov     %r12,216($context)      # restore context->R12
1583         mov     %r13,224($context)      # restore context->R13
1584         mov     %r14,232($context)      # restore context->R14
1585         mov     %r15,240($context)      # restore context->R15
1586
1587         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1588         lea     512($context),%rdi      # &context.Xmm6
1589         mov     \$20,%ecx
1590         .long   0xa548f3fc              # cld; rep movsq
1591
1592 .Lin_prologue:
1593         mov     8(%rax),%rdi
1594         mov     16(%rax),%rsi
1595         mov     %rax,152($context)      # restore context->Rsp
1596         mov     %rsi,168($context)      # restore context->Rsi
1597         mov     %rdi,176($context)      # restore context->Rdi
1598
1599         mov     40($disp),%rdi          # disp->ContextRecord
1600         mov     $context,%rsi           # context
1601         mov     \$154,%ecx              # sizeof(CONTEXT)
1602         .long   0xa548f3fc              # cld; rep movsq
1603
1604         mov     $disp,%rsi
1605         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1606         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1607         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1608         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1609         mov     40(%rsi),%r10           # disp->ContextRecord
1610         lea     56(%rsi),%r11           # &disp->HandlerData
1611         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1612         mov     %r10,32(%rsp)           # arg5
1613         mov     %r11,40(%rsp)           # arg6
1614         mov     %r12,48(%rsp)           # arg7
1615         mov     %rcx,56(%rsp)           # arg8, (NULL)
1616         call    *__imp_RtlVirtualUnwind(%rip)
1617
1618         mov     \$1,%eax                # ExceptionContinueSearch
1619         add     \$64,%rsp
1620         popfq
1621         pop     %r15
1622         pop     %r14
1623         pop     %r13
1624         pop     %r12
1625         pop     %rbp
1626         pop     %rbx
1627         pop     %rdi
1628         pop     %rsi
1629         ret
1630 .size   se_handler,.-se_handler
1631
1632 .section        .pdata
1633         .rva    .LSEH_begin_${func}_xop
1634         .rva    .LSEH_end_${func}_xop
1635         .rva    .LSEH_info_${func}_xop
1636
1637         .rva    .LSEH_begin_${func}_avx
1638         .rva    .LSEH_end_${func}_avx
1639         .rva    .LSEH_info_${func}_avx
1640 ___
1641 $code.=<<___ if ($avx>1);
1642         .rva    .LSEH_begin_${func}_avx2
1643         .rva    .LSEH_end_${func}_avx2
1644         .rva    .LSEH_info_${func}_avx2
1645 ___
1646 $code.=<<___ if ($shaext);
1647         .rva    .LSEH_begin_${func}_shaext
1648         .rva    .LSEH_end_${func}_shaext
1649         .rva    .LSEH_info_${func}_shaext
1650 ___
1651 $code.=<<___;
1652 .section        .xdata
1653 .align  8
1654 .LSEH_info_${func}_xop:
1655         .byte   9,0,0,0
1656         .rva    se_handler
1657         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1658
1659 .LSEH_info_${func}_avx:
1660         .byte   9,0,0,0
1661         .rva    se_handler
1662         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1663 ___
1664 $code.=<<___ if ($avx>1);
1665 .LSEH_info_${func}_avx2:
1666         .byte   9,0,0,0
1667         .rva    se_handler
1668         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1669 ___
1670 $code.=<<___ if ($shaext);
1671 .LSEH_info_${func}_shaext:
1672         .byte   9,0,0,0
1673         .rva    se_handler
1674         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1675 ___
1676 }
1677
1678 ####################################################################
1679 sub rex {
1680   local *opcode=shift;
1681   my ($dst,$src)=@_;
1682   my $rex=0;
1683
1684     $rex|=0x04                  if($dst>=8);
1685     $rex|=0x01                  if($src>=8);
1686     unshift @opcode,$rex|0x40   if($rex);
1687 }
1688
1689 {
1690   my %opcodelet = (
1691                 "sha256rnds2" => 0xcb,
1692                 "sha256msg1"  => 0xcc,
1693                 "sha256msg2"  => 0xcd   );
1694
1695   sub sha256op38 {
1696     my $instr = shift;
1697
1698     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1699       my @opcode=(0x0f,0x38);
1700         rex(\@opcode,$2,$1);
1701         push @opcode,$opcodelet{$instr};
1702         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1703         return ".byte\t".join(',',@opcode);
1704     } else {
1705         return $instr."\t".@_[0];
1706     }
1707   }
1708 }
1709
1710 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1711 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1712 print $code;
1713 close STDOUT;