ceec9a6e1eb51a66166439dd9b161fc8580ae534
[openssl.git] / crypto / aes / asm / aesni-sha256-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # January 2013
18 #
19 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
21 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
22 # parallelism, interleaving it with another algorithm would allow to
23 # utilize processor resources better and achieve better performance.
24 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25 # AESNI code is weaved into it. As SHA256 dominates execution time,
26 # stitch performance does not depend on AES key length. Below are
27 # performance numbers in cycles per processed byte, less is better,
28 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
29 # subroutine:
30 #
31 #                AES-128/-192/-256+SHA256       this(**)gain
32 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
33 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
34 # Haswell           4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
35 # Skylake           2.62/3.14/3.62+7.70         8.10    +27%/34%/40%
36 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
37 #
38 # (*)   there are XOP, AVX1 and AVX2 code paths, meaning that
39 #       Westmere is omitted from loop, this is because gain was not
40 #       estimated high enough to justify the effort;
41 # (**)  these are EVP-free results, results obtained with 'speed
42 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
43
44 $flavour = shift;
45 $output  = shift;
46 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
47
48 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
49
50 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
51 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
52 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
53 die "can't locate x86_64-xlate.pl";
54
55 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
56                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
57         $avx = ($1>=2.19) + ($1>=2.22);
58 }
59
60 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
61            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
62         $avx = ($1>=2.09) + ($1>=2.10);
63 }
64
65 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
66            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
67         $avx = ($1>=10) + ($1>=12);
68 }
69
70 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
71         $avx = ($2>=3.0) + ($2>3.0);
72 }
73
74 $shaext=$avx;   ### set to zero if compiling for 1.0.1
75 $avx=1          if (!$shaext && $avx);
76
77 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
78 *STDOUT=*OUT;
79
80 $func="aesni_cbc_sha256_enc";
81 $TABLE="K256";
82 $SZ=4;
83 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
84                                 "%r8d","%r9d","%r10d","%r11d");
85 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
86 @Sigma0=( 2,13,22);
87 @Sigma1=( 6,11,25);
88 @sigma0=( 7,18, 3);
89 @sigma1=(17,19,10);
90 $rounds=64;
91
92 ########################################################################
93 # void aesni_cbc_sha256_enc(const void *inp,
94 #                       void *out,
95 #                       size_t length,
96 #                       const AES_KEY *key,
97 #                       unsigned char *iv,
98 #                       SHA256_CTX *ctx,
99 #                       const void *in0);
100 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
101 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
102
103 $Tbl="%rbp";
104
105 $_inp="16*$SZ+0*8(%rsp)";
106 $_out="16*$SZ+1*8(%rsp)";
107 $_end="16*$SZ+2*8(%rsp)";
108 $_key="16*$SZ+3*8(%rsp)";
109 $_ivp="16*$SZ+4*8(%rsp)";
110 $_ctx="16*$SZ+5*8(%rsp)";
111 $_in0="16*$SZ+6*8(%rsp)";
112 $_rsp="`16*$SZ+7*8`(%rsp)";
113 $framesz=16*$SZ+8*8;
114
115 $code=<<___;
116 .text
117
118 .extern OPENSSL_ia32cap_P
119 .globl  $func
120 .type   $func,\@abi-omnipotent
121 .align  16
122 $func:
123 ___
124                                                 if ($avx) {
125 $code.=<<___;
126         lea     OPENSSL_ia32cap_P(%rip),%r11
127         mov     \$1,%eax
128         cmp     \$0,`$win64?"%rcx":"%rdi"`
129         je      .Lprobe
130         mov     0(%r11),%eax
131         mov     4(%r11),%r10
132 ___
133 $code.=<<___ if ($shaext);
134         bt      \$61,%r10                       # check for SHA
135         jc      ${func}_shaext
136 ___
137 $code.=<<___;
138         mov     %r10,%r11
139         shr     \$32,%r11
140
141         test    \$`1<<11`,%r10d                 # check for XOP
142         jnz     ${func}_xop
143 ___
144 $code.=<<___ if ($avx>1);
145         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
146         cmp     \$`1<<8|1<<5|1<<3`,%r11d
147         je      ${func}_avx2
148 ___
149 $code.=<<___;
150         and     \$`1<<28`,%r10d                 # check for AVX
151         jnz     ${func}_avx
152         ud2
153 ___
154                                                 }
155 $code.=<<___;
156         xor     %eax,%eax
157         cmp     \$0,`$win64?"%rcx":"%rdi"`
158         je      .Lprobe
159         ud2
160 .Lprobe:
161         ret
162 .size   $func,.-$func
163
164 .align  64
165 .type   $TABLE,\@object
166 $TABLE:
167         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
168         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
169         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
170         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
171         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
172         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
173         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
174         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
175         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
176         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
177         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
178         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
179         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
180         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
181         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
182         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
183         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
184         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
185         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
186         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
187         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
188         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
189         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
190         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
191         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
192         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
193         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
194         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
195         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
196         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
197         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
198         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
199
200         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
201         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
202         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
203         .long   0,0,0,0,   0,0,0,0
204         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
205 .align  64
206 ___
207
208 ######################################################################
209 # SIMD code paths
210 #
211 {{{
212 ($iv,$inout,$roundkey,$temp,
213  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
214
215 $aesni_cbc_idx=0;
216 @aesni_cbc_block = (
217 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
218 ##      &vmovdqu        ($inout,($inp));
219 ##      &mov            ($_inp,$inp);
220
221         '&vpxor         ($inout,$inout,$roundkey);'.
222         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
223
224         '&vpxor         ($inout,$inout,$iv);',
225
226         '&vaesenc       ($inout,$inout,$roundkey);'.
227         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
228
229         '&vaesenc       ($inout,$inout,$roundkey);'.
230         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
231
232         '&vaesenc       ($inout,$inout,$roundkey);'.
233         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
234
235         '&vaesenc       ($inout,$inout,$roundkey);'.
236         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
237
238         '&vaesenc       ($inout,$inout,$roundkey);'.
239         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
240
241         '&vaesenc       ($inout,$inout,$roundkey);'.
242         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
243
244         '&vaesenc       ($inout,$inout,$roundkey);'.
245         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
246
247         '&vaesenc       ($inout,$inout,$roundkey);'.
248         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
249
250         '&vaesenc       ($inout,$inout,$roundkey);'.
251         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
252
253         '&vaesenclast   ($temp,$inout,$roundkey);'.
254         ' &vaesenc      ($inout,$inout,$roundkey);'.
255         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
256
257         '&vpand         ($iv,$temp,$mask10);'.
258         ' &vaesenc      ($inout,$inout,$roundkey);'.
259         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
260
261         '&vaesenclast   ($temp,$inout,$roundkey);'.
262         ' &vaesenc      ($inout,$inout,$roundkey);'.
263         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
264
265         '&vpand         ($temp,$temp,$mask12);'.
266         ' &vaesenc      ($inout,$inout,$roundkey);'.
267          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
268
269         '&vpor          ($iv,$iv,$temp);'.
270         ' &vaesenclast  ($temp,$inout,$roundkey);'.
271         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
272
273 ##      &mov            ($inp,$_inp);
274 ##      &mov            ($out,$_out);
275 ##      &vpand          ($temp,$temp,$mask14);
276 ##      &vpor           ($iv,$iv,$temp);
277 ##      &vmovdqu        ($iv,($out,$inp);
278 ##      &lea            (inp,16($inp));
279 );
280
281 my $a4=$T1;
282 my ($a,$b,$c,$d,$e,$f,$g,$h);
283
284 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
285 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
286   my $arg = pop;
287     $arg = "\$$arg" if ($arg*1 eq $arg);
288     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
289 }
290
291 sub body_00_15 () {
292         (
293         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
294
295         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
296         '&mov   ($a,$a1)',
297         '&mov   ($a4,$f)',
298
299         '&xor   ($a0,$e)',
300         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
301         '&xor   ($a4,$g)',                      # f^g
302
303         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
304         '&xor   ($a1,$a)',
305         '&and   ($a4,$e)',                      # (f^g)&e
306
307         @aesni_cbc_block[$aesni_cbc_idx++].
308         '&xor   ($a0,$e)',
309         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
310         '&mov   ($a2,$a)',
311
312         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
313         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
314         '&xor   ($a2,$b)',                      # a^b, b^c in next round
315
316         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
317         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
318         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
319
320         '&xor   ($a1,$a)',
321         '&add   ($h,$a0)',                      # h+=Sigma1(e)
322         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
323
324         '&add   ($d,$h)',                       # d+=h
325         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
326         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
327
328         '&mov   ($a0,$d)',
329         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
330         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
331         );
332 }
333
334 if ($avx) {{
335 ######################################################################
336 # XOP code path
337 #
338 $code.=<<___;
339 .type   ${func}_xop,\@function,6
340 .align  64
341 ${func}_xop:
342 .cfi_startproc
343 .Lxop_shortcut:
344         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
345         mov     %rsp,%rax               # copy %rsp
346 .cfi_def_cfa_register   %rax
347         push    %rbx
348 .cfi_push       %rbx
349         push    %rbp
350 .cfi_push       %rbp
351         push    %r12
352 .cfi_push       %r12
353         push    %r13
354 .cfi_push       %r13
355         push    %r14
356 .cfi_push       %r14
357         push    %r15
358 .cfi_push       %r15
359         sub     \$`$framesz+$win64*16*10`,%rsp
360         and     \$-64,%rsp              # align stack frame
361
362         shl     \$6,$len
363         sub     $inp,$out               # re-bias
364         sub     $inp,$in0
365         add     $inp,$len               # end of input
366
367         #mov    $inp,$_inp              # saved later
368         mov     $out,$_out
369         mov     $len,$_end
370         #mov    $key,$_key              # remains resident in $inp register
371         mov     $ivp,$_ivp
372         mov     $ctx,$_ctx
373         mov     $in0,$_in0
374         mov     %rax,$_rsp
375 .cfi_cfa_expression     $_rsp,deref,+8
376 ___
377 $code.=<<___ if ($win64);
378         movaps  %xmm6,`$framesz+16*0`(%rsp)
379         movaps  %xmm7,`$framesz+16*1`(%rsp)
380         movaps  %xmm8,`$framesz+16*2`(%rsp)
381         movaps  %xmm9,`$framesz+16*3`(%rsp)
382         movaps  %xmm10,`$framesz+16*4`(%rsp)
383         movaps  %xmm11,`$framesz+16*5`(%rsp)
384         movaps  %xmm12,`$framesz+16*6`(%rsp)
385         movaps  %xmm13,`$framesz+16*7`(%rsp)
386         movaps  %xmm14,`$framesz+16*8`(%rsp)
387         movaps  %xmm15,`$framesz+16*9`(%rsp)
388 ___
389 $code.=<<___;
390 .Lprologue_xop:
391         vzeroall
392
393         mov     $inp,%r12               # borrow $a4
394         lea     0x80($key),$inp         # size optimization, reassign
395         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
396         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
397         mov     $ctx,%r15               # borrow $a2
398         mov     $in0,%rsi               # borrow $a3
399         vmovdqu ($ivp),$iv              # load IV
400         sub     \$9,%r14
401
402         mov     $SZ*0(%r15),$A
403         mov     $SZ*1(%r15),$B
404         mov     $SZ*2(%r15),$C
405         mov     $SZ*3(%r15),$D
406         mov     $SZ*4(%r15),$E
407         mov     $SZ*5(%r15),$F
408         mov     $SZ*6(%r15),$G
409         mov     $SZ*7(%r15),$H
410
411         vmovdqa 0x00(%r13,%r14,8),$mask14
412         vmovdqa 0x10(%r13,%r14,8),$mask12
413         vmovdqa 0x20(%r13,%r14,8),$mask10
414         vmovdqu 0x00-0x80($inp),$roundkey
415         jmp     .Lloop_xop
416 ___
417                                         if ($SZ==4) {   # SHA256
418     my @X = map("%xmm$_",(0..3));
419     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
420
421 $code.=<<___;
422 .align  16
423 .Lloop_xop:
424         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
425         vmovdqu 0x00(%rsi,%r12),@X[0]
426         vmovdqu 0x10(%rsi,%r12),@X[1]
427         vmovdqu 0x20(%rsi,%r12),@X[2]
428         vmovdqu 0x30(%rsi,%r12),@X[3]
429         vpshufb $t3,@X[0],@X[0]
430         lea     $TABLE(%rip),$Tbl
431         vpshufb $t3,@X[1],@X[1]
432         vpshufb $t3,@X[2],@X[2]
433         vpaddd  0x00($Tbl),@X[0],$t0
434         vpshufb $t3,@X[3],@X[3]
435         vpaddd  0x20($Tbl),@X[1],$t1
436         vpaddd  0x40($Tbl),@X[2],$t2
437         vpaddd  0x60($Tbl),@X[3],$t3
438         vmovdqa $t0,0x00(%rsp)
439         mov     $A,$a1
440         vmovdqa $t1,0x10(%rsp)
441         mov     $B,$a3
442         vmovdqa $t2,0x20(%rsp)
443         xor     $C,$a3                  # magic
444         vmovdqa $t3,0x30(%rsp)
445         mov     $E,$a0
446         jmp     .Lxop_00_47
447
448 .align  16
449 .Lxop_00_47:
450         sub     \$-16*2*$SZ,$Tbl        # size optimization
451         vmovdqu (%r12),$inout           # $a4
452         mov     %r12,$_inp              # $a4
453 ___
454 sub XOP_256_00_47 () {
455 my $j = shift;
456 my $body = shift;
457 my @X = @_;
458 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
459
460         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
461           eval(shift(@insns));
462           eval(shift(@insns));
463          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
464           eval(shift(@insns));
465           eval(shift(@insns));
466         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
467           eval(shift(@insns));
468           eval(shift(@insns));
469         &vpsrld         ($t0,$t0,$sigma0[2]);
470           eval(shift(@insns));
471           eval(shift(@insns));
472          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
473           eval(shift(@insns));
474           eval(shift(@insns));
475           eval(shift(@insns));
476           eval(shift(@insns));
477         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
478           eval(shift(@insns));
479           eval(shift(@insns));
480         &vpxor          ($t0,$t0,$t1);
481           eval(shift(@insns));
482           eval(shift(@insns));
483           eval(shift(@insns));
484           eval(shift(@insns));
485          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
486           eval(shift(@insns));
487           eval(shift(@insns));
488         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
489           eval(shift(@insns));
490           eval(shift(@insns));
491          &vpsrld        ($t2,@X[3],$sigma1[2]);
492           eval(shift(@insns));
493           eval(shift(@insns));
494         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
495           eval(shift(@insns));
496           eval(shift(@insns));
497          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
498           eval(shift(@insns));
499           eval(shift(@insns));
500          &vpxor         ($t3,$t3,$t2);
501           eval(shift(@insns));
502           eval(shift(@insns));
503           eval(shift(@insns));
504           eval(shift(@insns));
505          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
506           eval(shift(@insns));
507           eval(shift(@insns));
508           eval(shift(@insns));
509           eval(shift(@insns));
510         &vpsrldq        ($t3,$t3,8);
511           eval(shift(@insns));
512           eval(shift(@insns));
513           eval(shift(@insns));
514           eval(shift(@insns));
515         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
516           eval(shift(@insns));
517           eval(shift(@insns));
518           eval(shift(@insns));
519           eval(shift(@insns));
520          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
521           eval(shift(@insns));
522           eval(shift(@insns));
523          &vpsrld        ($t2,@X[0],$sigma1[2]);
524           eval(shift(@insns));
525           eval(shift(@insns));
526          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
527           eval(shift(@insns));
528           eval(shift(@insns));
529          &vpxor         ($t3,$t3,$t2);
530           eval(shift(@insns));
531           eval(shift(@insns));
532           eval(shift(@insns));
533           eval(shift(@insns));
534          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
535           eval(shift(@insns));
536           eval(shift(@insns));
537           eval(shift(@insns));
538           eval(shift(@insns));
539         &vpslldq        ($t3,$t3,8);            # 22 instructions
540           eval(shift(@insns));
541           eval(shift(@insns));
542           eval(shift(@insns));
543           eval(shift(@insns));
544         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
545           eval(shift(@insns));
546           eval(shift(@insns));
547           eval(shift(@insns));
548           eval(shift(@insns));
549         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
550           foreach (@insns) { eval; }            # remaining instructions
551         &vmovdqa        (16*$j."(%rsp)",$t2);
552 }
553
554     $aesni_cbc_idx=0;
555     for ($i=0,$j=0; $j<4; $j++) {
556         &XOP_256_00_47($j,\&body_00_15,@X);
557         push(@X,shift(@X));                     # rotate(@X)
558     }
559         &mov            ("%r12",$_inp);         # borrow $a4
560         &vpand          ($temp,$temp,$mask14);
561         &mov            ("%r15",$_out);         # borrow $a2
562         &vpor           ($iv,$iv,$temp);
563         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
564         &lea            ("%r12","16(%r12)");    # inp++
565
566         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
567         &jne    (".Lxop_00_47");
568
569         &vmovdqu        ($inout,"(%r12)");
570         &mov            ($_inp,"%r12");
571
572     $aesni_cbc_idx=0;
573     for ($i=0; $i<16; ) {
574         foreach(body_00_15()) { eval; }
575     }
576                                         }
577 $code.=<<___;
578         mov     $_inp,%r12              # borrow $a4
579         mov     $_out,%r13              # borrow $a0
580         mov     $_ctx,%r15              # borrow $a2
581         mov     $_in0,%rsi              # borrow $a3
582
583         vpand   $mask14,$temp,$temp
584         mov     $a1,$A
585         vpor    $temp,$iv,$iv
586         vmovdqu $iv,(%r13,%r12)         # write output
587         lea     16(%r12),%r12           # inp++
588
589         add     $SZ*0(%r15),$A
590         add     $SZ*1(%r15),$B
591         add     $SZ*2(%r15),$C
592         add     $SZ*3(%r15),$D
593         add     $SZ*4(%r15),$E
594         add     $SZ*5(%r15),$F
595         add     $SZ*6(%r15),$G
596         add     $SZ*7(%r15),$H
597
598         cmp     $_end,%r12
599
600         mov     $A,$SZ*0(%r15)
601         mov     $B,$SZ*1(%r15)
602         mov     $C,$SZ*2(%r15)
603         mov     $D,$SZ*3(%r15)
604         mov     $E,$SZ*4(%r15)
605         mov     $F,$SZ*5(%r15)
606         mov     $G,$SZ*6(%r15)
607         mov     $H,$SZ*7(%r15)
608
609         jb      .Lloop_xop
610
611         mov     $_ivp,$ivp
612         mov     $_rsp,%rsi
613 .cfi_def_cfa    %rsi,8
614         vmovdqu $iv,($ivp)              # output IV
615         vzeroall
616 ___
617 $code.=<<___ if ($win64);
618         movaps  `$framesz+16*0`(%rsp),%xmm6
619         movaps  `$framesz+16*1`(%rsp),%xmm7
620         movaps  `$framesz+16*2`(%rsp),%xmm8
621         movaps  `$framesz+16*3`(%rsp),%xmm9
622         movaps  `$framesz+16*4`(%rsp),%xmm10
623         movaps  `$framesz+16*5`(%rsp),%xmm11
624         movaps  `$framesz+16*6`(%rsp),%xmm12
625         movaps  `$framesz+16*7`(%rsp),%xmm13
626         movaps  `$framesz+16*8`(%rsp),%xmm14
627         movaps  `$framesz+16*9`(%rsp),%xmm15
628 ___
629 $code.=<<___;
630         mov     -48(%rsi),%r15
631 .cfi_restore    %r15
632         mov     -40(%rsi),%r14
633 .cfi_restore    %r14
634         mov     -32(%rsi),%r13
635 .cfi_restore    %r13
636         mov     -24(%rsi),%r12
637 .cfi_restore    %r12
638         mov     -16(%rsi),%rbp
639 .cfi_restore    %rbp
640         mov     -8(%rsi),%rbx
641 .cfi_restore    %rbx
642         lea     (%rsi),%rsp
643 .cfi_def_cfa_register   %rsp
644 .Lepilogue_xop:
645         ret
646 .cfi_endproc
647 .size   ${func}_xop,.-${func}_xop
648 ___
649 ######################################################################
650 # AVX+shrd code path
651 #
652 local *ror = sub { &shrd(@_[0],@_) };
653
654 $code.=<<___;
655 .type   ${func}_avx,\@function,6
656 .align  64
657 ${func}_avx:
658 .cfi_startproc
659 .Lavx_shortcut:
660         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
661         mov     %rsp,%rax               # copy %rsp
662 .cfi_def_cfa_register   %rax
663         push    %rbx
664 .cfi_push       %rbx
665         push    %rbp
666 .cfi_push       %rbp
667         push    %r12
668 .cfi_push       %r12
669         push    %r13
670 .cfi_push       %r13
671         push    %r14
672 .cfi_push       %r14
673         push    %r15
674 .cfi_push       %r15
675         sub     \$`$framesz+$win64*16*10`,%rsp
676         and     \$-64,%rsp              # align stack frame
677
678         shl     \$6,$len
679         sub     $inp,$out               # re-bias
680         sub     $inp,$in0
681         add     $inp,$len               # end of input
682
683         #mov    $inp,$_inp              # saved later
684         mov     $out,$_out
685         mov     $len,$_end
686         #mov    $key,$_key              # remains resident in $inp register
687         mov     $ivp,$_ivp
688         mov     $ctx,$_ctx
689         mov     $in0,$_in0
690         mov     %rax,$_rsp
691 .cfi_cfa_expression     $_rsp,deref,+8
692 ___
693 $code.=<<___ if ($win64);
694         movaps  %xmm6,`$framesz+16*0`(%rsp)
695         movaps  %xmm7,`$framesz+16*1`(%rsp)
696         movaps  %xmm8,`$framesz+16*2`(%rsp)
697         movaps  %xmm9,`$framesz+16*3`(%rsp)
698         movaps  %xmm10,`$framesz+16*4`(%rsp)
699         movaps  %xmm11,`$framesz+16*5`(%rsp)
700         movaps  %xmm12,`$framesz+16*6`(%rsp)
701         movaps  %xmm13,`$framesz+16*7`(%rsp)
702         movaps  %xmm14,`$framesz+16*8`(%rsp)
703         movaps  %xmm15,`$framesz+16*9`(%rsp)
704 ___
705 $code.=<<___;
706 .Lprologue_avx:
707         vzeroall
708
709         mov     $inp,%r12               # borrow $a4
710         lea     0x80($key),$inp         # size optimization, reassign
711         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
712         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
713         mov     $ctx,%r15               # borrow $a2
714         mov     $in0,%rsi               # borrow $a3
715         vmovdqu ($ivp),$iv              # load IV
716         sub     \$9,%r14
717
718         mov     $SZ*0(%r15),$A
719         mov     $SZ*1(%r15),$B
720         mov     $SZ*2(%r15),$C
721         mov     $SZ*3(%r15),$D
722         mov     $SZ*4(%r15),$E
723         mov     $SZ*5(%r15),$F
724         mov     $SZ*6(%r15),$G
725         mov     $SZ*7(%r15),$H
726
727         vmovdqa 0x00(%r13,%r14,8),$mask14
728         vmovdqa 0x10(%r13,%r14,8),$mask12
729         vmovdqa 0x20(%r13,%r14,8),$mask10
730         vmovdqu 0x00-0x80($inp),$roundkey
731 ___
732                                         if ($SZ==4) {   # SHA256
733     my @X = map("%xmm$_",(0..3));
734     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
735
736 $code.=<<___;
737         jmp     .Lloop_avx
738 .align  16
739 .Lloop_avx:
740         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
741         vmovdqu 0x00(%rsi,%r12),@X[0]
742         vmovdqu 0x10(%rsi,%r12),@X[1]
743         vmovdqu 0x20(%rsi,%r12),@X[2]
744         vmovdqu 0x30(%rsi,%r12),@X[3]
745         vpshufb $t3,@X[0],@X[0]
746         lea     $TABLE(%rip),$Tbl
747         vpshufb $t3,@X[1],@X[1]
748         vpshufb $t3,@X[2],@X[2]
749         vpaddd  0x00($Tbl),@X[0],$t0
750         vpshufb $t3,@X[3],@X[3]
751         vpaddd  0x20($Tbl),@X[1],$t1
752         vpaddd  0x40($Tbl),@X[2],$t2
753         vpaddd  0x60($Tbl),@X[3],$t3
754         vmovdqa $t0,0x00(%rsp)
755         mov     $A,$a1
756         vmovdqa $t1,0x10(%rsp)
757         mov     $B,$a3
758         vmovdqa $t2,0x20(%rsp)
759         xor     $C,$a3                  # magic
760         vmovdqa $t3,0x30(%rsp)
761         mov     $E,$a0
762         jmp     .Lavx_00_47
763
764 .align  16
765 .Lavx_00_47:
766         sub     \$-16*2*$SZ,$Tbl        # size optimization
767         vmovdqu (%r12),$inout           # $a4
768         mov     %r12,$_inp              # $a4
769 ___
770 sub Xupdate_256_AVX () {
771         (
772         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
773          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
774         '&vpsrld        ($t2,$t0,$sigma0[0]);',
775          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
776         '&vpsrld        ($t3,$t0,$sigma0[2])',
777         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
778         '&vpxor         ($t0,$t3,$t2)',
779          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
780         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
781         '&vpxor         ($t0,$t0,$t1)',
782         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
783         '&vpxor         ($t0,$t0,$t2)',
784          '&vpsrld       ($t2,$t3,$sigma1[2]);',
785         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
786          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
787         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
788          '&vpxor        ($t2,$t2,$t3);',
789          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
790          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
791          '&vpshufd      ($t2,$t2,0b10000100)',
792          '&vpsrldq      ($t2,$t2,8)',
793         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
794          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
795          '&vpsrld       ($t2,$t3,$sigma1[2])',
796          '&vpsrlq       ($t3,$t3,$sigma1[0])',
797          '&vpxor        ($t2,$t2,$t3);',
798          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
799          '&vpxor        ($t2,$t2,$t3)',
800          '&vpshufd      ($t2,$t2,0b11101000)',
801          '&vpslldq      ($t2,$t2,8)',
802         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
803         );
804 }
805
806 sub AVX_256_00_47 () {
807 my $j = shift;
808 my $body = shift;
809 my @X = @_;
810 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
811
812         foreach (Xupdate_256_AVX()) {           # 29 instructions
813             eval;
814             eval(shift(@insns));
815             eval(shift(@insns));
816             eval(shift(@insns));
817         }
818         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
819           foreach (@insns) { eval; }            # remaining instructions
820         &vmovdqa        (16*$j."(%rsp)",$t2);
821 }
822
823     $aesni_cbc_idx=0;
824     for ($i=0,$j=0; $j<4; $j++) {
825         &AVX_256_00_47($j,\&body_00_15,@X);
826         push(@X,shift(@X));                     # rotate(@X)
827     }
828         &mov            ("%r12",$_inp);         # borrow $a4
829         &vpand          ($temp,$temp,$mask14);
830         &mov            ("%r15",$_out);         # borrow $a2
831         &vpor           ($iv,$iv,$temp);
832         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
833         &lea            ("%r12","16(%r12)");    # inp++
834
835         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
836         &jne    (".Lavx_00_47");
837
838         &vmovdqu        ($inout,"(%r12)");
839         &mov            ($_inp,"%r12");
840
841     $aesni_cbc_idx=0;
842     for ($i=0; $i<16; ) {
843         foreach(body_00_15()) { eval; }
844     }
845
846                                         }
847 $code.=<<___;
848         mov     $_inp,%r12              # borrow $a4
849         mov     $_out,%r13              # borrow $a0
850         mov     $_ctx,%r15              # borrow $a2
851         mov     $_in0,%rsi              # borrow $a3
852
853         vpand   $mask14,$temp,$temp
854         mov     $a1,$A
855         vpor    $temp,$iv,$iv
856         vmovdqu $iv,(%r13,%r12)         # write output
857         lea     16(%r12),%r12           # inp++
858
859         add     $SZ*0(%r15),$A
860         add     $SZ*1(%r15),$B
861         add     $SZ*2(%r15),$C
862         add     $SZ*3(%r15),$D
863         add     $SZ*4(%r15),$E
864         add     $SZ*5(%r15),$F
865         add     $SZ*6(%r15),$G
866         add     $SZ*7(%r15),$H
867
868         cmp     $_end,%r12
869
870         mov     $A,$SZ*0(%r15)
871         mov     $B,$SZ*1(%r15)
872         mov     $C,$SZ*2(%r15)
873         mov     $D,$SZ*3(%r15)
874         mov     $E,$SZ*4(%r15)
875         mov     $F,$SZ*5(%r15)
876         mov     $G,$SZ*6(%r15)
877         mov     $H,$SZ*7(%r15)
878         jb      .Lloop_avx
879
880         mov     $_ivp,$ivp
881         mov     $_rsp,%rsi
882 .cfi_def_cfa    %rsi,8
883         vmovdqu $iv,($ivp)              # output IV
884         vzeroall
885 ___
886 $code.=<<___ if ($win64);
887         movaps  `$framesz+16*0`(%rsp),%xmm6
888         movaps  `$framesz+16*1`(%rsp),%xmm7
889         movaps  `$framesz+16*2`(%rsp),%xmm8
890         movaps  `$framesz+16*3`(%rsp),%xmm9
891         movaps  `$framesz+16*4`(%rsp),%xmm10
892         movaps  `$framesz+16*5`(%rsp),%xmm11
893         movaps  `$framesz+16*6`(%rsp),%xmm12
894         movaps  `$framesz+16*7`(%rsp),%xmm13
895         movaps  `$framesz+16*8`(%rsp),%xmm14
896         movaps  `$framesz+16*9`(%rsp),%xmm15
897 ___
898 $code.=<<___;
899         mov     -48(%rsi),%r15
900 .cfi_restore    %r15
901         mov     -40(%rsi),%r14
902 .cfi_restore    %r14
903         mov     -32(%rsi),%r13
904 .cfi_restore    %r13
905         mov     -24(%rsi),%r12
906 .cfi_restore    %r12
907         mov     -16(%rsi),%rbp
908 .cfi_restore    %rbp
909         mov     -8(%rsi),%rbx
910 .cfi_restore    %rbx
911         lea     (%rsi),%rsp
912 .cfi_def_cfa_register   %rsp
913 .Lepilogue_avx:
914         ret
915 .cfi_endproc
916 .size   ${func}_avx,.-${func}_avx
917 ___
918
919 if ($avx>1) {{
920 ######################################################################
921 # AVX2+BMI code path
922 #
923 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp
924 my $PUSH8=8*2*$SZ;
925 use integer;
926
927 sub bodyx_00_15 () {
928         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
929         (
930         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
931
932         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
933         '&and   ($a4,$e)',              # f&e
934         '&rorx  ($a0,$e,$Sigma1[2])',
935         '&rorx  ($a2,$e,$Sigma1[1])',
936
937         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
938         '&lea   ($h,"($h,$a4)")',
939         '&andn  ($a4,$e,$g)',           # ~e&g
940         '&xor   ($a0,$a2)',
941
942         '&rorx  ($a1,$e,$Sigma1[0])',
943         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
944         '&xor   ($a0,$a1)',             # Sigma1(e)
945         '&mov   ($a2,$a)',
946
947         '&rorx  ($a4,$a,$Sigma0[2])',
948         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
949         '&xor   ($a2,$b)',              # a^b, b^c in next round
950         '&rorx  ($a1,$a,$Sigma0[1])',
951
952         '&rorx  ($a0,$a,$Sigma0[0])',
953         '&lea   ($d,"($d,$h)")',        # d+=h
954         '&and   ($a3,$a2)',             # (b^c)&(a^b)
955         @aesni_cbc_block[$aesni_cbc_idx++].
956         '&xor   ($a1,$a4)',
957
958         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
959         '&xor   ($a1,$a0)',             # Sigma0(a)
960         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
961         '&mov   ($a4,$e)',              # copy of f in future
962
963         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
964         );
965         # and at the finish one has to $a+=$a1
966 }
967
968 $code.=<<___;
969 .type   ${func}_avx2,\@function,6
970 .align  64
971 ${func}_avx2:
972 .cfi_startproc
973 .Lavx2_shortcut:
974         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
975         mov     %rsp,%rax               # copy %rsp
976 .cfi_def_cfa_register   %rax
977         push    %rbx
978 .cfi_push       %rbx
979         push    %rbp
980 .cfi_push       %rbp
981         push    %r12
982 .cfi_push       %r12
983         push    %r13
984 .cfi_push       %r13
985         push    %r14
986 .cfi_push       %r14
987         push    %r15
988 .cfi_push       %r15
989         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
990         and     \$-256*$SZ,%rsp         # align stack frame
991         add     \$`2*$SZ*($rounds-8)`,%rsp
992
993         shl     \$6,$len
994         sub     $inp,$out               # re-bias
995         sub     $inp,$in0
996         add     $inp,$len               # end of input
997
998         #mov    $inp,$_inp              # saved later
999         #mov    $out,$_out              # kept in $offload
1000         mov     $len,$_end
1001         #mov    $key,$_key              # remains resident in $inp register
1002         mov     $ivp,$_ivp
1003         mov     $ctx,$_ctx
1004         mov     $in0,$_in0
1005         mov     %rax,$_rsp
1006 .cfi_cfa_expression     $_rsp,deref,+8
1007 ___
1008 $code.=<<___ if ($win64);
1009         movaps  %xmm6,`$framesz+16*0`(%rsp)
1010         movaps  %xmm7,`$framesz+16*1`(%rsp)
1011         movaps  %xmm8,`$framesz+16*2`(%rsp)
1012         movaps  %xmm9,`$framesz+16*3`(%rsp)
1013         movaps  %xmm10,`$framesz+16*4`(%rsp)
1014         movaps  %xmm11,`$framesz+16*5`(%rsp)
1015         movaps  %xmm12,`$framesz+16*6`(%rsp)
1016         movaps  %xmm13,`$framesz+16*7`(%rsp)
1017         movaps  %xmm14,`$framesz+16*8`(%rsp)
1018         movaps  %xmm15,`$framesz+16*9`(%rsp)
1019 ___
1020 $code.=<<___;
1021 .Lprologue_avx2:
1022         vzeroall
1023
1024         mov     $inp,%r13               # borrow $a0
1025         vpinsrq \$1,$out,$offload,$offload
1026         lea     0x80($key),$inp         # size optimization, reassign
1027         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
1028         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
1029         mov     $ctx,%r15               # borrow $a2
1030         mov     $in0,%rsi               # borrow $a3
1031         vmovdqu ($ivp),$iv              # load IV
1032         lea     -9(%r14),%r14
1033
1034         vmovdqa 0x00(%r12,%r14,8),$mask14
1035         vmovdqa 0x10(%r12,%r14,8),$mask12
1036         vmovdqa 0x20(%r12,%r14,8),$mask10
1037
1038         sub     \$-16*$SZ,%r13          # inp++, size optimization
1039         mov     $SZ*0(%r15),$A
1040         lea     (%rsi,%r13),%r12        # borrow $a0
1041         mov     $SZ*1(%r15),$B
1042         cmp     $len,%r13               # $_end
1043         mov     $SZ*2(%r15),$C
1044         cmove   %rsp,%r12               # next block or random data
1045         mov     $SZ*3(%r15),$D
1046         mov     $SZ*4(%r15),$E
1047         mov     $SZ*5(%r15),$F
1048         mov     $SZ*6(%r15),$G
1049         mov     $SZ*7(%r15),$H
1050         vmovdqu 0x00-0x80($inp),$roundkey
1051 ___
1052                                         if ($SZ==4) {   # SHA256
1053     my @X = map("%ymm$_",(0..3));
1054     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1055
1056 $code.=<<___;
1057         jmp     .Loop_avx2
1058 .align  16
1059 .Loop_avx2:
1060         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1061         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1062         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1063         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1064         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1065
1066         vinserti128     \$1,(%r12),@X[0],@X[0]
1067         vinserti128     \$1,16(%r12),@X[1],@X[1]
1068          vpshufb        $t3,@X[0],@X[0]
1069         vinserti128     \$1,32(%r12),@X[2],@X[2]
1070          vpshufb        $t3,@X[1],@X[1]
1071         vinserti128     \$1,48(%r12),@X[3],@X[3]
1072
1073         lea     $TABLE(%rip),$Tbl
1074         vpshufb $t3,@X[2],@X[2]
1075         lea     -16*$SZ(%r13),%r13
1076         vpaddd  0x00($Tbl),@X[0],$t0
1077         vpshufb $t3,@X[3],@X[3]
1078         vpaddd  0x20($Tbl),@X[1],$t1
1079         vpaddd  0x40($Tbl),@X[2],$t2
1080         vpaddd  0x60($Tbl),@X[3],$t3
1081         vmovdqa $t0,0x00(%rsp)
1082         xor     $a1,$a1
1083         vmovdqa $t1,0x20(%rsp)
1084         lea     -$PUSH8(%rsp),%rsp
1085         mov     $B,$a3
1086         vmovdqa $t2,0x00(%rsp)
1087         xor     $C,$a3                  # magic
1088         vmovdqa $t3,0x20(%rsp)
1089         mov     $F,$a4
1090         sub     \$-16*2*$SZ,$Tbl        # size optimization
1091         jmp     .Lavx2_00_47
1092
1093 .align  16
1094 .Lavx2_00_47:
1095         vmovdqu (%r13),$inout
1096         vpinsrq \$0,%r13,$offload,$offload
1097 ___
1098
1099 sub AVX2_256_00_47 () {
1100 my $j = shift;
1101 my $body = shift;
1102 my @X = @_;
1103 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1104 my $base = "+2*$PUSH8(%rsp)";
1105
1106         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1107         foreach (Xupdate_256_AVX()) {           # 29 instructions
1108             eval;
1109             eval(shift(@insns));
1110             eval(shift(@insns));
1111             eval(shift(@insns));
1112         }
1113         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1114           foreach (@insns) { eval; }            # remaining instructions
1115         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1116 }
1117     $aesni_cbc_idx=0;
1118     for ($i=0,$j=0; $j<4; $j++) {
1119         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1120         push(@X,shift(@X));                     # rotate(@X)
1121     }
1122         &vmovq          ("%r13",$offload);      # borrow $a0
1123         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1124         &vpand          ($temp,$temp,$mask14);
1125         &vpor           ($iv,$iv,$temp);
1126         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1127         &lea            ("%r13","16(%r13)");    # inp++
1128
1129         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1130         &cmpb   (($SZ-1)."($Tbl)",0);
1131         &jne    (".Lavx2_00_47");
1132
1133         &vmovdqu        ($inout,"(%r13)");
1134         &vpinsrq        ($offload,$offload,"%r13",0);
1135
1136     $aesni_cbc_idx=0;
1137     for ($i=0; $i<16; ) {
1138         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1139         foreach(bodyx_00_15()) { eval; }
1140     }
1141                                         }
1142 $code.=<<___;
1143         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1144         vmovq   $offload,%r13                   # $_inp, borrow $a0
1145         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1146         add     $a1,$A
1147         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1148
1149         vpand   $mask14,$temp,$temp
1150         vpor    $temp,$iv,$iv
1151         vmovdqu $iv,(%r12,%r13)                 # write output
1152         lea     16(%r13),%r13
1153
1154         add     $SZ*0(%r15),$A
1155         add     $SZ*1(%r15),$B
1156         add     $SZ*2(%r15),$C
1157         add     $SZ*3(%r15),$D
1158         add     $SZ*4(%r15),$E
1159         add     $SZ*5(%r15),$F
1160         add     $SZ*6(%r15),$G
1161         add     $SZ*7(%r15),$H
1162
1163         mov     $A,$SZ*0(%r15)
1164         mov     $B,$SZ*1(%r15)
1165         mov     $C,$SZ*2(%r15)
1166         mov     $D,$SZ*3(%r15)
1167         mov     $E,$SZ*4(%r15)
1168         mov     $F,$SZ*5(%r15)
1169         mov     $G,$SZ*6(%r15)
1170         mov     $H,$SZ*7(%r15)
1171
1172         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1173         je      .Ldone_avx2
1174
1175         xor     $a1,$a1
1176         mov     $B,$a3
1177         mov     $F,$a4
1178         xor     $C,$a3                  # magic
1179         jmp     .Lower_avx2
1180 .align  16
1181 .Lower_avx2:
1182         vmovdqu (%r13),$inout
1183         vpinsrq \$0,%r13,$offload,$offload
1184 ___
1185     $aesni_cbc_idx=0;
1186     for ($i=0; $i<16; ) {
1187         my $base="+16($Tbl)";
1188         foreach(bodyx_00_15()) { eval; }
1189         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1190     }
1191 $code.=<<___;
1192         vmovq   $offload,%r13                   # borrow $a0
1193         vpextrq \$1,$offload,%r15               # borrow $a2
1194         vpand   $mask14,$temp,$temp
1195         vpor    $temp,$iv,$iv
1196         lea     -$PUSH8($Tbl),$Tbl
1197         vmovdqu $iv,(%r15,%r13)                 # write output
1198         lea     16(%r13),%r13                   # inp++
1199         cmp     %rsp,$Tbl
1200         jae     .Lower_avx2
1201
1202         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1203         lea     16*$SZ(%r13),%r13
1204         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1205         add     $a1,$A
1206         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1207
1208         add     $SZ*0(%r15),$A
1209         add     $SZ*1(%r15),$B
1210         add     $SZ*2(%r15),$C
1211         add     $SZ*3(%r15),$D
1212         add     $SZ*4(%r15),$E
1213         add     $SZ*5(%r15),$F
1214         add     $SZ*6(%r15),$G
1215         lea     (%rsi,%r13),%r12
1216         add     $SZ*7(%r15),$H
1217
1218         cmp     $_end,%r13
1219
1220         mov     $A,$SZ*0(%r15)
1221         cmove   %rsp,%r12               # next block or stale data
1222         mov     $B,$SZ*1(%r15)
1223         mov     $C,$SZ*2(%r15)
1224         mov     $D,$SZ*3(%r15)
1225         mov     $E,$SZ*4(%r15)
1226         mov     $F,$SZ*5(%r15)
1227         mov     $G,$SZ*6(%r15)
1228         mov     $H,$SZ*7(%r15)
1229
1230         jbe     .Loop_avx2
1231         lea     (%rsp),$Tbl
1232
1233 .Ldone_avx2:
1234         lea     ($Tbl),%rsp
1235         mov     $_ivp,$ivp
1236         mov     $_rsp,%rsi
1237 .cfi_def_cfa    %rsi,8
1238         vmovdqu $iv,($ivp)              # output IV
1239         vzeroall
1240 ___
1241 $code.=<<___ if ($win64);
1242         movaps  `$framesz+16*0`(%rsp),%xmm6
1243         movaps  `$framesz+16*1`(%rsp),%xmm7
1244         movaps  `$framesz+16*2`(%rsp),%xmm8
1245         movaps  `$framesz+16*3`(%rsp),%xmm9
1246         movaps  `$framesz+16*4`(%rsp),%xmm10
1247         movaps  `$framesz+16*5`(%rsp),%xmm11
1248         movaps  `$framesz+16*6`(%rsp),%xmm12
1249         movaps  `$framesz+16*7`(%rsp),%xmm13
1250         movaps  `$framesz+16*8`(%rsp),%xmm14
1251         movaps  `$framesz+16*9`(%rsp),%xmm15
1252 ___
1253 $code.=<<___;
1254         mov     -48(%rsi),%r15
1255 .cfi_restore    %r15
1256         mov     -40(%rsi),%r14
1257 .cfi_restore    %r14
1258         mov     -32(%rsi),%r13
1259 .cfi_restore    %r13
1260         mov     -24(%rsi),%r12
1261 .cfi_restore    %r12
1262         mov     -16(%rsi),%rbp
1263 .cfi_restore    %rbp
1264         mov     -8(%rsi),%rbx
1265 .cfi_restore    %rbx
1266         lea     (%rsi),%rsp
1267 .cfi_def_cfa_register   %rsp
1268 .Lepilogue_avx2:
1269         ret
1270 .cfi_endproc
1271 .size   ${func}_avx2,.-${func}_avx2
1272 ___
1273 }}
1274 }}
1275 {{
1276 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1277
1278 my ($rounds,$Tbl)=("%r11d","%rbx");
1279
1280 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1281 my @rndkey=("%xmm4","%xmm5");
1282 my $r=0;
1283 my $sn=0;
1284
1285 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1286 my @MSG=map("%xmm$_",(10..13));
1287
1288 my $aesenc=sub {
1289   use integer;
1290   my ($n,$k)=($r/10,$r%10);
1291     if ($k==0) {
1292       $code.=<<___;
1293         movups          `16*$n`($in0),$in               # load input
1294         xorps           $rndkey0,$in
1295 ___
1296       $code.=<<___ if ($n);
1297         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1298 ___
1299       $code.=<<___;
1300         xorps           $in,$iv
1301         movups          `32+16*$k-112`($key),$rndkey[1]
1302         aesenc          $rndkey[0],$iv
1303 ___
1304     } elsif ($k==9) {
1305       $sn++;
1306       $code.=<<___;
1307         cmp             \$11,$rounds
1308         jb              .Laesenclast$sn
1309         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1310         aesenc          $rndkey[0],$iv
1311         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1312         aesenc          $rndkey[1],$iv
1313         je              .Laesenclast$sn
1314         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1315         aesenc          $rndkey[0],$iv
1316         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1317         aesenc          $rndkey[1],$iv
1318 .Laesenclast$sn:
1319         aesenclast      $rndkey[0],$iv
1320         movups          16-112($key),$rndkey[1]         # forward reference
1321         nop
1322 ___
1323     } else {
1324       $code.=<<___;
1325         movups          `32+16*$k-112`($key),$rndkey[1]
1326         aesenc          $rndkey[0],$iv
1327 ___
1328     }
1329     $r++;       unshift(@rndkey,pop(@rndkey));
1330 };
1331
1332 if ($shaext) {
1333 my $Tbl="%rax";
1334
1335 $code.=<<___;
1336 .type   ${func}_shaext,\@function,6
1337 .align  32
1338 ${func}_shaext:
1339         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1340 ___
1341 $code.=<<___ if ($win64);
1342         lea     `-8-10*16`(%rsp),%rsp
1343         movaps  %xmm6,-8-10*16(%rax)
1344         movaps  %xmm7,-8-9*16(%rax)
1345         movaps  %xmm8,-8-8*16(%rax)
1346         movaps  %xmm9,-8-7*16(%rax)
1347         movaps  %xmm10,-8-6*16(%rax)
1348         movaps  %xmm11,-8-5*16(%rax)
1349         movaps  %xmm12,-8-4*16(%rax)
1350         movaps  %xmm13,-8-3*16(%rax)
1351         movaps  %xmm14,-8-2*16(%rax)
1352         movaps  %xmm15,-8-1*16(%rax)
1353 .Lprologue_shaext:
1354 ___
1355 $code.=<<___;
1356         lea             K256+0x80(%rip),$Tbl
1357         movdqu          ($ctx),$ABEF            # DCBA
1358         movdqu          16($ctx),$CDGH          # HGFE
1359         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1360
1361         mov             240($key),$rounds
1362         sub             $in0,$out
1363         movups          ($key),$rndkey0         # $key[0]
1364         movups          16($key),$rndkey[0]     # forward reference
1365         lea             112($key),$key          # size optimization
1366
1367         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1368         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1369         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1370         movdqa          $TMP,$BSWAP             # offload
1371         palignr         \$8,$CDGH,$ABEF         # ABEF
1372         punpcklqdq      $Wi,$CDGH               # CDGH
1373
1374         jmp     .Loop_shaext
1375
1376 .align  16
1377 .Loop_shaext:
1378         movdqu          ($inp),@MSG[0]
1379         movdqu          0x10($inp),@MSG[1]
1380         movdqu          0x20($inp),@MSG[2]
1381         pshufb          $TMP,@MSG[0]
1382         movdqu          0x30($inp),@MSG[3]
1383
1384         movdqa          0*32-0x80($Tbl),$Wi
1385         paddd           @MSG[0],$Wi
1386         pshufb          $TMP,@MSG[1]
1387         movdqa          $CDGH,$CDGH_SAVE        # offload
1388         movdqa          $ABEF,$ABEF_SAVE        # offload
1389 ___
1390         &$aesenc();
1391 $code.=<<___;
1392         sha256rnds2     $ABEF,$CDGH             # 0-3
1393         pshufd          \$0x0e,$Wi,$Wi
1394 ___
1395         &$aesenc();
1396 $code.=<<___;
1397         sha256rnds2     $CDGH,$ABEF
1398
1399         movdqa          1*32-0x80($Tbl),$Wi
1400         paddd           @MSG[1],$Wi
1401         pshufb          $TMP,@MSG[2]
1402         lea             0x40($inp),$inp
1403 ___
1404         &$aesenc();
1405 $code.=<<___;
1406         sha256rnds2     $ABEF,$CDGH             # 4-7
1407         pshufd          \$0x0e,$Wi,$Wi
1408 ___
1409         &$aesenc();
1410 $code.=<<___;
1411         sha256rnds2     $CDGH,$ABEF
1412
1413         movdqa          2*32-0x80($Tbl),$Wi
1414         paddd           @MSG[2],$Wi
1415         pshufb          $TMP,@MSG[3]
1416         sha256msg1      @MSG[1],@MSG[0]
1417 ___
1418         &$aesenc();
1419 $code.=<<___;
1420         sha256rnds2     $ABEF,$CDGH             # 8-11
1421         pshufd          \$0x0e,$Wi,$Wi
1422         movdqa          @MSG[3],$TMP
1423         palignr         \$4,@MSG[2],$TMP
1424         paddd           $TMP,@MSG[0]
1425 ___
1426         &$aesenc();
1427 $code.=<<___;
1428         sha256rnds2     $CDGH,$ABEF
1429
1430         movdqa          3*32-0x80($Tbl),$Wi
1431         paddd           @MSG[3],$Wi
1432         sha256msg2      @MSG[3],@MSG[0]
1433         sha256msg1      @MSG[2],@MSG[1]
1434 ___
1435         &$aesenc();
1436 $code.=<<___;
1437         sha256rnds2     $ABEF,$CDGH             # 12-15
1438         pshufd          \$0x0e,$Wi,$Wi
1439 ___
1440         &$aesenc();
1441 $code.=<<___;
1442         movdqa          @MSG[0],$TMP
1443         palignr         \$4,@MSG[3],$TMP
1444         paddd           $TMP,@MSG[1]
1445         sha256rnds2     $CDGH,$ABEF
1446 ___
1447 for($i=4;$i<16-3;$i++) {
1448         &$aesenc()      if (($r%10)==0);
1449 $code.=<<___;
1450         movdqa          $i*32-0x80($Tbl),$Wi
1451         paddd           @MSG[0],$Wi
1452         sha256msg2      @MSG[0],@MSG[1]
1453         sha256msg1      @MSG[3],@MSG[2]
1454 ___
1455         &$aesenc();
1456 $code.=<<___;
1457         sha256rnds2     $ABEF,$CDGH             # 16-19...
1458         pshufd          \$0x0e,$Wi,$Wi
1459         movdqa          @MSG[1],$TMP
1460         palignr         \$4,@MSG[0],$TMP
1461         paddd           $TMP,@MSG[2]
1462 ___
1463         &$aesenc();
1464         &$aesenc()      if ($r==19);
1465 $code.=<<___;
1466         sha256rnds2     $CDGH,$ABEF
1467 ___
1468         push(@MSG,shift(@MSG));
1469 }
1470 $code.=<<___;
1471         movdqa          13*32-0x80($Tbl),$Wi
1472         paddd           @MSG[0],$Wi
1473         sha256msg2      @MSG[0],@MSG[1]
1474         sha256msg1      @MSG[3],@MSG[2]
1475 ___
1476         &$aesenc();
1477 $code.=<<___;
1478         sha256rnds2     $ABEF,$CDGH             # 52-55
1479         pshufd          \$0x0e,$Wi,$Wi
1480         movdqa          @MSG[1],$TMP
1481         palignr         \$4,@MSG[0],$TMP
1482         paddd           $TMP,@MSG[2]
1483 ___
1484         &$aesenc();
1485         &$aesenc();
1486 $code.=<<___;
1487         sha256rnds2     $CDGH,$ABEF
1488
1489         movdqa          14*32-0x80($Tbl),$Wi
1490         paddd           @MSG[1],$Wi
1491         sha256msg2      @MSG[1],@MSG[2]
1492         movdqa          $BSWAP,$TMP
1493 ___
1494         &$aesenc();
1495 $code.=<<___;
1496         sha256rnds2     $ABEF,$CDGH             # 56-59
1497         pshufd          \$0x0e,$Wi,$Wi
1498 ___
1499         &$aesenc();
1500 $code.=<<___;
1501         sha256rnds2     $CDGH,$ABEF
1502
1503         movdqa          15*32-0x80($Tbl),$Wi
1504         paddd           @MSG[2],$Wi
1505 ___
1506         &$aesenc();
1507         &$aesenc();
1508 $code.=<<___;
1509         sha256rnds2     $ABEF,$CDGH             # 60-63
1510         pshufd          \$0x0e,$Wi,$Wi
1511 ___
1512         &$aesenc();
1513 $code.=<<___;
1514         sha256rnds2     $CDGH,$ABEF
1515         #pxor           $CDGH,$rndkey0          # black magic
1516 ___
1517         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1518 $code.=<<___;
1519         #xorps          $CDGH,$rndkey0          # black magic
1520         paddd           $CDGH_SAVE,$CDGH
1521         paddd           $ABEF_SAVE,$ABEF
1522
1523         dec             $len
1524         movups          $iv,48($out,$in0)       # write output
1525         lea             64($in0),$in0
1526         jnz             .Loop_shaext
1527
1528         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1529         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1530         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1531         punpckhqdq      $CDGH,$ABEF             # DCBA
1532         palignr         \$8,$TMP,$CDGH          # HGFE
1533
1534         movups          $iv,($ivp)              # write IV
1535         movdqu          $ABEF,($ctx)
1536         movdqu          $CDGH,16($ctx)
1537 ___
1538 $code.=<<___ if ($win64);
1539         movaps  0*16(%rsp),%xmm6
1540         movaps  1*16(%rsp),%xmm7
1541         movaps  2*16(%rsp),%xmm8
1542         movaps  3*16(%rsp),%xmm9
1543         movaps  4*16(%rsp),%xmm10
1544         movaps  5*16(%rsp),%xmm11
1545         movaps  6*16(%rsp),%xmm12
1546         movaps  7*16(%rsp),%xmm13
1547         movaps  8*16(%rsp),%xmm14
1548         movaps  9*16(%rsp),%xmm15
1549         lea     8+10*16(%rsp),%rsp
1550 .Lepilogue_shaext:
1551 ___
1552 $code.=<<___;
1553         ret
1554 .size   ${func}_shaext,.-${func}_shaext
1555 ___
1556 }
1557 }}}}}
1558
1559 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1560 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1561 if ($win64 && $avx) {
1562 $rec="%rcx";
1563 $frame="%rdx";
1564 $context="%r8";
1565 $disp="%r9";
1566
1567 $code.=<<___;
1568 .extern __imp_RtlVirtualUnwind
1569 .type   se_handler,\@abi-omnipotent
1570 .align  16
1571 se_handler:
1572         push    %rsi
1573         push    %rdi
1574         push    %rbx
1575         push    %rbp
1576         push    %r12
1577         push    %r13
1578         push    %r14
1579         push    %r15
1580         pushfq
1581         sub     \$64,%rsp
1582
1583         mov     120($context),%rax      # pull context->Rax
1584         mov     248($context),%rbx      # pull context->Rip
1585
1586         mov     8($disp),%rsi           # disp->ImageBase
1587         mov     56($disp),%r11          # disp->HanderlData
1588
1589         mov     0(%r11),%r10d           # HandlerData[0]
1590         lea     (%rsi,%r10),%r10        # prologue label
1591         cmp     %r10,%rbx               # context->Rip<prologue label
1592         jb      .Lin_prologue
1593
1594         mov     152($context),%rax      # pull context->Rsp
1595
1596         mov     4(%r11),%r10d           # HandlerData[1]
1597         lea     (%rsi,%r10),%r10        # epilogue label
1598         cmp     %r10,%rbx               # context->Rip>=epilogue label
1599         jae     .Lin_prologue
1600 ___
1601 $code.=<<___ if ($shaext);
1602         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1603         cmp     %r10,%rbx
1604         jb      .Lnot_in_shaext
1605
1606         lea     (%rax),%rsi
1607         lea     512($context),%rdi      # &context.Xmm6
1608         mov     \$20,%ecx
1609         .long   0xa548f3fc              # cld; rep movsq
1610         lea     168(%rax),%rax          # adjust stack pointer
1611         jmp     .Lin_prologue
1612 .Lnot_in_shaext:
1613 ___
1614 $code.=<<___ if ($avx>1);
1615         lea     .Lavx2_shortcut(%rip),%r10
1616         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1617         jb      .Lnot_in_avx2
1618
1619         and     \$-256*$SZ,%rax
1620         add     \$`2*$SZ*($rounds-8)`,%rax
1621 .Lnot_in_avx2:
1622 ___
1623 $code.=<<___;
1624         mov     %rax,%rsi               # put aside Rsp
1625         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1626
1627         mov     -8(%rax),%rbx
1628         mov     -16(%rax),%rbp
1629         mov     -24(%rax),%r12
1630         mov     -32(%rax),%r13
1631         mov     -40(%rax),%r14
1632         mov     -48(%rax),%r15
1633         mov     %rbx,144($context)      # restore context->Rbx
1634         mov     %rbp,160($context)      # restore context->Rbp
1635         mov     %r12,216($context)      # restore context->R12
1636         mov     %r13,224($context)      # restore context->R13
1637         mov     %r14,232($context)      # restore context->R14
1638         mov     %r15,240($context)      # restore context->R15
1639
1640         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1641         lea     512($context),%rdi      # &context.Xmm6
1642         mov     \$20,%ecx
1643         .long   0xa548f3fc              # cld; rep movsq
1644
1645 .Lin_prologue:
1646         mov     8(%rax),%rdi
1647         mov     16(%rax),%rsi
1648         mov     %rax,152($context)      # restore context->Rsp
1649         mov     %rsi,168($context)      # restore context->Rsi
1650         mov     %rdi,176($context)      # restore context->Rdi
1651
1652         mov     40($disp),%rdi          # disp->ContextRecord
1653         mov     $context,%rsi           # context
1654         mov     \$154,%ecx              # sizeof(CONTEXT)
1655         .long   0xa548f3fc              # cld; rep movsq
1656
1657         mov     $disp,%rsi
1658         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1659         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1660         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1661         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1662         mov     40(%rsi),%r10           # disp->ContextRecord
1663         lea     56(%rsi),%r11           # &disp->HandlerData
1664         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1665         mov     %r10,32(%rsp)           # arg5
1666         mov     %r11,40(%rsp)           # arg6
1667         mov     %r12,48(%rsp)           # arg7
1668         mov     %rcx,56(%rsp)           # arg8, (NULL)
1669         call    *__imp_RtlVirtualUnwind(%rip)
1670
1671         mov     \$1,%eax                # ExceptionContinueSearch
1672         add     \$64,%rsp
1673         popfq
1674         pop     %r15
1675         pop     %r14
1676         pop     %r13
1677         pop     %r12
1678         pop     %rbp
1679         pop     %rbx
1680         pop     %rdi
1681         pop     %rsi
1682         ret
1683 .size   se_handler,.-se_handler
1684
1685 .section        .pdata
1686         .rva    .LSEH_begin_${func}_xop
1687         .rva    .LSEH_end_${func}_xop
1688         .rva    .LSEH_info_${func}_xop
1689
1690         .rva    .LSEH_begin_${func}_avx
1691         .rva    .LSEH_end_${func}_avx
1692         .rva    .LSEH_info_${func}_avx
1693 ___
1694 $code.=<<___ if ($avx>1);
1695         .rva    .LSEH_begin_${func}_avx2
1696         .rva    .LSEH_end_${func}_avx2
1697         .rva    .LSEH_info_${func}_avx2
1698 ___
1699 $code.=<<___ if ($shaext);
1700         .rva    .LSEH_begin_${func}_shaext
1701         .rva    .LSEH_end_${func}_shaext
1702         .rva    .LSEH_info_${func}_shaext
1703 ___
1704 $code.=<<___;
1705 .section        .xdata
1706 .align  8
1707 .LSEH_info_${func}_xop:
1708         .byte   9,0,0,0
1709         .rva    se_handler
1710         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1711
1712 .LSEH_info_${func}_avx:
1713         .byte   9,0,0,0
1714         .rva    se_handler
1715         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1716 ___
1717 $code.=<<___ if ($avx>1);
1718 .LSEH_info_${func}_avx2:
1719         .byte   9,0,0,0
1720         .rva    se_handler
1721         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1722 ___
1723 $code.=<<___ if ($shaext);
1724 .LSEH_info_${func}_shaext:
1725         .byte   9,0,0,0
1726         .rva    se_handler
1727         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1728 ___
1729 }
1730
1731 ####################################################################
1732 sub rex {
1733   local *opcode=shift;
1734   my ($dst,$src)=@_;
1735   my $rex=0;
1736
1737     $rex|=0x04                  if($dst>=8);
1738     $rex|=0x01                  if($src>=8);
1739     unshift @opcode,$rex|0x40   if($rex);
1740 }
1741
1742 {
1743   my %opcodelet = (
1744                 "sha256rnds2" => 0xcb,
1745                 "sha256msg1"  => 0xcc,
1746                 "sha256msg2"  => 0xcd   );
1747
1748   sub sha256op38 {
1749     my $instr = shift;
1750
1751     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1752       my @opcode=(0x0f,0x38);
1753         rex(\@opcode,$2,$1);
1754         push @opcode,$opcodelet{$instr};
1755         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1756         return ".byte\t".join(',',@opcode);
1757     } else {
1758         return $instr."\t".@_[0];
1759     }
1760   }
1761 }
1762
1763 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1764 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1765 print $code;
1766 close STDOUT;