ef460237108ec4bdeb3aa620e542d67576114191
[openssl.git] / crypto / aes / asm / aesni-sha256-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # January 2013
18 #
19 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
21 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
22 # parallelism, interleaving it with another algorithm would allow to
23 # utilize processor resources better and achieve better performance.
24 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25 # AESNI code is weaved into it. As SHA256 dominates execution time,
26 # stitch performance does not depend on AES key length. Below are
27 # performance numbers in cycles per processed byte, less is better,
28 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
29 # subroutine:
30 #
31 #                AES-128/-192/-256+SHA256   this(**)    gain
32 # Sandy Bridge      5.05/6.05/7.05+11.6     13.0        +28%/36%/43%
33 # Ivy Bridge        5.05/6.05/7.05+10.3     11.6        +32%/41%/50%
34 # Haswell           4.43/5.29/6.19+7.80     8.79        +39%/49%/59%
35 # Skylake           2.62/3.14/3.62+7.70     8.10        +27%/34%/40%
36 # Bulldozer         5.77/6.89/8.00+13.7     13.7        +42%/50%/58%
37 # Ryzen(***)        2.71/-/3.71+2.05        2.74/-/3.73 +74%/-/54%
38 # Goldmont(***)     3.82/-/5.35+4.16        4.73/-/5.94 +69%/-/60%
39 #
40 # (*)   there are XOP, AVX1 and AVX2 code paths, meaning that
41 #       Westmere is omitted from loop, this is because gain was not
42 #       estimated high enough to justify the effort;
43 # (**)  these are EVP-free results, results obtained with 'speed
44 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
45 # (***) these are SHAEXT results;
46
47 $flavour = shift;
48 $output  = shift;
49 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
50
51 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
52
53 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
55 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
56 die "can't locate x86_64-xlate.pl";
57
58 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
59                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
60         $avx = ($1>=2.19) + ($1>=2.22);
61 }
62
63 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
64            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
65         $avx = ($1>=2.09) + ($1>=2.10);
66 }
67
68 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
69            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
70         $avx = ($1>=10) + ($1>=12);
71 }
72
73 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
74         $avx = ($2>=3.0) + ($2>3.0);
75 }
76
77 $shaext=$avx;   ### set to zero if compiling for 1.0.1
78 $avx=1          if (!$shaext && $avx);
79
80 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
81 *STDOUT=*OUT;
82
83 $func="aesni_cbc_sha256_enc";
84 $TABLE="K256";
85 $SZ=4;
86 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
87                                 "%r8d","%r9d","%r10d","%r11d");
88 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
89 @Sigma0=( 2,13,22);
90 @Sigma1=( 6,11,25);
91 @sigma0=( 7,18, 3);
92 @sigma1=(17,19,10);
93 $rounds=64;
94
95 ########################################################################
96 # void aesni_cbc_sha256_enc(const void *inp,
97 #                       void *out,
98 #                       size_t length,
99 #                       const AES_KEY *key,
100 #                       unsigned char *iv,
101 #                       SHA256_CTX *ctx,
102 #                       const void *in0);
103 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
104 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
105
106 $Tbl="%rbp";
107
108 $_inp="16*$SZ+0*8(%rsp)";
109 $_out="16*$SZ+1*8(%rsp)";
110 $_end="16*$SZ+2*8(%rsp)";
111 $_key="16*$SZ+3*8(%rsp)";
112 $_ivp="16*$SZ+4*8(%rsp)";
113 $_ctx="16*$SZ+5*8(%rsp)";
114 $_in0="16*$SZ+6*8(%rsp)";
115 $_rsp="`16*$SZ+7*8`(%rsp)";
116 $framesz=16*$SZ+8*8;
117
118 $code=<<___;
119 .text
120
121 .extern OPENSSL_ia32cap_P
122 .globl  $func
123 .type   $func,\@abi-omnipotent
124 .align  16
125 $func:
126 ___
127                                                 if ($avx) {
128 $code.=<<___;
129         lea     OPENSSL_ia32cap_P(%rip),%r11
130         mov     \$1,%eax
131         cmp     \$0,`$win64?"%rcx":"%rdi"`
132         je      .Lprobe
133         mov     0(%r11),%eax
134         mov     4(%r11),%r10
135 ___
136 $code.=<<___ if ($shaext);
137         bt      \$61,%r10                       # check for SHA
138         jc      ${func}_shaext
139 ___
140 $code.=<<___;
141         mov     %r10,%r11
142         shr     \$32,%r11
143
144         test    \$`1<<11`,%r10d                 # check for XOP
145         jnz     ${func}_xop
146 ___
147 $code.=<<___ if ($avx>1);
148         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
149         cmp     \$`1<<8|1<<5|1<<3`,%r11d
150         je      ${func}_avx2
151 ___
152 $code.=<<___;
153         and     \$`1<<28`,%r10d                 # check for AVX
154         jnz     ${func}_avx
155         ud2
156 ___
157                                                 }
158 $code.=<<___;
159         xor     %eax,%eax
160         cmp     \$0,`$win64?"%rcx":"%rdi"`
161         je      .Lprobe
162         ud2
163 .Lprobe:
164         ret
165 .size   $func,.-$func
166
167 .align  64
168 .type   $TABLE,\@object
169 $TABLE:
170         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
171         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
172         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
173         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
174         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
175         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
176         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
177         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
178         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
179         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
180         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
181         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
182         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
183         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
184         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
185         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
186         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
187         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
188         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
189         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
190         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
191         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
192         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
193         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
194         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
195         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
196         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
197         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
198         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
199         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
200         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
201         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
202
203         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
204         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
205         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
206         .long   0,0,0,0,   0,0,0,0
207         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
208 .align  64
209 ___
210
211 ######################################################################
212 # SIMD code paths
213 #
214 {{{
215 ($iv,$inout,$roundkey,$temp,
216  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
217
218 $aesni_cbc_idx=0;
219 @aesni_cbc_block = (
220 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
221 ##      &vmovdqu        ($inout,($inp));
222 ##      &mov            ($_inp,$inp);
223
224         '&vpxor         ($inout,$inout,$roundkey);'.
225         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
226
227         '&vpxor         ($inout,$inout,$iv);',
228
229         '&vaesenc       ($inout,$inout,$roundkey);'.
230         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
231
232         '&vaesenc       ($inout,$inout,$roundkey);'.
233         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
234
235         '&vaesenc       ($inout,$inout,$roundkey);'.
236         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
237
238         '&vaesenc       ($inout,$inout,$roundkey);'.
239         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
240
241         '&vaesenc       ($inout,$inout,$roundkey);'.
242         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
243
244         '&vaesenc       ($inout,$inout,$roundkey);'.
245         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
246
247         '&vaesenc       ($inout,$inout,$roundkey);'.
248         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
249
250         '&vaesenc       ($inout,$inout,$roundkey);'.
251         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
252
253         '&vaesenc       ($inout,$inout,$roundkey);'.
254         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
255
256         '&vaesenclast   ($temp,$inout,$roundkey);'.
257         ' &vaesenc      ($inout,$inout,$roundkey);'.
258         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
259
260         '&vpand         ($iv,$temp,$mask10);'.
261         ' &vaesenc      ($inout,$inout,$roundkey);'.
262         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
263
264         '&vaesenclast   ($temp,$inout,$roundkey);'.
265         ' &vaesenc      ($inout,$inout,$roundkey);'.
266         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
267
268         '&vpand         ($temp,$temp,$mask12);'.
269         ' &vaesenc      ($inout,$inout,$roundkey);'.
270          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
271
272         '&vpor          ($iv,$iv,$temp);'.
273         ' &vaesenclast  ($temp,$inout,$roundkey);'.
274         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
275
276 ##      &mov            ($inp,$_inp);
277 ##      &mov            ($out,$_out);
278 ##      &vpand          ($temp,$temp,$mask14);
279 ##      &vpor           ($iv,$iv,$temp);
280 ##      &vmovdqu        ($iv,($out,$inp);
281 ##      &lea            (inp,16($inp));
282 );
283
284 my $a4=$T1;
285 my ($a,$b,$c,$d,$e,$f,$g,$h);
286
287 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
288 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
289   my $arg = pop;
290     $arg = "\$$arg" if ($arg*1 eq $arg);
291     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
292 }
293
294 sub body_00_15 () {
295         (
296         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
297
298         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
299         '&mov   ($a,$a1)',
300         '&mov   ($a4,$f)',
301
302         '&xor   ($a0,$e)',
303         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
304         '&xor   ($a4,$g)',                      # f^g
305
306         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
307         '&xor   ($a1,$a)',
308         '&and   ($a4,$e)',                      # (f^g)&e
309
310         @aesni_cbc_block[$aesni_cbc_idx++].
311         '&xor   ($a0,$e)',
312         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
313         '&mov   ($a2,$a)',
314
315         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
316         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
317         '&xor   ($a2,$b)',                      # a^b, b^c in next round
318
319         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
320         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
321         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
322
323         '&xor   ($a1,$a)',
324         '&add   ($h,$a0)',                      # h+=Sigma1(e)
325         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
326
327         '&add   ($d,$h)',                       # d+=h
328         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
329         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
330
331         '&mov   ($a0,$d)',
332         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
333         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
334         );
335 }
336
337 if ($avx) {{
338 ######################################################################
339 # XOP code path
340 #
341 $code.=<<___;
342 .type   ${func}_xop,\@function,6
343 .align  64
344 ${func}_xop:
345 .cfi_startproc
346 .Lxop_shortcut:
347         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
348         mov     %rsp,%rax               # copy %rsp
349 .cfi_def_cfa_register   %rax
350         push    %rbx
351 .cfi_push       %rbx
352         push    %rbp
353 .cfi_push       %rbp
354         push    %r12
355 .cfi_push       %r12
356         push    %r13
357 .cfi_push       %r13
358         push    %r14
359 .cfi_push       %r14
360         push    %r15
361 .cfi_push       %r15
362         sub     \$`$framesz+$win64*16*10`,%rsp
363         and     \$-64,%rsp              # align stack frame
364
365         shl     \$6,$len
366         sub     $inp,$out               # re-bias
367         sub     $inp,$in0
368         add     $inp,$len               # end of input
369
370         #mov    $inp,$_inp              # saved later
371         mov     $out,$_out
372         mov     $len,$_end
373         #mov    $key,$_key              # remains resident in $inp register
374         mov     $ivp,$_ivp
375         mov     $ctx,$_ctx
376         mov     $in0,$_in0
377         mov     %rax,$_rsp
378 .cfi_cfa_expression     $_rsp,deref,+8
379 ___
380 $code.=<<___ if ($win64);
381         movaps  %xmm6,`$framesz+16*0`(%rsp)
382         movaps  %xmm7,`$framesz+16*1`(%rsp)
383         movaps  %xmm8,`$framesz+16*2`(%rsp)
384         movaps  %xmm9,`$framesz+16*3`(%rsp)
385         movaps  %xmm10,`$framesz+16*4`(%rsp)
386         movaps  %xmm11,`$framesz+16*5`(%rsp)
387         movaps  %xmm12,`$framesz+16*6`(%rsp)
388         movaps  %xmm13,`$framesz+16*7`(%rsp)
389         movaps  %xmm14,`$framesz+16*8`(%rsp)
390         movaps  %xmm15,`$framesz+16*9`(%rsp)
391 ___
392 $code.=<<___;
393 .Lprologue_xop:
394         vzeroall
395
396         mov     $inp,%r12               # borrow $a4
397         lea     0x80($key),$inp         # size optimization, reassign
398         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
399         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
400         mov     $ctx,%r15               # borrow $a2
401         mov     $in0,%rsi               # borrow $a3
402         vmovdqu ($ivp),$iv              # load IV
403         sub     \$9,%r14
404
405         mov     $SZ*0(%r15),$A
406         mov     $SZ*1(%r15),$B
407         mov     $SZ*2(%r15),$C
408         mov     $SZ*3(%r15),$D
409         mov     $SZ*4(%r15),$E
410         mov     $SZ*5(%r15),$F
411         mov     $SZ*6(%r15),$G
412         mov     $SZ*7(%r15),$H
413
414         vmovdqa 0x00(%r13,%r14,8),$mask14
415         vmovdqa 0x10(%r13,%r14,8),$mask12
416         vmovdqa 0x20(%r13,%r14,8),$mask10
417         vmovdqu 0x00-0x80($inp),$roundkey
418         jmp     .Lloop_xop
419 ___
420                                         if ($SZ==4) {   # SHA256
421     my @X = map("%xmm$_",(0..3));
422     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
423
424 $code.=<<___;
425 .align  16
426 .Lloop_xop:
427         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
428         vmovdqu 0x00(%rsi,%r12),@X[0]
429         vmovdqu 0x10(%rsi,%r12),@X[1]
430         vmovdqu 0x20(%rsi,%r12),@X[2]
431         vmovdqu 0x30(%rsi,%r12),@X[3]
432         vpshufb $t3,@X[0],@X[0]
433         lea     $TABLE(%rip),$Tbl
434         vpshufb $t3,@X[1],@X[1]
435         vpshufb $t3,@X[2],@X[2]
436         vpaddd  0x00($Tbl),@X[0],$t0
437         vpshufb $t3,@X[3],@X[3]
438         vpaddd  0x20($Tbl),@X[1],$t1
439         vpaddd  0x40($Tbl),@X[2],$t2
440         vpaddd  0x60($Tbl),@X[3],$t3
441         vmovdqa $t0,0x00(%rsp)
442         mov     $A,$a1
443         vmovdqa $t1,0x10(%rsp)
444         mov     $B,$a3
445         vmovdqa $t2,0x20(%rsp)
446         xor     $C,$a3                  # magic
447         vmovdqa $t3,0x30(%rsp)
448         mov     $E,$a0
449         jmp     .Lxop_00_47
450
451 .align  16
452 .Lxop_00_47:
453         sub     \$-16*2*$SZ,$Tbl        # size optimization
454         vmovdqu (%r12),$inout           # $a4
455         mov     %r12,$_inp              # $a4
456 ___
457 sub XOP_256_00_47 () {
458 my $j = shift;
459 my $body = shift;
460 my @X = @_;
461 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
462
463         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
464           eval(shift(@insns));
465           eval(shift(@insns));
466          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
467           eval(shift(@insns));
468           eval(shift(@insns));
469         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
470           eval(shift(@insns));
471           eval(shift(@insns));
472         &vpsrld         ($t0,$t0,$sigma0[2]);
473           eval(shift(@insns));
474           eval(shift(@insns));
475          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
476           eval(shift(@insns));
477           eval(shift(@insns));
478           eval(shift(@insns));
479           eval(shift(@insns));
480         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
481           eval(shift(@insns));
482           eval(shift(@insns));
483         &vpxor          ($t0,$t0,$t1);
484           eval(shift(@insns));
485           eval(shift(@insns));
486           eval(shift(@insns));
487           eval(shift(@insns));
488          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
489           eval(shift(@insns));
490           eval(shift(@insns));
491         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
492           eval(shift(@insns));
493           eval(shift(@insns));
494          &vpsrld        ($t2,@X[3],$sigma1[2]);
495           eval(shift(@insns));
496           eval(shift(@insns));
497         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
498           eval(shift(@insns));
499           eval(shift(@insns));
500          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
501           eval(shift(@insns));
502           eval(shift(@insns));
503          &vpxor         ($t3,$t3,$t2);
504           eval(shift(@insns));
505           eval(shift(@insns));
506           eval(shift(@insns));
507           eval(shift(@insns));
508          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
509           eval(shift(@insns));
510           eval(shift(@insns));
511           eval(shift(@insns));
512           eval(shift(@insns));
513         &vpsrldq        ($t3,$t3,8);
514           eval(shift(@insns));
515           eval(shift(@insns));
516           eval(shift(@insns));
517           eval(shift(@insns));
518         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
519           eval(shift(@insns));
520           eval(shift(@insns));
521           eval(shift(@insns));
522           eval(shift(@insns));
523          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
524           eval(shift(@insns));
525           eval(shift(@insns));
526          &vpsrld        ($t2,@X[0],$sigma1[2]);
527           eval(shift(@insns));
528           eval(shift(@insns));
529          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
530           eval(shift(@insns));
531           eval(shift(@insns));
532          &vpxor         ($t3,$t3,$t2);
533           eval(shift(@insns));
534           eval(shift(@insns));
535           eval(shift(@insns));
536           eval(shift(@insns));
537          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
538           eval(shift(@insns));
539           eval(shift(@insns));
540           eval(shift(@insns));
541           eval(shift(@insns));
542         &vpslldq        ($t3,$t3,8);            # 22 instructions
543           eval(shift(@insns));
544           eval(shift(@insns));
545           eval(shift(@insns));
546           eval(shift(@insns));
547         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
548           eval(shift(@insns));
549           eval(shift(@insns));
550           eval(shift(@insns));
551           eval(shift(@insns));
552         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
553           foreach (@insns) { eval; }            # remaining instructions
554         &vmovdqa        (16*$j."(%rsp)",$t2);
555 }
556
557     $aesni_cbc_idx=0;
558     for ($i=0,$j=0; $j<4; $j++) {
559         &XOP_256_00_47($j,\&body_00_15,@X);
560         push(@X,shift(@X));                     # rotate(@X)
561     }
562         &mov            ("%r12",$_inp);         # borrow $a4
563         &vpand          ($temp,$temp,$mask14);
564         &mov            ("%r15",$_out);         # borrow $a2
565         &vpor           ($iv,$iv,$temp);
566         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
567         &lea            ("%r12","16(%r12)");    # inp++
568
569         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
570         &jne    (".Lxop_00_47");
571
572         &vmovdqu        ($inout,"(%r12)");
573         &mov            ($_inp,"%r12");
574
575     $aesni_cbc_idx=0;
576     for ($i=0; $i<16; ) {
577         foreach(body_00_15()) { eval; }
578     }
579                                         }
580 $code.=<<___;
581         mov     $_inp,%r12              # borrow $a4
582         mov     $_out,%r13              # borrow $a0
583         mov     $_ctx,%r15              # borrow $a2
584         mov     $_in0,%rsi              # borrow $a3
585
586         vpand   $mask14,$temp,$temp
587         mov     $a1,$A
588         vpor    $temp,$iv,$iv
589         vmovdqu $iv,(%r13,%r12)         # write output
590         lea     16(%r12),%r12           # inp++
591
592         add     $SZ*0(%r15),$A
593         add     $SZ*1(%r15),$B
594         add     $SZ*2(%r15),$C
595         add     $SZ*3(%r15),$D
596         add     $SZ*4(%r15),$E
597         add     $SZ*5(%r15),$F
598         add     $SZ*6(%r15),$G
599         add     $SZ*7(%r15),$H
600
601         cmp     $_end,%r12
602
603         mov     $A,$SZ*0(%r15)
604         mov     $B,$SZ*1(%r15)
605         mov     $C,$SZ*2(%r15)
606         mov     $D,$SZ*3(%r15)
607         mov     $E,$SZ*4(%r15)
608         mov     $F,$SZ*5(%r15)
609         mov     $G,$SZ*6(%r15)
610         mov     $H,$SZ*7(%r15)
611
612         jb      .Lloop_xop
613
614         mov     $_ivp,$ivp
615         mov     $_rsp,%rsi
616 .cfi_def_cfa    %rsi,8
617         vmovdqu $iv,($ivp)              # output IV
618         vzeroall
619 ___
620 $code.=<<___ if ($win64);
621         movaps  `$framesz+16*0`(%rsp),%xmm6
622         movaps  `$framesz+16*1`(%rsp),%xmm7
623         movaps  `$framesz+16*2`(%rsp),%xmm8
624         movaps  `$framesz+16*3`(%rsp),%xmm9
625         movaps  `$framesz+16*4`(%rsp),%xmm10
626         movaps  `$framesz+16*5`(%rsp),%xmm11
627         movaps  `$framesz+16*6`(%rsp),%xmm12
628         movaps  `$framesz+16*7`(%rsp),%xmm13
629         movaps  `$framesz+16*8`(%rsp),%xmm14
630         movaps  `$framesz+16*9`(%rsp),%xmm15
631 ___
632 $code.=<<___;
633         mov     -48(%rsi),%r15
634 .cfi_restore    %r15
635         mov     -40(%rsi),%r14
636 .cfi_restore    %r14
637         mov     -32(%rsi),%r13
638 .cfi_restore    %r13
639         mov     -24(%rsi),%r12
640 .cfi_restore    %r12
641         mov     -16(%rsi),%rbp
642 .cfi_restore    %rbp
643         mov     -8(%rsi),%rbx
644 .cfi_restore    %rbx
645         lea     (%rsi),%rsp
646 .cfi_def_cfa_register   %rsp
647 .Lepilogue_xop:
648         ret
649 .cfi_endproc
650 .size   ${func}_xop,.-${func}_xop
651 ___
652 ######################################################################
653 # AVX+shrd code path
654 #
655 local *ror = sub { &shrd(@_[0],@_) };
656
657 $code.=<<___;
658 .type   ${func}_avx,\@function,6
659 .align  64
660 ${func}_avx:
661 .cfi_startproc
662 .Lavx_shortcut:
663         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
664         mov     %rsp,%rax               # copy %rsp
665 .cfi_def_cfa_register   %rax
666         push    %rbx
667 .cfi_push       %rbx
668         push    %rbp
669 .cfi_push       %rbp
670         push    %r12
671 .cfi_push       %r12
672         push    %r13
673 .cfi_push       %r13
674         push    %r14
675 .cfi_push       %r14
676         push    %r15
677 .cfi_push       %r15
678         sub     \$`$framesz+$win64*16*10`,%rsp
679         and     \$-64,%rsp              # align stack frame
680
681         shl     \$6,$len
682         sub     $inp,$out               # re-bias
683         sub     $inp,$in0
684         add     $inp,$len               # end of input
685
686         #mov    $inp,$_inp              # saved later
687         mov     $out,$_out
688         mov     $len,$_end
689         #mov    $key,$_key              # remains resident in $inp register
690         mov     $ivp,$_ivp
691         mov     $ctx,$_ctx
692         mov     $in0,$_in0
693         mov     %rax,$_rsp
694 .cfi_cfa_expression     $_rsp,deref,+8
695 ___
696 $code.=<<___ if ($win64);
697         movaps  %xmm6,`$framesz+16*0`(%rsp)
698         movaps  %xmm7,`$framesz+16*1`(%rsp)
699         movaps  %xmm8,`$framesz+16*2`(%rsp)
700         movaps  %xmm9,`$framesz+16*3`(%rsp)
701         movaps  %xmm10,`$framesz+16*4`(%rsp)
702         movaps  %xmm11,`$framesz+16*5`(%rsp)
703         movaps  %xmm12,`$framesz+16*6`(%rsp)
704         movaps  %xmm13,`$framesz+16*7`(%rsp)
705         movaps  %xmm14,`$framesz+16*8`(%rsp)
706         movaps  %xmm15,`$framesz+16*9`(%rsp)
707 ___
708 $code.=<<___;
709 .Lprologue_avx:
710         vzeroall
711
712         mov     $inp,%r12               # borrow $a4
713         lea     0x80($key),$inp         # size optimization, reassign
714         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
715         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
716         mov     $ctx,%r15               # borrow $a2
717         mov     $in0,%rsi               # borrow $a3
718         vmovdqu ($ivp),$iv              # load IV
719         sub     \$9,%r14
720
721         mov     $SZ*0(%r15),$A
722         mov     $SZ*1(%r15),$B
723         mov     $SZ*2(%r15),$C
724         mov     $SZ*3(%r15),$D
725         mov     $SZ*4(%r15),$E
726         mov     $SZ*5(%r15),$F
727         mov     $SZ*6(%r15),$G
728         mov     $SZ*7(%r15),$H
729
730         vmovdqa 0x00(%r13,%r14,8),$mask14
731         vmovdqa 0x10(%r13,%r14,8),$mask12
732         vmovdqa 0x20(%r13,%r14,8),$mask10
733         vmovdqu 0x00-0x80($inp),$roundkey
734 ___
735                                         if ($SZ==4) {   # SHA256
736     my @X = map("%xmm$_",(0..3));
737     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
738
739 $code.=<<___;
740         jmp     .Lloop_avx
741 .align  16
742 .Lloop_avx:
743         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
744         vmovdqu 0x00(%rsi,%r12),@X[0]
745         vmovdqu 0x10(%rsi,%r12),@X[1]
746         vmovdqu 0x20(%rsi,%r12),@X[2]
747         vmovdqu 0x30(%rsi,%r12),@X[3]
748         vpshufb $t3,@X[0],@X[0]
749         lea     $TABLE(%rip),$Tbl
750         vpshufb $t3,@X[1],@X[1]
751         vpshufb $t3,@X[2],@X[2]
752         vpaddd  0x00($Tbl),@X[0],$t0
753         vpshufb $t3,@X[3],@X[3]
754         vpaddd  0x20($Tbl),@X[1],$t1
755         vpaddd  0x40($Tbl),@X[2],$t2
756         vpaddd  0x60($Tbl),@X[3],$t3
757         vmovdqa $t0,0x00(%rsp)
758         mov     $A,$a1
759         vmovdqa $t1,0x10(%rsp)
760         mov     $B,$a3
761         vmovdqa $t2,0x20(%rsp)
762         xor     $C,$a3                  # magic
763         vmovdqa $t3,0x30(%rsp)
764         mov     $E,$a0
765         jmp     .Lavx_00_47
766
767 .align  16
768 .Lavx_00_47:
769         sub     \$-16*2*$SZ,$Tbl        # size optimization
770         vmovdqu (%r12),$inout           # $a4
771         mov     %r12,$_inp              # $a4
772 ___
773 sub Xupdate_256_AVX () {
774         (
775         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
776          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
777         '&vpsrld        ($t2,$t0,$sigma0[0]);',
778          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
779         '&vpsrld        ($t3,$t0,$sigma0[2])',
780         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
781         '&vpxor         ($t0,$t3,$t2)',
782          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
783         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
784         '&vpxor         ($t0,$t0,$t1)',
785         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
786         '&vpxor         ($t0,$t0,$t2)',
787          '&vpsrld       ($t2,$t3,$sigma1[2]);',
788         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
789          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
790         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
791          '&vpxor        ($t2,$t2,$t3);',
792          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
793          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
794          '&vpshufd      ($t2,$t2,0b10000100)',
795          '&vpsrldq      ($t2,$t2,8)',
796         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
797          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
798          '&vpsrld       ($t2,$t3,$sigma1[2])',
799          '&vpsrlq       ($t3,$t3,$sigma1[0])',
800          '&vpxor        ($t2,$t2,$t3);',
801          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
802          '&vpxor        ($t2,$t2,$t3)',
803          '&vpshufd      ($t2,$t2,0b11101000)',
804          '&vpslldq      ($t2,$t2,8)',
805         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
806         );
807 }
808
809 sub AVX_256_00_47 () {
810 my $j = shift;
811 my $body = shift;
812 my @X = @_;
813 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
814
815         foreach (Xupdate_256_AVX()) {           # 29 instructions
816             eval;
817             eval(shift(@insns));
818             eval(shift(@insns));
819             eval(shift(@insns));
820         }
821         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
822           foreach (@insns) { eval; }            # remaining instructions
823         &vmovdqa        (16*$j."(%rsp)",$t2);
824 }
825
826     $aesni_cbc_idx=0;
827     for ($i=0,$j=0; $j<4; $j++) {
828         &AVX_256_00_47($j,\&body_00_15,@X);
829         push(@X,shift(@X));                     # rotate(@X)
830     }
831         &mov            ("%r12",$_inp);         # borrow $a4
832         &vpand          ($temp,$temp,$mask14);
833         &mov            ("%r15",$_out);         # borrow $a2
834         &vpor           ($iv,$iv,$temp);
835         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
836         &lea            ("%r12","16(%r12)");    # inp++
837
838         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
839         &jne    (".Lavx_00_47");
840
841         &vmovdqu        ($inout,"(%r12)");
842         &mov            ($_inp,"%r12");
843
844     $aesni_cbc_idx=0;
845     for ($i=0; $i<16; ) {
846         foreach(body_00_15()) { eval; }
847     }
848
849                                         }
850 $code.=<<___;
851         mov     $_inp,%r12              # borrow $a4
852         mov     $_out,%r13              # borrow $a0
853         mov     $_ctx,%r15              # borrow $a2
854         mov     $_in0,%rsi              # borrow $a3
855
856         vpand   $mask14,$temp,$temp
857         mov     $a1,$A
858         vpor    $temp,$iv,$iv
859         vmovdqu $iv,(%r13,%r12)         # write output
860         lea     16(%r12),%r12           # inp++
861
862         add     $SZ*0(%r15),$A
863         add     $SZ*1(%r15),$B
864         add     $SZ*2(%r15),$C
865         add     $SZ*3(%r15),$D
866         add     $SZ*4(%r15),$E
867         add     $SZ*5(%r15),$F
868         add     $SZ*6(%r15),$G
869         add     $SZ*7(%r15),$H
870
871         cmp     $_end,%r12
872
873         mov     $A,$SZ*0(%r15)
874         mov     $B,$SZ*1(%r15)
875         mov     $C,$SZ*2(%r15)
876         mov     $D,$SZ*3(%r15)
877         mov     $E,$SZ*4(%r15)
878         mov     $F,$SZ*5(%r15)
879         mov     $G,$SZ*6(%r15)
880         mov     $H,$SZ*7(%r15)
881         jb      .Lloop_avx
882
883         mov     $_ivp,$ivp
884         mov     $_rsp,%rsi
885 .cfi_def_cfa    %rsi,8
886         vmovdqu $iv,($ivp)              # output IV
887         vzeroall
888 ___
889 $code.=<<___ if ($win64);
890         movaps  `$framesz+16*0`(%rsp),%xmm6
891         movaps  `$framesz+16*1`(%rsp),%xmm7
892         movaps  `$framesz+16*2`(%rsp),%xmm8
893         movaps  `$framesz+16*3`(%rsp),%xmm9
894         movaps  `$framesz+16*4`(%rsp),%xmm10
895         movaps  `$framesz+16*5`(%rsp),%xmm11
896         movaps  `$framesz+16*6`(%rsp),%xmm12
897         movaps  `$framesz+16*7`(%rsp),%xmm13
898         movaps  `$framesz+16*8`(%rsp),%xmm14
899         movaps  `$framesz+16*9`(%rsp),%xmm15
900 ___
901 $code.=<<___;
902         mov     -48(%rsi),%r15
903 .cfi_restore    %r15
904         mov     -40(%rsi),%r14
905 .cfi_restore    %r14
906         mov     -32(%rsi),%r13
907 .cfi_restore    %r13
908         mov     -24(%rsi),%r12
909 .cfi_restore    %r12
910         mov     -16(%rsi),%rbp
911 .cfi_restore    %rbp
912         mov     -8(%rsi),%rbx
913 .cfi_restore    %rbx
914         lea     (%rsi),%rsp
915 .cfi_def_cfa_register   %rsp
916 .Lepilogue_avx:
917         ret
918 .cfi_endproc
919 .size   ${func}_avx,.-${func}_avx
920 ___
921
922 if ($avx>1) {{
923 ######################################################################
924 # AVX2+BMI code path
925 #
926 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp
927 my $PUSH8=8*2*$SZ;
928 use integer;
929
930 sub bodyx_00_15 () {
931         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
932         (
933         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
934
935         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
936         '&and   ($a4,$e)',              # f&e
937         '&rorx  ($a0,$e,$Sigma1[2])',
938         '&rorx  ($a2,$e,$Sigma1[1])',
939
940         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
941         '&lea   ($h,"($h,$a4)")',
942         '&andn  ($a4,$e,$g)',           # ~e&g
943         '&xor   ($a0,$a2)',
944
945         '&rorx  ($a1,$e,$Sigma1[0])',
946         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
947         '&xor   ($a0,$a1)',             # Sigma1(e)
948         '&mov   ($a2,$a)',
949
950         '&rorx  ($a4,$a,$Sigma0[2])',
951         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
952         '&xor   ($a2,$b)',              # a^b, b^c in next round
953         '&rorx  ($a1,$a,$Sigma0[1])',
954
955         '&rorx  ($a0,$a,$Sigma0[0])',
956         '&lea   ($d,"($d,$h)")',        # d+=h
957         '&and   ($a3,$a2)',             # (b^c)&(a^b)
958         @aesni_cbc_block[$aesni_cbc_idx++].
959         '&xor   ($a1,$a4)',
960
961         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
962         '&xor   ($a1,$a0)',             # Sigma0(a)
963         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
964         '&mov   ($a4,$e)',              # copy of f in future
965
966         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
967         );
968         # and at the finish one has to $a+=$a1
969 }
970
971 $code.=<<___;
972 .type   ${func}_avx2,\@function,6
973 .align  64
974 ${func}_avx2:
975 .cfi_startproc
976 .Lavx2_shortcut:
977         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
978         mov     %rsp,%rax               # copy %rsp
979 .cfi_def_cfa_register   %rax
980         push    %rbx
981 .cfi_push       %rbx
982         push    %rbp
983 .cfi_push       %rbp
984         push    %r12
985 .cfi_push       %r12
986         push    %r13
987 .cfi_push       %r13
988         push    %r14
989 .cfi_push       %r14
990         push    %r15
991 .cfi_push       %r15
992         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
993         and     \$-256*$SZ,%rsp         # align stack frame
994         add     \$`2*$SZ*($rounds-8)`,%rsp
995
996         shl     \$6,$len
997         sub     $inp,$out               # re-bias
998         sub     $inp,$in0
999         add     $inp,$len               # end of input
1000
1001         #mov    $inp,$_inp              # saved later
1002         #mov    $out,$_out              # kept in $offload
1003         mov     $len,$_end
1004         #mov    $key,$_key              # remains resident in $inp register
1005         mov     $ivp,$_ivp
1006         mov     $ctx,$_ctx
1007         mov     $in0,$_in0
1008         mov     %rax,$_rsp
1009 .cfi_cfa_expression     $_rsp,deref,+8
1010 ___
1011 $code.=<<___ if ($win64);
1012         movaps  %xmm6,`$framesz+16*0`(%rsp)
1013         movaps  %xmm7,`$framesz+16*1`(%rsp)
1014         movaps  %xmm8,`$framesz+16*2`(%rsp)
1015         movaps  %xmm9,`$framesz+16*3`(%rsp)
1016         movaps  %xmm10,`$framesz+16*4`(%rsp)
1017         movaps  %xmm11,`$framesz+16*5`(%rsp)
1018         movaps  %xmm12,`$framesz+16*6`(%rsp)
1019         movaps  %xmm13,`$framesz+16*7`(%rsp)
1020         movaps  %xmm14,`$framesz+16*8`(%rsp)
1021         movaps  %xmm15,`$framesz+16*9`(%rsp)
1022 ___
1023 $code.=<<___;
1024 .Lprologue_avx2:
1025         vzeroall
1026
1027         mov     $inp,%r13               # borrow $a0
1028         vpinsrq \$1,$out,$offload,$offload
1029         lea     0x80($key),$inp         # size optimization, reassign
1030         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
1031         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
1032         mov     $ctx,%r15               # borrow $a2
1033         mov     $in0,%rsi               # borrow $a3
1034         vmovdqu ($ivp),$iv              # load IV
1035         lea     -9(%r14),%r14
1036
1037         vmovdqa 0x00(%r12,%r14,8),$mask14
1038         vmovdqa 0x10(%r12,%r14,8),$mask12
1039         vmovdqa 0x20(%r12,%r14,8),$mask10
1040
1041         sub     \$-16*$SZ,%r13          # inp++, size optimization
1042         mov     $SZ*0(%r15),$A
1043         lea     (%rsi,%r13),%r12        # borrow $a0
1044         mov     $SZ*1(%r15),$B
1045         cmp     $len,%r13               # $_end
1046         mov     $SZ*2(%r15),$C
1047         cmove   %rsp,%r12               # next block or random data
1048         mov     $SZ*3(%r15),$D
1049         mov     $SZ*4(%r15),$E
1050         mov     $SZ*5(%r15),$F
1051         mov     $SZ*6(%r15),$G
1052         mov     $SZ*7(%r15),$H
1053         vmovdqu 0x00-0x80($inp),$roundkey
1054 ___
1055                                         if ($SZ==4) {   # SHA256
1056     my @X = map("%ymm$_",(0..3));
1057     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1058
1059 $code.=<<___;
1060         jmp     .Loop_avx2
1061 .align  16
1062 .Loop_avx2:
1063         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1064         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1065         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1066         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1067         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1068
1069         vinserti128     \$1,(%r12),@X[0],@X[0]
1070         vinserti128     \$1,16(%r12),@X[1],@X[1]
1071          vpshufb        $t3,@X[0],@X[0]
1072         vinserti128     \$1,32(%r12),@X[2],@X[2]
1073          vpshufb        $t3,@X[1],@X[1]
1074         vinserti128     \$1,48(%r12),@X[3],@X[3]
1075
1076         lea     $TABLE(%rip),$Tbl
1077         vpshufb $t3,@X[2],@X[2]
1078         lea     -16*$SZ(%r13),%r13
1079         vpaddd  0x00($Tbl),@X[0],$t0
1080         vpshufb $t3,@X[3],@X[3]
1081         vpaddd  0x20($Tbl),@X[1],$t1
1082         vpaddd  0x40($Tbl),@X[2],$t2
1083         vpaddd  0x60($Tbl),@X[3],$t3
1084         vmovdqa $t0,0x00(%rsp)
1085         xor     $a1,$a1
1086         vmovdqa $t1,0x20(%rsp)
1087         lea     -$PUSH8(%rsp),%rsp
1088         mov     $B,$a3
1089         vmovdqa $t2,0x00(%rsp)
1090         xor     $C,$a3                  # magic
1091         vmovdqa $t3,0x20(%rsp)
1092         mov     $F,$a4
1093         sub     \$-16*2*$SZ,$Tbl        # size optimization
1094         jmp     .Lavx2_00_47
1095
1096 .align  16
1097 .Lavx2_00_47:
1098         vmovdqu (%r13),$inout
1099         vpinsrq \$0,%r13,$offload,$offload
1100 ___
1101
1102 sub AVX2_256_00_47 () {
1103 my $j = shift;
1104 my $body = shift;
1105 my @X = @_;
1106 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1107 my $base = "+2*$PUSH8(%rsp)";
1108
1109         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1110         foreach (Xupdate_256_AVX()) {           # 29 instructions
1111             eval;
1112             eval(shift(@insns));
1113             eval(shift(@insns));
1114             eval(shift(@insns));
1115         }
1116         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1117           foreach (@insns) { eval; }            # remaining instructions
1118         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1119 }
1120     $aesni_cbc_idx=0;
1121     for ($i=0,$j=0; $j<4; $j++) {
1122         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1123         push(@X,shift(@X));                     # rotate(@X)
1124     }
1125         &vmovq          ("%r13",$offload);      # borrow $a0
1126         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1127         &vpand          ($temp,$temp,$mask14);
1128         &vpor           ($iv,$iv,$temp);
1129         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1130         &lea            ("%r13","16(%r13)");    # inp++
1131
1132         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1133         &cmpb   (($SZ-1)."($Tbl)",0);
1134         &jne    (".Lavx2_00_47");
1135
1136         &vmovdqu        ($inout,"(%r13)");
1137         &vpinsrq        ($offload,$offload,"%r13",0);
1138
1139     $aesni_cbc_idx=0;
1140     for ($i=0; $i<16; ) {
1141         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1142         foreach(bodyx_00_15()) { eval; }
1143     }
1144                                         }
1145 $code.=<<___;
1146         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1147         vmovq   $offload,%r13                   # $_inp, borrow $a0
1148         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1149         add     $a1,$A
1150         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1151
1152         vpand   $mask14,$temp,$temp
1153         vpor    $temp,$iv,$iv
1154         vmovdqu $iv,(%r12,%r13)                 # write output
1155         lea     16(%r13),%r13
1156
1157         add     $SZ*0(%r15),$A
1158         add     $SZ*1(%r15),$B
1159         add     $SZ*2(%r15),$C
1160         add     $SZ*3(%r15),$D
1161         add     $SZ*4(%r15),$E
1162         add     $SZ*5(%r15),$F
1163         add     $SZ*6(%r15),$G
1164         add     $SZ*7(%r15),$H
1165
1166         mov     $A,$SZ*0(%r15)
1167         mov     $B,$SZ*1(%r15)
1168         mov     $C,$SZ*2(%r15)
1169         mov     $D,$SZ*3(%r15)
1170         mov     $E,$SZ*4(%r15)
1171         mov     $F,$SZ*5(%r15)
1172         mov     $G,$SZ*6(%r15)
1173         mov     $H,$SZ*7(%r15)
1174
1175         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1176         je      .Ldone_avx2
1177
1178         xor     $a1,$a1
1179         mov     $B,$a3
1180         mov     $F,$a4
1181         xor     $C,$a3                  # magic
1182         jmp     .Lower_avx2
1183 .align  16
1184 .Lower_avx2:
1185         vmovdqu (%r13),$inout
1186         vpinsrq \$0,%r13,$offload,$offload
1187 ___
1188     $aesni_cbc_idx=0;
1189     for ($i=0; $i<16; ) {
1190         my $base="+16($Tbl)";
1191         foreach(bodyx_00_15()) { eval; }
1192         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1193     }
1194 $code.=<<___;
1195         vmovq   $offload,%r13                   # borrow $a0
1196         vpextrq \$1,$offload,%r15               # borrow $a2
1197         vpand   $mask14,$temp,$temp
1198         vpor    $temp,$iv,$iv
1199         lea     -$PUSH8($Tbl),$Tbl
1200         vmovdqu $iv,(%r15,%r13)                 # write output
1201         lea     16(%r13),%r13                   # inp++
1202         cmp     %rsp,$Tbl
1203         jae     .Lower_avx2
1204
1205         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1206         lea     16*$SZ(%r13),%r13
1207         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1208         add     $a1,$A
1209         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1210
1211         add     $SZ*0(%r15),$A
1212         add     $SZ*1(%r15),$B
1213         add     $SZ*2(%r15),$C
1214         add     $SZ*3(%r15),$D
1215         add     $SZ*4(%r15),$E
1216         add     $SZ*5(%r15),$F
1217         add     $SZ*6(%r15),$G
1218         lea     (%rsi,%r13),%r12
1219         add     $SZ*7(%r15),$H
1220
1221         cmp     $_end,%r13
1222
1223         mov     $A,$SZ*0(%r15)
1224         cmove   %rsp,%r12               # next block or stale data
1225         mov     $B,$SZ*1(%r15)
1226         mov     $C,$SZ*2(%r15)
1227         mov     $D,$SZ*3(%r15)
1228         mov     $E,$SZ*4(%r15)
1229         mov     $F,$SZ*5(%r15)
1230         mov     $G,$SZ*6(%r15)
1231         mov     $H,$SZ*7(%r15)
1232
1233         jbe     .Loop_avx2
1234         lea     (%rsp),$Tbl
1235
1236 .Ldone_avx2:
1237         lea     ($Tbl),%rsp
1238         mov     $_ivp,$ivp
1239         mov     $_rsp,%rsi
1240 .cfi_def_cfa    %rsi,8
1241         vmovdqu $iv,($ivp)              # output IV
1242         vzeroall
1243 ___
1244 $code.=<<___ if ($win64);
1245         movaps  `$framesz+16*0`(%rsp),%xmm6
1246         movaps  `$framesz+16*1`(%rsp),%xmm7
1247         movaps  `$framesz+16*2`(%rsp),%xmm8
1248         movaps  `$framesz+16*3`(%rsp),%xmm9
1249         movaps  `$framesz+16*4`(%rsp),%xmm10
1250         movaps  `$framesz+16*5`(%rsp),%xmm11
1251         movaps  `$framesz+16*6`(%rsp),%xmm12
1252         movaps  `$framesz+16*7`(%rsp),%xmm13
1253         movaps  `$framesz+16*8`(%rsp),%xmm14
1254         movaps  `$framesz+16*9`(%rsp),%xmm15
1255 ___
1256 $code.=<<___;
1257         mov     -48(%rsi),%r15
1258 .cfi_restore    %r15
1259         mov     -40(%rsi),%r14
1260 .cfi_restore    %r14
1261         mov     -32(%rsi),%r13
1262 .cfi_restore    %r13
1263         mov     -24(%rsi),%r12
1264 .cfi_restore    %r12
1265         mov     -16(%rsi),%rbp
1266 .cfi_restore    %rbp
1267         mov     -8(%rsi),%rbx
1268 .cfi_restore    %rbx
1269         lea     (%rsi),%rsp
1270 .cfi_def_cfa_register   %rsp
1271 .Lepilogue_avx2:
1272         ret
1273 .cfi_endproc
1274 .size   ${func}_avx2,.-${func}_avx2
1275 ___
1276 }}
1277 }}
1278 {{
1279 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1280
1281 my ($rounds,$Tbl)=("%r11d","%rbx");
1282
1283 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1284 my @rndkey=("%xmm4","%xmm5");
1285 my $r=0;
1286 my $sn=0;
1287
1288 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1289 my @MSG=map("%xmm$_",(10..13));
1290
1291 my $aesenc=sub {
1292   use integer;
1293   my ($n,$k)=($r/10,$r%10);
1294     if ($k==0) {
1295       $code.=<<___;
1296         movups          `16*$n`($in0),$in               # load input
1297         xorps           $rndkey0,$in
1298 ___
1299       $code.=<<___ if ($n);
1300         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1301 ___
1302       $code.=<<___;
1303         xorps           $in,$iv
1304         movups          `32+16*$k-112`($key),$rndkey[1]
1305         aesenc          $rndkey[0],$iv
1306 ___
1307     } elsif ($k==9) {
1308       $sn++;
1309       $code.=<<___;
1310         cmp             \$11,$rounds
1311         jb              .Laesenclast$sn
1312         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1313         aesenc          $rndkey[0],$iv
1314         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1315         aesenc          $rndkey[1],$iv
1316         je              .Laesenclast$sn
1317         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1318         aesenc          $rndkey[0],$iv
1319         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1320         aesenc          $rndkey[1],$iv
1321 .Laesenclast$sn:
1322         aesenclast      $rndkey[0],$iv
1323         movups          16-112($key),$rndkey[1]         # forward reference
1324         nop
1325 ___
1326     } else {
1327       $code.=<<___;
1328         movups          `32+16*$k-112`($key),$rndkey[1]
1329         aesenc          $rndkey[0],$iv
1330 ___
1331     }
1332     $r++;       unshift(@rndkey,pop(@rndkey));
1333 };
1334
1335 if ($shaext) {
1336 my $Tbl="%rax";
1337
1338 $code.=<<___;
1339 .type   ${func}_shaext,\@function,6
1340 .align  32
1341 ${func}_shaext:
1342         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1343 ___
1344 $code.=<<___ if ($win64);
1345         lea     `-8-10*16`(%rsp),%rsp
1346         movaps  %xmm6,-8-10*16(%rax)
1347         movaps  %xmm7,-8-9*16(%rax)
1348         movaps  %xmm8,-8-8*16(%rax)
1349         movaps  %xmm9,-8-7*16(%rax)
1350         movaps  %xmm10,-8-6*16(%rax)
1351         movaps  %xmm11,-8-5*16(%rax)
1352         movaps  %xmm12,-8-4*16(%rax)
1353         movaps  %xmm13,-8-3*16(%rax)
1354         movaps  %xmm14,-8-2*16(%rax)
1355         movaps  %xmm15,-8-1*16(%rax)
1356 .Lprologue_shaext:
1357 ___
1358 $code.=<<___;
1359         lea             K256+0x80(%rip),$Tbl
1360         movdqu          ($ctx),$ABEF            # DCBA
1361         movdqu          16($ctx),$CDGH          # HGFE
1362         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1363
1364         mov             240($key),$rounds
1365         sub             $in0,$out
1366         movups          ($key),$rndkey0         # $key[0]
1367         movups          ($ivp),$iv              # load IV
1368         movups          16($key),$rndkey[0]     # forward reference
1369         lea             112($key),$key          # size optimization
1370
1371         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1372         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1373         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1374         movdqa          $TMP,$BSWAP             # offload
1375         palignr         \$8,$CDGH,$ABEF         # ABEF
1376         punpcklqdq      $Wi,$CDGH               # CDGH
1377
1378         jmp     .Loop_shaext
1379
1380 .align  16
1381 .Loop_shaext:
1382         movdqu          ($inp),@MSG[0]
1383         movdqu          0x10($inp),@MSG[1]
1384         movdqu          0x20($inp),@MSG[2]
1385         pshufb          $TMP,@MSG[0]
1386         movdqu          0x30($inp),@MSG[3]
1387
1388         movdqa          0*32-0x80($Tbl),$Wi
1389         paddd           @MSG[0],$Wi
1390         pshufb          $TMP,@MSG[1]
1391         movdqa          $CDGH,$CDGH_SAVE        # offload
1392         movdqa          $ABEF,$ABEF_SAVE        # offload
1393 ___
1394         &$aesenc();
1395 $code.=<<___;
1396         sha256rnds2     $ABEF,$CDGH             # 0-3
1397         pshufd          \$0x0e,$Wi,$Wi
1398 ___
1399         &$aesenc();
1400 $code.=<<___;
1401         sha256rnds2     $CDGH,$ABEF
1402
1403         movdqa          1*32-0x80($Tbl),$Wi
1404         paddd           @MSG[1],$Wi
1405         pshufb          $TMP,@MSG[2]
1406         lea             0x40($inp),$inp
1407 ___
1408         &$aesenc();
1409 $code.=<<___;
1410         sha256rnds2     $ABEF,$CDGH             # 4-7
1411         pshufd          \$0x0e,$Wi,$Wi
1412 ___
1413         &$aesenc();
1414 $code.=<<___;
1415         sha256rnds2     $CDGH,$ABEF
1416
1417         movdqa          2*32-0x80($Tbl),$Wi
1418         paddd           @MSG[2],$Wi
1419         pshufb          $TMP,@MSG[3]
1420         sha256msg1      @MSG[1],@MSG[0]
1421 ___
1422         &$aesenc();
1423 $code.=<<___;
1424         sha256rnds2     $ABEF,$CDGH             # 8-11
1425         pshufd          \$0x0e,$Wi,$Wi
1426         movdqa          @MSG[3],$TMP
1427         palignr         \$4,@MSG[2],$TMP
1428         paddd           $TMP,@MSG[0]
1429 ___
1430         &$aesenc();
1431 $code.=<<___;
1432         sha256rnds2     $CDGH,$ABEF
1433
1434         movdqa          3*32-0x80($Tbl),$Wi
1435         paddd           @MSG[3],$Wi
1436         sha256msg2      @MSG[3],@MSG[0]
1437         sha256msg1      @MSG[2],@MSG[1]
1438 ___
1439         &$aesenc();
1440 $code.=<<___;
1441         sha256rnds2     $ABEF,$CDGH             # 12-15
1442         pshufd          \$0x0e,$Wi,$Wi
1443 ___
1444         &$aesenc();
1445 $code.=<<___;
1446         movdqa          @MSG[0],$TMP
1447         palignr         \$4,@MSG[3],$TMP
1448         paddd           $TMP,@MSG[1]
1449         sha256rnds2     $CDGH,$ABEF
1450 ___
1451 for($i=4;$i<16-3;$i++) {
1452         &$aesenc()      if (($r%10)==0);
1453 $code.=<<___;
1454         movdqa          $i*32-0x80($Tbl),$Wi
1455         paddd           @MSG[0],$Wi
1456         sha256msg2      @MSG[0],@MSG[1]
1457         sha256msg1      @MSG[3],@MSG[2]
1458 ___
1459         &$aesenc();
1460 $code.=<<___;
1461         sha256rnds2     $ABEF,$CDGH             # 16-19...
1462         pshufd          \$0x0e,$Wi,$Wi
1463         movdqa          @MSG[1],$TMP
1464         palignr         \$4,@MSG[0],$TMP
1465         paddd           $TMP,@MSG[2]
1466 ___
1467         &$aesenc();
1468         &$aesenc()      if ($r==19);
1469 $code.=<<___;
1470         sha256rnds2     $CDGH,$ABEF
1471 ___
1472         push(@MSG,shift(@MSG));
1473 }
1474 $code.=<<___;
1475         movdqa          13*32-0x80($Tbl),$Wi
1476         paddd           @MSG[0],$Wi
1477         sha256msg2      @MSG[0],@MSG[1]
1478         sha256msg1      @MSG[3],@MSG[2]
1479 ___
1480         &$aesenc();
1481 $code.=<<___;
1482         sha256rnds2     $ABEF,$CDGH             # 52-55
1483         pshufd          \$0x0e,$Wi,$Wi
1484         movdqa          @MSG[1],$TMP
1485         palignr         \$4,@MSG[0],$TMP
1486         paddd           $TMP,@MSG[2]
1487 ___
1488         &$aesenc();
1489         &$aesenc();
1490 $code.=<<___;
1491         sha256rnds2     $CDGH,$ABEF
1492
1493         movdqa          14*32-0x80($Tbl),$Wi
1494         paddd           @MSG[1],$Wi
1495         sha256msg2      @MSG[1],@MSG[2]
1496         movdqa          $BSWAP,$TMP
1497 ___
1498         &$aesenc();
1499 $code.=<<___;
1500         sha256rnds2     $ABEF,$CDGH             # 56-59
1501         pshufd          \$0x0e,$Wi,$Wi
1502 ___
1503         &$aesenc();
1504 $code.=<<___;
1505         sha256rnds2     $CDGH,$ABEF
1506
1507         movdqa          15*32-0x80($Tbl),$Wi
1508         paddd           @MSG[2],$Wi
1509 ___
1510         &$aesenc();
1511         &$aesenc();
1512 $code.=<<___;
1513         sha256rnds2     $ABEF,$CDGH             # 60-63
1514         pshufd          \$0x0e,$Wi,$Wi
1515 ___
1516         &$aesenc();
1517 $code.=<<___;
1518         sha256rnds2     $CDGH,$ABEF
1519         #pxor           $CDGH,$rndkey0          # black magic
1520 ___
1521         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1522 $code.=<<___;
1523         #xorps          $CDGH,$rndkey0          # black magic
1524         paddd           $CDGH_SAVE,$CDGH
1525         paddd           $ABEF_SAVE,$ABEF
1526
1527         dec             $len
1528         movups          $iv,48($out,$in0)       # write output
1529         lea             64($in0),$in0
1530         jnz             .Loop_shaext
1531
1532         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1533         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1534         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1535         punpckhqdq      $CDGH,$ABEF             # DCBA
1536         palignr         \$8,$TMP,$CDGH          # HGFE
1537
1538         movups          $iv,($ivp)              # write IV
1539         movdqu          $ABEF,($ctx)
1540         movdqu          $CDGH,16($ctx)
1541 ___
1542 $code.=<<___ if ($win64);
1543         movaps  0*16(%rsp),%xmm6
1544         movaps  1*16(%rsp),%xmm7
1545         movaps  2*16(%rsp),%xmm8
1546         movaps  3*16(%rsp),%xmm9
1547         movaps  4*16(%rsp),%xmm10
1548         movaps  5*16(%rsp),%xmm11
1549         movaps  6*16(%rsp),%xmm12
1550         movaps  7*16(%rsp),%xmm13
1551         movaps  8*16(%rsp),%xmm14
1552         movaps  9*16(%rsp),%xmm15
1553         lea     8+10*16(%rsp),%rsp
1554 .Lepilogue_shaext:
1555 ___
1556 $code.=<<___;
1557         ret
1558 .size   ${func}_shaext,.-${func}_shaext
1559 ___
1560 }
1561 }}}}}
1562
1563 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1564 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1565 if ($win64 && $avx) {
1566 $rec="%rcx";
1567 $frame="%rdx";
1568 $context="%r8";
1569 $disp="%r9";
1570
1571 $code.=<<___;
1572 .extern __imp_RtlVirtualUnwind
1573 .type   se_handler,\@abi-omnipotent
1574 .align  16
1575 se_handler:
1576         push    %rsi
1577         push    %rdi
1578         push    %rbx
1579         push    %rbp
1580         push    %r12
1581         push    %r13
1582         push    %r14
1583         push    %r15
1584         pushfq
1585         sub     \$64,%rsp
1586
1587         mov     120($context),%rax      # pull context->Rax
1588         mov     248($context),%rbx      # pull context->Rip
1589
1590         mov     8($disp),%rsi           # disp->ImageBase
1591         mov     56($disp),%r11          # disp->HanderlData
1592
1593         mov     0(%r11),%r10d           # HandlerData[0]
1594         lea     (%rsi,%r10),%r10        # prologue label
1595         cmp     %r10,%rbx               # context->Rip<prologue label
1596         jb      .Lin_prologue
1597
1598         mov     152($context),%rax      # pull context->Rsp
1599
1600         mov     4(%r11),%r10d           # HandlerData[1]
1601         lea     (%rsi,%r10),%r10        # epilogue label
1602         cmp     %r10,%rbx               # context->Rip>=epilogue label
1603         jae     .Lin_prologue
1604 ___
1605 $code.=<<___ if ($shaext);
1606         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1607         cmp     %r10,%rbx
1608         jb      .Lnot_in_shaext
1609
1610         lea     (%rax),%rsi
1611         lea     512($context),%rdi      # &context.Xmm6
1612         mov     \$20,%ecx
1613         .long   0xa548f3fc              # cld; rep movsq
1614         lea     168(%rax),%rax          # adjust stack pointer
1615         jmp     .Lin_prologue
1616 .Lnot_in_shaext:
1617 ___
1618 $code.=<<___ if ($avx>1);
1619         lea     .Lavx2_shortcut(%rip),%r10
1620         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1621         jb      .Lnot_in_avx2
1622
1623         and     \$-256*$SZ,%rax
1624         add     \$`2*$SZ*($rounds-8)`,%rax
1625 .Lnot_in_avx2:
1626 ___
1627 $code.=<<___;
1628         mov     %rax,%rsi               # put aside Rsp
1629         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1630
1631         mov     -8(%rax),%rbx
1632         mov     -16(%rax),%rbp
1633         mov     -24(%rax),%r12
1634         mov     -32(%rax),%r13
1635         mov     -40(%rax),%r14
1636         mov     -48(%rax),%r15
1637         mov     %rbx,144($context)      # restore context->Rbx
1638         mov     %rbp,160($context)      # restore context->Rbp
1639         mov     %r12,216($context)      # restore context->R12
1640         mov     %r13,224($context)      # restore context->R13
1641         mov     %r14,232($context)      # restore context->R14
1642         mov     %r15,240($context)      # restore context->R15
1643
1644         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1645         lea     512($context),%rdi      # &context.Xmm6
1646         mov     \$20,%ecx
1647         .long   0xa548f3fc              # cld; rep movsq
1648
1649 .Lin_prologue:
1650         mov     8(%rax),%rdi
1651         mov     16(%rax),%rsi
1652         mov     %rax,152($context)      # restore context->Rsp
1653         mov     %rsi,168($context)      # restore context->Rsi
1654         mov     %rdi,176($context)      # restore context->Rdi
1655
1656         mov     40($disp),%rdi          # disp->ContextRecord
1657         mov     $context,%rsi           # context
1658         mov     \$154,%ecx              # sizeof(CONTEXT)
1659         .long   0xa548f3fc              # cld; rep movsq
1660
1661         mov     $disp,%rsi
1662         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1663         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1664         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1665         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1666         mov     40(%rsi),%r10           # disp->ContextRecord
1667         lea     56(%rsi),%r11           # &disp->HandlerData
1668         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1669         mov     %r10,32(%rsp)           # arg5
1670         mov     %r11,40(%rsp)           # arg6
1671         mov     %r12,48(%rsp)           # arg7
1672         mov     %rcx,56(%rsp)           # arg8, (NULL)
1673         call    *__imp_RtlVirtualUnwind(%rip)
1674
1675         mov     \$1,%eax                # ExceptionContinueSearch
1676         add     \$64,%rsp
1677         popfq
1678         pop     %r15
1679         pop     %r14
1680         pop     %r13
1681         pop     %r12
1682         pop     %rbp
1683         pop     %rbx
1684         pop     %rdi
1685         pop     %rsi
1686         ret
1687 .size   se_handler,.-se_handler
1688
1689 .section        .pdata
1690         .rva    .LSEH_begin_${func}_xop
1691         .rva    .LSEH_end_${func}_xop
1692         .rva    .LSEH_info_${func}_xop
1693
1694         .rva    .LSEH_begin_${func}_avx
1695         .rva    .LSEH_end_${func}_avx
1696         .rva    .LSEH_info_${func}_avx
1697 ___
1698 $code.=<<___ if ($avx>1);
1699         .rva    .LSEH_begin_${func}_avx2
1700         .rva    .LSEH_end_${func}_avx2
1701         .rva    .LSEH_info_${func}_avx2
1702 ___
1703 $code.=<<___ if ($shaext);
1704         .rva    .LSEH_begin_${func}_shaext
1705         .rva    .LSEH_end_${func}_shaext
1706         .rva    .LSEH_info_${func}_shaext
1707 ___
1708 $code.=<<___;
1709 .section        .xdata
1710 .align  8
1711 .LSEH_info_${func}_xop:
1712         .byte   9,0,0,0
1713         .rva    se_handler
1714         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1715
1716 .LSEH_info_${func}_avx:
1717         .byte   9,0,0,0
1718         .rva    se_handler
1719         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1720 ___
1721 $code.=<<___ if ($avx>1);
1722 .LSEH_info_${func}_avx2:
1723         .byte   9,0,0,0
1724         .rva    se_handler
1725         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1726 ___
1727 $code.=<<___ if ($shaext);
1728 .LSEH_info_${func}_shaext:
1729         .byte   9,0,0,0
1730         .rva    se_handler
1731         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1732 ___
1733 }
1734
1735 ####################################################################
1736 sub rex {
1737   local *opcode=shift;
1738   my ($dst,$src)=@_;
1739   my $rex=0;
1740
1741     $rex|=0x04                  if($dst>=8);
1742     $rex|=0x01                  if($src>=8);
1743     unshift @opcode,$rex|0x40   if($rex);
1744 }
1745
1746 {
1747   my %opcodelet = (
1748                 "sha256rnds2" => 0xcb,
1749                 "sha256msg1"  => 0xcc,
1750                 "sha256msg2"  => 0xcd   );
1751
1752   sub sha256op38 {
1753     my $instr = shift;
1754
1755     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1756       my @opcode=(0x0f,0x38);
1757         rex(\@opcode,$2,$1);
1758         push @opcode,$opcodelet{$instr};
1759         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1760         return ".byte\t".join(',',@opcode);
1761     } else {
1762         return $instr."\t".@_[0];
1763     }
1764   }
1765 }
1766
1767 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1768 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1769 print $code;
1770 close STDOUT;