Add AES-SHA256 stitch.
[openssl.git] / crypto / aes / asm / aesni-sha256-x86_64.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # January 2013
11 #
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
22 # subroutine:
23 #
24 #                AES-128/-192/-256+SHA256       this(**)gain
25 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
26 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
27 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
28 #
29 # (*)   there are XOP, AVX1 and AVX2 code pathes, meaning that
30 #       Westmere is omitted from loop, this is because gain was not
31 #       estimated high enough to justify the effort;
32 # (**)  these are EVP-free results, results obtained with 'speed
33 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
34
35 $flavour = shift;
36 $output  = shift;
37 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
38
39 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
40
41 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
43 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
44 die "can't locate x86_64-xlate.pl";
45
46 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
47                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
48         $avx = ($1>=2.19) + ($1>=2.22);
49 }
50
51 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
52            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
53         $avx = ($1>=2.09) + ($1>=2.10);
54 }
55
56 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
57            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
58         $avx = ($1>=10) + ($1>=11);
59 }
60
61 open OUT,"| \"$^X\" $xlate $flavour $output";
62 *STDOUT=*OUT;
63
64 $func="aesni_cbc_sha256_enc";
65 $TABLE="K256";
66 $SZ=4;
67 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
68                                 "%r8d","%r9d","%r10d","%r11d");
69 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
70 @Sigma0=( 2,13,22);
71 @Sigma1=( 6,11,25);
72 @sigma0=( 7,18, 3);
73 @sigma1=(17,19,10);
74 $rounds=64;
75
76 ########################################################################
77 # void aesni_cbc_sha256_enc(const void *inp,
78 #                       void *out,
79 #                       size_t length,
80 #                       const AES_KEY *key,
81 #                       unsigned char *iv,
82 #                       SHA256_CTX *ctx,
83 #                       const void *in0);
84 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
85 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
86
87 $Tbl="%rbp";
88
89 $_inp="16*$SZ+0*8(%rsp)";
90 $_out="16*$SZ+1*8(%rsp)";
91 $_end="16*$SZ+2*8(%rsp)";
92 $_key="16*$SZ+3*8(%rsp)";
93 $_ivp="16*$SZ+4*8(%rsp)";
94 $_ctx="16*$SZ+5*8(%rsp)";
95 $_in0="16*$SZ+6*8(%rsp)";
96 $_rsp="16*$SZ+7*8(%rsp)";
97 $framesz=16*$SZ+8*8;
98
99 $code=<<___;
100 .text
101
102 .extern OPENSSL_ia32cap_P
103 .globl  $func
104 .type   $func,\@abi-omnipotent
105 .align  16
106 $func:
107 ___
108 $code.=<<___ if ($avx);
109         lea     OPENSSL_ia32cap_P(%rip),%r11
110         mov     \$1,%eax
111         cmp     \$0,`$win64?"%rcx":"%rdi"`
112         je      .Lprobe
113         mov     0(%r11),%eax
114         mov     4(%r11),%r10d
115         mov     8(%r11),%r11d
116
117         test    \$`1<<11`,%r10d                 # check for XOP
118         jnz     ${func}_xop
119 ___
120 $code.=<<___ if ($avx>1);
121         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
122         cmp     \$`1<<8|1<<5|1<<3`,%r11d
123         je      ${func}_avx2
124 ___
125 $code.=<<___ if ($avx);
126         and     \$`1<<30`,%eax                  # mask "Intel CPU" bit
127         and     \$`1<<28|1<<9`,%r10d            # mask AVX+SSSE3 bits
128         or      %eax,%r10d
129         cmp     \$`1<<28|1<<9|1<<30`,%r10d
130         je      ${func}_avx
131         ud2
132 ___
133 $code.=<<___;
134         xor     %eax,%eax
135         cmp     \$0,`$win64?"%rcx":"%rdi"`
136         je      .Lprobe
137         ud2
138 .Lprobe:
139         ret
140 .size   $func,.-$func
141
142 .align  64
143 .type   $TABLE,\@object
144 $TABLE:
145         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
146         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
147         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
148         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
149         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
150         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
151         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
152         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
153         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
154         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
155         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
156         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
157         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
158         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
159         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
160         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
161         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
162         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
163         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
164         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
165         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
166         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
167         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
168         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
169         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
170         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
171         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
172         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
173         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
174         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
175         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
176         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
177
178         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
179         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
180         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
181         .long   0,0,0,0,   0,0,0,0
182         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
183 .align  64
184 ___
185
186 ######################################################################
187 # SIMD code paths
188 #
189 {{{
190 ($iv,$inout,$roundkey,$temp,
191  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
192
193 $aesni_cbc_idx=0;
194 @aesni_cbc_block = (
195 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
196 ##      &vmovdqu        ($inout,($inp));
197 ##      &mov            ($_inp,$inp);
198
199         '&vpxor         ($inout,$inout,$roundkey);'.
200         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
201
202         '&vpxor         ($inout,$inout,$iv);',
203
204         '&vaesenc       ($inout,$inout,$roundkey);'.
205         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
206
207         '&vaesenc       ($inout,$inout,$roundkey);'.
208         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
209
210         '&vaesenc       ($inout,$inout,$roundkey);'.
211         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
212
213         '&vaesenc       ($inout,$inout,$roundkey);'.
214         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
215
216         '&vaesenc       ($inout,$inout,$roundkey);'.
217         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
218
219         '&vaesenc       ($inout,$inout,$roundkey);'.
220         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
221
222         '&vaesenc       ($inout,$inout,$roundkey);'.
223         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
224
225         '&vaesenc       ($inout,$inout,$roundkey);'.
226         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
227
228         '&vaesenc       ($inout,$inout,$roundkey);'.
229         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
230
231         '&vaesenclast   ($temp,$inout,$roundkey);'.
232         ' &vaesenc      ($inout,$inout,$roundkey);'.
233         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
234
235         '&vpand         ($iv,$temp,$mask10);'.
236         ' &vaesenc      ($inout,$inout,$roundkey);'.
237         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
238
239         '&vaesenclast   ($temp,$inout,$roundkey);'.
240         ' &vaesenc      ($inout,$inout,$roundkey);'.
241         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
242
243         '&vpand         ($temp,$temp,$mask12);'.
244         ' &vaesenc      ($inout,$inout,$roundkey);'.
245          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
246
247         '&vpor          ($iv,$iv,$temp);'.
248         ' &vaesenclast  ($temp,$inout,$roundkey);'.
249         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
250
251 ##      &mov            ($inp,$_inp);
252 ##      &mov            ($out,$_out);
253 ##      &vpand          ($temp,$temp,$mask14);
254 ##      &vpor           ($iv,$iv,$temp);
255 ##      &vmovdqu        ($iv,($out,$inp);
256 ##      &lea            (inp,16($inp));
257 );
258
259 my $a4=$T1;
260 my ($a,$b,$c,$d,$e,$f,$g,$h);
261
262 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
263 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
264   my $arg = pop;
265     $arg = "\$$arg" if ($arg*1 eq $arg);
266     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
267 }
268
269 sub body_00_15 () {
270         (
271         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
272
273         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
274         '&mov   ($a,$a1)',
275         '&mov   ($a4,$f)',
276
277         '&xor   ($a0,$e)',
278         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
279         '&xor   ($a4,$g)',                      # f^g
280
281         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
282         '&xor   ($a1,$a)',
283         '&and   ($a4,$e)',                      # (f^g)&e
284
285         @aesni_cbc_block[$aesni_cbc_idx++].
286         '&xor   ($a0,$e)',
287         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
288         '&mov   ($a2,$a)',
289
290         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
291         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
292         '&xor   ($a2,$b)',                      # a^b, b^c in next round
293
294         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
295         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
296         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
297
298         '&xor   ($a1,$a)',
299         '&add   ($h,$a0)',                      # h+=Sigma1(e)
300         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
301
302         '&add   ($d,$h)',                       # d+=h
303         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
304         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
305
306         '&mov   ($a0,$d)',
307         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
308         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
309         );
310 }
311
312 if ($avx) {{
313 ######################################################################
314 # XOP code path
315 #
316 $code.=<<___;
317 .type   ${func}_xop,\@function,6
318 .align  64
319 ${func}_xop:
320 .Lxop_shortcut:
321         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
322         push    %rbx
323         push    %rbp
324         push    %r12
325         push    %r13
326         push    %r14
327         push    %r15
328         mov     %rsp,%r11               # copy %rsp
329         sub     \$`$framesz+$win64*16*10`,%rsp
330         and     \$-64,%rsp              # align stack frame
331
332         shl     \$6,$len
333         sub     $inp,$out               # re-bias
334         sub     $inp,$in0
335         add     $inp,$len               # end of input
336
337         #mov    $inp,$_inp              # saved later
338         mov     $out,$_out
339         mov     $len,$_end
340         #mov    $key,$_key              # remains resident in $inp register
341         mov     $ivp,$_ivp
342         mov     $ctx,$_ctx
343         mov     $in0,$_in0
344         mov     %r11,$_rsp
345 ___
346 $code.=<<___ if ($win64);
347         movaps  %xmm6,`$framesz+16*0`(%rsp)
348         movaps  %xmm7,`$framesz+16*1`(%rsp)
349         movaps  %xmm8,`$framesz+16*2`(%rsp)
350         movaps  %xmm9,`$framesz+16*3`(%rsp)
351         movaps  %xmm10,`$framesz+16*4`(%rsp)
352         movaps  %xmm11,`$framesz+16*5`(%rsp)
353         movaps  %xmm12,`$framesz+16*6`(%rsp)
354         movaps  %xmm13,`$framesz+16*7`(%rsp)
355         movaps  %xmm14,`$framesz+16*8`(%rsp)
356         movaps  %xmm15,`$framesz+16*9`(%rsp)
357 ___
358 $code.=<<___;
359 .Lprologue_xop:
360         vzeroall
361
362         mov     $inp,%r12               # borrow $a4
363         lea     0x80($key),$inp         # size optimization, reassign
364         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
365         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
366         mov     $ctx,%r15               # borrow $a2
367         mov     $in0,%rsi               # borrow $a3
368         vmovdqu ($ivp),$iv              # load IV
369         sub     \$9,%r14
370
371         mov     $SZ*0(%r15),$A
372         mov     $SZ*1(%r15),$B
373         mov     $SZ*2(%r15),$C
374         mov     $SZ*3(%r15),$D
375         mov     $SZ*4(%r15),$E
376         mov     $SZ*5(%r15),$F
377         mov     $SZ*6(%r15),$G
378         mov     $SZ*7(%r15),$H
379
380         vmovdqa 0x00(%r13,%r14,8),$mask14
381         vmovdqa 0x10(%r13,%r14,8),$mask12
382         vmovdqa 0x20(%r13,%r14,8),$mask10
383         vmovdqu 0x00-0x80($inp),$roundkey
384         jmp     .Lloop_xop
385 ___
386                                         if ($SZ==4) {   # SHA256
387     my @X = map("%xmm$_",(0..3));
388     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
389
390 $code.=<<___;
391 .align  16
392 .Lloop_xop:
393         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
394         vmovdqu 0x00(%rsi,%r12),@X[0]
395         vmovdqu 0x10(%rsi,%r12),@X[1]
396         vmovdqu 0x20(%rsi,%r12),@X[2]
397         vmovdqu 0x30(%rsi,%r12),@X[3]
398         vpshufb $t3,@X[0],@X[0]
399         lea     $TABLE(%rip),$Tbl
400         vpshufb $t3,@X[1],@X[1]
401         vpshufb $t3,@X[2],@X[2]
402         vpaddd  0x00($Tbl),@X[0],$t0
403         vpshufb $t3,@X[3],@X[3]
404         vpaddd  0x20($Tbl),@X[1],$t1
405         vpaddd  0x40($Tbl),@X[2],$t2
406         vpaddd  0x60($Tbl),@X[3],$t3
407         vmovdqa $t0,0x00(%rsp)
408         mov     $A,$a1
409         vmovdqa $t1,0x10(%rsp)
410         mov     $B,$a3
411         vmovdqa $t2,0x20(%rsp)
412         xor     $C,$a3                  # magic
413         vmovdqa $t3,0x30(%rsp)
414         mov     $E,$a0
415         jmp     .Lxop_00_47
416
417 .align  16
418 .Lxop_00_47:
419         sub     \$-16*2*$SZ,$Tbl        # size optimization
420         vmovdqu (%r12),$inout           # $a4
421         mov     %r12,$_inp              # $a4
422 ___
423 sub XOP_256_00_47 () {
424 my $j = shift;
425 my $body = shift;
426 my @X = @_;
427 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
428
429         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
430           eval(shift(@insns));
431           eval(shift(@insns));
432          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
433           eval(shift(@insns));
434           eval(shift(@insns));
435         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
436           eval(shift(@insns));
437           eval(shift(@insns));
438         &vpsrld         ($t0,$t0,$sigma0[2]);
439           eval(shift(@insns));
440           eval(shift(@insns));
441          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
442           eval(shift(@insns));
443           eval(shift(@insns));
444           eval(shift(@insns));
445           eval(shift(@insns));
446         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
447           eval(shift(@insns));
448           eval(shift(@insns));
449         &vpxor          ($t0,$t0,$t1);
450           eval(shift(@insns));
451           eval(shift(@insns));
452           eval(shift(@insns));
453           eval(shift(@insns));
454          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
455           eval(shift(@insns));
456           eval(shift(@insns));
457         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
458           eval(shift(@insns));
459           eval(shift(@insns));
460          &vpsrld        ($t2,@X[3],$sigma1[2]);
461           eval(shift(@insns));
462           eval(shift(@insns));
463         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
464           eval(shift(@insns));
465           eval(shift(@insns));
466          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
467           eval(shift(@insns));
468           eval(shift(@insns));
469          &vpxor         ($t3,$t3,$t2);
470           eval(shift(@insns));
471           eval(shift(@insns));
472           eval(shift(@insns));
473           eval(shift(@insns));
474          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
475           eval(shift(@insns));
476           eval(shift(@insns));
477           eval(shift(@insns));
478           eval(shift(@insns));
479         &vpsrldq        ($t3,$t3,8);
480           eval(shift(@insns));
481           eval(shift(@insns));
482           eval(shift(@insns));
483           eval(shift(@insns));
484         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
485           eval(shift(@insns));
486           eval(shift(@insns));
487           eval(shift(@insns));
488           eval(shift(@insns));
489          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
490           eval(shift(@insns));
491           eval(shift(@insns));
492          &vpsrld        ($t2,@X[0],$sigma1[2]);
493           eval(shift(@insns));
494           eval(shift(@insns));
495          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
496           eval(shift(@insns));
497           eval(shift(@insns));
498          &vpxor         ($t3,$t3,$t2);
499           eval(shift(@insns));
500           eval(shift(@insns));
501           eval(shift(@insns));
502           eval(shift(@insns));
503          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
504           eval(shift(@insns));
505           eval(shift(@insns));
506           eval(shift(@insns));
507           eval(shift(@insns));
508         &vpslldq        ($t3,$t3,8);            # 22 instructions
509           eval(shift(@insns));
510           eval(shift(@insns));
511           eval(shift(@insns));
512           eval(shift(@insns));
513         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
514           eval(shift(@insns));
515           eval(shift(@insns));
516           eval(shift(@insns));
517           eval(shift(@insns));
518         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
519           foreach (@insns) { eval; }            # remaining instructions
520         &vmovdqa        (16*$j."(%rsp)",$t2);
521 }
522
523     $aesni_cbc_idx=0;
524     for ($i=0,$j=0; $j<4; $j++) {
525         &XOP_256_00_47($j,\&body_00_15,@X);
526         push(@X,shift(@X));                     # rotate(@X)
527     }
528         &mov            ("%r12",$_inp);         # borrow $a4
529         &vpand          ($temp,$temp,$mask14);
530         &mov            ("%r15",$_out);         # borrow $a2
531         &vpor           ($iv,$iv,$temp);
532         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
533         &lea            ("%r12","16(%r12)");    # inp++
534
535         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
536         &jne    (".Lxop_00_47");
537
538         &vmovdqu        ($inout,"(%r12)");
539         &mov            ($_inp,"%r12");
540
541     $aesni_cbc_idx=0;
542     for ($i=0; $i<16; ) {
543         foreach(body_00_15()) { eval; }
544     }
545                                         }
546 $code.=<<___;
547         mov     $_inp,%r12              # borrow $a4
548         mov     $_out,%r13              # borrow $a0
549         mov     $_ctx,%r15              # borrow $a2
550         mov     $_in0,%rsi              # borrow $a3
551
552         vpand   $mask14,$temp,$temp
553         mov     $a1,$A
554         vpor    $temp,$iv,$iv
555         vmovdqu $iv,(%r13,%r12)         # write output
556         lea     16(%r12),%r12           # inp++
557
558         add     $SZ*0(%r15),$A
559         add     $SZ*1(%r15),$B
560         add     $SZ*2(%r15),$C
561         add     $SZ*3(%r15),$D
562         add     $SZ*4(%r15),$E
563         add     $SZ*5(%r15),$F
564         add     $SZ*6(%r15),$G
565         add     $SZ*7(%r15),$H
566
567         cmp     $_end,%r12
568
569         mov     $A,$SZ*0(%r15)
570         mov     $B,$SZ*1(%r15)
571         mov     $C,$SZ*2(%r15)
572         mov     $D,$SZ*3(%r15)
573         mov     $E,$SZ*4(%r15)
574         mov     $F,$SZ*5(%r15)
575         mov     $G,$SZ*6(%r15)
576         mov     $H,$SZ*7(%r15)
577
578         jb      .Lloop_xop
579
580         mov     $_ivp,$ivp
581         mov     $_rsp,%rsi
582         vmovdqu $iv,($ivp)              # output IV
583         vzeroall
584 ___
585 $code.=<<___ if ($win64);
586         movaps  `$framesz+16*0`(%rsp),%xmm6
587         movaps  `$framesz+16*1`(%rsp),%xmm7
588         movaps  `$framesz+16*2`(%rsp),%xmm8
589         movaps  `$framesz+16*3`(%rsp),%xmm9
590         movaps  `$framesz+16*4`(%rsp),%xmm10
591         movaps  `$framesz+16*5`(%rsp),%xmm11
592         movaps  `$framesz+16*6`(%rsp),%xmm12
593         movaps  `$framesz+16*7`(%rsp),%xmm13
594         movaps  `$framesz+16*8`(%rsp),%xmm14
595         movaps  `$framesz+16*9`(%rsp),%xmm15
596 ___
597 $code.=<<___;
598         mov     (%rsi),%r15
599         mov     8(%rsi),%r14
600         mov     16(%rsi),%r13
601         mov     24(%rsi),%r12
602         mov     32(%rsi),%rbp
603         mov     40(%rsi),%rbx
604         lea     48(%rsi),%rsp
605 .Lepilogue_xop:
606         ret
607 .size   ${func}_xop,.-${func}_xop
608 ___
609 ######################################################################
610 # AVX+shrd code path
611 #
612 local *ror = sub { &shrd(@_[0],@_) };
613
614 $code.=<<___;
615 .type   ${func}_avx,\@function,6
616 .align  64
617 ${func}_avx:
618 .Lavx_shortcut:
619         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
620         push    %rbx
621         push    %rbp
622         push    %r12
623         push    %r13
624         push    %r14
625         push    %r15
626         mov     %rsp,%r11               # copy %rsp
627         sub     \$`$framesz+$win64*16*10`,%rsp
628         and     \$-64,%rsp              # align stack frame
629
630         shl     \$6,$len
631         sub     $inp,$out               # re-bias
632         sub     $inp,$in0
633         add     $inp,$len               # end of input
634
635         #mov    $inp,$_inp              # saved later
636         mov     $out,$_out
637         mov     $len,$_end
638         #mov    $key,$_key              # remains resident in $inp register
639         mov     $ivp,$_ivp
640         mov     $ctx,$_ctx
641         mov     $in0,$_in0
642         mov     %r11,$_rsp
643 ___
644 $code.=<<___ if ($win64);
645         movaps  %xmm6,`$framesz+16*0`(%rsp)
646         movaps  %xmm7,`$framesz+16*1`(%rsp)
647         movaps  %xmm8,`$framesz+16*2`(%rsp)
648         movaps  %xmm9,`$framesz+16*3`(%rsp)
649         movaps  %xmm10,`$framesz+16*4`(%rsp)
650         movaps  %xmm11,`$framesz+16*5`(%rsp)
651         movaps  %xmm12,`$framesz+16*6`(%rsp)
652         movaps  %xmm13,`$framesz+16*7`(%rsp)
653         movaps  %xmm14,`$framesz+16*8`(%rsp)
654         movaps  %xmm15,`$framesz+16*9`(%rsp)
655 ___
656 $code.=<<___;
657 .Lprologue_avx:
658         vzeroall
659
660         mov     $inp,%r12               # borrow $a4
661         lea     0x80($key),$inp         # size optimization, reassign
662         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
663         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
664         mov     $ctx,%r15               # borrow $a2
665         mov     $in0,%rsi               # borrow $a3
666         vmovdqu ($ivp),$iv              # load IV
667         sub     \$9,%r14
668
669         mov     $SZ*0(%r15),$A
670         mov     $SZ*1(%r15),$B
671         mov     $SZ*2(%r15),$C
672         mov     $SZ*3(%r15),$D
673         mov     $SZ*4(%r15),$E
674         mov     $SZ*5(%r15),$F
675         mov     $SZ*6(%r15),$G
676         mov     $SZ*7(%r15),$H
677
678         vmovdqa 0x00(%r13,%r14,8),$mask14
679         vmovdqa 0x10(%r13,%r14,8),$mask12
680         vmovdqa 0x20(%r13,%r14,8),$mask10
681         vmovdqu 0x00-0x80($inp),$roundkey
682 ___
683                                         if ($SZ==4) {   # SHA256
684     my @X = map("%xmm$_",(0..3));
685     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
686
687 $code.=<<___;
688         jmp     .Lloop_avx
689 .align  16
690 .Lloop_avx:
691         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
692         vmovdqu 0x00(%rsi,%r12),@X[0]
693         vmovdqu 0x10(%rsi,%r12),@X[1]
694         vmovdqu 0x20(%rsi,%r12),@X[2]
695         vmovdqu 0x30(%rsi,%r12),@X[3]
696         vpshufb $t3,@X[0],@X[0]
697         lea     $TABLE(%rip),$Tbl
698         vpshufb $t3,@X[1],@X[1]
699         vpshufb $t3,@X[2],@X[2]
700         vpaddd  0x00($Tbl),@X[0],$t0
701         vpshufb $t3,@X[3],@X[3]
702         vpaddd  0x20($Tbl),@X[1],$t1
703         vpaddd  0x40($Tbl),@X[2],$t2
704         vpaddd  0x60($Tbl),@X[3],$t3
705         vmovdqa $t0,0x00(%rsp)
706         mov     $A,$a1
707         vmovdqa $t1,0x10(%rsp)
708         mov     $B,$a3
709         vmovdqa $t2,0x20(%rsp)
710         xor     $C,$a3                  # magic
711         vmovdqa $t3,0x30(%rsp)
712         mov     $E,$a0
713         jmp     .Lavx_00_47
714
715 .align  16
716 .Lavx_00_47:
717         sub     \$-16*2*$SZ,$Tbl        # size optimization
718         vmovdqu (%r12),$inout           # $a4
719         mov     %r12,$_inp              # $a4
720 ___
721 sub Xupdate_256_AVX () {
722         (
723         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
724          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
725         '&vpsrld        ($t2,$t0,$sigma0[0]);',
726          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
727         '&vpsrld        ($t3,$t0,$sigma0[2])',
728         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
729         '&vpxor         ($t0,$t3,$t2)',
730          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
731         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
732         '&vpxor         ($t0,$t0,$t1)',
733         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
734         '&vpxor         ($t0,$t0,$t2)',
735          '&vpsrld       ($t2,$t3,$sigma1[2]);',
736         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
737          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
738         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
739          '&vpxor        ($t2,$t2,$t3);',
740          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
741          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
742          '&vpshufd      ($t2,$t2,0b10000100)',
743          '&vpsrldq      ($t2,$t2,8)',
744         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
745          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
746          '&vpsrld       ($t2,$t3,$sigma1[2])',
747          '&vpsrlq       ($t3,$t3,$sigma1[0])',
748          '&vpxor        ($t2,$t2,$t3);',
749          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
750          '&vpxor        ($t2,$t2,$t3)',
751          '&vpshufd      ($t2,$t2,0b11101000)',
752          '&vpslldq      ($t2,$t2,8)',
753         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
754         );
755 }
756
757 sub AVX_256_00_47 () {
758 my $j = shift;
759 my $body = shift;
760 my @X = @_;
761 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
762
763         foreach (Xupdate_256_AVX()) {           # 29 instructions
764             eval;
765             eval(shift(@insns));
766             eval(shift(@insns));
767             eval(shift(@insns));
768         }
769         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
770           foreach (@insns) { eval; }            # remaining instructions
771         &vmovdqa        (16*$j."(%rsp)",$t2);
772 }
773
774     $aesni_cbc_idx=0;
775     for ($i=0,$j=0; $j<4; $j++) {
776         &AVX_256_00_47($j,\&body_00_15,@X);
777         push(@X,shift(@X));                     # rotate(@X)
778     }
779         &mov            ("%r12",$_inp);         # borrow $a4
780         &vpand          ($temp,$temp,$mask14);
781         &mov            ("%r15",$_out);         # borrow $a2
782         &vpor           ($iv,$iv,$temp);
783         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
784         &lea            ("%r12","16(%r12)");    # inp++
785
786         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
787         &jne    (".Lavx_00_47");
788
789         &vmovdqu        ($inout,"(%r12)");
790         &mov            ($_inp,"%r12");
791
792     $aesni_cbc_idx=0;
793     for ($i=0; $i<16; ) {
794         foreach(body_00_15()) { eval; }
795     }
796
797                                         }
798 $code.=<<___;
799         mov     $_inp,%r12              # borrow $a4
800         mov     $_out,%r13              # borrow $a0
801         mov     $_ctx,%r15              # borrow $a2
802         mov     $_in0,%rsi              # borrow $a3
803
804         vpand   $mask14,$temp,$temp
805         mov     $a1,$A
806         vpor    $temp,$iv,$iv
807         vmovdqu $iv,(%r13,%r12)         # write output
808         lea     16(%r12),%r12           # inp++
809
810         add     $SZ*0(%r15),$A
811         add     $SZ*1(%r15),$B
812         add     $SZ*2(%r15),$C
813         add     $SZ*3(%r15),$D
814         add     $SZ*4(%r15),$E
815         add     $SZ*5(%r15),$F
816         add     $SZ*6(%r15),$G
817         add     $SZ*7(%r15),$H
818
819         cmp     $_end,%r12
820
821         mov     $A,$SZ*0(%r15)
822         mov     $B,$SZ*1(%r15)
823         mov     $C,$SZ*2(%r15)
824         mov     $D,$SZ*3(%r15)
825         mov     $E,$SZ*4(%r15)
826         mov     $F,$SZ*5(%r15)
827         mov     $G,$SZ*6(%r15)
828         mov     $H,$SZ*7(%r15)
829         jb      .Lloop_avx
830
831         mov     $_ivp,$ivp
832         mov     $_rsp,%rsi
833         vmovdqu $iv,($ivp)              # output IV
834         vzeroall
835 ___
836 $code.=<<___ if ($win64);
837         movaps  `$framesz+16*0`(%rsp),%xmm6
838         movaps  `$framesz+16*1`(%rsp),%xmm7
839         movaps  `$framesz+16*2`(%rsp),%xmm8
840         movaps  `$framesz+16*3`(%rsp),%xmm9
841         movaps  `$framesz+16*4`(%rsp),%xmm10
842         movaps  `$framesz+16*5`(%rsp),%xmm11
843         movaps  `$framesz+16*6`(%rsp),%xmm12
844         movaps  `$framesz+16*7`(%rsp),%xmm13
845         movaps  `$framesz+16*8`(%rsp),%xmm14
846         movaps  `$framesz+16*9`(%rsp),%xmm15
847 ___
848 $code.=<<___;
849         mov     (%rsi),%r15
850         mov     8(%rsi),%r14
851         mov     16(%rsi),%r13
852         mov     24(%rsi),%r12
853         mov     32(%rsi),%rbp
854         mov     40(%rsi),%rbx
855         lea     48(%rsi),%rsp
856 .Lepilogue_avx:
857         ret
858 .size   ${func}_avx,.-${func}_avx
859 ___
860
861 if ($avx>1) {{
862 ######################################################################
863 # AVX2+BMI code path
864 #
865 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp 
866 my $PUSH8=8*2*$SZ;
867 use integer;
868
869 sub bodyx_00_15 () {
870         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
871         (
872         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
873
874         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
875         '&and   ($a4,$e)',              # f&e
876         '&rorx  ($a0,$e,$Sigma1[2])',
877         '&rorx  ($a2,$e,$Sigma1[1])',
878
879         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
880         '&lea   ($h,"($h,$a4)")',
881         '&andn  ($a4,$e,$g)',           # ~e&g
882         '&xor   ($a0,$a2)',
883
884         '&rorx  ($a1,$e,$Sigma1[0])',
885         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
886         '&xor   ($a0,$a1)',             # Sigma1(e)
887         '&mov   ($a2,$a)',
888
889         '&rorx  ($a4,$a,$Sigma0[2])',
890         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
891         '&xor   ($a2,$b)',              # a^b, b^c in next round
892         '&rorx  ($a1,$a,$Sigma0[1])',
893
894         '&rorx  ($a0,$a,$Sigma0[0])',
895         '&lea   ($d,"($d,$h)")',        # d+=h
896         '&and   ($a3,$a2)',             # (b^c)&(a^b)
897         @aesni_cbc_block[$aesni_cbc_idx++].
898         '&xor   ($a1,$a4)',
899
900         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
901         '&xor   ($a1,$a0)',             # Sigma0(a)
902         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
903         '&mov   ($a4,$e)',              # copy of f in future
904
905         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
906         );
907         # and at the finish one has to $a+=$a1
908 }
909
910 $code.=<<___;
911 .type   ${func}_avx2,\@function,6
912 .align  64
913 ${func}_avx2:
914 .Lavx2_shortcut:
915         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
916         push    %rbx
917         push    %rbp
918         push    %r12
919         push    %r13
920         push    %r14
921         push    %r15
922         mov     %rsp,%r11               # copy %rsp
923         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
924         and     \$-256*$SZ,%rsp         # align stack frame
925         add     \$`2*$SZ*($rounds-8)`,%rsp
926
927         shl     \$6,$len
928         sub     $inp,$out               # re-bias
929         sub     $inp,$in0
930         add     $inp,$len               # end of input
931
932         #mov    $inp,$_inp              # saved later
933         #mov    $out,$_out              # kept in $offload
934         mov     $len,$_end
935         #mov    $key,$_key              # remains resident in $inp register
936         mov     $ivp,$_ivp
937         mov     $ctx,$_ctx
938         mov     $in0,$_in0
939         mov     %r11,$_rsp
940 ___
941 $code.=<<___ if ($win64);
942         movaps  %xmm6,`$framesz+16*0`(%rsp)
943         movaps  %xmm7,`$framesz+16*1`(%rsp)
944         movaps  %xmm8,`$framesz+16*2`(%rsp)
945         movaps  %xmm9,`$framesz+16*3`(%rsp)
946         movaps  %xmm10,`$framesz+16*4`(%rsp)
947         movaps  %xmm11,`$framesz+16*5`(%rsp)
948         movaps  %xmm12,`$framesz+16*6`(%rsp)
949         movaps  %xmm13,`$framesz+16*7`(%rsp)
950         movaps  %xmm14,`$framesz+16*8`(%rsp)
951         movaps  %xmm15,`$framesz+16*9`(%rsp)
952 ___
953 $code.=<<___;
954 .Lprologue_avx2:
955         vzeroall
956
957         mov     $inp,%r13               # borrow $a0
958         vpinsrq \$1,$out,$offload,$offload
959         lea     0x80($key),$inp         # size optimization, reassign
960         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
961         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
962         mov     $ctx,%r15               # borrow $a2
963         mov     $in0,%rsi               # borrow $a3
964         vmovdqu ($ivp),$iv              # load IV
965         lea     -9(%r14),%r14
966
967         vmovdqa 0x00(%r12,%r14,8),$mask14
968         vmovdqa 0x10(%r12,%r14,8),$mask12
969         vmovdqa 0x20(%r12,%r14,8),$mask10
970
971         sub     \$-16*$SZ,%r13          # inp++, size optimization
972         mov     $SZ*0(%r15),$A
973         xor     %r12,%r12               # borrow $a0
974         mov     $SZ*1(%r15),$B
975         cmp     $len,%r13               # $_end
976         mov     $SZ*2(%r15),$C
977         sete    %r12b
978         mov     $SZ*3(%r15),$D
979         mov     $SZ*4(%r15),$E
980         mov     $SZ*5(%r15),$F
981         mov     $SZ*6(%r15),$G
982         mov     $SZ*7(%r15),$H
983         vmovdqu 0x00-0x80($inp),$roundkey
984 ___
985                                         if ($SZ==4) {   # SHA256
986     my @X = map("%ymm$_",(0..3));
987     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
988
989 $code.=<<___;
990         jmp     .Loop_avx2
991 .align  16
992 .Loop_avx2:
993         shl     \$`log(16*$SZ)/log(2)`,%r12
994         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
995         neg     %r12
996         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
997         add     %rsi,%r12               # next or same input block
998         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
999         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1000         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1001
1002         vinserti128     \$1,(%r12,%r13),@X[0],@X[0]
1003         vinserti128     \$1,16(%r12,%r13),@X[1],@X[1]
1004          vpshufb        $t3,@X[0],@X[0]
1005         vinserti128     \$1,32(%r12,%r13),@X[2],@X[2]
1006          vpshufb        $t3,@X[1],@X[1]
1007         vinserti128     \$1,48(%r12,%r13),@X[3],@X[3]
1008
1009         lea     $TABLE(%rip),$Tbl
1010         vpshufb $t3,@X[2],@X[2]
1011         lea     -16*$SZ(%r13),%r13
1012         vpaddd  0x00($Tbl),@X[0],$t0
1013         vpshufb $t3,@X[3],@X[3]
1014         vpaddd  0x20($Tbl),@X[1],$t1
1015         vpaddd  0x40($Tbl),@X[2],$t2
1016         vpaddd  0x60($Tbl),@X[3],$t3
1017         vmovdqa $t0,0x00(%rsp)
1018         xor     $a1,$a1
1019         vmovdqa $t1,0x20(%rsp)
1020         lea     -$PUSH8(%rsp),%rsp
1021         mov     $B,$a3
1022         vmovdqa $t2,0x00(%rsp)
1023         xor     $C,$a3                  # magic
1024         vmovdqa $t3,0x20(%rsp)
1025         mov     $F,$a4
1026         sub     \$-16*2*$SZ,$Tbl        # size optimization
1027         jmp     .Lavx2_00_47
1028
1029 .align  16
1030 .Lavx2_00_47:
1031         vmovdqu (%r13),$inout
1032         vpinsrq \$0,%r13,$offload,$offload
1033 ___
1034
1035 sub AVX2_256_00_47 () {
1036 my $j = shift;
1037 my $body = shift;
1038 my @X = @_;
1039 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1040 my $base = "+2*$PUSH8(%rsp)";
1041
1042         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1043         foreach (Xupdate_256_AVX()) {           # 29 instructions
1044             eval;
1045             eval(shift(@insns));
1046             eval(shift(@insns));
1047             eval(shift(@insns));
1048         }
1049         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1050           foreach (@insns) { eval; }            # remaining instructions
1051         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1052 }
1053     $aesni_cbc_idx=0;
1054     for ($i=0,$j=0; $j<4; $j++) {
1055         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1056         push(@X,shift(@X));                     # rotate(@X)
1057     }
1058         &vmovq          ("%r13",$offload);      # borrow $a0
1059         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1060         &vpand          ($temp,$temp,$mask14);
1061         &vpor           ($iv,$iv,$temp);
1062         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1063         &lea            ("%r13","16(%r13)");    # inp++
1064
1065         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1066         &cmpb   (($SZ-1)."($Tbl)",0);
1067         &jne    (".Lavx2_00_47");
1068
1069         &vmovdqu        ($inout,"(%r13)");
1070         &vpinsrq        ($offload,$offload,"%r13",0);
1071
1072     $aesni_cbc_idx=0;
1073     for ($i=0; $i<16; ) {
1074         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1075         foreach(bodyx_00_15()) { eval; }
1076     }
1077                                         }
1078 $code.=<<___;
1079         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1080         vmovq   $offload,%r13                   # $_inp, borrow $a0
1081         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1082         add     $a1,$A
1083         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1084
1085         vpand   $mask14,$temp,$temp
1086         vpor    $temp,$iv,$iv
1087         vmovdqu $iv,(%r12,%r13)                 # write output
1088         lea     16(%r13),%r13
1089
1090         add     $SZ*0(%r15),$A
1091         add     $SZ*1(%r15),$B
1092         add     $SZ*2(%r15),$C
1093         add     $SZ*3(%r15),$D
1094         add     $SZ*4(%r15),$E
1095         add     $SZ*5(%r15),$F
1096         add     $SZ*6(%r15),$G
1097         add     $SZ*7(%r15),$H
1098
1099         mov     $A,$SZ*0(%r15)
1100         mov     $B,$SZ*1(%r15)
1101         mov     $C,$SZ*2(%r15)
1102         mov     $D,$SZ*3(%r15)
1103         mov     $E,$SZ*4(%r15)
1104         mov     $F,$SZ*5(%r15)
1105         mov     $G,$SZ*6(%r15)
1106         mov     $H,$SZ*7(%r15)
1107
1108         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1109         je      .Ldone_avx2
1110
1111         xor     $a1,$a1
1112         mov     $B,$a3
1113         mov     $F,$a4
1114         xor     $C,$a3                  # magic
1115         jmp     .Lower_avx2
1116 .align  16
1117 .Lower_avx2:
1118         vmovdqu (%r13),$inout
1119         vpinsrq \$0,%r13,$offload,$offload
1120 ___
1121     $aesni_cbc_idx=0;
1122     for ($i=0; $i<16; ) {
1123         my $base="+16($Tbl)";
1124         foreach(bodyx_00_15()) { eval; }
1125         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1126     }
1127 $code.=<<___;
1128         vmovq   $offload,%r13                   # borrow $a0
1129         vpextrq \$1,$offload,%r15               # borrow $a2
1130         vpand   $mask14,$temp,$temp
1131         vpor    $temp,$iv,$iv
1132         lea     -$PUSH8($Tbl),$Tbl
1133         vmovdqu $iv,(%r15,%r13)                 # write output
1134         lea     16(%r13),%r13                   # inp++
1135         cmp     %rsp,$Tbl
1136         jae     .Lower_avx2
1137
1138         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1139         lea     16*$SZ(%r13),%r13
1140         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1141         add     $a1,$A
1142         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1143
1144         add     $SZ*0(%r15),$A
1145         add     $SZ*1(%r15),$B
1146         add     $SZ*2(%r15),$C
1147         add     $SZ*3(%r15),$D
1148         add     $SZ*4(%r15),$E
1149         add     $SZ*5(%r15),$F
1150         add     $SZ*6(%r15),$G
1151         xor     %r12,%r12
1152         add     $SZ*7(%r15),$H
1153
1154         cmp     $_end,%r13
1155
1156         mov     $A,$SZ*0(%r15)
1157         mov     $B,$SZ*1(%r15)
1158         mov     $C,$SZ*2(%r15)
1159         mov     $D,$SZ*3(%r15)
1160         mov     $E,$SZ*4(%r15)
1161         mov     $F,$SZ*5(%r15)
1162         mov     $G,$SZ*6(%r15)
1163         mov     $H,$SZ*7(%r15)
1164
1165         sete    %r12b
1166         jbe     .Loop_avx2
1167         lea     (%rsp),$Tbl
1168
1169 .Ldone_avx2:
1170         lea     ($Tbl),%rsp
1171         mov     $_ivp,$ivp
1172         mov     $_rsp,%rsi
1173         vmovdqu $iv,($ivp)              # output IV
1174         vzeroall
1175 ___
1176 $code.=<<___ if ($win64);
1177         movaps  `$framesz+16*0`(%rsp),%xmm6
1178         movaps  `$framesz+16*1`(%rsp),%xmm7
1179         movaps  `$framesz+16*2`(%rsp),%xmm8
1180         movaps  `$framesz+16*3`(%rsp),%xmm9
1181         movaps  `$framesz+16*4`(%rsp),%xmm10
1182         movaps  `$framesz+16*5`(%rsp),%xmm11
1183         movaps  `$framesz+16*6`(%rsp),%xmm12
1184         movaps  `$framesz+16*7`(%rsp),%xmm13
1185         movaps  `$framesz+16*8`(%rsp),%xmm14
1186         movaps  `$framesz+16*9`(%rsp),%xmm15
1187 ___
1188 $code.=<<___;
1189         mov     (%rsi),%r15
1190         mov     8(%rsi),%r14
1191         mov     16(%rsi),%r13
1192         mov     24(%rsi),%r12
1193         mov     32(%rsi),%rbp
1194         mov     40(%rsi),%rbx
1195         lea     48(%rsi),%rsp
1196 .Lepilogue_avx2:
1197         ret
1198 .size   ${func}_avx2,.-${func}_avx2
1199 ___
1200 }}
1201 }}}}}
1202
1203 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1204 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1205 if ($win64) {
1206 $rec="%rcx";
1207 $frame="%rdx";
1208 $context="%r8";
1209 $disp="%r9";
1210
1211 $code.=<<___ if ($avx);
1212 .extern __imp_RtlVirtualUnwind
1213 .type   se_handler,\@abi-omnipotent
1214 .align  16
1215 se_handler:
1216         push    %rsi
1217         push    %rdi
1218         push    %rbx
1219         push    %rbp
1220         push    %r12
1221         push    %r13
1222         push    %r14
1223         push    %r15
1224         pushfq
1225         sub     \$64,%rsp
1226
1227         mov     120($context),%rax      # pull context->Rax
1228         mov     248($context),%rbx      # pull context->Rip
1229
1230         mov     8($disp),%rsi           # disp->ImageBase
1231         mov     56($disp),%r11          # disp->HanderlData
1232
1233         mov     0(%r11),%r10d           # HandlerData[0]
1234         lea     (%rsi,%r10),%r10        # prologue label
1235         cmp     %r10,%rbx               # context->Rip<prologue label
1236         jb      .Lin_prologue
1237
1238         mov     152($context),%rax      # pull context->Rsp
1239
1240         mov     4(%r11),%r10d           # HandlerData[1]
1241         lea     (%rsi,%r10),%r10        # epilogue label
1242         cmp     %r10,%rbx               # context->Rip>=epilogue label
1243         jae     .Lin_prologue
1244 ___
1245 $code.=<<___ if ($avx>1);
1246         lea     .Lavx2_shortcut(%rip),%r10
1247         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1248         jb      .Lnot_in_avx2
1249
1250         and     \$-256*$SZ,%rax
1251         add     \$`2*$SZ*($rounds-8)`,%rax
1252 .Lnot_in_avx2:
1253 ___
1254 $code.=<<___;
1255         mov     %rax,%rsi               # put aside Rsp
1256         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1257         lea     48(%rax),%rax
1258
1259         mov     -8(%rax),%rbx
1260         mov     -16(%rax),%rbp
1261         mov     -24(%rax),%r12
1262         mov     -32(%rax),%r13
1263         mov     -40(%rax),%r14
1264         mov     -48(%rax),%r15
1265         mov     %rbx,144($context)      # restore context->Rbx
1266         mov     %rbp,160($context)      # restore context->Rbp
1267         mov     %r12,216($context)      # restore context->R12
1268         mov     %r13,224($context)      # restore context->R13
1269         mov     %r14,232($context)      # restore context->R14
1270         mov     %r15,240($context)      # restore context->R15
1271
1272         lea     .Lepilogue(%rip),%r10
1273         cmp     %r10,%rbx
1274         jb      .Lin_prologue           # non-AVX code
1275
1276         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1277         lea     512($context),%rdi      # &context.Xmm6
1278         mov     \$20,%ecx
1279         .long   0xa548f3fc              # cld; rep movsq
1280
1281 .Lin_prologue:
1282         mov     8(%rax),%rdi
1283         mov     16(%rax),%rsi
1284         mov     %rax,152($context)      # restore context->Rsp
1285         mov     %rsi,168($context)      # restore context->Rsi
1286         mov     %rdi,176($context)      # restore context->Rdi
1287
1288         mov     40($disp),%rdi          # disp->ContextRecord
1289         mov     $context,%rsi           # context
1290         mov     \$154,%ecx              # sizeof(CONTEXT)
1291         .long   0xa548f3fc              # cld; rep movsq
1292
1293         mov     $disp,%rsi
1294         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1295         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1296         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1297         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1298         mov     40(%rsi),%r10           # disp->ContextRecord
1299         lea     56(%rsi),%r11           # &disp->HandlerData
1300         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1301         mov     %r10,32(%rsp)           # arg5
1302         mov     %r11,40(%rsp)           # arg6
1303         mov     %r12,48(%rsp)           # arg7
1304         mov     %rcx,56(%rsp)           # arg8, (NULL)
1305         call    *__imp_RtlVirtualUnwind(%rip)
1306
1307         mov     \$1,%eax                # ExceptionContinueSearch
1308         add     \$64,%rsp
1309         popfq
1310         pop     %r15
1311         pop     %r14
1312         pop     %r13
1313         pop     %r12
1314         pop     %rbp
1315         pop     %rbx
1316         pop     %rdi
1317         pop     %rsi
1318         ret
1319 .size   se_handler,.-se_handler
1320
1321 .section        .pdata
1322         .rva    .LSEH_begin_${func}_xop
1323         .rva    .LSEH_end_${func}_xop
1324         .rva    .LSEH_info_${func}_xop
1325
1326         .rva    .LSEH_begin_${func}_avx
1327         .rva    .LSEH_end_${func}_avx
1328         .rva    .LSEH_info_${func}_avx
1329 ___
1330 $code.=<<___ if ($avx>1);
1331         .rva    .LSEH_begin_${func}_avx2
1332         .rva    .LSEH_end_${func}_avx2
1333         .rva    .LSEH_info_${func}_avx2
1334 ___
1335 $code.=<<___ if ($avx);
1336 .section        .xdata
1337 .align  8
1338 .LSEH_info_${func}_xop:
1339         .byte   9,0,0,0
1340         .rva    se_handler
1341         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1342
1343 .LSEH_info_${func}_avx:
1344         .byte   9,0,0,0
1345         .rva    se_handler
1346         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1347 ___
1348 $code.=<<___ if ($avx>1);
1349 .LSEH_info_${func}_avx2:
1350         .byte   9,0,0,0
1351         .rva    se_handler
1352         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1353 ___
1354 }
1355
1356 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1357 print $code;
1358 close STDOUT;