16fea488173f587adbf8b077de7b4049ae355e45
[openssl.git] / crypto / aes / asm / aesni-sha256-x86_64.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # January 2013
11 #
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
22 # subroutine:
23 #
24 #                AES-128/-192/-256+SHA256       this(**)gain
25 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
26 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
27 # Haswell           4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
28 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
29 #
30 # (*)   there are XOP, AVX1 and AVX2 code pathes, meaning that
31 #       Westmere is omitted from loop, this is because gain was not
32 #       estimated high enough to justify the effort;
33 # (**)  these are EVP-free results, results obtained with 'speed
34 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
35
36 $flavour = shift;
37 $output  = shift;
38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
39
40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
41
42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
45 die "can't locate x86_64-xlate.pl";
46
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49         $avx = ($1>=2.19) + ($1>=2.22);
50 }
51
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54         $avx = ($1>=2.09) + ($1>=2.10);
55 }
56
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59         $avx = ($1>=10) + ($1>=11);
60 }
61
62 open OUT,"| \"$^X\" $xlate $flavour $output";
63 *STDOUT=*OUT;
64
65 $func="aesni_cbc_sha256_enc";
66 $TABLE="K256";
67 $SZ=4;
68 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
69                                 "%r8d","%r9d","%r10d","%r11d");
70 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
71 @Sigma0=( 2,13,22);
72 @Sigma1=( 6,11,25);
73 @sigma0=( 7,18, 3);
74 @sigma1=(17,19,10);
75 $rounds=64;
76
77 ########################################################################
78 # void aesni_cbc_sha256_enc(const void *inp,
79 #                       void *out,
80 #                       size_t length,
81 #                       const AES_KEY *key,
82 #                       unsigned char *iv,
83 #                       SHA256_CTX *ctx,
84 #                       const void *in0);
85 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
86 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
87
88 $Tbl="%rbp";
89
90 $_inp="16*$SZ+0*8(%rsp)";
91 $_out="16*$SZ+1*8(%rsp)";
92 $_end="16*$SZ+2*8(%rsp)";
93 $_key="16*$SZ+3*8(%rsp)";
94 $_ivp="16*$SZ+4*8(%rsp)";
95 $_ctx="16*$SZ+5*8(%rsp)";
96 $_in0="16*$SZ+6*8(%rsp)";
97 $_rsp="16*$SZ+7*8(%rsp)";
98 $framesz=16*$SZ+8*8;
99
100 $code=<<___;
101 .text
102
103 .extern OPENSSL_ia32cap_P
104 .globl  $func
105 .type   $func,\@abi-omnipotent
106 .align  16
107 $func:
108 ___
109 $code.=<<___ if ($avx);
110         lea     OPENSSL_ia32cap_P(%rip),%r11
111         mov     \$1,%eax
112         cmp     \$0,`$win64?"%rcx":"%rdi"`
113         je      .Lprobe
114         mov     0(%r11),%eax
115         mov     4(%r11),%r10d
116         mov     8(%r11),%r11d
117
118         test    \$`1<<11`,%r10d                 # check for XOP
119         jnz     ${func}_xop
120 ___
121 $code.=<<___ if ($avx>1);
122         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
123         cmp     \$`1<<8|1<<5|1<<3`,%r11d
124         je      ${func}_avx2
125 ___
126 $code.=<<___ if ($avx);
127         and     \$`1<<30`,%eax                  # mask "Intel CPU" bit
128         and     \$`1<<28|1<<9`,%r10d            # mask AVX+SSSE3 bits
129         or      %eax,%r10d
130         cmp     \$`1<<28|1<<9|1<<30`,%r10d
131         je      ${func}_avx
132         ud2
133 ___
134 $code.=<<___;
135         xor     %eax,%eax
136         cmp     \$0,`$win64?"%rcx":"%rdi"`
137         je      .Lprobe
138         ud2
139 .Lprobe:
140         ret
141 .size   $func,.-$func
142
143 .align  64
144 .type   $TABLE,\@object
145 $TABLE:
146         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
147         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
148         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
149         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
150         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
151         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
152         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
153         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
154         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
155         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
156         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
157         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
158         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
159         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
160         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
161         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
162         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
163         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
164         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
165         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
166         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
167         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
168         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
169         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
170         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
171         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
172         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
173         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
174         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
175         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
176         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
177         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
178
179         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
180         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
181         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
182         .long   0,0,0,0,   0,0,0,0
183         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
184 .align  64
185 ___
186
187 ######################################################################
188 # SIMD code paths
189 #
190 {{{
191 ($iv,$inout,$roundkey,$temp,
192  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
193
194 $aesni_cbc_idx=0;
195 @aesni_cbc_block = (
196 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
197 ##      &vmovdqu        ($inout,($inp));
198 ##      &mov            ($_inp,$inp);
199
200         '&vpxor         ($inout,$inout,$roundkey);'.
201         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
202
203         '&vpxor         ($inout,$inout,$iv);',
204
205         '&vaesenc       ($inout,$inout,$roundkey);'.
206         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
207
208         '&vaesenc       ($inout,$inout,$roundkey);'.
209         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
210
211         '&vaesenc       ($inout,$inout,$roundkey);'.
212         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
213
214         '&vaesenc       ($inout,$inout,$roundkey);'.
215         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
216
217         '&vaesenc       ($inout,$inout,$roundkey);'.
218         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
219
220         '&vaesenc       ($inout,$inout,$roundkey);'.
221         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
222
223         '&vaesenc       ($inout,$inout,$roundkey);'.
224         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
225
226         '&vaesenc       ($inout,$inout,$roundkey);'.
227         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
228
229         '&vaesenc       ($inout,$inout,$roundkey);'.
230         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
231
232         '&vaesenclast   ($temp,$inout,$roundkey);'.
233         ' &vaesenc      ($inout,$inout,$roundkey);'.
234         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
235
236         '&vpand         ($iv,$temp,$mask10);'.
237         ' &vaesenc      ($inout,$inout,$roundkey);'.
238         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
239
240         '&vaesenclast   ($temp,$inout,$roundkey);'.
241         ' &vaesenc      ($inout,$inout,$roundkey);'.
242         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
243
244         '&vpand         ($temp,$temp,$mask12);'.
245         ' &vaesenc      ($inout,$inout,$roundkey);'.
246          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
247
248         '&vpor          ($iv,$iv,$temp);'.
249         ' &vaesenclast  ($temp,$inout,$roundkey);'.
250         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
251
252 ##      &mov            ($inp,$_inp);
253 ##      &mov            ($out,$_out);
254 ##      &vpand          ($temp,$temp,$mask14);
255 ##      &vpor           ($iv,$iv,$temp);
256 ##      &vmovdqu        ($iv,($out,$inp);
257 ##      &lea            (inp,16($inp));
258 );
259
260 my $a4=$T1;
261 my ($a,$b,$c,$d,$e,$f,$g,$h);
262
263 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
264 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
265   my $arg = pop;
266     $arg = "\$$arg" if ($arg*1 eq $arg);
267     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
268 }
269
270 sub body_00_15 () {
271         (
272         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
273
274         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
275         '&mov   ($a,$a1)',
276         '&mov   ($a4,$f)',
277
278         '&xor   ($a0,$e)',
279         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
280         '&xor   ($a4,$g)',                      # f^g
281
282         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
283         '&xor   ($a1,$a)',
284         '&and   ($a4,$e)',                      # (f^g)&e
285
286         @aesni_cbc_block[$aesni_cbc_idx++].
287         '&xor   ($a0,$e)',
288         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
289         '&mov   ($a2,$a)',
290
291         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
292         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
293         '&xor   ($a2,$b)',                      # a^b, b^c in next round
294
295         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
296         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
297         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
298
299         '&xor   ($a1,$a)',
300         '&add   ($h,$a0)',                      # h+=Sigma1(e)
301         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
302
303         '&add   ($d,$h)',                       # d+=h
304         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
305         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
306
307         '&mov   ($a0,$d)',
308         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
309         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
310         );
311 }
312
313 if ($avx) {{
314 ######################################################################
315 # XOP code path
316 #
317 $code.=<<___;
318 .type   ${func}_xop,\@function,6
319 .align  64
320 ${func}_xop:
321 .Lxop_shortcut:
322         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
323         push    %rbx
324         push    %rbp
325         push    %r12
326         push    %r13
327         push    %r14
328         push    %r15
329         mov     %rsp,%r11               # copy %rsp
330         sub     \$`$framesz+$win64*16*10`,%rsp
331         and     \$-64,%rsp              # align stack frame
332
333         shl     \$6,$len
334         sub     $inp,$out               # re-bias
335         sub     $inp,$in0
336         add     $inp,$len               # end of input
337
338         #mov    $inp,$_inp              # saved later
339         mov     $out,$_out
340         mov     $len,$_end
341         #mov    $key,$_key              # remains resident in $inp register
342         mov     $ivp,$_ivp
343         mov     $ctx,$_ctx
344         mov     $in0,$_in0
345         mov     %r11,$_rsp
346 ___
347 $code.=<<___ if ($win64);
348         movaps  %xmm6,`$framesz+16*0`(%rsp)
349         movaps  %xmm7,`$framesz+16*1`(%rsp)
350         movaps  %xmm8,`$framesz+16*2`(%rsp)
351         movaps  %xmm9,`$framesz+16*3`(%rsp)
352         movaps  %xmm10,`$framesz+16*4`(%rsp)
353         movaps  %xmm11,`$framesz+16*5`(%rsp)
354         movaps  %xmm12,`$framesz+16*6`(%rsp)
355         movaps  %xmm13,`$framesz+16*7`(%rsp)
356         movaps  %xmm14,`$framesz+16*8`(%rsp)
357         movaps  %xmm15,`$framesz+16*9`(%rsp)
358 ___
359 $code.=<<___;
360 .Lprologue_xop:
361         vzeroall
362
363         mov     $inp,%r12               # borrow $a4
364         lea     0x80($key),$inp         # size optimization, reassign
365         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
366         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
367         mov     $ctx,%r15               # borrow $a2
368         mov     $in0,%rsi               # borrow $a3
369         vmovdqu ($ivp),$iv              # load IV
370         sub     \$9,%r14
371
372         mov     $SZ*0(%r15),$A
373         mov     $SZ*1(%r15),$B
374         mov     $SZ*2(%r15),$C
375         mov     $SZ*3(%r15),$D
376         mov     $SZ*4(%r15),$E
377         mov     $SZ*5(%r15),$F
378         mov     $SZ*6(%r15),$G
379         mov     $SZ*7(%r15),$H
380
381         vmovdqa 0x00(%r13,%r14,8),$mask14
382         vmovdqa 0x10(%r13,%r14,8),$mask12
383         vmovdqa 0x20(%r13,%r14,8),$mask10
384         vmovdqu 0x00-0x80($inp),$roundkey
385         jmp     .Lloop_xop
386 ___
387                                         if ($SZ==4) {   # SHA256
388     my @X = map("%xmm$_",(0..3));
389     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
390
391 $code.=<<___;
392 .align  16
393 .Lloop_xop:
394         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
395         vmovdqu 0x00(%rsi,%r12),@X[0]
396         vmovdqu 0x10(%rsi,%r12),@X[1]
397         vmovdqu 0x20(%rsi,%r12),@X[2]
398         vmovdqu 0x30(%rsi,%r12),@X[3]
399         vpshufb $t3,@X[0],@X[0]
400         lea     $TABLE(%rip),$Tbl
401         vpshufb $t3,@X[1],@X[1]
402         vpshufb $t3,@X[2],@X[2]
403         vpaddd  0x00($Tbl),@X[0],$t0
404         vpshufb $t3,@X[3],@X[3]
405         vpaddd  0x20($Tbl),@X[1],$t1
406         vpaddd  0x40($Tbl),@X[2],$t2
407         vpaddd  0x60($Tbl),@X[3],$t3
408         vmovdqa $t0,0x00(%rsp)
409         mov     $A,$a1
410         vmovdqa $t1,0x10(%rsp)
411         mov     $B,$a3
412         vmovdqa $t2,0x20(%rsp)
413         xor     $C,$a3                  # magic
414         vmovdqa $t3,0x30(%rsp)
415         mov     $E,$a0
416         jmp     .Lxop_00_47
417
418 .align  16
419 .Lxop_00_47:
420         sub     \$-16*2*$SZ,$Tbl        # size optimization
421         vmovdqu (%r12),$inout           # $a4
422         mov     %r12,$_inp              # $a4
423 ___
424 sub XOP_256_00_47 () {
425 my $j = shift;
426 my $body = shift;
427 my @X = @_;
428 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
429
430         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
431           eval(shift(@insns));
432           eval(shift(@insns));
433          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
434           eval(shift(@insns));
435           eval(shift(@insns));
436         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
437           eval(shift(@insns));
438           eval(shift(@insns));
439         &vpsrld         ($t0,$t0,$sigma0[2]);
440           eval(shift(@insns));
441           eval(shift(@insns));
442          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
443           eval(shift(@insns));
444           eval(shift(@insns));
445           eval(shift(@insns));
446           eval(shift(@insns));
447         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
448           eval(shift(@insns));
449           eval(shift(@insns));
450         &vpxor          ($t0,$t0,$t1);
451           eval(shift(@insns));
452           eval(shift(@insns));
453           eval(shift(@insns));
454           eval(shift(@insns));
455          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
456           eval(shift(@insns));
457           eval(shift(@insns));
458         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
459           eval(shift(@insns));
460           eval(shift(@insns));
461          &vpsrld        ($t2,@X[3],$sigma1[2]);
462           eval(shift(@insns));
463           eval(shift(@insns));
464         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
465           eval(shift(@insns));
466           eval(shift(@insns));
467          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
468           eval(shift(@insns));
469           eval(shift(@insns));
470          &vpxor         ($t3,$t3,$t2);
471           eval(shift(@insns));
472           eval(shift(@insns));
473           eval(shift(@insns));
474           eval(shift(@insns));
475          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
476           eval(shift(@insns));
477           eval(shift(@insns));
478           eval(shift(@insns));
479           eval(shift(@insns));
480         &vpsrldq        ($t3,$t3,8);
481           eval(shift(@insns));
482           eval(shift(@insns));
483           eval(shift(@insns));
484           eval(shift(@insns));
485         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
486           eval(shift(@insns));
487           eval(shift(@insns));
488           eval(shift(@insns));
489           eval(shift(@insns));
490          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
491           eval(shift(@insns));
492           eval(shift(@insns));
493          &vpsrld        ($t2,@X[0],$sigma1[2]);
494           eval(shift(@insns));
495           eval(shift(@insns));
496          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
497           eval(shift(@insns));
498           eval(shift(@insns));
499          &vpxor         ($t3,$t3,$t2);
500           eval(shift(@insns));
501           eval(shift(@insns));
502           eval(shift(@insns));
503           eval(shift(@insns));
504          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
505           eval(shift(@insns));
506           eval(shift(@insns));
507           eval(shift(@insns));
508           eval(shift(@insns));
509         &vpslldq        ($t3,$t3,8);            # 22 instructions
510           eval(shift(@insns));
511           eval(shift(@insns));
512           eval(shift(@insns));
513           eval(shift(@insns));
514         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
515           eval(shift(@insns));
516           eval(shift(@insns));
517           eval(shift(@insns));
518           eval(shift(@insns));
519         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
520           foreach (@insns) { eval; }            # remaining instructions
521         &vmovdqa        (16*$j."(%rsp)",$t2);
522 }
523
524     $aesni_cbc_idx=0;
525     for ($i=0,$j=0; $j<4; $j++) {
526         &XOP_256_00_47($j,\&body_00_15,@X);
527         push(@X,shift(@X));                     # rotate(@X)
528     }
529         &mov            ("%r12",$_inp);         # borrow $a4
530         &vpand          ($temp,$temp,$mask14);
531         &mov            ("%r15",$_out);         # borrow $a2
532         &vpor           ($iv,$iv,$temp);
533         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
534         &lea            ("%r12","16(%r12)");    # inp++
535
536         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
537         &jne    (".Lxop_00_47");
538
539         &vmovdqu        ($inout,"(%r12)");
540         &mov            ($_inp,"%r12");
541
542     $aesni_cbc_idx=0;
543     for ($i=0; $i<16; ) {
544         foreach(body_00_15()) { eval; }
545     }
546                                         }
547 $code.=<<___;
548         mov     $_inp,%r12              # borrow $a4
549         mov     $_out,%r13              # borrow $a0
550         mov     $_ctx,%r15              # borrow $a2
551         mov     $_in0,%rsi              # borrow $a3
552
553         vpand   $mask14,$temp,$temp
554         mov     $a1,$A
555         vpor    $temp,$iv,$iv
556         vmovdqu $iv,(%r13,%r12)         # write output
557         lea     16(%r12),%r12           # inp++
558
559         add     $SZ*0(%r15),$A
560         add     $SZ*1(%r15),$B
561         add     $SZ*2(%r15),$C
562         add     $SZ*3(%r15),$D
563         add     $SZ*4(%r15),$E
564         add     $SZ*5(%r15),$F
565         add     $SZ*6(%r15),$G
566         add     $SZ*7(%r15),$H
567
568         cmp     $_end,%r12
569
570         mov     $A,$SZ*0(%r15)
571         mov     $B,$SZ*1(%r15)
572         mov     $C,$SZ*2(%r15)
573         mov     $D,$SZ*3(%r15)
574         mov     $E,$SZ*4(%r15)
575         mov     $F,$SZ*5(%r15)
576         mov     $G,$SZ*6(%r15)
577         mov     $H,$SZ*7(%r15)
578
579         jb      .Lloop_xop
580
581         mov     $_ivp,$ivp
582         mov     $_rsp,%rsi
583         vmovdqu $iv,($ivp)              # output IV
584         vzeroall
585 ___
586 $code.=<<___ if ($win64);
587         movaps  `$framesz+16*0`(%rsp),%xmm6
588         movaps  `$framesz+16*1`(%rsp),%xmm7
589         movaps  `$framesz+16*2`(%rsp),%xmm8
590         movaps  `$framesz+16*3`(%rsp),%xmm9
591         movaps  `$framesz+16*4`(%rsp),%xmm10
592         movaps  `$framesz+16*5`(%rsp),%xmm11
593         movaps  `$framesz+16*6`(%rsp),%xmm12
594         movaps  `$framesz+16*7`(%rsp),%xmm13
595         movaps  `$framesz+16*8`(%rsp),%xmm14
596         movaps  `$framesz+16*9`(%rsp),%xmm15
597 ___
598 $code.=<<___;
599         mov     (%rsi),%r15
600         mov     8(%rsi),%r14
601         mov     16(%rsi),%r13
602         mov     24(%rsi),%r12
603         mov     32(%rsi),%rbp
604         mov     40(%rsi),%rbx
605         lea     48(%rsi),%rsp
606 .Lepilogue_xop:
607         ret
608 .size   ${func}_xop,.-${func}_xop
609 ___
610 ######################################################################
611 # AVX+shrd code path
612 #
613 local *ror = sub { &shrd(@_[0],@_) };
614
615 $code.=<<___;
616 .type   ${func}_avx,\@function,6
617 .align  64
618 ${func}_avx:
619 .Lavx_shortcut:
620         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
621         push    %rbx
622         push    %rbp
623         push    %r12
624         push    %r13
625         push    %r14
626         push    %r15
627         mov     %rsp,%r11               # copy %rsp
628         sub     \$`$framesz+$win64*16*10`,%rsp
629         and     \$-64,%rsp              # align stack frame
630
631         shl     \$6,$len
632         sub     $inp,$out               # re-bias
633         sub     $inp,$in0
634         add     $inp,$len               # end of input
635
636         #mov    $inp,$_inp              # saved later
637         mov     $out,$_out
638         mov     $len,$_end
639         #mov    $key,$_key              # remains resident in $inp register
640         mov     $ivp,$_ivp
641         mov     $ctx,$_ctx
642         mov     $in0,$_in0
643         mov     %r11,$_rsp
644 ___
645 $code.=<<___ if ($win64);
646         movaps  %xmm6,`$framesz+16*0`(%rsp)
647         movaps  %xmm7,`$framesz+16*1`(%rsp)
648         movaps  %xmm8,`$framesz+16*2`(%rsp)
649         movaps  %xmm9,`$framesz+16*3`(%rsp)
650         movaps  %xmm10,`$framesz+16*4`(%rsp)
651         movaps  %xmm11,`$framesz+16*5`(%rsp)
652         movaps  %xmm12,`$framesz+16*6`(%rsp)
653         movaps  %xmm13,`$framesz+16*7`(%rsp)
654         movaps  %xmm14,`$framesz+16*8`(%rsp)
655         movaps  %xmm15,`$framesz+16*9`(%rsp)
656 ___
657 $code.=<<___;
658 .Lprologue_avx:
659         vzeroall
660
661         mov     $inp,%r12               # borrow $a4
662         lea     0x80($key),$inp         # size optimization, reassign
663         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
664         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
665         mov     $ctx,%r15               # borrow $a2
666         mov     $in0,%rsi               # borrow $a3
667         vmovdqu ($ivp),$iv              # load IV
668         sub     \$9,%r14
669
670         mov     $SZ*0(%r15),$A
671         mov     $SZ*1(%r15),$B
672         mov     $SZ*2(%r15),$C
673         mov     $SZ*3(%r15),$D
674         mov     $SZ*4(%r15),$E
675         mov     $SZ*5(%r15),$F
676         mov     $SZ*6(%r15),$G
677         mov     $SZ*7(%r15),$H
678
679         vmovdqa 0x00(%r13,%r14,8),$mask14
680         vmovdqa 0x10(%r13,%r14,8),$mask12
681         vmovdqa 0x20(%r13,%r14,8),$mask10
682         vmovdqu 0x00-0x80($inp),$roundkey
683 ___
684                                         if ($SZ==4) {   # SHA256
685     my @X = map("%xmm$_",(0..3));
686     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
687
688 $code.=<<___;
689         jmp     .Lloop_avx
690 .align  16
691 .Lloop_avx:
692         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
693         vmovdqu 0x00(%rsi,%r12),@X[0]
694         vmovdqu 0x10(%rsi,%r12),@X[1]
695         vmovdqu 0x20(%rsi,%r12),@X[2]
696         vmovdqu 0x30(%rsi,%r12),@X[3]
697         vpshufb $t3,@X[0],@X[0]
698         lea     $TABLE(%rip),$Tbl
699         vpshufb $t3,@X[1],@X[1]
700         vpshufb $t3,@X[2],@X[2]
701         vpaddd  0x00($Tbl),@X[0],$t0
702         vpshufb $t3,@X[3],@X[3]
703         vpaddd  0x20($Tbl),@X[1],$t1
704         vpaddd  0x40($Tbl),@X[2],$t2
705         vpaddd  0x60($Tbl),@X[3],$t3
706         vmovdqa $t0,0x00(%rsp)
707         mov     $A,$a1
708         vmovdqa $t1,0x10(%rsp)
709         mov     $B,$a3
710         vmovdqa $t2,0x20(%rsp)
711         xor     $C,$a3                  # magic
712         vmovdqa $t3,0x30(%rsp)
713         mov     $E,$a0
714         jmp     .Lavx_00_47
715
716 .align  16
717 .Lavx_00_47:
718         sub     \$-16*2*$SZ,$Tbl        # size optimization
719         vmovdqu (%r12),$inout           # $a4
720         mov     %r12,$_inp              # $a4
721 ___
722 sub Xupdate_256_AVX () {
723         (
724         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
725          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
726         '&vpsrld        ($t2,$t0,$sigma0[0]);',
727          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
728         '&vpsrld        ($t3,$t0,$sigma0[2])',
729         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
730         '&vpxor         ($t0,$t3,$t2)',
731          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
732         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
733         '&vpxor         ($t0,$t0,$t1)',
734         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
735         '&vpxor         ($t0,$t0,$t2)',
736          '&vpsrld       ($t2,$t3,$sigma1[2]);',
737         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
738          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
739         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
740          '&vpxor        ($t2,$t2,$t3);',
741          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
742          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
743          '&vpshufd      ($t2,$t2,0b10000100)',
744          '&vpsrldq      ($t2,$t2,8)',
745         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
746          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
747          '&vpsrld       ($t2,$t3,$sigma1[2])',
748          '&vpsrlq       ($t3,$t3,$sigma1[0])',
749          '&vpxor        ($t2,$t2,$t3);',
750          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
751          '&vpxor        ($t2,$t2,$t3)',
752          '&vpshufd      ($t2,$t2,0b11101000)',
753          '&vpslldq      ($t2,$t2,8)',
754         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
755         );
756 }
757
758 sub AVX_256_00_47 () {
759 my $j = shift;
760 my $body = shift;
761 my @X = @_;
762 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
763
764         foreach (Xupdate_256_AVX()) {           # 29 instructions
765             eval;
766             eval(shift(@insns));
767             eval(shift(@insns));
768             eval(shift(@insns));
769         }
770         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
771           foreach (@insns) { eval; }            # remaining instructions
772         &vmovdqa        (16*$j."(%rsp)",$t2);
773 }
774
775     $aesni_cbc_idx=0;
776     for ($i=0,$j=0; $j<4; $j++) {
777         &AVX_256_00_47($j,\&body_00_15,@X);
778         push(@X,shift(@X));                     # rotate(@X)
779     }
780         &mov            ("%r12",$_inp);         # borrow $a4
781         &vpand          ($temp,$temp,$mask14);
782         &mov            ("%r15",$_out);         # borrow $a2
783         &vpor           ($iv,$iv,$temp);
784         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
785         &lea            ("%r12","16(%r12)");    # inp++
786
787         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
788         &jne    (".Lavx_00_47");
789
790         &vmovdqu        ($inout,"(%r12)");
791         &mov            ($_inp,"%r12");
792
793     $aesni_cbc_idx=0;
794     for ($i=0; $i<16; ) {
795         foreach(body_00_15()) { eval; }
796     }
797
798                                         }
799 $code.=<<___;
800         mov     $_inp,%r12              # borrow $a4
801         mov     $_out,%r13              # borrow $a0
802         mov     $_ctx,%r15              # borrow $a2
803         mov     $_in0,%rsi              # borrow $a3
804
805         vpand   $mask14,$temp,$temp
806         mov     $a1,$A
807         vpor    $temp,$iv,$iv
808         vmovdqu $iv,(%r13,%r12)         # write output
809         lea     16(%r12),%r12           # inp++
810
811         add     $SZ*0(%r15),$A
812         add     $SZ*1(%r15),$B
813         add     $SZ*2(%r15),$C
814         add     $SZ*3(%r15),$D
815         add     $SZ*4(%r15),$E
816         add     $SZ*5(%r15),$F
817         add     $SZ*6(%r15),$G
818         add     $SZ*7(%r15),$H
819
820         cmp     $_end,%r12
821
822         mov     $A,$SZ*0(%r15)
823         mov     $B,$SZ*1(%r15)
824         mov     $C,$SZ*2(%r15)
825         mov     $D,$SZ*3(%r15)
826         mov     $E,$SZ*4(%r15)
827         mov     $F,$SZ*5(%r15)
828         mov     $G,$SZ*6(%r15)
829         mov     $H,$SZ*7(%r15)
830         jb      .Lloop_avx
831
832         mov     $_ivp,$ivp
833         mov     $_rsp,%rsi
834         vmovdqu $iv,($ivp)              # output IV
835         vzeroall
836 ___
837 $code.=<<___ if ($win64);
838         movaps  `$framesz+16*0`(%rsp),%xmm6
839         movaps  `$framesz+16*1`(%rsp),%xmm7
840         movaps  `$framesz+16*2`(%rsp),%xmm8
841         movaps  `$framesz+16*3`(%rsp),%xmm9
842         movaps  `$framesz+16*4`(%rsp),%xmm10
843         movaps  `$framesz+16*5`(%rsp),%xmm11
844         movaps  `$framesz+16*6`(%rsp),%xmm12
845         movaps  `$framesz+16*7`(%rsp),%xmm13
846         movaps  `$framesz+16*8`(%rsp),%xmm14
847         movaps  `$framesz+16*9`(%rsp),%xmm15
848 ___
849 $code.=<<___;
850         mov     (%rsi),%r15
851         mov     8(%rsi),%r14
852         mov     16(%rsi),%r13
853         mov     24(%rsi),%r12
854         mov     32(%rsi),%rbp
855         mov     40(%rsi),%rbx
856         lea     48(%rsi),%rsp
857 .Lepilogue_avx:
858         ret
859 .size   ${func}_avx,.-${func}_avx
860 ___
861
862 if ($avx>1) {{
863 ######################################################################
864 # AVX2+BMI code path
865 #
866 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp 
867 my $PUSH8=8*2*$SZ;
868 use integer;
869
870 sub bodyx_00_15 () {
871         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
872         (
873         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
874
875         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
876         '&and   ($a4,$e)',              # f&e
877         '&rorx  ($a0,$e,$Sigma1[2])',
878         '&rorx  ($a2,$e,$Sigma1[1])',
879
880         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
881         '&lea   ($h,"($h,$a4)")',
882         '&andn  ($a4,$e,$g)',           # ~e&g
883         '&xor   ($a0,$a2)',
884
885         '&rorx  ($a1,$e,$Sigma1[0])',
886         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
887         '&xor   ($a0,$a1)',             # Sigma1(e)
888         '&mov   ($a2,$a)',
889
890         '&rorx  ($a4,$a,$Sigma0[2])',
891         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
892         '&xor   ($a2,$b)',              # a^b, b^c in next round
893         '&rorx  ($a1,$a,$Sigma0[1])',
894
895         '&rorx  ($a0,$a,$Sigma0[0])',
896         '&lea   ($d,"($d,$h)")',        # d+=h
897         '&and   ($a3,$a2)',             # (b^c)&(a^b)
898         @aesni_cbc_block[$aesni_cbc_idx++].
899         '&xor   ($a1,$a4)',
900
901         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
902         '&xor   ($a1,$a0)',             # Sigma0(a)
903         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
904         '&mov   ($a4,$e)',              # copy of f in future
905
906         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
907         );
908         # and at the finish one has to $a+=$a1
909 }
910
911 $code.=<<___;
912 .type   ${func}_avx2,\@function,6
913 .align  64
914 ${func}_avx2:
915 .Lavx2_shortcut:
916         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
917         push    %rbx
918         push    %rbp
919         push    %r12
920         push    %r13
921         push    %r14
922         push    %r15
923         mov     %rsp,%r11               # copy %rsp
924         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
925         and     \$-256*$SZ,%rsp         # align stack frame
926         add     \$`2*$SZ*($rounds-8)`,%rsp
927
928         shl     \$6,$len
929         sub     $inp,$out               # re-bias
930         sub     $inp,$in0
931         add     $inp,$len               # end of input
932
933         #mov    $inp,$_inp              # saved later
934         #mov    $out,$_out              # kept in $offload
935         mov     $len,$_end
936         #mov    $key,$_key              # remains resident in $inp register
937         mov     $ivp,$_ivp
938         mov     $ctx,$_ctx
939         mov     $in0,$_in0
940         mov     %r11,$_rsp
941 ___
942 $code.=<<___ if ($win64);
943         movaps  %xmm6,`$framesz+16*0`(%rsp)
944         movaps  %xmm7,`$framesz+16*1`(%rsp)
945         movaps  %xmm8,`$framesz+16*2`(%rsp)
946         movaps  %xmm9,`$framesz+16*3`(%rsp)
947         movaps  %xmm10,`$framesz+16*4`(%rsp)
948         movaps  %xmm11,`$framesz+16*5`(%rsp)
949         movaps  %xmm12,`$framesz+16*6`(%rsp)
950         movaps  %xmm13,`$framesz+16*7`(%rsp)
951         movaps  %xmm14,`$framesz+16*8`(%rsp)
952         movaps  %xmm15,`$framesz+16*9`(%rsp)
953 ___
954 $code.=<<___;
955 .Lprologue_avx2:
956         vzeroall
957
958         mov     $inp,%r13               # borrow $a0
959         vpinsrq \$1,$out,$offload,$offload
960         lea     0x80($key),$inp         # size optimization, reassign
961         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
962         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
963         mov     $ctx,%r15               # borrow $a2
964         mov     $in0,%rsi               # borrow $a3
965         vmovdqu ($ivp),$iv              # load IV
966         lea     -9(%r14),%r14
967
968         vmovdqa 0x00(%r12,%r14,8),$mask14
969         vmovdqa 0x10(%r12,%r14,8),$mask12
970         vmovdqa 0x20(%r12,%r14,8),$mask10
971
972         sub     \$-16*$SZ,%r13          # inp++, size optimization
973         mov     $SZ*0(%r15),$A
974         lea     (%rsi,%r13),%r12        # borrow $a0
975         mov     $SZ*1(%r15),$B
976         cmp     $len,%r13               # $_end
977         mov     $SZ*2(%r15),$C
978         cmove   %rsp,%r12               # next block or random data
979         mov     $SZ*3(%r15),$D
980         mov     $SZ*4(%r15),$E
981         mov     $SZ*5(%r15),$F
982         mov     $SZ*6(%r15),$G
983         mov     $SZ*7(%r15),$H
984         vmovdqu 0x00-0x80($inp),$roundkey
985 ___
986                                         if ($SZ==4) {   # SHA256
987     my @X = map("%ymm$_",(0..3));
988     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
989
990 $code.=<<___;
991         jmp     .Loop_avx2
992 .align  16
993 .Loop_avx2:
994         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
995         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
996         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
997         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
998         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
999
1000         vinserti128     \$1,(%r12),@X[0],@X[0]
1001         vinserti128     \$1,16(%r12),@X[1],@X[1]
1002          vpshufb        $t3,@X[0],@X[0]
1003         vinserti128     \$1,32(%r12),@X[2],@X[2]
1004          vpshufb        $t3,@X[1],@X[1]
1005         vinserti128     \$1,48(%r12),@X[3],@X[3]
1006
1007         lea     $TABLE(%rip),$Tbl
1008         vpshufb $t3,@X[2],@X[2]
1009         lea     -16*$SZ(%r13),%r13
1010         vpaddd  0x00($Tbl),@X[0],$t0
1011         vpshufb $t3,@X[3],@X[3]
1012         vpaddd  0x20($Tbl),@X[1],$t1
1013         vpaddd  0x40($Tbl),@X[2],$t2
1014         vpaddd  0x60($Tbl),@X[3],$t3
1015         vmovdqa $t0,0x00(%rsp)
1016         xor     $a1,$a1
1017         vmovdqa $t1,0x20(%rsp)
1018         lea     -$PUSH8(%rsp),%rsp
1019         mov     $B,$a3
1020         vmovdqa $t2,0x00(%rsp)
1021         xor     $C,$a3                  # magic
1022         vmovdqa $t3,0x20(%rsp)
1023         mov     $F,$a4
1024         sub     \$-16*2*$SZ,$Tbl        # size optimization
1025         jmp     .Lavx2_00_47
1026
1027 .align  16
1028 .Lavx2_00_47:
1029         vmovdqu (%r13),$inout
1030         vpinsrq \$0,%r13,$offload,$offload
1031 ___
1032
1033 sub AVX2_256_00_47 () {
1034 my $j = shift;
1035 my $body = shift;
1036 my @X = @_;
1037 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1038 my $base = "+2*$PUSH8(%rsp)";
1039
1040         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1041         foreach (Xupdate_256_AVX()) {           # 29 instructions
1042             eval;
1043             eval(shift(@insns));
1044             eval(shift(@insns));
1045             eval(shift(@insns));
1046         }
1047         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1048           foreach (@insns) { eval; }            # remaining instructions
1049         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1050 }
1051     $aesni_cbc_idx=0;
1052     for ($i=0,$j=0; $j<4; $j++) {
1053         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1054         push(@X,shift(@X));                     # rotate(@X)
1055     }
1056         &vmovq          ("%r13",$offload);      # borrow $a0
1057         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1058         &vpand          ($temp,$temp,$mask14);
1059         &vpor           ($iv,$iv,$temp);
1060         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1061         &lea            ("%r13","16(%r13)");    # inp++
1062
1063         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1064         &cmpb   (($SZ-1)."($Tbl)",0);
1065         &jne    (".Lavx2_00_47");
1066
1067         &vmovdqu        ($inout,"(%r13)");
1068         &vpinsrq        ($offload,$offload,"%r13",0);
1069
1070     $aesni_cbc_idx=0;
1071     for ($i=0; $i<16; ) {
1072         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1073         foreach(bodyx_00_15()) { eval; }
1074     }
1075                                         }
1076 $code.=<<___;
1077         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1078         vmovq   $offload,%r13                   # $_inp, borrow $a0
1079         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1080         add     $a1,$A
1081         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1082
1083         vpand   $mask14,$temp,$temp
1084         vpor    $temp,$iv,$iv
1085         vmovdqu $iv,(%r12,%r13)                 # write output
1086         lea     16(%r13),%r13
1087
1088         add     $SZ*0(%r15),$A
1089         add     $SZ*1(%r15),$B
1090         add     $SZ*2(%r15),$C
1091         add     $SZ*3(%r15),$D
1092         add     $SZ*4(%r15),$E
1093         add     $SZ*5(%r15),$F
1094         add     $SZ*6(%r15),$G
1095         add     $SZ*7(%r15),$H
1096
1097         mov     $A,$SZ*0(%r15)
1098         mov     $B,$SZ*1(%r15)
1099         mov     $C,$SZ*2(%r15)
1100         mov     $D,$SZ*3(%r15)
1101         mov     $E,$SZ*4(%r15)
1102         mov     $F,$SZ*5(%r15)
1103         mov     $G,$SZ*6(%r15)
1104         mov     $H,$SZ*7(%r15)
1105
1106         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1107         je      .Ldone_avx2
1108
1109         xor     $a1,$a1
1110         mov     $B,$a3
1111         mov     $F,$a4
1112         xor     $C,$a3                  # magic
1113         jmp     .Lower_avx2
1114 .align  16
1115 .Lower_avx2:
1116         vmovdqu (%r13),$inout
1117         vpinsrq \$0,%r13,$offload,$offload
1118 ___
1119     $aesni_cbc_idx=0;
1120     for ($i=0; $i<16; ) {
1121         my $base="+16($Tbl)";
1122         foreach(bodyx_00_15()) { eval; }
1123         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1124     }
1125 $code.=<<___;
1126         vmovq   $offload,%r13                   # borrow $a0
1127         vpextrq \$1,$offload,%r15               # borrow $a2
1128         vpand   $mask14,$temp,$temp
1129         vpor    $temp,$iv,$iv
1130         lea     -$PUSH8($Tbl),$Tbl
1131         vmovdqu $iv,(%r15,%r13)                 # write output
1132         lea     16(%r13),%r13                   # inp++
1133         cmp     %rsp,$Tbl
1134         jae     .Lower_avx2
1135
1136         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1137         lea     16*$SZ(%r13),%r13
1138         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1139         add     $a1,$A
1140         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1141
1142         add     $SZ*0(%r15),$A
1143         add     $SZ*1(%r15),$B
1144         add     $SZ*2(%r15),$C
1145         add     $SZ*3(%r15),$D
1146         add     $SZ*4(%r15),$E
1147         add     $SZ*5(%r15),$F
1148         add     $SZ*6(%r15),$G
1149         lea     (%rsi,%r13),%r12
1150         add     $SZ*7(%r15),$H
1151
1152         cmp     $_end,%r13
1153
1154         mov     $A,$SZ*0(%r15)
1155         cmove   %rsp,%r12               # next block or stale data
1156         mov     $B,$SZ*1(%r15)
1157         mov     $C,$SZ*2(%r15)
1158         mov     $D,$SZ*3(%r15)
1159         mov     $E,$SZ*4(%r15)
1160         mov     $F,$SZ*5(%r15)
1161         mov     $G,$SZ*6(%r15)
1162         mov     $H,$SZ*7(%r15)
1163
1164         jbe     .Loop_avx2
1165         lea     (%rsp),$Tbl
1166
1167 .Ldone_avx2:
1168         lea     ($Tbl),%rsp
1169         mov     $_ivp,$ivp
1170         mov     $_rsp,%rsi
1171         vmovdqu $iv,($ivp)              # output IV
1172         vzeroall
1173 ___
1174 $code.=<<___ if ($win64);
1175         movaps  `$framesz+16*0`(%rsp),%xmm6
1176         movaps  `$framesz+16*1`(%rsp),%xmm7
1177         movaps  `$framesz+16*2`(%rsp),%xmm8
1178         movaps  `$framesz+16*3`(%rsp),%xmm9
1179         movaps  `$framesz+16*4`(%rsp),%xmm10
1180         movaps  `$framesz+16*5`(%rsp),%xmm11
1181         movaps  `$framesz+16*6`(%rsp),%xmm12
1182         movaps  `$framesz+16*7`(%rsp),%xmm13
1183         movaps  `$framesz+16*8`(%rsp),%xmm14
1184         movaps  `$framesz+16*9`(%rsp),%xmm15
1185 ___
1186 $code.=<<___;
1187         mov     (%rsi),%r15
1188         mov     8(%rsi),%r14
1189         mov     16(%rsi),%r13
1190         mov     24(%rsi),%r12
1191         mov     32(%rsi),%rbp
1192         mov     40(%rsi),%rbx
1193         lea     48(%rsi),%rsp
1194 .Lepilogue_avx2:
1195         ret
1196 .size   ${func}_avx2,.-${func}_avx2
1197 ___
1198 }}
1199 }}}}}
1200
1201 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1202 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1203 if ($win64) {
1204 $rec="%rcx";
1205 $frame="%rdx";
1206 $context="%r8";
1207 $disp="%r9";
1208
1209 $code.=<<___ if ($avx);
1210 .extern __imp_RtlVirtualUnwind
1211 .type   se_handler,\@abi-omnipotent
1212 .align  16
1213 se_handler:
1214         push    %rsi
1215         push    %rdi
1216         push    %rbx
1217         push    %rbp
1218         push    %r12
1219         push    %r13
1220         push    %r14
1221         push    %r15
1222         pushfq
1223         sub     \$64,%rsp
1224
1225         mov     120($context),%rax      # pull context->Rax
1226         mov     248($context),%rbx      # pull context->Rip
1227
1228         mov     8($disp),%rsi           # disp->ImageBase
1229         mov     56($disp),%r11          # disp->HanderlData
1230
1231         mov     0(%r11),%r10d           # HandlerData[0]
1232         lea     (%rsi,%r10),%r10        # prologue label
1233         cmp     %r10,%rbx               # context->Rip<prologue label
1234         jb      .Lin_prologue
1235
1236         mov     152($context),%rax      # pull context->Rsp
1237
1238         mov     4(%r11),%r10d           # HandlerData[1]
1239         lea     (%rsi,%r10),%r10        # epilogue label
1240         cmp     %r10,%rbx               # context->Rip>=epilogue label
1241         jae     .Lin_prologue
1242 ___
1243 $code.=<<___ if ($avx>1);
1244         lea     .Lavx2_shortcut(%rip),%r10
1245         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1246         jb      .Lnot_in_avx2
1247
1248         and     \$-256*$SZ,%rax
1249         add     \$`2*$SZ*($rounds-8)`,%rax
1250 .Lnot_in_avx2:
1251 ___
1252 $code.=<<___;
1253         mov     %rax,%rsi               # put aside Rsp
1254         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1255         lea     48(%rax),%rax
1256
1257         mov     -8(%rax),%rbx
1258         mov     -16(%rax),%rbp
1259         mov     -24(%rax),%r12
1260         mov     -32(%rax),%r13
1261         mov     -40(%rax),%r14
1262         mov     -48(%rax),%r15
1263         mov     %rbx,144($context)      # restore context->Rbx
1264         mov     %rbp,160($context)      # restore context->Rbp
1265         mov     %r12,216($context)      # restore context->R12
1266         mov     %r13,224($context)      # restore context->R13
1267         mov     %r14,232($context)      # restore context->R14
1268         mov     %r15,240($context)      # restore context->R15
1269
1270         lea     .Lepilogue(%rip),%r10
1271         cmp     %r10,%rbx
1272         jb      .Lin_prologue           # non-AVX code
1273
1274         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1275         lea     512($context),%rdi      # &context.Xmm6
1276         mov     \$20,%ecx
1277         .long   0xa548f3fc              # cld; rep movsq
1278
1279 .Lin_prologue:
1280         mov     8(%rax),%rdi
1281         mov     16(%rax),%rsi
1282         mov     %rax,152($context)      # restore context->Rsp
1283         mov     %rsi,168($context)      # restore context->Rsi
1284         mov     %rdi,176($context)      # restore context->Rdi
1285
1286         mov     40($disp),%rdi          # disp->ContextRecord
1287         mov     $context,%rsi           # context
1288         mov     \$154,%ecx              # sizeof(CONTEXT)
1289         .long   0xa548f3fc              # cld; rep movsq
1290
1291         mov     $disp,%rsi
1292         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1293         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1294         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1295         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1296         mov     40(%rsi),%r10           # disp->ContextRecord
1297         lea     56(%rsi),%r11           # &disp->HandlerData
1298         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1299         mov     %r10,32(%rsp)           # arg5
1300         mov     %r11,40(%rsp)           # arg6
1301         mov     %r12,48(%rsp)           # arg7
1302         mov     %rcx,56(%rsp)           # arg8, (NULL)
1303         call    *__imp_RtlVirtualUnwind(%rip)
1304
1305         mov     \$1,%eax                # ExceptionContinueSearch
1306         add     \$64,%rsp
1307         popfq
1308         pop     %r15
1309         pop     %r14
1310         pop     %r13
1311         pop     %r12
1312         pop     %rbp
1313         pop     %rbx
1314         pop     %rdi
1315         pop     %rsi
1316         ret
1317 .size   se_handler,.-se_handler
1318
1319 .section        .pdata
1320         .rva    .LSEH_begin_${func}_xop
1321         .rva    .LSEH_end_${func}_xop
1322         .rva    .LSEH_info_${func}_xop
1323
1324         .rva    .LSEH_begin_${func}_avx
1325         .rva    .LSEH_end_${func}_avx
1326         .rva    .LSEH_info_${func}_avx
1327 ___
1328 $code.=<<___ if ($avx>1);
1329         .rva    .LSEH_begin_${func}_avx2
1330         .rva    .LSEH_end_${func}_avx2
1331         .rva    .LSEH_info_${func}_avx2
1332 ___
1333 $code.=<<___ if ($avx);
1334 .section        .xdata
1335 .align  8
1336 .LSEH_info_${func}_xop:
1337         .byte   9,0,0,0
1338         .rva    se_handler
1339         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1340
1341 .LSEH_info_${func}_avx:
1342         .byte   9,0,0,0
1343         .rva    se_handler
1344         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1345 ___
1346 $code.=<<___ if ($avx>1);
1347 .LSEH_info_${func}_avx2:
1348         .byte   9,0,0,0
1349         .rva    se_handler
1350         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1351 ___
1352 }
1353
1354 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1355 print $code;
1356 close STDOUT;