Update copyright year
[openssl.git] / crypto / chacha / asm / chacha-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # November 2014
18 #
19 # ChaCha20 for x86_64.
20 #
21 # December 2016
22 #
23 # Add AVX512F code path.
24 #
25 # December 2017
26 #
27 # Add AVX512VL code path.
28 #
29 # Performance in cycles per byte out of large buffer.
30 #
31 #               IALU/gcc 4.8(i) 1x/2xSSSE3(ii)  4xSSSE3     NxAVX(v)
32 #
33 # P4            9.48/+99%       -               -
34 # Core2         7.83/+55%       7.90/5.76       4.35
35 # Westmere      7.19/+50%       5.60/4.50       3.00
36 # Sandy Bridge  8.31/+42%       5.45/4.00       2.72
37 # Ivy Bridge    6.71/+46%       5.40/?          2.41
38 # Haswell       5.92/+43%       5.20/3.45       2.42        1.23
39 # Skylake[-X]   5.87/+39%       4.70/3.22       2.31        1.19[0.80(vi)]
40 # Silvermont    12.0/+33%       7.75/6.90       7.03(iii)
41 # Knights L     11.7/-          ?               9.60(iii)   0.80
42 # Goldmont      10.6/+17%       5.10/3.52       3.28
43 # Sledgehammer  7.28/+52%       -               -
44 # Bulldozer     9.66/+28%       9.85/5.35(iv)   3.06(iv)
45 # Ryzen         5.96/+50%       5.19/3.00       2.40        2.09
46 # VIA Nano      10.5/+46%       6.72/6.88       6.05
47 #
48 # (i)   compared to older gcc 3.x one can observe >2x improvement on
49 #       most platforms;
50 # (ii)  2xSSSE3 is code path optimized specifically for 128 bytes used
51 #       by chacha20_poly1305_tls_cipher, results are EVP-free;
52 # (iii) this is not optimal result for Atom because of MSROM
53 #       limitations, SSE2 can do better, but gain is considered too
54 #       low to justify the [maintenance] effort;
55 # (iv)  Bulldozer actually executes 4xXOP code path that delivers 2.20
56 #       and 4.85 for 128-byte inputs;
57 # (v)   8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
58 # (vi)  even though Skylake-X can execute AVX512F code and deliver 0.57
59 #       cpb in single thread, the corresponding capability is suppressed;
60
61 $flavour = shift;
62 $output  = shift;
63 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
64
65 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
66
67 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
68 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
69 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
70 die "can't locate x86_64-xlate.pl";
71
72 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
73                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
74         $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
75 }
76
77 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
78            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
79         $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
80         $avx += 1 if ($1==2.11 && $2>=8);
81 }
82
83 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
84            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
85         $avx = ($1>=10) + ($1>=11);
86 }
87
88 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
89         $avx = ($2>=3.0) + ($2>3.0);
90 }
91
92 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
93 *STDOUT=*OUT;
94
95 # input parameter block
96 ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
97
98 $code.=<<___;
99 .text
100
101 .extern OPENSSL_ia32cap_P
102
103 .align  64
104 .Lzero:
105 .long   0,0,0,0
106 .Lone:
107 .long   1,0,0,0
108 .Linc:
109 .long   0,1,2,3
110 .Lfour:
111 .long   4,4,4,4
112 .Lincy:
113 .long   0,2,4,6,1,3,5,7
114 .Leight:
115 .long   8,8,8,8,8,8,8,8
116 .Lrot16:
117 .byte   0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
118 .Lrot24:
119 .byte   0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
120 .Ltwoy:
121 .long   2,0,0,0, 2,0,0,0
122 .align  64
123 .Lzeroz:
124 .long   0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
125 .Lfourz:
126 .long   4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
127 .Lincz:
128 .long   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
129 .Lsixteen:
130 .long   16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
131 .Lsigma:
132 .asciz  "expand 32-byte k"
133 .asciz  "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
134 ___
135
136 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
137 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
138   my $arg = pop;
139     $arg = "\$$arg" if ($arg*1 eq $arg);
140     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
141 }
142
143 @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
144     "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
145 @t=("%esi","%edi");
146
147 sub ROUND {                     # critical path is 24 cycles per round
148 my ($a0,$b0,$c0,$d0)=@_;
149 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
150 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
151 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
152 my ($xc,$xc_)=map("\"$_\"",@t);
153 my @x=map("\"$_\"",@x);
154
155         # Consider order in which variables are addressed by their
156         # index:
157         #
158         #       a   b   c   d
159         #
160         #       0   4   8  12 < even round
161         #       1   5   9  13
162         #       2   6  10  14
163         #       3   7  11  15
164         #       0   5  10  15 < odd round
165         #       1   6  11  12
166         #       2   7   8  13
167         #       3   4   9  14
168         #
169         # 'a', 'b' and 'd's are permanently allocated in registers,
170         # @x[0..7,12..15], while 'c's are maintained in memory. If
171         # you observe 'c' column, you'll notice that pair of 'c's is
172         # invariant between rounds. This means that we have to reload
173         # them once per round, in the middle. This is why you'll see
174         # bunch of 'c' stores and loads in the middle, but none in
175         # the beginning or end.
176
177         # Normally instructions would be interleaved to favour in-order
178         # execution. Generally out-of-order cores manage it gracefully,
179         # but not this time for some reason. As in-order execution
180         # cores are dying breed, old Atom is the only one around,
181         # instructions are left uninterleaved. Besides, Atom is better
182         # off executing 1xSSSE3 code anyway...
183
184         (
185         "&add   (@x[$a0],@x[$b0])",     # Q1
186         "&xor   (@x[$d0],@x[$a0])",
187         "&rol   (@x[$d0],16)",
188          "&add  (@x[$a1],@x[$b1])",     # Q2
189          "&xor  (@x[$d1],@x[$a1])",
190          "&rol  (@x[$d1],16)",
191
192         "&add   ($xc,@x[$d0])",
193         "&xor   (@x[$b0],$xc)",
194         "&rol   (@x[$b0],12)",
195          "&add  ($xc_,@x[$d1])",
196          "&xor  (@x[$b1],$xc_)",
197          "&rol  (@x[$b1],12)",
198
199         "&add   (@x[$a0],@x[$b0])",
200         "&xor   (@x[$d0],@x[$a0])",
201         "&rol   (@x[$d0],8)",
202          "&add  (@x[$a1],@x[$b1])",
203          "&xor  (@x[$d1],@x[$a1])",
204          "&rol  (@x[$d1],8)",
205
206         "&add   ($xc,@x[$d0])",
207         "&xor   (@x[$b0],$xc)",
208         "&rol   (@x[$b0],7)",
209          "&add  ($xc_,@x[$d1])",
210          "&xor  (@x[$b1],$xc_)",
211          "&rol  (@x[$b1],7)",
212
213         "&mov   (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
214          "&mov  (\"4*$c1(%rsp)\",$xc_)",
215         "&mov   ($xc,\"4*$c2(%rsp)\")",
216          "&mov  ($xc_,\"4*$c3(%rsp)\")",
217
218         "&add   (@x[$a2],@x[$b2])",     # Q3
219         "&xor   (@x[$d2],@x[$a2])",
220         "&rol   (@x[$d2],16)",
221          "&add  (@x[$a3],@x[$b3])",     # Q4
222          "&xor  (@x[$d3],@x[$a3])",
223          "&rol  (@x[$d3],16)",
224
225         "&add   ($xc,@x[$d2])",
226         "&xor   (@x[$b2],$xc)",
227         "&rol   (@x[$b2],12)",
228          "&add  ($xc_,@x[$d3])",
229          "&xor  (@x[$b3],$xc_)",
230          "&rol  (@x[$b3],12)",
231
232         "&add   (@x[$a2],@x[$b2])",
233         "&xor   (@x[$d2],@x[$a2])",
234         "&rol   (@x[$d2],8)",
235          "&add  (@x[$a3],@x[$b3])",
236          "&xor  (@x[$d3],@x[$a3])",
237          "&rol  (@x[$d3],8)",
238
239         "&add   ($xc,@x[$d2])",
240         "&xor   (@x[$b2],$xc)",
241         "&rol   (@x[$b2],7)",
242          "&add  ($xc_,@x[$d3])",
243          "&xor  (@x[$b3],$xc_)",
244          "&rol  (@x[$b3],7)"
245         );
246 }
247
248 ########################################################################
249 # Generic code path that handles all lengths on pre-SSSE3 processors.
250 $code.=<<___;
251 .globl  ChaCha20_ctr32
252 .type   ChaCha20_ctr32,\@function,5
253 .align  64
254 ChaCha20_ctr32:
255 .cfi_startproc
256         cmp     \$0,$len
257         je      .Lno_data
258         mov     OPENSSL_ia32cap_P+4(%rip),%r10
259 ___
260 $code.=<<___    if ($avx>2);
261         bt      \$48,%r10               # check for AVX512F
262         jc      .LChaCha20_avx512
263         test    %r10,%r10               # check for AVX512VL
264         js      .LChaCha20_avx512vl
265 ___
266 $code.=<<___;
267         test    \$`1<<(41-32)`,%r10d
268         jnz     .LChaCha20_ssse3
269
270         push    %rbx
271 .cfi_push       %rbx
272         push    %rbp
273 .cfi_push       %rbp
274         push    %r12
275 .cfi_push       %r12
276         push    %r13
277 .cfi_push       %r13
278         push    %r14
279 .cfi_push       %r14
280         push    %r15
281 .cfi_push       %r15
282         sub     \$64+24,%rsp
283 .cfi_adjust_cfa_offset  64+24
284 .Lctr32_body:
285
286         #movdqa .Lsigma(%rip),%xmm0
287         movdqu  ($key),%xmm1
288         movdqu  16($key),%xmm2
289         movdqu  ($counter),%xmm3
290         movdqa  .Lone(%rip),%xmm4
291
292         #movdqa %xmm0,4*0(%rsp)         # key[0]
293         movdqa  %xmm1,4*4(%rsp)         # key[1]
294         movdqa  %xmm2,4*8(%rsp)         # key[2]
295         movdqa  %xmm3,4*12(%rsp)        # key[3]
296         mov     $len,%rbp               # reassign $len
297         jmp     .Loop_outer
298
299 .align  32
300 .Loop_outer:
301         mov     \$0x61707865,@x[0]      # 'expa'
302         mov     \$0x3320646e,@x[1]      # 'nd 3'
303         mov     \$0x79622d32,@x[2]      # '2-by'
304         mov     \$0x6b206574,@x[3]      # 'te k'
305         mov     4*4(%rsp),@x[4]
306         mov     4*5(%rsp),@x[5]
307         mov     4*6(%rsp),@x[6]
308         mov     4*7(%rsp),@x[7]
309         movd    %xmm3,@x[12]
310         mov     4*13(%rsp),@x[13]
311         mov     4*14(%rsp),@x[14]
312         mov     4*15(%rsp),@x[15]
313
314         mov     %rbp,64+0(%rsp)         # save len
315         mov     \$10,%ebp
316         mov     $inp,64+8(%rsp)         # save inp
317         movq    %xmm2,%rsi              # "@x[8]"
318         mov     $out,64+16(%rsp)        # save out
319         mov     %rsi,%rdi
320         shr     \$32,%rdi               # "@x[9]"
321         jmp     .Loop
322
323 .align  32
324 .Loop:
325 ___
326         foreach (&ROUND (0, 4, 8,12)) { eval; }
327         foreach (&ROUND (0, 5,10,15)) { eval; }
328         &dec    ("%ebp");
329         &jnz    (".Loop");
330
331 $code.=<<___;
332         mov     @t[1],4*9(%rsp)         # modulo-scheduled
333         mov     @t[0],4*8(%rsp)
334         mov     64(%rsp),%rbp           # load len
335         movdqa  %xmm2,%xmm1
336         mov     64+8(%rsp),$inp         # load inp
337         paddd   %xmm4,%xmm3             # increment counter
338         mov     64+16(%rsp),$out        # load out
339
340         add     \$0x61707865,@x[0]      # 'expa'
341         add     \$0x3320646e,@x[1]      # 'nd 3'
342         add     \$0x79622d32,@x[2]      # '2-by'
343         add     \$0x6b206574,@x[3]      # 'te k'
344         add     4*4(%rsp),@x[4]
345         add     4*5(%rsp),@x[5]
346         add     4*6(%rsp),@x[6]
347         add     4*7(%rsp),@x[7]
348         add     4*12(%rsp),@x[12]
349         add     4*13(%rsp),@x[13]
350         add     4*14(%rsp),@x[14]
351         add     4*15(%rsp),@x[15]
352         paddd   4*8(%rsp),%xmm1
353
354         cmp     \$64,%rbp
355         jb      .Ltail
356
357         xor     4*0($inp),@x[0]         # xor with input
358         xor     4*1($inp),@x[1]
359         xor     4*2($inp),@x[2]
360         xor     4*3($inp),@x[3]
361         xor     4*4($inp),@x[4]
362         xor     4*5($inp),@x[5]
363         xor     4*6($inp),@x[6]
364         xor     4*7($inp),@x[7]
365         movdqu  4*8($inp),%xmm0
366         xor     4*12($inp),@x[12]
367         xor     4*13($inp),@x[13]
368         xor     4*14($inp),@x[14]
369         xor     4*15($inp),@x[15]
370         lea     4*16($inp),$inp         # inp+=64
371         pxor    %xmm1,%xmm0
372
373         movdqa  %xmm2,4*8(%rsp)
374         movd    %xmm3,4*12(%rsp)
375
376         mov     @x[0],4*0($out)         # write output
377         mov     @x[1],4*1($out)
378         mov     @x[2],4*2($out)
379         mov     @x[3],4*3($out)
380         mov     @x[4],4*4($out)
381         mov     @x[5],4*5($out)
382         mov     @x[6],4*6($out)
383         mov     @x[7],4*7($out)
384         movdqu  %xmm0,4*8($out)
385         mov     @x[12],4*12($out)
386         mov     @x[13],4*13($out)
387         mov     @x[14],4*14($out)
388         mov     @x[15],4*15($out)
389         lea     4*16($out),$out         # out+=64
390
391         sub     \$64,%rbp
392         jnz     .Loop_outer
393
394         jmp     .Ldone
395
396 .align  16
397 .Ltail:
398         mov     @x[0],4*0(%rsp)
399         mov     @x[1],4*1(%rsp)
400         xor     %rbx,%rbx
401         mov     @x[2],4*2(%rsp)
402         mov     @x[3],4*3(%rsp)
403         mov     @x[4],4*4(%rsp)
404         mov     @x[5],4*5(%rsp)
405         mov     @x[6],4*6(%rsp)
406         mov     @x[7],4*7(%rsp)
407         movdqa  %xmm1,4*8(%rsp)
408         mov     @x[12],4*12(%rsp)
409         mov     @x[13],4*13(%rsp)
410         mov     @x[14],4*14(%rsp)
411         mov     @x[15],4*15(%rsp)
412
413 .Loop_tail:
414         movzb   ($inp,%rbx),%eax
415         movzb   (%rsp,%rbx),%edx
416         lea     1(%rbx),%rbx
417         xor     %edx,%eax
418         mov     %al,-1($out,%rbx)
419         dec     %rbp
420         jnz     .Loop_tail
421
422 .Ldone:
423         lea     64+24+48(%rsp),%rsi
424 .cfi_def_cfa    %rsi,8
425         mov     -48(%rsi),%r15
426 .cfi_restore    %r15
427         mov     -40(%rsi),%r14
428 .cfi_restore    %r14
429         mov     -32(%rsi),%r13
430 .cfi_restore    %r13
431         mov     -24(%rsi),%r12
432 .cfi_restore    %r12
433         mov     -16(%rsi),%rbp
434 .cfi_restore    %rbp
435         mov     -8(%rsi),%rbx
436 .cfi_restore    %rbx
437         lea     (%rsi),%rsp
438 .cfi_def_cfa_register   %rsp
439 .Lno_data:
440         ret
441 .cfi_endproc
442 .size   ChaCha20_ctr32,.-ChaCha20_ctr32
443 ___
444
445 ########################################################################
446 # SSSE3 code path that handles shorter lengths
447 {
448 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
449
450 sub SSSE3ROUND {        # critical path is 20 "SIMD ticks" per round
451         &paddd  ($a,$b);
452         &pxor   ($d,$a);
453         &pshufb ($d,$rot16);
454
455         &paddd  ($c,$d);
456         &pxor   ($b,$c);
457         &movdqa ($t,$b);
458         &psrld  ($b,20);
459         &pslld  ($t,12);
460         &por    ($b,$t);
461
462         &paddd  ($a,$b);
463         &pxor   ($d,$a);
464         &pshufb ($d,$rot24);
465
466         &paddd  ($c,$d);
467         &pxor   ($b,$c);
468         &movdqa ($t,$b);
469         &psrld  ($b,25);
470         &pslld  ($t,7);
471         &por    ($b,$t);
472 }
473
474 my $xframe = $win64 ? 32+8 : 8;
475
476 $code.=<<___;
477 .type   ChaCha20_ssse3,\@function,5
478 .align  32
479 ChaCha20_ssse3:
480 .cfi_startproc
481 .LChaCha20_ssse3:
482         mov     %rsp,%r9                # frame pointer
483 .cfi_def_cfa_register   %r9
484 ___
485 $code.=<<___    if ($avx);
486         test    \$`1<<(43-32)`,%r10d
487         jnz     .LChaCha20_4xop         # XOP is fastest even if we use 1/4
488 ___
489 $code.=<<___;
490         cmp     \$128,$len              # we might throw away some data,
491         je      .LChaCha20_128
492         ja      .LChaCha20_4x           # but overall it won't be slower
493
494 .Ldo_sse3_after_all:
495         sub     \$64+$xframe,%rsp
496 ___
497 $code.=<<___    if ($win64);
498         movaps  %xmm6,-0x28(%r9)
499         movaps  %xmm7,-0x18(%r9)
500 .Lssse3_body:
501 ___
502 $code.=<<___;
503         movdqa  .Lsigma(%rip),$a
504         movdqu  ($key),$b
505         movdqu  16($key),$c
506         movdqu  ($counter),$d
507         movdqa  .Lrot16(%rip),$rot16
508         movdqa  .Lrot24(%rip),$rot24
509
510         movdqa  $a,0x00(%rsp)
511         movdqa  $b,0x10(%rsp)
512         movdqa  $c,0x20(%rsp)
513         movdqa  $d,0x30(%rsp)
514         mov     \$10,$counter           # reuse $counter
515         jmp     .Loop_ssse3
516
517 .align  32
518 .Loop_outer_ssse3:
519         movdqa  .Lone(%rip),$d
520         movdqa  0x00(%rsp),$a
521         movdqa  0x10(%rsp),$b
522         movdqa  0x20(%rsp),$c
523         paddd   0x30(%rsp),$d
524         mov     \$10,$counter
525         movdqa  $d,0x30(%rsp)
526         jmp     .Loop_ssse3
527
528 .align  32
529 .Loop_ssse3:
530 ___
531         &SSSE3ROUND();
532         &pshufd ($c,$c,0b01001110);
533         &pshufd ($b,$b,0b00111001);
534         &pshufd ($d,$d,0b10010011);
535         &nop    ();
536
537         &SSSE3ROUND();
538         &pshufd ($c,$c,0b01001110);
539         &pshufd ($b,$b,0b10010011);
540         &pshufd ($d,$d,0b00111001);
541
542         &dec    ($counter);
543         &jnz    (".Loop_ssse3");
544
545 $code.=<<___;
546         paddd   0x00(%rsp),$a
547         paddd   0x10(%rsp),$b
548         paddd   0x20(%rsp),$c
549         paddd   0x30(%rsp),$d
550
551         cmp     \$64,$len
552         jb      .Ltail_ssse3
553
554         movdqu  0x00($inp),$t
555         movdqu  0x10($inp),$t1
556         pxor    $t,$a                   # xor with input
557         movdqu  0x20($inp),$t
558         pxor    $t1,$b
559         movdqu  0x30($inp),$t1
560         lea     0x40($inp),$inp         # inp+=64
561         pxor    $t,$c
562         pxor    $t1,$d
563
564         movdqu  $a,0x00($out)           # write output
565         movdqu  $b,0x10($out)
566         movdqu  $c,0x20($out)
567         movdqu  $d,0x30($out)
568         lea     0x40($out),$out         # out+=64
569
570         sub     \$64,$len
571         jnz     .Loop_outer_ssse3
572
573         jmp     .Ldone_ssse3
574
575 .align  16
576 .Ltail_ssse3:
577         movdqa  $a,0x00(%rsp)
578         movdqa  $b,0x10(%rsp)
579         movdqa  $c,0x20(%rsp)
580         movdqa  $d,0x30(%rsp)
581         xor     $counter,$counter
582
583 .Loop_tail_ssse3:
584         movzb   ($inp,$counter),%eax
585         movzb   (%rsp,$counter),%ecx
586         lea     1($counter),$counter
587         xor     %ecx,%eax
588         mov     %al,-1($out,$counter)
589         dec     $len
590         jnz     .Loop_tail_ssse3
591
592 .Ldone_ssse3:
593 ___
594 $code.=<<___    if ($win64);
595         movaps  -0x28(%r9),%xmm6
596         movaps  -0x18(%r9),%xmm7
597 ___
598 $code.=<<___;
599         lea     (%r9),%rsp
600 .cfi_def_cfa_register   %rsp
601 .Lssse3_epilogue:
602         ret
603 .cfi_endproc
604 .size   ChaCha20_ssse3,.-ChaCha20_ssse3
605 ___
606 }
607
608 ########################################################################
609 # SSSE3 code path that handles 128-byte inputs
610 {
611 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7));
612 my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1));
613
614 sub SSSE3ROUND_2x {
615         &paddd  ($a,$b);
616         &pxor   ($d,$a);
617          &paddd ($a1,$b1);
618          &pxor  ($d1,$a1);
619         &pshufb ($d,$rot16);
620          &pshufb($d1,$rot16);
621
622         &paddd  ($c,$d);
623          &paddd ($c1,$d1);
624         &pxor   ($b,$c);
625          &pxor  ($b1,$c1);
626         &movdqa ($t,$b);
627         &psrld  ($b,20);
628          &movdqa($t1,$b1);
629         &pslld  ($t,12);
630          &psrld ($b1,20);
631         &por    ($b,$t);
632          &pslld ($t1,12);
633          &por   ($b1,$t1);
634
635         &paddd  ($a,$b);
636         &pxor   ($d,$a);
637          &paddd ($a1,$b1);
638          &pxor  ($d1,$a1);
639         &pshufb ($d,$rot24);
640          &pshufb($d1,$rot24);
641
642         &paddd  ($c,$d);
643          &paddd ($c1,$d1);
644         &pxor   ($b,$c);
645          &pxor  ($b1,$c1);
646         &movdqa ($t,$b);
647         &psrld  ($b,25);
648          &movdqa($t1,$b1);
649         &pslld  ($t,7);
650          &psrld ($b1,25);
651         &por    ($b,$t);
652          &pslld ($t1,7);
653          &por   ($b1,$t1);
654 }
655
656 my $xframe = $win64 ? 0x68 : 8;
657
658 $code.=<<___;
659 .type   ChaCha20_128,\@function,5
660 .align  32
661 ChaCha20_128:
662 .cfi_startproc
663 .LChaCha20_128:
664         mov     %rsp,%r9                # frame pointer
665 .cfi_def_cfa_register   %r9
666         sub     \$64+$xframe,%rsp
667 ___
668 $code.=<<___    if ($win64);
669         movaps  %xmm6,-0x68(%r9)
670         movaps  %xmm7,-0x58(%r9)
671         movaps  %xmm8,-0x48(%r9)
672         movaps  %xmm9,-0x38(%r9)
673         movaps  %xmm10,-0x28(%r9)
674         movaps  %xmm11,-0x18(%r9)
675 .L128_body:
676 ___
677 $code.=<<___;
678         movdqa  .Lsigma(%rip),$a
679         movdqu  ($key),$b
680         movdqu  16($key),$c
681         movdqu  ($counter),$d
682         movdqa  .Lone(%rip),$d1
683         movdqa  .Lrot16(%rip),$rot16
684         movdqa  .Lrot24(%rip),$rot24
685
686         movdqa  $a,$a1
687         movdqa  $a,0x00(%rsp)
688         movdqa  $b,$b1
689         movdqa  $b,0x10(%rsp)
690         movdqa  $c,$c1
691         movdqa  $c,0x20(%rsp)
692         paddd   $d,$d1
693         movdqa  $d,0x30(%rsp)
694         mov     \$10,$counter           # reuse $counter
695         jmp     .Loop_128
696
697 .align  32
698 .Loop_128:
699 ___
700         &SSSE3ROUND_2x();
701         &pshufd ($c,$c,0b01001110);
702         &pshufd ($b,$b,0b00111001);
703         &pshufd ($d,$d,0b10010011);
704         &pshufd ($c1,$c1,0b01001110);
705         &pshufd ($b1,$b1,0b00111001);
706         &pshufd ($d1,$d1,0b10010011);
707
708         &SSSE3ROUND_2x();
709         &pshufd ($c,$c,0b01001110);
710         &pshufd ($b,$b,0b10010011);
711         &pshufd ($d,$d,0b00111001);
712         &pshufd ($c1,$c1,0b01001110);
713         &pshufd ($b1,$b1,0b10010011);
714         &pshufd ($d1,$d1,0b00111001);
715
716         &dec    ($counter);
717         &jnz    (".Loop_128");
718
719 $code.=<<___;
720         paddd   0x00(%rsp),$a
721         paddd   0x10(%rsp),$b
722         paddd   0x20(%rsp),$c
723         paddd   0x30(%rsp),$d
724         paddd   .Lone(%rip),$d1
725         paddd   0x00(%rsp),$a1
726         paddd   0x10(%rsp),$b1
727         paddd   0x20(%rsp),$c1
728         paddd   0x30(%rsp),$d1
729
730         movdqu  0x00($inp),$t
731         movdqu  0x10($inp),$t1
732         pxor    $t,$a                   # xor with input
733         movdqu  0x20($inp),$t
734         pxor    $t1,$b
735         movdqu  0x30($inp),$t1
736         pxor    $t,$c
737         movdqu  0x40($inp),$t
738         pxor    $t1,$d
739         movdqu  0x50($inp),$t1
740         pxor    $t,$a1
741         movdqu  0x60($inp),$t
742         pxor    $t1,$b1
743         movdqu  0x70($inp),$t1
744         pxor    $t,$c1
745         pxor    $t1,$d1
746
747         movdqu  $a,0x00($out)           # write output
748         movdqu  $b,0x10($out)
749         movdqu  $c,0x20($out)
750         movdqu  $d,0x30($out)
751         movdqu  $a1,0x40($out)
752         movdqu  $b1,0x50($out)
753         movdqu  $c1,0x60($out)
754         movdqu  $d1,0x70($out)
755 ___
756 $code.=<<___    if ($win64);
757         movaps  -0x68(%r9),%xmm6
758         movaps  -0x58(%r9),%xmm7
759         movaps  -0x48(%r9),%xmm8
760         movaps  -0x38(%r9),%xmm9
761         movaps  -0x28(%r9),%xmm10
762         movaps  -0x18(%r9),%xmm11
763 ___
764 $code.=<<___;
765         lea     (%r9),%rsp
766 .cfi_def_cfa_register   %rsp
767 .L128_epilogue:
768         ret
769 .cfi_endproc
770 .size   ChaCha20_128,.-ChaCha20_128
771 ___
772 }
773
774 ########################################################################
775 # SSSE3 code path that handles longer messages.
776 {
777 # assign variables to favor Atom front-end
778 my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
779     $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
780 my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
781         "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
782
783 sub SSSE3_lane_ROUND {
784 my ($a0,$b0,$c0,$d0)=@_;
785 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
786 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
787 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
788 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
789 my @x=map("\"$_\"",@xx);
790
791         # Consider order in which variables are addressed by their
792         # index:
793         #
794         #       a   b   c   d
795         #
796         #       0   4   8  12 < even round
797         #       1   5   9  13
798         #       2   6  10  14
799         #       3   7  11  15
800         #       0   5  10  15 < odd round
801         #       1   6  11  12
802         #       2   7   8  13
803         #       3   4   9  14
804         #
805         # 'a', 'b' and 'd's are permanently allocated in registers,
806         # @x[0..7,12..15], while 'c's are maintained in memory. If
807         # you observe 'c' column, you'll notice that pair of 'c's is
808         # invariant between rounds. This means that we have to reload
809         # them once per round, in the middle. This is why you'll see
810         # bunch of 'c' stores and loads in the middle, but none in
811         # the beginning or end.
812
813         (
814         "&paddd         (@x[$a0],@x[$b0])",     # Q1
815          "&paddd        (@x[$a1],@x[$b1])",     # Q2
816         "&pxor          (@x[$d0],@x[$a0])",
817          "&pxor         (@x[$d1],@x[$a1])",
818         "&pshufb        (@x[$d0],$t1)",
819          "&pshufb       (@x[$d1],$t1)",
820
821         "&paddd         ($xc,@x[$d0])",
822          "&paddd        ($xc_,@x[$d1])",
823         "&pxor          (@x[$b0],$xc)",
824          "&pxor         (@x[$b1],$xc_)",
825         "&movdqa        ($t0,@x[$b0])",
826         "&pslld         (@x[$b0],12)",
827         "&psrld         ($t0,20)",
828          "&movdqa       ($t1,@x[$b1])",
829          "&pslld        (@x[$b1],12)",
830         "&por           (@x[$b0],$t0)",
831          "&psrld        ($t1,20)",
832         "&movdqa        ($t0,'(%r11)')",        # .Lrot24(%rip)
833          "&por          (@x[$b1],$t1)",
834
835         "&paddd         (@x[$a0],@x[$b0])",
836          "&paddd        (@x[$a1],@x[$b1])",
837         "&pxor          (@x[$d0],@x[$a0])",
838          "&pxor         (@x[$d1],@x[$a1])",
839         "&pshufb        (@x[$d0],$t0)",
840          "&pshufb       (@x[$d1],$t0)",
841
842         "&paddd         ($xc,@x[$d0])",
843          "&paddd        ($xc_,@x[$d1])",
844         "&pxor          (@x[$b0],$xc)",
845          "&pxor         (@x[$b1],$xc_)",
846         "&movdqa        ($t1,@x[$b0])",
847         "&pslld         (@x[$b0],7)",
848         "&psrld         ($t1,25)",
849          "&movdqa       ($t0,@x[$b1])",
850          "&pslld        (@x[$b1],7)",
851         "&por           (@x[$b0],$t1)",
852          "&psrld        ($t0,25)",
853         "&movdqa        ($t1,'(%r10)')",        # .Lrot16(%rip)
854          "&por          (@x[$b1],$t0)",
855
856         "&movdqa        (\"`16*($c0-8)`(%rsp)\",$xc)",  # reload pair of 'c's
857          "&movdqa       (\"`16*($c1-8)`(%rsp)\",$xc_)",
858         "&movdqa        ($xc,\"`16*($c2-8)`(%rsp)\")",
859          "&movdqa       ($xc_,\"`16*($c3-8)`(%rsp)\")",
860
861         "&paddd         (@x[$a2],@x[$b2])",     # Q3
862          "&paddd        (@x[$a3],@x[$b3])",     # Q4
863         "&pxor          (@x[$d2],@x[$a2])",
864          "&pxor         (@x[$d3],@x[$a3])",
865         "&pshufb        (@x[$d2],$t1)",
866          "&pshufb       (@x[$d3],$t1)",
867
868         "&paddd         ($xc,@x[$d2])",
869          "&paddd        ($xc_,@x[$d3])",
870         "&pxor          (@x[$b2],$xc)",
871          "&pxor         (@x[$b3],$xc_)",
872         "&movdqa        ($t0,@x[$b2])",
873         "&pslld         (@x[$b2],12)",
874         "&psrld         ($t0,20)",
875          "&movdqa       ($t1,@x[$b3])",
876          "&pslld        (@x[$b3],12)",
877         "&por           (@x[$b2],$t0)",
878          "&psrld        ($t1,20)",
879         "&movdqa        ($t0,'(%r11)')",        # .Lrot24(%rip)
880          "&por          (@x[$b3],$t1)",
881
882         "&paddd         (@x[$a2],@x[$b2])",
883          "&paddd        (@x[$a3],@x[$b3])",
884         "&pxor          (@x[$d2],@x[$a2])",
885          "&pxor         (@x[$d3],@x[$a3])",
886         "&pshufb        (@x[$d2],$t0)",
887          "&pshufb       (@x[$d3],$t0)",
888
889         "&paddd         ($xc,@x[$d2])",
890          "&paddd        ($xc_,@x[$d3])",
891         "&pxor          (@x[$b2],$xc)",
892          "&pxor         (@x[$b3],$xc_)",
893         "&movdqa        ($t1,@x[$b2])",
894         "&pslld         (@x[$b2],7)",
895         "&psrld         ($t1,25)",
896          "&movdqa       ($t0,@x[$b3])",
897          "&pslld        (@x[$b3],7)",
898         "&por           (@x[$b2],$t1)",
899          "&psrld        ($t0,25)",
900         "&movdqa        ($t1,'(%r10)')",        # .Lrot16(%rip)
901          "&por          (@x[$b3],$t0)"
902         );
903 }
904
905 my $xframe = $win64 ? 0xa8 : 8;
906
907 $code.=<<___;
908 .type   ChaCha20_4x,\@function,5
909 .align  32
910 ChaCha20_4x:
911 .cfi_startproc
912 .LChaCha20_4x:
913         mov             %rsp,%r9                # frame pointer
914 .cfi_def_cfa_register   %r9
915         mov             %r10,%r11
916 ___
917 $code.=<<___    if ($avx>1);
918         shr             \$32,%r10               # OPENSSL_ia32cap_P+8
919         test            \$`1<<5`,%r10           # test AVX2
920         jnz             .LChaCha20_8x
921 ___
922 $code.=<<___;
923         cmp             \$192,$len
924         ja              .Lproceed4x
925
926         and             \$`1<<26|1<<22`,%r11    # isolate XSAVE+MOVBE
927         cmp             \$`1<<22`,%r11          # check for MOVBE without XSAVE
928         je              .Ldo_sse3_after_all     # to detect Atom
929
930 .Lproceed4x:
931         sub             \$0x140+$xframe,%rsp
932 ___
933         ################ stack layout
934         # +0x00         SIMD equivalent of @x[8-12]
935         # ...
936         # +0x40         constant copy of key[0-2] smashed by lanes
937         # ...
938         # +0x100        SIMD counters (with nonce smashed by lanes)
939         # ...
940         # +0x140
941 $code.=<<___    if ($win64);
942         movaps          %xmm6,-0xa8(%r9)
943         movaps          %xmm7,-0x98(%r9)
944         movaps          %xmm8,-0x88(%r9)
945         movaps          %xmm9,-0x78(%r9)
946         movaps          %xmm10,-0x68(%r9)
947         movaps          %xmm11,-0x58(%r9)
948         movaps          %xmm12,-0x48(%r9)
949         movaps          %xmm13,-0x38(%r9)
950         movaps          %xmm14,-0x28(%r9)
951         movaps          %xmm15,-0x18(%r9)
952 .L4x_body:
953 ___
954 $code.=<<___;
955         movdqa          .Lsigma(%rip),$xa3      # key[0]
956         movdqu          ($key),$xb3             # key[1]
957         movdqu          16($key),$xt3           # key[2]
958         movdqu          ($counter),$xd3         # key[3]
959         lea             0x100(%rsp),%rcx        # size optimization
960         lea             .Lrot16(%rip),%r10
961         lea             .Lrot24(%rip),%r11
962
963         pshufd          \$0x00,$xa3,$xa0        # smash key by lanes...
964         pshufd          \$0x55,$xa3,$xa1
965         movdqa          $xa0,0x40(%rsp)         # ... and offload
966         pshufd          \$0xaa,$xa3,$xa2
967         movdqa          $xa1,0x50(%rsp)
968         pshufd          \$0xff,$xa3,$xa3
969         movdqa          $xa2,0x60(%rsp)
970         movdqa          $xa3,0x70(%rsp)
971
972         pshufd          \$0x00,$xb3,$xb0
973         pshufd          \$0x55,$xb3,$xb1
974         movdqa          $xb0,0x80-0x100(%rcx)
975         pshufd          \$0xaa,$xb3,$xb2
976         movdqa          $xb1,0x90-0x100(%rcx)
977         pshufd          \$0xff,$xb3,$xb3
978         movdqa          $xb2,0xa0-0x100(%rcx)
979         movdqa          $xb3,0xb0-0x100(%rcx)
980
981         pshufd          \$0x00,$xt3,$xt0        # "$xc0"
982         pshufd          \$0x55,$xt3,$xt1        # "$xc1"
983         movdqa          $xt0,0xc0-0x100(%rcx)
984         pshufd          \$0xaa,$xt3,$xt2        # "$xc2"
985         movdqa          $xt1,0xd0-0x100(%rcx)
986         pshufd          \$0xff,$xt3,$xt3        # "$xc3"
987         movdqa          $xt2,0xe0-0x100(%rcx)
988         movdqa          $xt3,0xf0-0x100(%rcx)
989
990         pshufd          \$0x00,$xd3,$xd0
991         pshufd          \$0x55,$xd3,$xd1
992         paddd           .Linc(%rip),$xd0        # don't save counters yet
993         pshufd          \$0xaa,$xd3,$xd2
994         movdqa          $xd1,0x110-0x100(%rcx)
995         pshufd          \$0xff,$xd3,$xd3
996         movdqa          $xd2,0x120-0x100(%rcx)
997         movdqa          $xd3,0x130-0x100(%rcx)
998
999         jmp             .Loop_enter4x
1000
1001 .align  32
1002 .Loop_outer4x:
1003         movdqa          0x40(%rsp),$xa0         # re-load smashed key
1004         movdqa          0x50(%rsp),$xa1
1005         movdqa          0x60(%rsp),$xa2
1006         movdqa          0x70(%rsp),$xa3
1007         movdqa          0x80-0x100(%rcx),$xb0
1008         movdqa          0x90-0x100(%rcx),$xb1
1009         movdqa          0xa0-0x100(%rcx),$xb2
1010         movdqa          0xb0-0x100(%rcx),$xb3
1011         movdqa          0xc0-0x100(%rcx),$xt0   # "$xc0"
1012         movdqa          0xd0-0x100(%rcx),$xt1   # "$xc1"
1013         movdqa          0xe0-0x100(%rcx),$xt2   # "$xc2"
1014         movdqa          0xf0-0x100(%rcx),$xt3   # "$xc3"
1015         movdqa          0x100-0x100(%rcx),$xd0
1016         movdqa          0x110-0x100(%rcx),$xd1
1017         movdqa          0x120-0x100(%rcx),$xd2
1018         movdqa          0x130-0x100(%rcx),$xd3
1019         paddd           .Lfour(%rip),$xd0       # next SIMD counters
1020
1021 .Loop_enter4x:
1022         movdqa          $xt2,0x20(%rsp)         # SIMD equivalent of "@x[10]"
1023         movdqa          $xt3,0x30(%rsp)         # SIMD equivalent of "@x[11]"
1024         movdqa          (%r10),$xt3             # .Lrot16(%rip)
1025         mov             \$10,%eax
1026         movdqa          $xd0,0x100-0x100(%rcx)  # save SIMD counters
1027         jmp             .Loop4x
1028
1029 .align  32
1030 .Loop4x:
1031 ___
1032         foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
1033         foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
1034 $code.=<<___;
1035         dec             %eax
1036         jnz             .Loop4x
1037
1038         paddd           0x40(%rsp),$xa0         # accumulate key material
1039         paddd           0x50(%rsp),$xa1
1040         paddd           0x60(%rsp),$xa2
1041         paddd           0x70(%rsp),$xa3
1042
1043         movdqa          $xa0,$xt2               # "de-interlace" data
1044         punpckldq       $xa1,$xa0
1045         movdqa          $xa2,$xt3
1046         punpckldq       $xa3,$xa2
1047         punpckhdq       $xa1,$xt2
1048         punpckhdq       $xa3,$xt3
1049         movdqa          $xa0,$xa1
1050         punpcklqdq      $xa2,$xa0               # "a0"
1051         movdqa          $xt2,$xa3
1052         punpcklqdq      $xt3,$xt2               # "a2"
1053         punpckhqdq      $xa2,$xa1               # "a1"
1054         punpckhqdq      $xt3,$xa3               # "a3"
1055 ___
1056         ($xa2,$xt2)=($xt2,$xa2);
1057 $code.=<<___;
1058         paddd           0x80-0x100(%rcx),$xb0
1059         paddd           0x90-0x100(%rcx),$xb1
1060         paddd           0xa0-0x100(%rcx),$xb2
1061         paddd           0xb0-0x100(%rcx),$xb3
1062
1063         movdqa          $xa0,0x00(%rsp)         # offload $xaN
1064         movdqa          $xa1,0x10(%rsp)
1065         movdqa          0x20(%rsp),$xa0         # "xc2"
1066         movdqa          0x30(%rsp),$xa1         # "xc3"
1067
1068         movdqa          $xb0,$xt2
1069         punpckldq       $xb1,$xb0
1070         movdqa          $xb2,$xt3
1071         punpckldq       $xb3,$xb2
1072         punpckhdq       $xb1,$xt2
1073         punpckhdq       $xb3,$xt3
1074         movdqa          $xb0,$xb1
1075         punpcklqdq      $xb2,$xb0               # "b0"
1076         movdqa          $xt2,$xb3
1077         punpcklqdq      $xt3,$xt2               # "b2"
1078         punpckhqdq      $xb2,$xb1               # "b1"
1079         punpckhqdq      $xt3,$xb3               # "b3"
1080 ___
1081         ($xb2,$xt2)=($xt2,$xb2);
1082         my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1083 $code.=<<___;
1084         paddd           0xc0-0x100(%rcx),$xc0
1085         paddd           0xd0-0x100(%rcx),$xc1
1086         paddd           0xe0-0x100(%rcx),$xc2
1087         paddd           0xf0-0x100(%rcx),$xc3
1088
1089         movdqa          $xa2,0x20(%rsp)         # keep offloading $xaN
1090         movdqa          $xa3,0x30(%rsp)
1091
1092         movdqa          $xc0,$xt2
1093         punpckldq       $xc1,$xc0
1094         movdqa          $xc2,$xt3
1095         punpckldq       $xc3,$xc2
1096         punpckhdq       $xc1,$xt2
1097         punpckhdq       $xc3,$xt3
1098         movdqa          $xc0,$xc1
1099         punpcklqdq      $xc2,$xc0               # "c0"
1100         movdqa          $xt2,$xc3
1101         punpcklqdq      $xt3,$xt2               # "c2"
1102         punpckhqdq      $xc2,$xc1               # "c1"
1103         punpckhqdq      $xt3,$xc3               # "c3"
1104 ___
1105         ($xc2,$xt2)=($xt2,$xc2);
1106         ($xt0,$xt1)=($xa2,$xa3);                # use $xaN as temporary
1107 $code.=<<___;
1108         paddd           0x100-0x100(%rcx),$xd0
1109         paddd           0x110-0x100(%rcx),$xd1
1110         paddd           0x120-0x100(%rcx),$xd2
1111         paddd           0x130-0x100(%rcx),$xd3
1112
1113         movdqa          $xd0,$xt2
1114         punpckldq       $xd1,$xd0
1115         movdqa          $xd2,$xt3
1116         punpckldq       $xd3,$xd2
1117         punpckhdq       $xd1,$xt2
1118         punpckhdq       $xd3,$xt3
1119         movdqa          $xd0,$xd1
1120         punpcklqdq      $xd2,$xd0               # "d0"
1121         movdqa          $xt2,$xd3
1122         punpcklqdq      $xt3,$xt2               # "d2"
1123         punpckhqdq      $xd2,$xd1               # "d1"
1124         punpckhqdq      $xt3,$xd3               # "d3"
1125 ___
1126         ($xd2,$xt2)=($xt2,$xd2);
1127 $code.=<<___;
1128         cmp             \$64*4,$len
1129         jb              .Ltail4x
1130
1131         movdqu          0x00($inp),$xt0         # xor with input
1132         movdqu          0x10($inp),$xt1
1133         movdqu          0x20($inp),$xt2
1134         movdqu          0x30($inp),$xt3
1135         pxor            0x00(%rsp),$xt0         # $xaN is offloaded, remember?
1136         pxor            $xb0,$xt1
1137         pxor            $xc0,$xt2
1138         pxor            $xd0,$xt3
1139
1140          movdqu         $xt0,0x00($out)
1141         movdqu          0x40($inp),$xt0
1142          movdqu         $xt1,0x10($out)
1143         movdqu          0x50($inp),$xt1
1144          movdqu         $xt2,0x20($out)
1145         movdqu          0x60($inp),$xt2
1146          movdqu         $xt3,0x30($out)
1147         movdqu          0x70($inp),$xt3
1148         lea             0x80($inp),$inp         # size optimization
1149         pxor            0x10(%rsp),$xt0
1150         pxor            $xb1,$xt1
1151         pxor            $xc1,$xt2
1152         pxor            $xd1,$xt3
1153
1154          movdqu         $xt0,0x40($out)
1155         movdqu          0x00($inp),$xt0
1156          movdqu         $xt1,0x50($out)
1157         movdqu          0x10($inp),$xt1
1158          movdqu         $xt2,0x60($out)
1159         movdqu          0x20($inp),$xt2
1160          movdqu         $xt3,0x70($out)
1161          lea            0x80($out),$out         # size optimization
1162         movdqu          0x30($inp),$xt3
1163         pxor            0x20(%rsp),$xt0
1164         pxor            $xb2,$xt1
1165         pxor            $xc2,$xt2
1166         pxor            $xd2,$xt3
1167
1168          movdqu         $xt0,0x00($out)
1169         movdqu          0x40($inp),$xt0
1170          movdqu         $xt1,0x10($out)
1171         movdqu          0x50($inp),$xt1
1172          movdqu         $xt2,0x20($out)
1173         movdqu          0x60($inp),$xt2
1174          movdqu         $xt3,0x30($out)
1175         movdqu          0x70($inp),$xt3
1176         lea             0x80($inp),$inp         # inp+=64*4
1177         pxor            0x30(%rsp),$xt0
1178         pxor            $xb3,$xt1
1179         pxor            $xc3,$xt2
1180         pxor            $xd3,$xt3
1181         movdqu          $xt0,0x40($out)
1182         movdqu          $xt1,0x50($out)
1183         movdqu          $xt2,0x60($out)
1184         movdqu          $xt3,0x70($out)
1185         lea             0x80($out),$out         # out+=64*4
1186
1187         sub             \$64*4,$len
1188         jnz             .Loop_outer4x
1189
1190         jmp             .Ldone4x
1191
1192 .Ltail4x:
1193         cmp             \$192,$len
1194         jae             .L192_or_more4x
1195         cmp             \$128,$len
1196         jae             .L128_or_more4x
1197         cmp             \$64,$len
1198         jae             .L64_or_more4x
1199
1200         #movdqa         0x00(%rsp),$xt0         # $xaN is offloaded, remember?
1201         xor             %r10,%r10
1202         #movdqa         $xt0,0x00(%rsp)
1203         movdqa          $xb0,0x10(%rsp)
1204         movdqa          $xc0,0x20(%rsp)
1205         movdqa          $xd0,0x30(%rsp)
1206         jmp             .Loop_tail4x
1207
1208 .align  32
1209 .L64_or_more4x:
1210         movdqu          0x00($inp),$xt0         # xor with input
1211         movdqu          0x10($inp),$xt1
1212         movdqu          0x20($inp),$xt2
1213         movdqu          0x30($inp),$xt3
1214         pxor            0x00(%rsp),$xt0         # $xaxN is offloaded, remember?
1215         pxor            $xb0,$xt1
1216         pxor            $xc0,$xt2
1217         pxor            $xd0,$xt3
1218         movdqu          $xt0,0x00($out)
1219         movdqu          $xt1,0x10($out)
1220         movdqu          $xt2,0x20($out)
1221         movdqu          $xt3,0x30($out)
1222         je              .Ldone4x
1223
1224         movdqa          0x10(%rsp),$xt0         # $xaN is offloaded, remember?
1225         lea             0x40($inp),$inp         # inp+=64*1
1226         xor             %r10,%r10
1227         movdqa          $xt0,0x00(%rsp)
1228         movdqa          $xb1,0x10(%rsp)
1229         lea             0x40($out),$out         # out+=64*1
1230         movdqa          $xc1,0x20(%rsp)
1231         sub             \$64,$len               # len-=64*1
1232         movdqa          $xd1,0x30(%rsp)
1233         jmp             .Loop_tail4x
1234
1235 .align  32
1236 .L128_or_more4x:
1237         movdqu          0x00($inp),$xt0         # xor with input
1238         movdqu          0x10($inp),$xt1
1239         movdqu          0x20($inp),$xt2
1240         movdqu          0x30($inp),$xt3
1241         pxor            0x00(%rsp),$xt0         # $xaN is offloaded, remember?
1242         pxor            $xb0,$xt1
1243         pxor            $xc0,$xt2
1244         pxor            $xd0,$xt3
1245
1246          movdqu         $xt0,0x00($out)
1247         movdqu          0x40($inp),$xt0
1248          movdqu         $xt1,0x10($out)
1249         movdqu          0x50($inp),$xt1
1250          movdqu         $xt2,0x20($out)
1251         movdqu          0x60($inp),$xt2
1252          movdqu         $xt3,0x30($out)
1253         movdqu          0x70($inp),$xt3
1254         pxor            0x10(%rsp),$xt0
1255         pxor            $xb1,$xt1
1256         pxor            $xc1,$xt2
1257         pxor            $xd1,$xt3
1258         movdqu          $xt0,0x40($out)
1259         movdqu          $xt1,0x50($out)
1260         movdqu          $xt2,0x60($out)
1261         movdqu          $xt3,0x70($out)
1262         je              .Ldone4x
1263
1264         movdqa          0x20(%rsp),$xt0         # $xaN is offloaded, remember?
1265         lea             0x80($inp),$inp         # inp+=64*2
1266         xor             %r10,%r10
1267         movdqa          $xt0,0x00(%rsp)
1268         movdqa          $xb2,0x10(%rsp)
1269         lea             0x80($out),$out         # out+=64*2
1270         movdqa          $xc2,0x20(%rsp)
1271         sub             \$128,$len              # len-=64*2
1272         movdqa          $xd2,0x30(%rsp)
1273         jmp             .Loop_tail4x
1274
1275 .align  32
1276 .L192_or_more4x:
1277         movdqu          0x00($inp),$xt0         # xor with input
1278         movdqu          0x10($inp),$xt1
1279         movdqu          0x20($inp),$xt2
1280         movdqu          0x30($inp),$xt3
1281         pxor            0x00(%rsp),$xt0         # $xaN is offloaded, remember?
1282         pxor            $xb0,$xt1
1283         pxor            $xc0,$xt2
1284         pxor            $xd0,$xt3
1285
1286          movdqu         $xt0,0x00($out)
1287         movdqu          0x40($inp),$xt0
1288          movdqu         $xt1,0x10($out)
1289         movdqu          0x50($inp),$xt1
1290          movdqu         $xt2,0x20($out)
1291         movdqu          0x60($inp),$xt2
1292          movdqu         $xt3,0x30($out)
1293         movdqu          0x70($inp),$xt3
1294         lea             0x80($inp),$inp         # size optimization
1295         pxor            0x10(%rsp),$xt0
1296         pxor            $xb1,$xt1
1297         pxor            $xc1,$xt2
1298         pxor            $xd1,$xt3
1299
1300          movdqu         $xt0,0x40($out)
1301         movdqu          0x00($inp),$xt0
1302          movdqu         $xt1,0x50($out)
1303         movdqu          0x10($inp),$xt1
1304          movdqu         $xt2,0x60($out)
1305         movdqu          0x20($inp),$xt2
1306          movdqu         $xt3,0x70($out)
1307          lea            0x80($out),$out         # size optimization
1308         movdqu          0x30($inp),$xt3
1309         pxor            0x20(%rsp),$xt0
1310         pxor            $xb2,$xt1
1311         pxor            $xc2,$xt2
1312         pxor            $xd2,$xt3
1313         movdqu          $xt0,0x00($out)
1314         movdqu          $xt1,0x10($out)
1315         movdqu          $xt2,0x20($out)
1316         movdqu          $xt3,0x30($out)
1317         je              .Ldone4x
1318
1319         movdqa          0x30(%rsp),$xt0         # $xaN is offloaded, remember?
1320         lea             0x40($inp),$inp         # inp+=64*3
1321         xor             %r10,%r10
1322         movdqa          $xt0,0x00(%rsp)
1323         movdqa          $xb3,0x10(%rsp)
1324         lea             0x40($out),$out         # out+=64*3
1325         movdqa          $xc3,0x20(%rsp)
1326         sub             \$192,$len              # len-=64*3
1327         movdqa          $xd3,0x30(%rsp)
1328
1329 .Loop_tail4x:
1330         movzb           ($inp,%r10),%eax
1331         movzb           (%rsp,%r10),%ecx
1332         lea             1(%r10),%r10
1333         xor             %ecx,%eax
1334         mov             %al,-1($out,%r10)
1335         dec             $len
1336         jnz             .Loop_tail4x
1337
1338 .Ldone4x:
1339 ___
1340 $code.=<<___    if ($win64);
1341         movaps          -0xa8(%r9),%xmm6
1342         movaps          -0x98(%r9),%xmm7
1343         movaps          -0x88(%r9),%xmm8
1344         movaps          -0x78(%r9),%xmm9
1345         movaps          -0x68(%r9),%xmm10
1346         movaps          -0x58(%r9),%xmm11
1347         movaps          -0x48(%r9),%xmm12
1348         movaps          -0x38(%r9),%xmm13
1349         movaps          -0x28(%r9),%xmm14
1350         movaps          -0x18(%r9),%xmm15
1351 ___
1352 $code.=<<___;
1353         lea             (%r9),%rsp
1354 .cfi_def_cfa_register   %rsp
1355 .L4x_epilogue:
1356         ret
1357 .cfi_endproc
1358 .size   ChaCha20_4x,.-ChaCha20_4x
1359 ___
1360 }
1361
1362 ########################################################################
1363 # XOP code path that handles all lengths.
1364 if ($avx) {
1365 # There is some "anomaly" observed depending on instructions' size or
1366 # alignment. If you look closely at below code you'll notice that
1367 # sometimes argument order varies. The order affects instruction
1368 # encoding by making it larger, and such fiddling gives 5% performance
1369 # improvement. This is on FX-4100...
1370
1371 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1372     $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1373 my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1374          $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1375
1376 sub XOP_lane_ROUND {
1377 my ($a0,$b0,$c0,$d0)=@_;
1378 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1379 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1380 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1381 my @x=map("\"$_\"",@xx);
1382
1383         (
1384         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",     # Q1
1385          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",     # Q2
1386           "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",     # Q3
1387            "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",     # Q4
1388         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1389          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1390           "&vpxor       (@x[$d2],@x[$a2],@x[$d2])",
1391            "&vpxor      (@x[$d3],@x[$a3],@x[$d3])",
1392         "&vprotd        (@x[$d0],@x[$d0],16)",
1393          "&vprotd       (@x[$d1],@x[$d1],16)",
1394           "&vprotd      (@x[$d2],@x[$d2],16)",
1395            "&vprotd     (@x[$d3],@x[$d3],16)",
1396
1397         "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
1398          "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
1399           "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
1400            "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
1401         "&vpxor         (@x[$b0],@x[$c0],@x[$b0])",
1402          "&vpxor        (@x[$b1],@x[$c1],@x[$b1])",
1403           "&vpxor       (@x[$b2],@x[$b2],@x[$c2])",     # flip
1404            "&vpxor      (@x[$b3],@x[$b3],@x[$c3])",     # flip
1405         "&vprotd        (@x[$b0],@x[$b0],12)",
1406          "&vprotd       (@x[$b1],@x[$b1],12)",
1407           "&vprotd      (@x[$b2],@x[$b2],12)",
1408            "&vprotd     (@x[$b3],@x[$b3],12)",
1409
1410         "&vpaddd        (@x[$a0],@x[$b0],@x[$a0])",     # flip
1411          "&vpaddd       (@x[$a1],@x[$b1],@x[$a1])",     # flip
1412           "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",
1413            "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",
1414         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1415          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1416           "&vpxor       (@x[$d2],@x[$a2],@x[$d2])",
1417            "&vpxor      (@x[$d3],@x[$a3],@x[$d3])",
1418         "&vprotd        (@x[$d0],@x[$d0],8)",
1419          "&vprotd       (@x[$d1],@x[$d1],8)",
1420           "&vprotd      (@x[$d2],@x[$d2],8)",
1421            "&vprotd     (@x[$d3],@x[$d3],8)",
1422
1423         "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
1424          "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
1425           "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
1426            "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
1427         "&vpxor         (@x[$b0],@x[$c0],@x[$b0])",
1428          "&vpxor        (@x[$b1],@x[$c1],@x[$b1])",
1429           "&vpxor       (@x[$b2],@x[$b2],@x[$c2])",     # flip
1430            "&vpxor      (@x[$b3],@x[$b3],@x[$c3])",     # flip
1431         "&vprotd        (@x[$b0],@x[$b0],7)",
1432          "&vprotd       (@x[$b1],@x[$b1],7)",
1433           "&vprotd      (@x[$b2],@x[$b2],7)",
1434            "&vprotd     (@x[$b3],@x[$b3],7)"
1435         );
1436 }
1437
1438 my $xframe = $win64 ? 0xa8 : 8;
1439
1440 $code.=<<___;
1441 .type   ChaCha20_4xop,\@function,5
1442 .align  32
1443 ChaCha20_4xop:
1444 .cfi_startproc
1445 .LChaCha20_4xop:
1446         mov             %rsp,%r9                # frame pointer
1447 .cfi_def_cfa_register   %r9
1448         sub             \$0x140+$xframe,%rsp
1449 ___
1450         ################ stack layout
1451         # +0x00         SIMD equivalent of @x[8-12]
1452         # ...
1453         # +0x40         constant copy of key[0-2] smashed by lanes
1454         # ...
1455         # +0x100        SIMD counters (with nonce smashed by lanes)
1456         # ...
1457         # +0x140
1458 $code.=<<___    if ($win64);
1459         movaps          %xmm6,-0xa8(%r9)
1460         movaps          %xmm7,-0x98(%r9)
1461         movaps          %xmm8,-0x88(%r9)
1462         movaps          %xmm9,-0x78(%r9)
1463         movaps          %xmm10,-0x68(%r9)
1464         movaps          %xmm11,-0x58(%r9)
1465         movaps          %xmm12,-0x48(%r9)
1466         movaps          %xmm13,-0x38(%r9)
1467         movaps          %xmm14,-0x28(%r9)
1468         movaps          %xmm15,-0x18(%r9)
1469 .L4xop_body:
1470 ___
1471 $code.=<<___;
1472         vzeroupper
1473
1474         vmovdqa         .Lsigma(%rip),$xa3      # key[0]
1475         vmovdqu         ($key),$xb3             # key[1]
1476         vmovdqu         16($key),$xt3           # key[2]
1477         vmovdqu         ($counter),$xd3         # key[3]
1478         lea             0x100(%rsp),%rcx        # size optimization
1479
1480         vpshufd         \$0x00,$xa3,$xa0        # smash key by lanes...
1481         vpshufd         \$0x55,$xa3,$xa1
1482         vmovdqa         $xa0,0x40(%rsp)         # ... and offload
1483         vpshufd         \$0xaa,$xa3,$xa2
1484         vmovdqa         $xa1,0x50(%rsp)
1485         vpshufd         \$0xff,$xa3,$xa3
1486         vmovdqa         $xa2,0x60(%rsp)
1487         vmovdqa         $xa3,0x70(%rsp)
1488
1489         vpshufd         \$0x00,$xb3,$xb0
1490         vpshufd         \$0x55,$xb3,$xb1
1491         vmovdqa         $xb0,0x80-0x100(%rcx)
1492         vpshufd         \$0xaa,$xb3,$xb2
1493         vmovdqa         $xb1,0x90-0x100(%rcx)
1494         vpshufd         \$0xff,$xb3,$xb3
1495         vmovdqa         $xb2,0xa0-0x100(%rcx)
1496         vmovdqa         $xb3,0xb0-0x100(%rcx)
1497
1498         vpshufd         \$0x00,$xt3,$xt0        # "$xc0"
1499         vpshufd         \$0x55,$xt3,$xt1        # "$xc1"
1500         vmovdqa         $xt0,0xc0-0x100(%rcx)
1501         vpshufd         \$0xaa,$xt3,$xt2        # "$xc2"
1502         vmovdqa         $xt1,0xd0-0x100(%rcx)
1503         vpshufd         \$0xff,$xt3,$xt3        # "$xc3"
1504         vmovdqa         $xt2,0xe0-0x100(%rcx)
1505         vmovdqa         $xt3,0xf0-0x100(%rcx)
1506
1507         vpshufd         \$0x00,$xd3,$xd0
1508         vpshufd         \$0x55,$xd3,$xd1
1509         vpaddd          .Linc(%rip),$xd0,$xd0   # don't save counters yet
1510         vpshufd         \$0xaa,$xd3,$xd2
1511         vmovdqa         $xd1,0x110-0x100(%rcx)
1512         vpshufd         \$0xff,$xd3,$xd3
1513         vmovdqa         $xd2,0x120-0x100(%rcx)
1514         vmovdqa         $xd3,0x130-0x100(%rcx)
1515
1516         jmp             .Loop_enter4xop
1517
1518 .align  32
1519 .Loop_outer4xop:
1520         vmovdqa         0x40(%rsp),$xa0         # re-load smashed key
1521         vmovdqa         0x50(%rsp),$xa1
1522         vmovdqa         0x60(%rsp),$xa2
1523         vmovdqa         0x70(%rsp),$xa3
1524         vmovdqa         0x80-0x100(%rcx),$xb0
1525         vmovdqa         0x90-0x100(%rcx),$xb1
1526         vmovdqa         0xa0-0x100(%rcx),$xb2
1527         vmovdqa         0xb0-0x100(%rcx),$xb3
1528         vmovdqa         0xc0-0x100(%rcx),$xt0   # "$xc0"
1529         vmovdqa         0xd0-0x100(%rcx),$xt1   # "$xc1"
1530         vmovdqa         0xe0-0x100(%rcx),$xt2   # "$xc2"
1531         vmovdqa         0xf0-0x100(%rcx),$xt3   # "$xc3"
1532         vmovdqa         0x100-0x100(%rcx),$xd0
1533         vmovdqa         0x110-0x100(%rcx),$xd1
1534         vmovdqa         0x120-0x100(%rcx),$xd2
1535         vmovdqa         0x130-0x100(%rcx),$xd3
1536         vpaddd          .Lfour(%rip),$xd0,$xd0  # next SIMD counters
1537
1538 .Loop_enter4xop:
1539         mov             \$10,%eax
1540         vmovdqa         $xd0,0x100-0x100(%rcx)  # save SIMD counters
1541         jmp             .Loop4xop
1542
1543 .align  32
1544 .Loop4xop:
1545 ___
1546         foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
1547         foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
1548 $code.=<<___;
1549         dec             %eax
1550         jnz             .Loop4xop
1551
1552         vpaddd          0x40(%rsp),$xa0,$xa0    # accumulate key material
1553         vpaddd          0x50(%rsp),$xa1,$xa1
1554         vpaddd          0x60(%rsp),$xa2,$xa2
1555         vpaddd          0x70(%rsp),$xa3,$xa3
1556
1557         vmovdqa         $xt2,0x20(%rsp)         # offload $xc2,3
1558         vmovdqa         $xt3,0x30(%rsp)
1559
1560         vpunpckldq      $xa1,$xa0,$xt2          # "de-interlace" data
1561         vpunpckldq      $xa3,$xa2,$xt3
1562         vpunpckhdq      $xa1,$xa0,$xa0
1563         vpunpckhdq      $xa3,$xa2,$xa2
1564         vpunpcklqdq     $xt3,$xt2,$xa1          # "a0"
1565         vpunpckhqdq     $xt3,$xt2,$xt2          # "a1"
1566         vpunpcklqdq     $xa2,$xa0,$xa3          # "a2"
1567         vpunpckhqdq     $xa2,$xa0,$xa0          # "a3"
1568 ___
1569         ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1570 $code.=<<___;
1571         vpaddd          0x80-0x100(%rcx),$xb0,$xb0
1572         vpaddd          0x90-0x100(%rcx),$xb1,$xb1
1573         vpaddd          0xa0-0x100(%rcx),$xb2,$xb2
1574         vpaddd          0xb0-0x100(%rcx),$xb3,$xb3
1575
1576         vmovdqa         $xa0,0x00(%rsp)         # offload $xa0,1
1577         vmovdqa         $xa1,0x10(%rsp)
1578         vmovdqa         0x20(%rsp),$xa0         # "xc2"
1579         vmovdqa         0x30(%rsp),$xa1         # "xc3"
1580
1581         vpunpckldq      $xb1,$xb0,$xt2
1582         vpunpckldq      $xb3,$xb2,$xt3
1583         vpunpckhdq      $xb1,$xb0,$xb0
1584         vpunpckhdq      $xb3,$xb2,$xb2
1585         vpunpcklqdq     $xt3,$xt2,$xb1          # "b0"
1586         vpunpckhqdq     $xt3,$xt2,$xt2          # "b1"
1587         vpunpcklqdq     $xb2,$xb0,$xb3          # "b2"
1588         vpunpckhqdq     $xb2,$xb0,$xb0          # "b3"
1589 ___
1590         ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1591         my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1592 $code.=<<___;
1593         vpaddd          0xc0-0x100(%rcx),$xc0,$xc0
1594         vpaddd          0xd0-0x100(%rcx),$xc1,$xc1
1595         vpaddd          0xe0-0x100(%rcx),$xc2,$xc2
1596         vpaddd          0xf0-0x100(%rcx),$xc3,$xc3
1597
1598         vpunpckldq      $xc1,$xc0,$xt2
1599         vpunpckldq      $xc3,$xc2,$xt3
1600         vpunpckhdq      $xc1,$xc0,$xc0
1601         vpunpckhdq      $xc3,$xc2,$xc2
1602         vpunpcklqdq     $xt3,$xt2,$xc1          # "c0"
1603         vpunpckhqdq     $xt3,$xt2,$xt2          # "c1"
1604         vpunpcklqdq     $xc2,$xc0,$xc3          # "c2"
1605         vpunpckhqdq     $xc2,$xc0,$xc0          # "c3"
1606 ___
1607         ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1608 $code.=<<___;
1609         vpaddd          0x100-0x100(%rcx),$xd0,$xd0
1610         vpaddd          0x110-0x100(%rcx),$xd1,$xd1
1611         vpaddd          0x120-0x100(%rcx),$xd2,$xd2
1612         vpaddd          0x130-0x100(%rcx),$xd3,$xd3
1613
1614         vpunpckldq      $xd1,$xd0,$xt2
1615         vpunpckldq      $xd3,$xd2,$xt3
1616         vpunpckhdq      $xd1,$xd0,$xd0
1617         vpunpckhdq      $xd3,$xd2,$xd2
1618         vpunpcklqdq     $xt3,$xt2,$xd1          # "d0"
1619         vpunpckhqdq     $xt3,$xt2,$xt2          # "d1"
1620         vpunpcklqdq     $xd2,$xd0,$xd3          # "d2"
1621         vpunpckhqdq     $xd2,$xd0,$xd0          # "d3"
1622 ___
1623         ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1624         ($xa0,$xa1)=($xt2,$xt3);
1625 $code.=<<___;
1626         vmovdqa         0x00(%rsp),$xa0         # restore $xa0,1
1627         vmovdqa         0x10(%rsp),$xa1
1628
1629         cmp             \$64*4,$len
1630         jb              .Ltail4xop
1631
1632         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1633         vpxor           0x10($inp),$xb0,$xb0
1634         vpxor           0x20($inp),$xc0,$xc0
1635         vpxor           0x30($inp),$xd0,$xd0
1636         vpxor           0x40($inp),$xa1,$xa1
1637         vpxor           0x50($inp),$xb1,$xb1
1638         vpxor           0x60($inp),$xc1,$xc1
1639         vpxor           0x70($inp),$xd1,$xd1
1640         lea             0x80($inp),$inp         # size optimization
1641         vpxor           0x00($inp),$xa2,$xa2
1642         vpxor           0x10($inp),$xb2,$xb2
1643         vpxor           0x20($inp),$xc2,$xc2
1644         vpxor           0x30($inp),$xd2,$xd2
1645         vpxor           0x40($inp),$xa3,$xa3
1646         vpxor           0x50($inp),$xb3,$xb3
1647         vpxor           0x60($inp),$xc3,$xc3
1648         vpxor           0x70($inp),$xd3,$xd3
1649         lea             0x80($inp),$inp         # inp+=64*4
1650
1651         vmovdqu         $xa0,0x00($out)
1652         vmovdqu         $xb0,0x10($out)
1653         vmovdqu         $xc0,0x20($out)
1654         vmovdqu         $xd0,0x30($out)
1655         vmovdqu         $xa1,0x40($out)
1656         vmovdqu         $xb1,0x50($out)
1657         vmovdqu         $xc1,0x60($out)
1658         vmovdqu         $xd1,0x70($out)
1659         lea             0x80($out),$out         # size optimization
1660         vmovdqu         $xa2,0x00($out)
1661         vmovdqu         $xb2,0x10($out)
1662         vmovdqu         $xc2,0x20($out)
1663         vmovdqu         $xd2,0x30($out)
1664         vmovdqu         $xa3,0x40($out)
1665         vmovdqu         $xb3,0x50($out)
1666         vmovdqu         $xc3,0x60($out)
1667         vmovdqu         $xd3,0x70($out)
1668         lea             0x80($out),$out         # out+=64*4
1669
1670         sub             \$64*4,$len
1671         jnz             .Loop_outer4xop
1672
1673         jmp             .Ldone4xop
1674
1675 .align  32
1676 .Ltail4xop:
1677         cmp             \$192,$len
1678         jae             .L192_or_more4xop
1679         cmp             \$128,$len
1680         jae             .L128_or_more4xop
1681         cmp             \$64,$len
1682         jae             .L64_or_more4xop
1683
1684         xor             %r10,%r10
1685         vmovdqa         $xa0,0x00(%rsp)
1686         vmovdqa         $xb0,0x10(%rsp)
1687         vmovdqa         $xc0,0x20(%rsp)
1688         vmovdqa         $xd0,0x30(%rsp)
1689         jmp             .Loop_tail4xop
1690
1691 .align  32
1692 .L64_or_more4xop:
1693         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1694         vpxor           0x10($inp),$xb0,$xb0
1695         vpxor           0x20($inp),$xc0,$xc0
1696         vpxor           0x30($inp),$xd0,$xd0
1697         vmovdqu         $xa0,0x00($out)
1698         vmovdqu         $xb0,0x10($out)
1699         vmovdqu         $xc0,0x20($out)
1700         vmovdqu         $xd0,0x30($out)
1701         je              .Ldone4xop
1702
1703         lea             0x40($inp),$inp         # inp+=64*1
1704         vmovdqa         $xa1,0x00(%rsp)
1705         xor             %r10,%r10
1706         vmovdqa         $xb1,0x10(%rsp)
1707         lea             0x40($out),$out         # out+=64*1
1708         vmovdqa         $xc1,0x20(%rsp)
1709         sub             \$64,$len               # len-=64*1
1710         vmovdqa         $xd1,0x30(%rsp)
1711         jmp             .Loop_tail4xop
1712
1713 .align  32
1714 .L128_or_more4xop:
1715         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1716         vpxor           0x10($inp),$xb0,$xb0
1717         vpxor           0x20($inp),$xc0,$xc0
1718         vpxor           0x30($inp),$xd0,$xd0
1719         vpxor           0x40($inp),$xa1,$xa1
1720         vpxor           0x50($inp),$xb1,$xb1
1721         vpxor           0x60($inp),$xc1,$xc1
1722         vpxor           0x70($inp),$xd1,$xd1
1723
1724         vmovdqu         $xa0,0x00($out)
1725         vmovdqu         $xb0,0x10($out)
1726         vmovdqu         $xc0,0x20($out)
1727         vmovdqu         $xd0,0x30($out)
1728         vmovdqu         $xa1,0x40($out)
1729         vmovdqu         $xb1,0x50($out)
1730         vmovdqu         $xc1,0x60($out)
1731         vmovdqu         $xd1,0x70($out)
1732         je              .Ldone4xop
1733
1734         lea             0x80($inp),$inp         # inp+=64*2
1735         vmovdqa         $xa2,0x00(%rsp)
1736         xor             %r10,%r10
1737         vmovdqa         $xb2,0x10(%rsp)
1738         lea             0x80($out),$out         # out+=64*2
1739         vmovdqa         $xc2,0x20(%rsp)
1740         sub             \$128,$len              # len-=64*2
1741         vmovdqa         $xd2,0x30(%rsp)
1742         jmp             .Loop_tail4xop
1743
1744 .align  32
1745 .L192_or_more4xop:
1746         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1747         vpxor           0x10($inp),$xb0,$xb0
1748         vpxor           0x20($inp),$xc0,$xc0
1749         vpxor           0x30($inp),$xd0,$xd0
1750         vpxor           0x40($inp),$xa1,$xa1
1751         vpxor           0x50($inp),$xb1,$xb1
1752         vpxor           0x60($inp),$xc1,$xc1
1753         vpxor           0x70($inp),$xd1,$xd1
1754         lea             0x80($inp),$inp         # size optimization
1755         vpxor           0x00($inp),$xa2,$xa2
1756         vpxor           0x10($inp),$xb2,$xb2
1757         vpxor           0x20($inp),$xc2,$xc2
1758         vpxor           0x30($inp),$xd2,$xd2
1759
1760         vmovdqu         $xa0,0x00($out)
1761         vmovdqu         $xb0,0x10($out)
1762         vmovdqu         $xc0,0x20($out)
1763         vmovdqu         $xd0,0x30($out)
1764         vmovdqu         $xa1,0x40($out)
1765         vmovdqu         $xb1,0x50($out)
1766         vmovdqu         $xc1,0x60($out)
1767         vmovdqu         $xd1,0x70($out)
1768         lea             0x80($out),$out         # size optimization
1769         vmovdqu         $xa2,0x00($out)
1770         vmovdqu         $xb2,0x10($out)
1771         vmovdqu         $xc2,0x20($out)
1772         vmovdqu         $xd2,0x30($out)
1773         je              .Ldone4xop
1774
1775         lea             0x40($inp),$inp         # inp+=64*3
1776         vmovdqa         $xa3,0x00(%rsp)
1777         xor             %r10,%r10
1778         vmovdqa         $xb3,0x10(%rsp)
1779         lea             0x40($out),$out         # out+=64*3
1780         vmovdqa         $xc3,0x20(%rsp)
1781         sub             \$192,$len              # len-=64*3
1782         vmovdqa         $xd3,0x30(%rsp)
1783
1784 .Loop_tail4xop:
1785         movzb           ($inp,%r10),%eax
1786         movzb           (%rsp,%r10),%ecx
1787         lea             1(%r10),%r10
1788         xor             %ecx,%eax
1789         mov             %al,-1($out,%r10)
1790         dec             $len
1791         jnz             .Loop_tail4xop
1792
1793 .Ldone4xop:
1794         vzeroupper
1795 ___
1796 $code.=<<___    if ($win64);
1797         movaps          -0xa8(%r9),%xmm6
1798         movaps          -0x98(%r9),%xmm7
1799         movaps          -0x88(%r9),%xmm8
1800         movaps          -0x78(%r9),%xmm9
1801         movaps          -0x68(%r9),%xmm10
1802         movaps          -0x58(%r9),%xmm11
1803         movaps          -0x48(%r9),%xmm12
1804         movaps          -0x38(%r9),%xmm13
1805         movaps          -0x28(%r9),%xmm14
1806         movaps          -0x18(%r9),%xmm15
1807 ___
1808 $code.=<<___;
1809         lea             (%r9),%rsp
1810 .cfi_def_cfa_register   %rsp
1811 .L4xop_epilogue:
1812         ret
1813 .cfi_endproc
1814 .size   ChaCha20_4xop,.-ChaCha20_4xop
1815 ___
1816 }
1817
1818 ########################################################################
1819 # AVX2 code path
1820 if ($avx>1) {
1821 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1822     $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1823 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1824         "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1825
1826 sub AVX2_lane_ROUND {
1827 my ($a0,$b0,$c0,$d0)=@_;
1828 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1829 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1830 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1831 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1832 my @x=map("\"$_\"",@xx);
1833
1834         # Consider order in which variables are addressed by their
1835         # index:
1836         #
1837         #       a   b   c   d
1838         #
1839         #       0   4   8  12 < even round
1840         #       1   5   9  13
1841         #       2   6  10  14
1842         #       3   7  11  15
1843         #       0   5  10  15 < odd round
1844         #       1   6  11  12
1845         #       2   7   8  13
1846         #       3   4   9  14
1847         #
1848         # 'a', 'b' and 'd's are permanently allocated in registers,
1849         # @x[0..7,12..15], while 'c's are maintained in memory. If
1850         # you observe 'c' column, you'll notice that pair of 'c's is
1851         # invariant between rounds. This means that we have to reload
1852         # them once per round, in the middle. This is why you'll see
1853         # bunch of 'c' stores and loads in the middle, but none in
1854         # the beginning or end.
1855
1856         (
1857         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",     # Q1
1858         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1859         "&vpshufb       (@x[$d0],@x[$d0],$t1)",
1860          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",     # Q2
1861          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1862          "&vpshufb      (@x[$d1],@x[$d1],$t1)",
1863
1864         "&vpaddd        ($xc,$xc,@x[$d0])",
1865         "&vpxor         (@x[$b0],$xc,@x[$b0])",
1866         "&vpslld        ($t0,@x[$b0],12)",
1867         "&vpsrld        (@x[$b0],@x[$b0],20)",
1868         "&vpor          (@x[$b0],$t0,@x[$b0])",
1869         "&vbroadcasti128($t0,'(%r11)')",                # .Lrot24(%rip)
1870          "&vpaddd       ($xc_,$xc_,@x[$d1])",
1871          "&vpxor        (@x[$b1],$xc_,@x[$b1])",
1872          "&vpslld       ($t1,@x[$b1],12)",
1873          "&vpsrld       (@x[$b1],@x[$b1],20)",
1874          "&vpor         (@x[$b1],$t1,@x[$b1])",
1875
1876         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",
1877         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1878         "&vpshufb       (@x[$d0],@x[$d0],$t0)",
1879          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",
1880          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1881          "&vpshufb      (@x[$d1],@x[$d1],$t0)",
1882
1883         "&vpaddd        ($xc,$xc,@x[$d0])",
1884         "&vpxor         (@x[$b0],$xc,@x[$b0])",
1885         "&vpslld        ($t1,@x[$b0],7)",
1886         "&vpsrld        (@x[$b0],@x[$b0],25)",
1887         "&vpor          (@x[$b0],$t1,@x[$b0])",
1888         "&vbroadcasti128($t1,'(%r10)')",                # .Lrot16(%rip)
1889          "&vpaddd       ($xc_,$xc_,@x[$d1])",
1890          "&vpxor        (@x[$b1],$xc_,@x[$b1])",
1891          "&vpslld       ($t0,@x[$b1],7)",
1892          "&vpsrld       (@x[$b1],@x[$b1],25)",
1893          "&vpor         (@x[$b1],$t0,@x[$b1])",
1894
1895         "&vmovdqa       (\"`32*($c0-8)`(%rsp)\",$xc)",  # reload pair of 'c's
1896          "&vmovdqa      (\"`32*($c1-8)`(%rsp)\",$xc_)",
1897         "&vmovdqa       ($xc,\"`32*($c2-8)`(%rsp)\")",
1898          "&vmovdqa      ($xc_,\"`32*($c3-8)`(%rsp)\")",
1899
1900         "&vpaddd        (@x[$a2],@x[$a2],@x[$b2])",     # Q3
1901         "&vpxor         (@x[$d2],@x[$a2],@x[$d2])",
1902         "&vpshufb       (@x[$d2],@x[$d2],$t1)",
1903          "&vpaddd       (@x[$a3],@x[$a3],@x[$b3])",     # Q4
1904          "&vpxor        (@x[$d3],@x[$a3],@x[$d3])",
1905          "&vpshufb      (@x[$d3],@x[$d3],$t1)",
1906
1907         "&vpaddd        ($xc,$xc,@x[$d2])",
1908         "&vpxor         (@x[$b2],$xc,@x[$b2])",
1909         "&vpslld        ($t0,@x[$b2],12)",
1910         "&vpsrld        (@x[$b2],@x[$b2],20)",
1911         "&vpor          (@x[$b2],$t0,@x[$b2])",
1912         "&vbroadcasti128($t0,'(%r11)')",                # .Lrot24(%rip)
1913          "&vpaddd       ($xc_,$xc_,@x[$d3])",
1914          "&vpxor        (@x[$b3],$xc_,@x[$b3])",
1915          "&vpslld       ($t1,@x[$b3],12)",
1916          "&vpsrld       (@x[$b3],@x[$b3],20)",
1917          "&vpor         (@x[$b3],$t1,@x[$b3])",
1918
1919         "&vpaddd        (@x[$a2],@x[$a2],@x[$b2])",
1920         "&vpxor         (@x[$d2],@x[$a2],@x[$d2])",
1921         "&vpshufb       (@x[$d2],@x[$d2],$t0)",
1922          "&vpaddd       (@x[$a3],@x[$a3],@x[$b3])",
1923          "&vpxor        (@x[$d3],@x[$a3],@x[$d3])",
1924          "&vpshufb      (@x[$d3],@x[$d3],$t0)",
1925
1926         "&vpaddd        ($xc,$xc,@x[$d2])",
1927         "&vpxor         (@x[$b2],$xc,@x[$b2])",
1928         "&vpslld        ($t1,@x[$b2],7)",
1929         "&vpsrld        (@x[$b2],@x[$b2],25)",
1930         "&vpor          (@x[$b2],$t1,@x[$b2])",
1931         "&vbroadcasti128($t1,'(%r10)')",                # .Lrot16(%rip)
1932          "&vpaddd       ($xc_,$xc_,@x[$d3])",
1933          "&vpxor        (@x[$b3],$xc_,@x[$b3])",
1934          "&vpslld       ($t0,@x[$b3],7)",
1935          "&vpsrld       (@x[$b3],@x[$b3],25)",
1936          "&vpor         (@x[$b3],$t0,@x[$b3])"
1937         );
1938 }
1939
1940 my $xframe = $win64 ? 0xa8 : 8;
1941
1942 $code.=<<___;
1943 .type   ChaCha20_8x,\@function,5
1944 .align  32
1945 ChaCha20_8x:
1946 .cfi_startproc
1947 .LChaCha20_8x:
1948         mov             %rsp,%r9                # frame register
1949 .cfi_def_cfa_register   %r9
1950         sub             \$0x280+$xframe,%rsp
1951         and             \$-32,%rsp
1952 ___
1953 $code.=<<___    if ($win64);
1954         movaps          %xmm6,-0xa8(%r9)
1955         movaps          %xmm7,-0x98(%r9)
1956         movaps          %xmm8,-0x88(%r9)
1957         movaps          %xmm9,-0x78(%r9)
1958         movaps          %xmm10,-0x68(%r9)
1959         movaps          %xmm11,-0x58(%r9)
1960         movaps          %xmm12,-0x48(%r9)
1961         movaps          %xmm13,-0x38(%r9)
1962         movaps          %xmm14,-0x28(%r9)
1963         movaps          %xmm15,-0x18(%r9)
1964 .L8x_body:
1965 ___
1966 $code.=<<___;
1967         vzeroupper
1968
1969         ################ stack layout
1970         # +0x00         SIMD equivalent of @x[8-12]
1971         # ...
1972         # +0x80         constant copy of key[0-2] smashed by lanes
1973         # ...
1974         # +0x200        SIMD counters (with nonce smashed by lanes)
1975         # ...
1976         # +0x280
1977
1978         vbroadcasti128  .Lsigma(%rip),$xa3      # key[0]
1979         vbroadcasti128  ($key),$xb3             # key[1]
1980         vbroadcasti128  16($key),$xt3           # key[2]
1981         vbroadcasti128  ($counter),$xd3         # key[3]
1982         lea             0x100(%rsp),%rcx        # size optimization
1983         lea             0x200(%rsp),%rax        # size optimization
1984         lea             .Lrot16(%rip),%r10
1985         lea             .Lrot24(%rip),%r11
1986
1987         vpshufd         \$0x00,$xa3,$xa0        # smash key by lanes...
1988         vpshufd         \$0x55,$xa3,$xa1
1989         vmovdqa         $xa0,0x80-0x100(%rcx)   # ... and offload
1990         vpshufd         \$0xaa,$xa3,$xa2
1991         vmovdqa         $xa1,0xa0-0x100(%rcx)
1992         vpshufd         \$0xff,$xa3,$xa3
1993         vmovdqa         $xa2,0xc0-0x100(%rcx)
1994         vmovdqa         $xa3,0xe0-0x100(%rcx)
1995
1996         vpshufd         \$0x00,$xb3,$xb0
1997         vpshufd         \$0x55,$xb3,$xb1
1998         vmovdqa         $xb0,0x100-0x100(%rcx)
1999         vpshufd         \$0xaa,$xb3,$xb2
2000         vmovdqa         $xb1,0x120-0x100(%rcx)
2001         vpshufd         \$0xff,$xb3,$xb3
2002         vmovdqa         $xb2,0x140-0x100(%rcx)
2003         vmovdqa         $xb3,0x160-0x100(%rcx)
2004
2005         vpshufd         \$0x00,$xt3,$xt0        # "xc0"
2006         vpshufd         \$0x55,$xt3,$xt1        # "xc1"
2007         vmovdqa         $xt0,0x180-0x200(%rax)
2008         vpshufd         \$0xaa,$xt3,$xt2        # "xc2"
2009         vmovdqa         $xt1,0x1a0-0x200(%rax)
2010         vpshufd         \$0xff,$xt3,$xt3        # "xc3"
2011         vmovdqa         $xt2,0x1c0-0x200(%rax)
2012         vmovdqa         $xt3,0x1e0-0x200(%rax)
2013
2014         vpshufd         \$0x00,$xd3,$xd0
2015         vpshufd         \$0x55,$xd3,$xd1
2016         vpaddd          .Lincy(%rip),$xd0,$xd0  # don't save counters yet
2017         vpshufd         \$0xaa,$xd3,$xd2
2018         vmovdqa         $xd1,0x220-0x200(%rax)
2019         vpshufd         \$0xff,$xd3,$xd3
2020         vmovdqa         $xd2,0x240-0x200(%rax)
2021         vmovdqa         $xd3,0x260-0x200(%rax)
2022
2023         jmp             .Loop_enter8x
2024
2025 .align  32
2026 .Loop_outer8x:
2027         vmovdqa         0x80-0x100(%rcx),$xa0   # re-load smashed key
2028         vmovdqa         0xa0-0x100(%rcx),$xa1
2029         vmovdqa         0xc0-0x100(%rcx),$xa2
2030         vmovdqa         0xe0-0x100(%rcx),$xa3
2031         vmovdqa         0x100-0x100(%rcx),$xb0
2032         vmovdqa         0x120-0x100(%rcx),$xb1
2033         vmovdqa         0x140-0x100(%rcx),$xb2
2034         vmovdqa         0x160-0x100(%rcx),$xb3
2035         vmovdqa         0x180-0x200(%rax),$xt0  # "xc0"
2036         vmovdqa         0x1a0-0x200(%rax),$xt1  # "xc1"
2037         vmovdqa         0x1c0-0x200(%rax),$xt2  # "xc2"
2038         vmovdqa         0x1e0-0x200(%rax),$xt3  # "xc3"
2039         vmovdqa         0x200-0x200(%rax),$xd0
2040         vmovdqa         0x220-0x200(%rax),$xd1
2041         vmovdqa         0x240-0x200(%rax),$xd2
2042         vmovdqa         0x260-0x200(%rax),$xd3
2043         vpaddd          .Leight(%rip),$xd0,$xd0 # next SIMD counters
2044
2045 .Loop_enter8x:
2046         vmovdqa         $xt2,0x40(%rsp)         # SIMD equivalent of "@x[10]"
2047         vmovdqa         $xt3,0x60(%rsp)         # SIMD equivalent of "@x[11]"
2048         vbroadcasti128  (%r10),$xt3
2049         vmovdqa         $xd0,0x200-0x200(%rax)  # save SIMD counters
2050         mov             \$10,%eax
2051         jmp             .Loop8x
2052
2053 .align  32
2054 .Loop8x:
2055 ___
2056         foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
2057         foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
2058 $code.=<<___;
2059         dec             %eax
2060         jnz             .Loop8x
2061
2062         lea             0x200(%rsp),%rax        # size optimization
2063         vpaddd          0x80-0x100(%rcx),$xa0,$xa0      # accumulate key
2064         vpaddd          0xa0-0x100(%rcx),$xa1,$xa1
2065         vpaddd          0xc0-0x100(%rcx),$xa2,$xa2
2066         vpaddd          0xe0-0x100(%rcx),$xa3,$xa3
2067
2068         vpunpckldq      $xa1,$xa0,$xt2          # "de-interlace" data
2069         vpunpckldq      $xa3,$xa2,$xt3
2070         vpunpckhdq      $xa1,$xa0,$xa0
2071         vpunpckhdq      $xa3,$xa2,$xa2
2072         vpunpcklqdq     $xt3,$xt2,$xa1          # "a0"
2073         vpunpckhqdq     $xt3,$xt2,$xt2          # "a1"
2074         vpunpcklqdq     $xa2,$xa0,$xa3          # "a2"
2075         vpunpckhqdq     $xa2,$xa0,$xa0          # "a3"
2076 ___
2077         ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
2078 $code.=<<___;
2079         vpaddd          0x100-0x100(%rcx),$xb0,$xb0
2080         vpaddd          0x120-0x100(%rcx),$xb1,$xb1
2081         vpaddd          0x140-0x100(%rcx),$xb2,$xb2
2082         vpaddd          0x160-0x100(%rcx),$xb3,$xb3
2083
2084         vpunpckldq      $xb1,$xb0,$xt2
2085         vpunpckldq      $xb3,$xb2,$xt3
2086         vpunpckhdq      $xb1,$xb0,$xb0
2087         vpunpckhdq      $xb3,$xb2,$xb2
2088         vpunpcklqdq     $xt3,$xt2,$xb1          # "b0"
2089         vpunpckhqdq     $xt3,$xt2,$xt2          # "b1"
2090         vpunpcklqdq     $xb2,$xb0,$xb3          # "b2"
2091         vpunpckhqdq     $xb2,$xb0,$xb0          # "b3"
2092 ___
2093         ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
2094 $code.=<<___;
2095         vperm2i128      \$0x20,$xb0,$xa0,$xt3   # "de-interlace" further
2096         vperm2i128      \$0x31,$xb0,$xa0,$xb0
2097         vperm2i128      \$0x20,$xb1,$xa1,$xa0
2098         vperm2i128      \$0x31,$xb1,$xa1,$xb1
2099         vperm2i128      \$0x20,$xb2,$xa2,$xa1
2100         vperm2i128      \$0x31,$xb2,$xa2,$xb2
2101         vperm2i128      \$0x20,$xb3,$xa3,$xa2
2102         vperm2i128      \$0x31,$xb3,$xa3,$xb3
2103 ___
2104         ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
2105         my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
2106 $code.=<<___;
2107         vmovdqa         $xa0,0x00(%rsp)         # offload $xaN
2108         vmovdqa         $xa1,0x20(%rsp)
2109         vmovdqa         0x40(%rsp),$xc2         # $xa0
2110         vmovdqa         0x60(%rsp),$xc3         # $xa1
2111
2112         vpaddd          0x180-0x200(%rax),$xc0,$xc0
2113         vpaddd          0x1a0-0x200(%rax),$xc1,$xc1
2114         vpaddd          0x1c0-0x200(%rax),$xc2,$xc2
2115         vpaddd          0x1e0-0x200(%rax),$xc3,$xc3
2116
2117         vpunpckldq      $xc1,$xc0,$xt2
2118         vpunpckldq      $xc3,$xc2,$xt3
2119         vpunpckhdq      $xc1,$xc0,$xc0
2120         vpunpckhdq      $xc3,$xc2,$xc2
2121         vpunpcklqdq     $xt3,$xt2,$xc1          # "c0"
2122         vpunpckhqdq     $xt3,$xt2,$xt2          # "c1"
2123         vpunpcklqdq     $xc2,$xc0,$xc3          # "c2"
2124         vpunpckhqdq     $xc2,$xc0,$xc0          # "c3"
2125 ___
2126         ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
2127 $code.=<<___;
2128         vpaddd          0x200-0x200(%rax),$xd0,$xd0
2129         vpaddd          0x220-0x200(%rax),$xd1,$xd1
2130         vpaddd          0x240-0x200(%rax),$xd2,$xd2
2131         vpaddd          0x260-0x200(%rax),$xd3,$xd3
2132
2133         vpunpckldq      $xd1,$xd0,$xt2
2134         vpunpckldq      $xd3,$xd2,$xt3
2135         vpunpckhdq      $xd1,$xd0,$xd0
2136         vpunpckhdq      $xd3,$xd2,$xd2
2137         vpunpcklqdq     $xt3,$xt2,$xd1          # "d0"
2138         vpunpckhqdq     $xt3,$xt2,$xt2          # "d1"
2139         vpunpcklqdq     $xd2,$xd0,$xd3          # "d2"
2140         vpunpckhqdq     $xd2,$xd0,$xd0          # "d3"
2141 ___
2142         ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
2143 $code.=<<___;
2144         vperm2i128      \$0x20,$xd0,$xc0,$xt3   # "de-interlace" further
2145         vperm2i128      \$0x31,$xd0,$xc0,$xd0
2146         vperm2i128      \$0x20,$xd1,$xc1,$xc0
2147         vperm2i128      \$0x31,$xd1,$xc1,$xd1
2148         vperm2i128      \$0x20,$xd2,$xc2,$xc1
2149         vperm2i128      \$0x31,$xd2,$xc2,$xd2
2150         vperm2i128      \$0x20,$xd3,$xc3,$xc2
2151         vperm2i128      \$0x31,$xd3,$xc3,$xd3
2152 ___
2153         ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
2154         ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
2155         ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
2156         ($xa0,$xa1)=($xt2,$xt3);
2157 $code.=<<___;
2158         vmovdqa         0x00(%rsp),$xa0         # $xaN was offloaded, remember?
2159         vmovdqa         0x20(%rsp),$xa1
2160
2161         cmp             \$64*8,$len
2162         jb              .Ltail8x
2163
2164         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2165         vpxor           0x20($inp),$xb0,$xb0
2166         vpxor           0x40($inp),$xc0,$xc0
2167         vpxor           0x60($inp),$xd0,$xd0
2168         lea             0x80($inp),$inp         # size optimization
2169         vmovdqu         $xa0,0x00($out)
2170         vmovdqu         $xb0,0x20($out)
2171         vmovdqu         $xc0,0x40($out)
2172         vmovdqu         $xd0,0x60($out)
2173         lea             0x80($out),$out         # size optimization
2174
2175         vpxor           0x00($inp),$xa1,$xa1
2176         vpxor           0x20($inp),$xb1,$xb1
2177         vpxor           0x40($inp),$xc1,$xc1
2178         vpxor           0x60($inp),$xd1,$xd1
2179         lea             0x80($inp),$inp         # size optimization
2180         vmovdqu         $xa1,0x00($out)
2181         vmovdqu         $xb1,0x20($out)
2182         vmovdqu         $xc1,0x40($out)
2183         vmovdqu         $xd1,0x60($out)
2184         lea             0x80($out),$out         # size optimization
2185
2186         vpxor           0x00($inp),$xa2,$xa2
2187         vpxor           0x20($inp),$xb2,$xb2
2188         vpxor           0x40($inp),$xc2,$xc2
2189         vpxor           0x60($inp),$xd2,$xd2
2190         lea             0x80($inp),$inp         # size optimization
2191         vmovdqu         $xa2,0x00($out)
2192         vmovdqu         $xb2,0x20($out)
2193         vmovdqu         $xc2,0x40($out)
2194         vmovdqu         $xd2,0x60($out)
2195         lea             0x80($out),$out         # size optimization
2196
2197         vpxor           0x00($inp),$xa3,$xa3
2198         vpxor           0x20($inp),$xb3,$xb3
2199         vpxor           0x40($inp),$xc3,$xc3
2200         vpxor           0x60($inp),$xd3,$xd3
2201         lea             0x80($inp),$inp         # size optimization
2202         vmovdqu         $xa3,0x00($out)
2203         vmovdqu         $xb3,0x20($out)
2204         vmovdqu         $xc3,0x40($out)
2205         vmovdqu         $xd3,0x60($out)
2206         lea             0x80($out),$out         # size optimization
2207
2208         sub             \$64*8,$len
2209         jnz             .Loop_outer8x
2210
2211         jmp             .Ldone8x
2212
2213 .Ltail8x:
2214         cmp             \$448,$len
2215         jae             .L448_or_more8x
2216         cmp             \$384,$len
2217         jae             .L384_or_more8x
2218         cmp             \$320,$len
2219         jae             .L320_or_more8x
2220         cmp             \$256,$len
2221         jae             .L256_or_more8x
2222         cmp             \$192,$len
2223         jae             .L192_or_more8x
2224         cmp             \$128,$len
2225         jae             .L128_or_more8x
2226         cmp             \$64,$len
2227         jae             .L64_or_more8x
2228
2229         xor             %r10,%r10
2230         vmovdqa         $xa0,0x00(%rsp)
2231         vmovdqa         $xb0,0x20(%rsp)
2232         jmp             .Loop_tail8x
2233
2234 .align  32
2235 .L64_or_more8x:
2236         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2237         vpxor           0x20($inp),$xb0,$xb0
2238         vmovdqu         $xa0,0x00($out)
2239         vmovdqu         $xb0,0x20($out)
2240         je              .Ldone8x
2241
2242         lea             0x40($inp),$inp         # inp+=64*1
2243         xor             %r10,%r10
2244         vmovdqa         $xc0,0x00(%rsp)
2245         lea             0x40($out),$out         # out+=64*1
2246         sub             \$64,$len               # len-=64*1
2247         vmovdqa         $xd0,0x20(%rsp)
2248         jmp             .Loop_tail8x
2249
2250 .align  32
2251 .L128_or_more8x:
2252         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2253         vpxor           0x20($inp),$xb0,$xb0
2254         vpxor           0x40($inp),$xc0,$xc0
2255         vpxor           0x60($inp),$xd0,$xd0
2256         vmovdqu         $xa0,0x00($out)
2257         vmovdqu         $xb0,0x20($out)
2258         vmovdqu         $xc0,0x40($out)
2259         vmovdqu         $xd0,0x60($out)
2260         je              .Ldone8x
2261
2262         lea             0x80($inp),$inp         # inp+=64*2
2263         xor             %r10,%r10
2264         vmovdqa         $xa1,0x00(%rsp)
2265         lea             0x80($out),$out         # out+=64*2
2266         sub             \$128,$len              # len-=64*2
2267         vmovdqa         $xb1,0x20(%rsp)
2268         jmp             .Loop_tail8x
2269
2270 .align  32
2271 .L192_or_more8x:
2272         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2273         vpxor           0x20($inp),$xb0,$xb0
2274         vpxor           0x40($inp),$xc0,$xc0
2275         vpxor           0x60($inp),$xd0,$xd0
2276         vpxor           0x80($inp),$xa1,$xa1
2277         vpxor           0xa0($inp),$xb1,$xb1
2278         vmovdqu         $xa0,0x00($out)
2279         vmovdqu         $xb0,0x20($out)
2280         vmovdqu         $xc0,0x40($out)
2281         vmovdqu         $xd0,0x60($out)
2282         vmovdqu         $xa1,0x80($out)
2283         vmovdqu         $xb1,0xa0($out)
2284         je              .Ldone8x
2285
2286         lea             0xc0($inp),$inp         # inp+=64*3
2287         xor             %r10,%r10
2288         vmovdqa         $xc1,0x00(%rsp)
2289         lea             0xc0($out),$out         # out+=64*3
2290         sub             \$192,$len              # len-=64*3
2291         vmovdqa         $xd1,0x20(%rsp)
2292         jmp             .Loop_tail8x
2293
2294 .align  32
2295 .L256_or_more8x:
2296         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2297         vpxor           0x20($inp),$xb0,$xb0
2298         vpxor           0x40($inp),$xc0,$xc0
2299         vpxor           0x60($inp),$xd0,$xd0
2300         vpxor           0x80($inp),$xa1,$xa1
2301         vpxor           0xa0($inp),$xb1,$xb1
2302         vpxor           0xc0($inp),$xc1,$xc1
2303         vpxor           0xe0($inp),$xd1,$xd1
2304         vmovdqu         $xa0,0x00($out)
2305         vmovdqu         $xb0,0x20($out)
2306         vmovdqu         $xc0,0x40($out)
2307         vmovdqu         $xd0,0x60($out)
2308         vmovdqu         $xa1,0x80($out)
2309         vmovdqu         $xb1,0xa0($out)
2310         vmovdqu         $xc1,0xc0($out)
2311         vmovdqu         $xd1,0xe0($out)
2312         je              .Ldone8x
2313
2314         lea             0x100($inp),$inp        # inp+=64*4
2315         xor             %r10,%r10
2316         vmovdqa         $xa2,0x00(%rsp)
2317         lea             0x100($out),$out        # out+=64*4
2318         sub             \$256,$len              # len-=64*4
2319         vmovdqa         $xb2,0x20(%rsp)
2320         jmp             .Loop_tail8x
2321
2322 .align  32
2323 .L320_or_more8x:
2324         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2325         vpxor           0x20($inp),$xb0,$xb0
2326         vpxor           0x40($inp),$xc0,$xc0
2327         vpxor           0x60($inp),$xd0,$xd0
2328         vpxor           0x80($inp),$xa1,$xa1
2329         vpxor           0xa0($inp),$xb1,$xb1
2330         vpxor           0xc0($inp),$xc1,$xc1
2331         vpxor           0xe0($inp),$xd1,$xd1
2332         vpxor           0x100($inp),$xa2,$xa2
2333         vpxor           0x120($inp),$xb2,$xb2
2334         vmovdqu         $xa0,0x00($out)
2335         vmovdqu         $xb0,0x20($out)
2336         vmovdqu         $xc0,0x40($out)
2337         vmovdqu         $xd0,0x60($out)
2338         vmovdqu         $xa1,0x80($out)
2339         vmovdqu         $xb1,0xa0($out)
2340         vmovdqu         $xc1,0xc0($out)
2341         vmovdqu         $xd1,0xe0($out)
2342         vmovdqu         $xa2,0x100($out)
2343         vmovdqu         $xb2,0x120($out)
2344         je              .Ldone8x
2345
2346         lea             0x140($inp),$inp        # inp+=64*5
2347         xor             %r10,%r10
2348         vmovdqa         $xc2,0x00(%rsp)
2349         lea             0x140($out),$out        # out+=64*5
2350         sub             \$320,$len              # len-=64*5
2351         vmovdqa         $xd2,0x20(%rsp)
2352         jmp             .Loop_tail8x
2353
2354 .align  32
2355 .L384_or_more8x:
2356         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2357         vpxor           0x20($inp),$xb0,$xb0
2358         vpxor           0x40($inp),$xc0,$xc0
2359         vpxor           0x60($inp),$xd0,$xd0
2360         vpxor           0x80($inp),$xa1,$xa1
2361         vpxor           0xa0($inp),$xb1,$xb1
2362         vpxor           0xc0($inp),$xc1,$xc1
2363         vpxor           0xe0($inp),$xd1,$xd1
2364         vpxor           0x100($inp),$xa2,$xa2
2365         vpxor           0x120($inp),$xb2,$xb2
2366         vpxor           0x140($inp),$xc2,$xc2
2367         vpxor           0x160($inp),$xd2,$xd2
2368         vmovdqu         $xa0,0x00($out)
2369         vmovdqu         $xb0,0x20($out)
2370         vmovdqu         $xc0,0x40($out)
2371         vmovdqu         $xd0,0x60($out)
2372         vmovdqu         $xa1,0x80($out)
2373         vmovdqu         $xb1,0xa0($out)
2374         vmovdqu         $xc1,0xc0($out)
2375         vmovdqu         $xd1,0xe0($out)
2376         vmovdqu         $xa2,0x100($out)
2377         vmovdqu         $xb2,0x120($out)
2378         vmovdqu         $xc2,0x140($out)
2379         vmovdqu         $xd2,0x160($out)
2380         je              .Ldone8x
2381
2382         lea             0x180($inp),$inp        # inp+=64*6
2383         xor             %r10,%r10
2384         vmovdqa         $xa3,0x00(%rsp)
2385         lea             0x180($out),$out        # out+=64*6
2386         sub             \$384,$len              # len-=64*6
2387         vmovdqa         $xb3,0x20(%rsp)
2388         jmp             .Loop_tail8x
2389
2390 .align  32
2391 .L448_or_more8x:
2392         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2393         vpxor           0x20($inp),$xb0,$xb0
2394         vpxor           0x40($inp),$xc0,$xc0
2395         vpxor           0x60($inp),$xd0,$xd0
2396         vpxor           0x80($inp),$xa1,$xa1
2397         vpxor           0xa0($inp),$xb1,$xb1
2398         vpxor           0xc0($inp),$xc1,$xc1
2399         vpxor           0xe0($inp),$xd1,$xd1
2400         vpxor           0x100($inp),$xa2,$xa2
2401         vpxor           0x120($inp),$xb2,$xb2
2402         vpxor           0x140($inp),$xc2,$xc2
2403         vpxor           0x160($inp),$xd2,$xd2
2404         vpxor           0x180($inp),$xa3,$xa3
2405         vpxor           0x1a0($inp),$xb3,$xb3
2406         vmovdqu         $xa0,0x00($out)
2407         vmovdqu         $xb0,0x20($out)
2408         vmovdqu         $xc0,0x40($out)
2409         vmovdqu         $xd0,0x60($out)
2410         vmovdqu         $xa1,0x80($out)
2411         vmovdqu         $xb1,0xa0($out)
2412         vmovdqu         $xc1,0xc0($out)
2413         vmovdqu         $xd1,0xe0($out)
2414         vmovdqu         $xa2,0x100($out)
2415         vmovdqu         $xb2,0x120($out)
2416         vmovdqu         $xc2,0x140($out)
2417         vmovdqu         $xd2,0x160($out)
2418         vmovdqu         $xa3,0x180($out)
2419         vmovdqu         $xb3,0x1a0($out)
2420         je              .Ldone8x
2421
2422         lea             0x1c0($inp),$inp        # inp+=64*7
2423         xor             %r10,%r10
2424         vmovdqa         $xc3,0x00(%rsp)
2425         lea             0x1c0($out),$out        # out+=64*7
2426         sub             \$448,$len              # len-=64*7
2427         vmovdqa         $xd3,0x20(%rsp)
2428
2429 .Loop_tail8x:
2430         movzb           ($inp,%r10),%eax
2431         movzb           (%rsp,%r10),%ecx
2432         lea             1(%r10),%r10
2433         xor             %ecx,%eax
2434         mov             %al,-1($out,%r10)
2435         dec             $len
2436         jnz             .Loop_tail8x
2437
2438 .Ldone8x:
2439         vzeroall
2440 ___
2441 $code.=<<___    if ($win64);
2442         movaps          -0xa8(%r9),%xmm6
2443         movaps          -0x98(%r9),%xmm7
2444         movaps          -0x88(%r9),%xmm8
2445         movaps          -0x78(%r9),%xmm9
2446         movaps          -0x68(%r9),%xmm10
2447         movaps          -0x58(%r9),%xmm11
2448         movaps          -0x48(%r9),%xmm12
2449         movaps          -0x38(%r9),%xmm13
2450         movaps          -0x28(%r9),%xmm14
2451         movaps          -0x18(%r9),%xmm15
2452 ___
2453 $code.=<<___;
2454         lea             (%r9),%rsp
2455 .cfi_def_cfa_register   %rsp
2456 .L8x_epilogue:
2457         ret
2458 .cfi_endproc
2459 .size   ChaCha20_8x,.-ChaCha20_8x
2460 ___
2461 }
2462
2463 ########################################################################
2464 # AVX512 code paths
2465 if ($avx>2) {
2466 # This one handles shorter inputs...
2467
2468 my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
2469 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
2470
2471 sub vpxord()            # size optimization
2472 { my $opcode = "vpxor"; # adhere to vpxor when possible
2473
2474     foreach (@_) {
2475         if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) {
2476             $opcode = "vpxord";
2477             last;
2478         }
2479     }
2480
2481     $code .= "\t$opcode\t".join(',',reverse @_)."\n";
2482 }
2483
2484 sub AVX512ROUND {       # critical path is 14 "SIMD ticks" per round
2485         &vpaddd ($a,$a,$b);
2486         &vpxord ($d,$d,$a);
2487         &vprold ($d,$d,16);
2488
2489         &vpaddd ($c,$c,$d);
2490         &vpxord ($b,$b,$c);
2491         &vprold ($b,$b,12);
2492
2493         &vpaddd ($a,$a,$b);
2494         &vpxord ($d,$d,$a);
2495         &vprold ($d,$d,8);
2496
2497         &vpaddd ($c,$c,$d);
2498         &vpxord ($b,$b,$c);
2499         &vprold ($b,$b,7);
2500 }
2501
2502 my $xframe = $win64 ? 32+8 : 8;
2503
2504 $code.=<<___;
2505 .type   ChaCha20_avx512,\@function,5
2506 .align  32
2507 ChaCha20_avx512:
2508 .cfi_startproc
2509 .LChaCha20_avx512:
2510         mov     %rsp,%r9                # frame pointer
2511 .cfi_def_cfa_register   %r9
2512         cmp     \$512,$len
2513         ja      .LChaCha20_16x
2514
2515         sub     \$64+$xframe,%rsp
2516 ___
2517 $code.=<<___    if ($win64);
2518         movaps  %xmm6,-0x28(%r9)
2519         movaps  %xmm7,-0x18(%r9)
2520 .Lavx512_body:
2521 ___
2522 $code.=<<___;
2523         vbroadcasti32x4 .Lsigma(%rip),$a
2524         vbroadcasti32x4 ($key),$b
2525         vbroadcasti32x4 16($key),$c
2526         vbroadcasti32x4 ($counter),$d
2527
2528         vmovdqa32       $a,$a_
2529         vmovdqa32       $b,$b_
2530         vmovdqa32       $c,$c_
2531         vpaddd          .Lzeroz(%rip),$d,$d
2532         vmovdqa32       .Lfourz(%rip),$fourz
2533         mov             \$10,$counter   # reuse $counter
2534         vmovdqa32       $d,$d_
2535         jmp             .Loop_avx512
2536
2537 .align  16
2538 .Loop_outer_avx512:
2539         vmovdqa32       $a_,$a
2540         vmovdqa32       $b_,$b
2541         vmovdqa32       $c_,$c
2542         vpaddd          $fourz,$d_,$d
2543         mov             \$10,$counter
2544         vmovdqa32       $d,$d_
2545         jmp             .Loop_avx512
2546
2547 .align  32
2548 .Loop_avx512:
2549 ___
2550         &AVX512ROUND();
2551         &vpshufd        ($c,$c,0b01001110);
2552         &vpshufd        ($b,$b,0b00111001);
2553         &vpshufd        ($d,$d,0b10010011);
2554
2555         &AVX512ROUND();
2556         &vpshufd        ($c,$c,0b01001110);
2557         &vpshufd        ($b,$b,0b10010011);
2558         &vpshufd        ($d,$d,0b00111001);
2559
2560         &dec            ($counter);
2561         &jnz            (".Loop_avx512");
2562
2563 $code.=<<___;
2564         vpaddd          $a_,$a,$a
2565         vpaddd          $b_,$b,$b
2566         vpaddd          $c_,$c,$c
2567         vpaddd          $d_,$d,$d
2568
2569         sub             \$64,$len
2570         jb              .Ltail64_avx512
2571
2572         vpxor           0x00($inp),%x#$a,$t0    # xor with input
2573         vpxor           0x10($inp),%x#$b,$t1
2574         vpxor           0x20($inp),%x#$c,$t2
2575         vpxor           0x30($inp),%x#$d,$t3
2576         lea             0x40($inp),$inp         # inp+=64
2577
2578         vmovdqu         $t0,0x00($out)          # write output
2579         vmovdqu         $t1,0x10($out)
2580         vmovdqu         $t2,0x20($out)
2581         vmovdqu         $t3,0x30($out)
2582         lea             0x40($out),$out         # out+=64
2583
2584         jz              .Ldone_avx512
2585
2586         vextracti32x4   \$1,$a,$t0
2587         vextracti32x4   \$1,$b,$t1
2588         vextracti32x4   \$1,$c,$t2
2589         vextracti32x4   \$1,$d,$t3
2590
2591         sub             \$64,$len
2592         jb              .Ltail_avx512
2593
2594         vpxor           0x00($inp),$t0,$t0      # xor with input
2595         vpxor           0x10($inp),$t1,$t1
2596         vpxor           0x20($inp),$t2,$t2
2597         vpxor           0x30($inp),$t3,$t3
2598         lea             0x40($inp),$inp         # inp+=64
2599
2600         vmovdqu         $t0,0x00($out)          # write output
2601         vmovdqu         $t1,0x10($out)
2602         vmovdqu         $t2,0x20($out)
2603         vmovdqu         $t3,0x30($out)
2604         lea             0x40($out),$out         # out+=64
2605
2606         jz              .Ldone_avx512
2607
2608         vextracti32x4   \$2,$a,$t0
2609         vextracti32x4   \$2,$b,$t1
2610         vextracti32x4   \$2,$c,$t2
2611         vextracti32x4   \$2,$d,$t3
2612
2613         sub             \$64,$len
2614         jb              .Ltail_avx512
2615
2616         vpxor           0x00($inp),$t0,$t0      # xor with input
2617         vpxor           0x10($inp),$t1,$t1
2618         vpxor           0x20($inp),$t2,$t2
2619         vpxor           0x30($inp),$t3,$t3
2620         lea             0x40($inp),$inp         # inp+=64
2621
2622         vmovdqu         $t0,0x00($out)          # write output
2623         vmovdqu         $t1,0x10($out)
2624         vmovdqu         $t2,0x20($out)
2625         vmovdqu         $t3,0x30($out)
2626         lea             0x40($out),$out         # out+=64
2627
2628         jz              .Ldone_avx512
2629
2630         vextracti32x4   \$3,$a,$t0
2631         vextracti32x4   \$3,$b,$t1
2632         vextracti32x4   \$3,$c,$t2
2633         vextracti32x4   \$3,$d,$t3
2634
2635         sub             \$64,$len
2636         jb              .Ltail_avx512
2637
2638         vpxor           0x00($inp),$t0,$t0      # xor with input
2639         vpxor           0x10($inp),$t1,$t1
2640         vpxor           0x20($inp),$t2,$t2
2641         vpxor           0x30($inp),$t3,$t3
2642         lea             0x40($inp),$inp         # inp+=64
2643
2644         vmovdqu         $t0,0x00($out)          # write output
2645         vmovdqu         $t1,0x10($out)
2646         vmovdqu         $t2,0x20($out)
2647         vmovdqu         $t3,0x30($out)
2648         lea             0x40($out),$out         # out+=64
2649
2650         jnz             .Loop_outer_avx512
2651
2652         jmp             .Ldone_avx512
2653
2654 .align  16
2655 .Ltail64_avx512:
2656         vmovdqa         %x#$a,0x00(%rsp)
2657         vmovdqa         %x#$b,0x10(%rsp)
2658         vmovdqa         %x#$c,0x20(%rsp)
2659         vmovdqa         %x#$d,0x30(%rsp)
2660         add             \$64,$len
2661         jmp             .Loop_tail_avx512
2662
2663 .align  16
2664 .Ltail_avx512:
2665         vmovdqa         $t0,0x00(%rsp)
2666         vmovdqa         $t1,0x10(%rsp)
2667         vmovdqa         $t2,0x20(%rsp)
2668         vmovdqa         $t3,0x30(%rsp)
2669         add             \$64,$len
2670
2671 .Loop_tail_avx512:
2672         movzb           ($inp,$counter),%eax
2673         movzb           (%rsp,$counter),%ecx
2674         lea             1($counter),$counter
2675         xor             %ecx,%eax
2676         mov             %al,-1($out,$counter)
2677         dec             $len
2678         jnz             .Loop_tail_avx512
2679
2680         vmovdqu32       $a_,0x00(%rsp)
2681
2682 .Ldone_avx512:
2683         vzeroall
2684 ___
2685 $code.=<<___    if ($win64);
2686         movaps  -0x28(%r9),%xmm6
2687         movaps  -0x18(%r9),%xmm7
2688 ___
2689 $code.=<<___;
2690         lea     (%r9),%rsp
2691 .cfi_def_cfa_register   %rsp
2692 .Lavx512_epilogue:
2693         ret
2694 .cfi_endproc
2695 .size   ChaCha20_avx512,.-ChaCha20_avx512
2696 ___
2697
2698 map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz);
2699
2700 $code.=<<___;
2701 .type   ChaCha20_avx512vl,\@function,5
2702 .align  32
2703 ChaCha20_avx512vl:
2704 .cfi_startproc
2705 .LChaCha20_avx512vl:
2706         mov     %rsp,%r9                # frame pointer
2707 .cfi_def_cfa_register   %r9
2708         cmp     \$128,$len
2709         ja      .LChaCha20_8xvl
2710
2711         sub     \$64+$xframe,%rsp
2712 ___
2713 $code.=<<___    if ($win64);
2714         movaps  %xmm6,-0x28(%r9)
2715         movaps  %xmm7,-0x18(%r9)
2716 .Lavx512vl_body:
2717 ___
2718 $code.=<<___;
2719         vbroadcasti128  .Lsigma(%rip),$a
2720         vbroadcasti128  ($key),$b
2721         vbroadcasti128  16($key),$c
2722         vbroadcasti128  ($counter),$d
2723
2724         vmovdqa32       $a,$a_
2725         vmovdqa32       $b,$b_
2726         vmovdqa32       $c,$c_
2727         vpaddd          .Lzeroz(%rip),$d,$d
2728         vmovdqa32       .Ltwoy(%rip),$fourz
2729         mov             \$10,$counter   # reuse $counter
2730         vmovdqa32       $d,$d_
2731         jmp             .Loop_avx512vl
2732
2733 .align  16
2734 .Loop_outer_avx512vl:
2735         vmovdqa32       $c_,$c
2736         vpaddd          $fourz,$d_,$d
2737         mov             \$10,$counter
2738         vmovdqa32       $d,$d_
2739         jmp             .Loop_avx512vl
2740
2741 .align  32
2742 .Loop_avx512vl:
2743 ___
2744         &AVX512ROUND();
2745         &vpshufd        ($c,$c,0b01001110);
2746         &vpshufd        ($b,$b,0b00111001);
2747         &vpshufd        ($d,$d,0b10010011);
2748
2749         &AVX512ROUND();
2750         &vpshufd        ($c,$c,0b01001110);
2751         &vpshufd        ($b,$b,0b10010011);
2752         &vpshufd        ($d,$d,0b00111001);
2753
2754         &dec            ($counter);
2755         &jnz            (".Loop_avx512vl");
2756
2757 $code.=<<___;
2758         vpaddd          $a_,$a,$a
2759         vpaddd          $b_,$b,$b
2760         vpaddd          $c_,$c,$c
2761         vpaddd          $d_,$d,$d
2762
2763         sub             \$64,$len
2764         jb              .Ltail64_avx512vl
2765
2766         vpxor           0x00($inp),%x#$a,$t0    # xor with input
2767         vpxor           0x10($inp),%x#$b,$t1
2768         vpxor           0x20($inp),%x#$c,$t2
2769         vpxor           0x30($inp),%x#$d,$t3
2770         lea             0x40($inp),$inp         # inp+=64
2771
2772         vmovdqu         $t0,0x00($out)          # write output
2773         vmovdqu         $t1,0x10($out)
2774         vmovdqu         $t2,0x20($out)
2775         vmovdqu         $t3,0x30($out)
2776         lea             0x40($out),$out         # out+=64
2777
2778         jz              .Ldone_avx512vl
2779
2780         vextracti128    \$1,$a,$t0
2781         vextracti128    \$1,$b,$t1
2782         vextracti128    \$1,$c,$t2
2783         vextracti128    \$1,$d,$t3
2784
2785         sub             \$64,$len
2786         jb              .Ltail_avx512vl
2787
2788         vpxor           0x00($inp),$t0,$t0      # xor with input
2789         vpxor           0x10($inp),$t1,$t1
2790         vpxor           0x20($inp),$t2,$t2
2791         vpxor           0x30($inp),$t3,$t3
2792         lea             0x40($inp),$inp         # inp+=64
2793
2794         vmovdqu         $t0,0x00($out)          # write output
2795         vmovdqu         $t1,0x10($out)
2796         vmovdqu         $t2,0x20($out)
2797         vmovdqu         $t3,0x30($out)
2798         lea             0x40($out),$out         # out+=64
2799
2800         vmovdqa32       $a_,$a
2801         vmovdqa32       $b_,$b
2802         jnz             .Loop_outer_avx512vl
2803
2804         jmp             .Ldone_avx512vl
2805
2806 .align  16
2807 .Ltail64_avx512vl:
2808         vmovdqa         %x#$a,0x00(%rsp)
2809         vmovdqa         %x#$b,0x10(%rsp)
2810         vmovdqa         %x#$c,0x20(%rsp)
2811         vmovdqa         %x#$d,0x30(%rsp)
2812         add             \$64,$len
2813         jmp             .Loop_tail_avx512vl
2814
2815 .align  16
2816 .Ltail_avx512vl:
2817         vmovdqa         $t0,0x00(%rsp)
2818         vmovdqa         $t1,0x10(%rsp)
2819         vmovdqa         $t2,0x20(%rsp)
2820         vmovdqa         $t3,0x30(%rsp)
2821         add             \$64,$len
2822
2823 .Loop_tail_avx512vl:
2824         movzb           ($inp,$counter),%eax
2825         movzb           (%rsp,$counter),%ecx
2826         lea             1($counter),$counter
2827         xor             %ecx,%eax
2828         mov             %al,-1($out,$counter)
2829         dec             $len
2830         jnz             .Loop_tail_avx512vl
2831
2832         vmovdqu32       $a_,0x00(%rsp)
2833         vmovdqu32       $a_,0x20(%rsp)
2834
2835 .Ldone_avx512vl:
2836         vzeroall
2837 ___
2838 $code.=<<___    if ($win64);
2839         movaps  -0x28(%r9),%xmm6
2840         movaps  -0x18(%r9),%xmm7
2841 ___
2842 $code.=<<___;
2843         lea     (%r9),%rsp
2844 .cfi_def_cfa_register   %rsp
2845 .Lavx512vl_epilogue:
2846         ret
2847 .cfi_endproc
2848 .size   ChaCha20_avx512vl,.-ChaCha20_avx512vl
2849 ___
2850 }
2851 if ($avx>2) {
2852 # This one handles longer inputs...
2853
2854 my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2855     $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
2856 my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2857          $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2858 my @key=map("%zmm$_",(16..31));
2859 my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
2860
2861 sub AVX512_lane_ROUND {
2862 my ($a0,$b0,$c0,$d0)=@_;
2863 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
2864 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
2865 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
2866 my @x=map("\"$_\"",@xx);
2867
2868         (
2869         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",     # Q1
2870          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",     # Q2
2871           "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",     # Q3
2872            "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",     # Q4
2873         "&vpxord        (@x[$d0],@x[$d0],@x[$a0])",
2874          "&vpxord       (@x[$d1],@x[$d1],@x[$a1])",
2875           "&vpxord      (@x[$d2],@x[$d2],@x[$a2])",
2876            "&vpxord     (@x[$d3],@x[$d3],@x[$a3])",
2877         "&vprold        (@x[$d0],@x[$d0],16)",
2878          "&vprold       (@x[$d1],@x[$d1],16)",
2879           "&vprold      (@x[$d2],@x[$d2],16)",
2880            "&vprold     (@x[$d3],@x[$d3],16)",
2881
2882         "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
2883          "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
2884           "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
2885            "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
2886         "&vpxord        (@x[$b0],@x[$b0],@x[$c0])",
2887          "&vpxord       (@x[$b1],@x[$b1],@x[$c1])",
2888           "&vpxord      (@x[$b2],@x[$b2],@x[$c2])",
2889            "&vpxord     (@x[$b3],@x[$b3],@x[$c3])",
2890         "&vprold        (@x[$b0],@x[$b0],12)",
2891          "&vprold       (@x[$b1],@x[$b1],12)",
2892           "&vprold      (@x[$b2],@x[$b2],12)",
2893            "&vprold     (@x[$b3],@x[$b3],12)",
2894
2895         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",
2896          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",
2897           "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",
2898            "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",
2899         "&vpxord        (@x[$d0],@x[$d0],@x[$a0])",
2900          "&vpxord       (@x[$d1],@x[$d1],@x[$a1])",
2901           "&vpxord      (@x[$d2],@x[$d2],@x[$a2])",
2902            "&vpxord     (@x[$d3],@x[$d3],@x[$a3])",
2903         "&vprold        (@x[$d0],@x[$d0],8)",
2904          "&vprold       (@x[$d1],@x[$d1],8)",
2905           "&vprold      (@x[$d2],@x[$d2],8)",
2906            "&vprold     (@x[$d3],@x[$d3],8)",
2907
2908         "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
2909          "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
2910           "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
2911            "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
2912         "&vpxord        (@x[$b0],@x[$b0],@x[$c0])",
2913          "&vpxord       (@x[$b1],@x[$b1],@x[$c1])",
2914           "&vpxord      (@x[$b2],@x[$b2],@x[$c2])",
2915            "&vpxord     (@x[$b3],@x[$b3],@x[$c3])",
2916         "&vprold        (@x[$b0],@x[$b0],7)",
2917          "&vprold       (@x[$b1],@x[$b1],7)",
2918           "&vprold      (@x[$b2],@x[$b2],7)",
2919            "&vprold     (@x[$b3],@x[$b3],7)"
2920         );
2921 }
2922
2923 my $xframe = $win64 ? 0xa8 : 8;
2924
2925 $code.=<<___;
2926 .type   ChaCha20_16x,\@function,5
2927 .align  32
2928 ChaCha20_16x:
2929 .cfi_startproc
2930 .LChaCha20_16x:
2931         mov             %rsp,%r9                # frame register
2932 .cfi_def_cfa_register   %r9
2933         sub             \$64+$xframe,%rsp
2934         and             \$-64,%rsp
2935 ___
2936 $code.=<<___    if ($win64);
2937         movaps          %xmm6,-0xa8(%r9)
2938         movaps          %xmm7,-0x98(%r9)
2939         movaps          %xmm8,-0x88(%r9)
2940         movaps          %xmm9,-0x78(%r9)
2941         movaps          %xmm10,-0x68(%r9)
2942         movaps          %xmm11,-0x58(%r9)
2943         movaps          %xmm12,-0x48(%r9)
2944         movaps          %xmm13,-0x38(%r9)
2945         movaps          %xmm14,-0x28(%r9)
2946         movaps          %xmm15,-0x18(%r9)
2947 .L16x_body:
2948 ___
2949 $code.=<<___;
2950         vzeroupper
2951
2952         lea             .Lsigma(%rip),%r10
2953         vbroadcasti32x4 (%r10),$xa3             # key[0]
2954         vbroadcasti32x4 ($key),$xb3             # key[1]
2955         vbroadcasti32x4 16($key),$xc3           # key[2]
2956         vbroadcasti32x4 ($counter),$xd3         # key[3]
2957
2958         vpshufd         \$0x00,$xa3,$xa0        # smash key by lanes...
2959         vpshufd         \$0x55,$xa3,$xa1
2960         vpshufd         \$0xaa,$xa3,$xa2
2961         vpshufd         \$0xff,$xa3,$xa3
2962         vmovdqa64       $xa0,@key[0]
2963         vmovdqa64       $xa1,@key[1]
2964         vmovdqa64       $xa2,@key[2]
2965         vmovdqa64       $xa3,@key[3]
2966
2967         vpshufd         \$0x00,$xb3,$xb0
2968         vpshufd         \$0x55,$xb3,$xb1
2969         vpshufd         \$0xaa,$xb3,$xb2
2970         vpshufd         \$0xff,$xb3,$xb3
2971         vmovdqa64       $xb0,@key[4]
2972         vmovdqa64       $xb1,@key[5]
2973         vmovdqa64       $xb2,@key[6]
2974         vmovdqa64       $xb3,@key[7]
2975
2976         vpshufd         \$0x00,$xc3,$xc0
2977         vpshufd         \$0x55,$xc3,$xc1
2978         vpshufd         \$0xaa,$xc3,$xc2
2979         vpshufd         \$0xff,$xc3,$xc3
2980         vmovdqa64       $xc0,@key[8]
2981         vmovdqa64       $xc1,@key[9]
2982         vmovdqa64       $xc2,@key[10]
2983         vmovdqa64       $xc3,@key[11]
2984
2985         vpshufd         \$0x00,$xd3,$xd0
2986         vpshufd         \$0x55,$xd3,$xd1
2987         vpshufd         \$0xaa,$xd3,$xd2
2988         vpshufd         \$0xff,$xd3,$xd3
2989         vpaddd          .Lincz(%rip),$xd0,$xd0  # don't save counters yet
2990         vmovdqa64       $xd0,@key[12]
2991         vmovdqa64       $xd1,@key[13]
2992         vmovdqa64       $xd2,@key[14]
2993         vmovdqa64       $xd3,@key[15]
2994
2995         mov             \$10,%eax
2996         jmp             .Loop16x
2997
2998 .align  32
2999 .Loop_outer16x:
3000         vpbroadcastd    0(%r10),$xa0            # reload key
3001         vpbroadcastd    4(%r10),$xa1
3002         vpbroadcastd    8(%r10),$xa2
3003         vpbroadcastd    12(%r10),$xa3
3004         vpaddd          .Lsixteen(%rip),@key[12],@key[12]       # next SIMD counters
3005         vmovdqa64       @key[4],$xb0
3006         vmovdqa64       @key[5],$xb1
3007         vmovdqa64       @key[6],$xb2
3008         vmovdqa64       @key[7],$xb3
3009         vmovdqa64       @key[8],$xc0
3010         vmovdqa64       @key[9],$xc1
3011         vmovdqa64       @key[10],$xc2
3012         vmovdqa64       @key[11],$xc3
3013         vmovdqa64       @key[12],$xd0
3014         vmovdqa64       @key[13],$xd1
3015         vmovdqa64       @key[14],$xd2
3016         vmovdqa64       @key[15],$xd3
3017
3018         vmovdqa64       $xa0,@key[0]
3019         vmovdqa64       $xa1,@key[1]
3020         vmovdqa64       $xa2,@key[2]
3021         vmovdqa64       $xa3,@key[3]
3022
3023         mov             \$10,%eax
3024         jmp             .Loop16x
3025
3026 .align  32
3027 .Loop16x:
3028 ___
3029         foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
3030         foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
3031 $code.=<<___;
3032         dec             %eax
3033         jnz             .Loop16x
3034
3035         vpaddd          @key[0],$xa0,$xa0       # accumulate key
3036         vpaddd          @key[1],$xa1,$xa1
3037         vpaddd          @key[2],$xa2,$xa2
3038         vpaddd          @key[3],$xa3,$xa3
3039
3040         vpunpckldq      $xa1,$xa0,$xt2          # "de-interlace" data
3041         vpunpckldq      $xa3,$xa2,$xt3
3042         vpunpckhdq      $xa1,$xa0,$xa0
3043         vpunpckhdq      $xa3,$xa2,$xa2
3044         vpunpcklqdq     $xt3,$xt2,$xa1          # "a0"
3045         vpunpckhqdq     $xt3,$xt2,$xt2          # "a1"
3046         vpunpcklqdq     $xa2,$xa0,$xa3          # "a2"
3047         vpunpckhqdq     $xa2,$xa0,$xa0          # "a3"
3048 ___
3049         ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
3050 $code.=<<___;
3051         vpaddd          @key[4],$xb0,$xb0
3052         vpaddd          @key[5],$xb1,$xb1
3053         vpaddd          @key[6],$xb2,$xb2
3054         vpaddd          @key[7],$xb3,$xb3
3055
3056         vpunpckldq      $xb1,$xb0,$xt2
3057         vpunpckldq      $xb3,$xb2,$xt3
3058         vpunpckhdq      $xb1,$xb0,$xb0
3059         vpunpckhdq      $xb3,$xb2,$xb2
3060         vpunpcklqdq     $xt3,$xt2,$xb1          # "b0"
3061         vpunpckhqdq     $xt3,$xt2,$xt2          # "b1"
3062         vpunpcklqdq     $xb2,$xb0,$xb3          # "b2"
3063         vpunpckhqdq     $xb2,$xb0,$xb0          # "b3"
3064 ___
3065         ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
3066 $code.=<<___;
3067         vshufi32x4      \$0x44,$xb0,$xa0,$xt3   # "de-interlace" further
3068         vshufi32x4      \$0xee,$xb0,$xa0,$xb0
3069         vshufi32x4      \$0x44,$xb1,$xa1,$xa0
3070         vshufi32x4      \$0xee,$xb1,$xa1,$xb1
3071         vshufi32x4      \$0x44,$xb2,$xa2,$xa1
3072         vshufi32x4      \$0xee,$xb2,$xa2,$xb2
3073         vshufi32x4      \$0x44,$xb3,$xa3,$xa2
3074         vshufi32x4      \$0xee,$xb3,$xa3,$xb3
3075 ___
3076         ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
3077 $code.=<<___;
3078         vpaddd          @key[8],$xc0,$xc0
3079         vpaddd          @key[9],$xc1,$xc1
3080         vpaddd          @key[10],$xc2,$xc2
3081         vpaddd          @key[11],$xc3,$xc3
3082
3083         vpunpckldq      $xc1,$xc0,$xt2
3084         vpunpckldq      $xc3,$xc2,$xt3
3085         vpunpckhdq      $xc1,$xc0,$xc0
3086         vpunpckhdq      $xc3,$xc2,$xc2
3087         vpunpcklqdq     $xt3,$xt2,$xc1          # "c0"
3088         vpunpckhqdq     $xt3,$xt2,$xt2          # "c1"
3089         vpunpcklqdq     $xc2,$xc0,$xc3          # "c2"
3090         vpunpckhqdq     $xc2,$xc0,$xc0          # "c3"
3091 ___
3092         ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
3093 $code.=<<___;
3094         vpaddd          @key[12],$xd0,$xd0
3095         vpaddd          @key[13],$xd1,$xd1
3096         vpaddd          @key[14],$xd2,$xd2
3097         vpaddd          @key[15],$xd3,$xd3
3098
3099         vpunpckldq      $xd1,$xd0,$xt2
3100         vpunpckldq      $xd3,$xd2,$xt3
3101         vpunpckhdq      $xd1,$xd0,$xd0
3102         vpunpckhdq      $xd3,$xd2,$xd2
3103         vpunpcklqdq     $xt3,$xt2,$xd1          # "d0"
3104         vpunpckhqdq     $xt3,$xt2,$xt2          # "d1"
3105         vpunpcklqdq     $xd2,$xd0,$xd3          # "d2"
3106         vpunpckhqdq     $xd2,$xd0,$xd0          # "d3"
3107 ___
3108         ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
3109 $code.=<<___;
3110         vshufi32x4      \$0x44,$xd0,$xc0,$xt3   # "de-interlace" further
3111         vshufi32x4      \$0xee,$xd0,$xc0,$xd0
3112         vshufi32x4      \$0x44,$xd1,$xc1,$xc0
3113         vshufi32x4      \$0xee,$xd1,$xc1,$xd1
3114         vshufi32x4      \$0x44,$xd2,$xc2,$xc1
3115         vshufi32x4      \$0xee,$xd2,$xc2,$xd2
3116         vshufi32x4      \$0x44,$xd3,$xc3,$xc2
3117         vshufi32x4      \$0xee,$xd3,$xc3,$xd3
3118 ___
3119         ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
3120 $code.=<<___;
3121         vshufi32x4      \$0x88,$xc0,$xa0,$xt0   # "de-interlace" further
3122         vshufi32x4      \$0xdd,$xc0,$xa0,$xa0
3123          vshufi32x4     \$0x88,$xd0,$xb0,$xc0
3124          vshufi32x4     \$0xdd,$xd0,$xb0,$xd0
3125         vshufi32x4      \$0x88,$xc1,$xa1,$xt1
3126         vshufi32x4      \$0xdd,$xc1,$xa1,$xa1
3127          vshufi32x4     \$0x88,$xd1,$xb1,$xc1
3128          vshufi32x4     \$0xdd,$xd1,$xb1,$xd1
3129         vshufi32x4      \$0x88,$xc2,$xa2,$xt2
3130         vshufi32x4      \$0xdd,$xc2,$xa2,$xa2
3131          vshufi32x4     \$0x88,$xd2,$xb2,$xc2
3132          vshufi32x4     \$0xdd,$xd2,$xb2,$xd2
3133         vshufi32x4      \$0x88,$xc3,$xa3,$xt3
3134         vshufi32x4      \$0xdd,$xc3,$xa3,$xa3
3135          vshufi32x4     \$0x88,$xd3,$xb3,$xc3
3136          vshufi32x4     \$0xdd,$xd3,$xb3,$xd3
3137 ___
3138         ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
3139         ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
3140
3141         ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
3142          $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
3143         ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3144          $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
3145 $code.=<<___;
3146         cmp             \$64*16,$len
3147         jb              .Ltail16x
3148
3149         vpxord          0x00($inp),$xa0,$xa0    # xor with input
3150         vpxord          0x40($inp),$xb0,$xb0
3151         vpxord          0x80($inp),$xc0,$xc0
3152         vpxord          0xc0($inp),$xd0,$xd0
3153         vmovdqu32       $xa0,0x00($out)
3154         vmovdqu32       $xb0,0x40($out)
3155         vmovdqu32       $xc0,0x80($out)
3156         vmovdqu32       $xd0,0xc0($out)
3157
3158         vpxord          0x100($inp),$xa1,$xa1
3159         vpxord          0x140($inp),$xb1,$xb1
3160         vpxord          0x180($inp),$xc1,$xc1
3161         vpxord          0x1c0($inp),$xd1,$xd1
3162         vmovdqu32       $xa1,0x100($out)
3163         vmovdqu32       $xb1,0x140($out)
3164         vmovdqu32       $xc1,0x180($out)
3165         vmovdqu32       $xd1,0x1c0($out)
3166
3167         vpxord          0x200($inp),$xa2,$xa2
3168         vpxord          0x240($inp),$xb2,$xb2
3169         vpxord          0x280($inp),$xc2,$xc2
3170         vpxord          0x2c0($inp),$xd2,$xd2
3171         vmovdqu32       $xa2,0x200($out)
3172         vmovdqu32       $xb2,0x240($out)
3173         vmovdqu32       $xc2,0x280($out)
3174         vmovdqu32       $xd2,0x2c0($out)
3175
3176         vpxord          0x300($inp),$xa3,$xa3
3177         vpxord          0x340($inp),$xb3,$xb3
3178         vpxord          0x380($inp),$xc3,$xc3
3179         vpxord          0x3c0($inp),$xd3,$xd3
3180         lea             0x400($inp),$inp
3181         vmovdqu32       $xa3,0x300($out)
3182         vmovdqu32       $xb3,0x340($out)
3183         vmovdqu32       $xc3,0x380($out)
3184         vmovdqu32       $xd3,0x3c0($out)
3185         lea             0x400($out),$out
3186
3187         sub             \$64*16,$len
3188         jnz             .Loop_outer16x
3189
3190         jmp             .Ldone16x
3191
3192 .align  32
3193 .Ltail16x:
3194         xor             %r10,%r10
3195         sub             $inp,$out
3196         cmp             \$64*1,$len
3197         jb              .Less_than_64_16x
3198         vpxord          ($inp),$xa0,$xa0        # xor with input
3199         vmovdqu32       $xa0,($out,$inp)
3200         je              .Ldone16x
3201         vmovdqa32       $xb0,$xa0
3202         lea             64($inp),$inp
3203
3204         cmp             \$64*2,$len
3205         jb              .Less_than_64_16x
3206         vpxord          ($inp),$xb0,$xb0
3207         vmovdqu32       $xb0,($out,$inp)
3208         je              .Ldone16x
3209         vmovdqa32       $xc0,$xa0
3210         lea             64($inp),$inp
3211
3212         cmp             \$64*3,$len
3213         jb              .Less_than_64_16x
3214         vpxord          ($inp),$xc0,$xc0
3215         vmovdqu32       $xc0,($out,$inp)
3216         je              .Ldone16x
3217         vmovdqa32       $xd0,$xa0
3218         lea             64($inp),$inp
3219
3220         cmp             \$64*4,$len
3221         jb              .Less_than_64_16x
3222         vpxord          ($inp),$xd0,$xd0
3223         vmovdqu32       $xd0,($out,$inp)
3224         je              .Ldone16x
3225         vmovdqa32       $xa1,$xa0
3226         lea             64($inp),$inp
3227
3228         cmp             \$64*5,$len
3229         jb              .Less_than_64_16x
3230         vpxord          ($inp),$xa1,$xa1
3231         vmovdqu32       $xa1,($out,$inp)
3232         je              .Ldone16x
3233         vmovdqa32       $xb1,$xa0
3234         lea             64($inp),$inp
3235
3236         cmp             \$64*6,$len
3237         jb              .Less_than_64_16x
3238         vpxord          ($inp),$xb1,$xb1
3239         vmovdqu32       $xb1,($out,$inp)
3240         je              .Ldone16x
3241         vmovdqa32       $xc1,$xa0
3242         lea             64($inp),$inp
3243
3244         cmp             \$64*7,$len
3245         jb              .Less_than_64_16x
3246         vpxord          ($inp),$xc1,$xc1
3247         vmovdqu32       $xc1,($out,$inp)
3248         je              .Ldone16x
3249         vmovdqa32       $xd1,$xa0
3250         lea             64($inp),$inp
3251
3252         cmp             \$64*8,$len
3253         jb              .Less_than_64_16x
3254         vpxord          ($inp),$xd1,$xd1
3255         vmovdqu32       $xd1,($out,$inp)
3256         je              .Ldone16x
3257         vmovdqa32       $xa2,$xa0
3258         lea             64($inp),$inp
3259
3260         cmp             \$64*9,$len
3261         jb              .Less_than_64_16x
3262         vpxord          ($inp),$xa2,$xa2
3263         vmovdqu32       $xa2,($out,$inp)
3264         je              .Ldone16x
3265         vmovdqa32       $xb2,$xa0
3266         lea             64($inp),$inp
3267
3268         cmp             \$64*10,$len
3269         jb              .Less_than_64_16x
3270         vpxord          ($inp),$xb2,$xb2
3271         vmovdqu32       $xb2,($out,$inp)
3272         je              .Ldone16x
3273         vmovdqa32       $xc2,$xa0
3274         lea             64($inp),$inp
3275
3276         cmp             \$64*11,$len
3277         jb              .Less_than_64_16x
3278         vpxord          ($inp),$xc2,$xc2
3279         vmovdqu32       $xc2,($out,$inp)
3280         je              .Ldone16x
3281         vmovdqa32       $xd2,$xa0
3282         lea             64($inp),$inp
3283
3284         cmp             \$64*12,$len
3285         jb              .Less_than_64_16x
3286         vpxord          ($inp),$xd2,$xd2
3287         vmovdqu32       $xd2,($out,$inp)
3288         je              .Ldone16x
3289         vmovdqa32       $xa3,$xa0
3290         lea             64($inp),$inp
3291
3292         cmp             \$64*13,$len
3293         jb              .Less_than_64_16x
3294         vpxord          ($inp),$xa3,$xa3
3295         vmovdqu32       $xa3,($out,$inp)
3296         je              .Ldone16x
3297         vmovdqa32       $xb3,$xa0
3298         lea             64($inp),$inp
3299
3300         cmp             \$64*14,$len
3301         jb              .Less_than_64_16x
3302         vpxord          ($inp),$xb3,$xb3
3303         vmovdqu32       $xb3,($out,$inp)
3304         je              .Ldone16x
3305         vmovdqa32       $xc3,$xa0
3306         lea             64($inp),$inp
3307
3308         cmp             \$64*15,$len
3309         jb              .Less_than_64_16x
3310         vpxord          ($inp),$xc3,$xc3
3311         vmovdqu32       $xc3,($out,$inp)
3312         je              .Ldone16x
3313         vmovdqa32       $xd3,$xa0
3314         lea             64($inp),$inp
3315
3316 .Less_than_64_16x:
3317         vmovdqa32       $xa0,0x00(%rsp)
3318         lea             ($out,$inp),$out
3319         and             \$63,$len
3320
3321 .Loop_tail16x:
3322         movzb           ($inp,%r10),%eax
3323         movzb           (%rsp,%r10),%ecx
3324         lea             1(%r10),%r10
3325         xor             %ecx,%eax
3326         mov             %al,-1($out,%r10)
3327         dec             $len
3328         jnz             .Loop_tail16x
3329
3330         vpxord          $xa0,$xa0,$xa0
3331         vmovdqa32       $xa0,0(%rsp)
3332
3333 .Ldone16x:
3334         vzeroall
3335 ___
3336 $code.=<<___    if ($win64);
3337         movaps          -0xa8(%r9),%xmm6
3338         movaps          -0x98(%r9),%xmm7
3339         movaps          -0x88(%r9),%xmm8
3340         movaps          -0x78(%r9),%xmm9
3341         movaps          -0x68(%r9),%xmm10
3342         movaps          -0x58(%r9),%xmm11
3343         movaps          -0x48(%r9),%xmm12
3344         movaps          -0x38(%r9),%xmm13
3345         movaps          -0x28(%r9),%xmm14
3346         movaps          -0x18(%r9),%xmm15
3347 ___
3348 $code.=<<___;
3349         lea             (%r9),%rsp
3350 .cfi_def_cfa_register   %rsp
3351 .L16x_epilogue:
3352         ret
3353 .cfi_endproc
3354 .size   ChaCha20_16x,.-ChaCha20_16x
3355 ___
3356
3357 # switch to %ymm domain
3358 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3359  $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15));
3360 @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3361      $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
3362 @key=map("%ymm$_",(16..31));
3363 ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
3364
3365 $code.=<<___;
3366 .type   ChaCha20_8xvl,\@function,5
3367 .align  32
3368 ChaCha20_8xvl:
3369 .cfi_startproc
3370 .LChaCha20_8xvl:
3371         mov             %rsp,%r9                # frame register
3372 .cfi_def_cfa_register   %r9
3373         sub             \$64+$xframe,%rsp
3374         and             \$-64,%rsp
3375 ___
3376 $code.=<<___    if ($win64);
3377         movaps          %xmm6,-0xa8(%r9)
3378         movaps          %xmm7,-0x98(%r9)
3379         movaps          %xmm8,-0x88(%r9)
3380         movaps          %xmm9,-0x78(%r9)
3381         movaps          %xmm10,-0x68(%r9)
3382         movaps          %xmm11,-0x58(%r9)
3383         movaps          %xmm12,-0x48(%r9)
3384         movaps          %xmm13,-0x38(%r9)
3385         movaps          %xmm14,-0x28(%r9)
3386         movaps          %xmm15,-0x18(%r9)
3387 .L8xvl_body:
3388 ___
3389 $code.=<<___;
3390         vzeroupper
3391
3392         lea             .Lsigma(%rip),%r10
3393         vbroadcasti128  (%r10),$xa3             # key[0]
3394         vbroadcasti128  ($key),$xb3             # key[1]
3395         vbroadcasti128  16($key),$xc3           # key[2]
3396         vbroadcasti128  ($counter),$xd3         # key[3]
3397
3398         vpshufd         \$0x00,$xa3,$xa0        # smash key by lanes...
3399         vpshufd         \$0x55,$xa3,$xa1
3400         vpshufd         \$0xaa,$xa3,$xa2
3401         vpshufd         \$0xff,$xa3,$xa3
3402         vmovdqa64       $xa0,@key[0]
3403         vmovdqa64       $xa1,@key[1]
3404         vmovdqa64       $xa2,@key[2]
3405         vmovdqa64       $xa3,@key[3]
3406
3407         vpshufd         \$0x00,$xb3,$xb0
3408         vpshufd         \$0x55,$xb3,$xb1
3409         vpshufd         \$0xaa,$xb3,$xb2
3410         vpshufd         \$0xff,$xb3,$xb3
3411         vmovdqa64       $xb0,@key[4]
3412         vmovdqa64       $xb1,@key[5]
3413         vmovdqa64       $xb2,@key[6]
3414         vmovdqa64       $xb3,@key[7]
3415
3416         vpshufd         \$0x00,$xc3,$xc0
3417         vpshufd         \$0x55,$xc3,$xc1
3418         vpshufd         \$0xaa,$xc3,$xc2
3419         vpshufd         \$0xff,$xc3,$xc3
3420         vmovdqa64       $xc0,@key[8]
3421         vmovdqa64       $xc1,@key[9]
3422         vmovdqa64       $xc2,@key[10]
3423         vmovdqa64       $xc3,@key[11]
3424
3425         vpshufd         \$0x00,$xd3,$xd0
3426         vpshufd         \$0x55,$xd3,$xd1
3427         vpshufd         \$0xaa,$xd3,$xd2
3428         vpshufd         \$0xff,$xd3,$xd3
3429         vpaddd          .Lincy(%rip),$xd0,$xd0  # don't save counters yet
3430         vmovdqa64       $xd0,@key[12]
3431         vmovdqa64       $xd1,@key[13]
3432         vmovdqa64       $xd2,@key[14]
3433         vmovdqa64       $xd3,@key[15]
3434
3435         mov             \$10,%eax
3436         jmp             .Loop8xvl
3437
3438 .align  32
3439 .Loop_outer8xvl:
3440         #vpbroadcastd   0(%r10),$xa0            # reload key
3441         #vpbroadcastd   4(%r10),$xa1
3442         vpbroadcastd    8(%r10),$xa2
3443         vpbroadcastd    12(%r10),$xa3
3444         vpaddd          .Leight(%rip),@key[12],@key[12] # next SIMD counters
3445         vmovdqa64       @key[4],$xb0
3446         vmovdqa64       @key[5],$xb1
3447         vmovdqa64       @key[6],$xb2
3448         vmovdqa64       @key[7],$xb3
3449         vmovdqa64       @key[8],$xc0
3450         vmovdqa64       @key[9],$xc1
3451         vmovdqa64       @key[10],$xc2
3452         vmovdqa64       @key[11],$xc3
3453         vmovdqa64       @key[12],$xd0
3454         vmovdqa64       @key[13],$xd1
3455         vmovdqa64       @key[14],$xd2
3456         vmovdqa64       @key[15],$xd3
3457
3458         vmovdqa64       $xa0,@key[0]
3459         vmovdqa64       $xa1,@key[1]
3460         vmovdqa64       $xa2,@key[2]
3461         vmovdqa64       $xa3,@key[3]
3462
3463         mov             \$10,%eax
3464         jmp             .Loop8xvl
3465
3466 .align  32
3467 .Loop8xvl:
3468 ___
3469         foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
3470         foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
3471 $code.=<<___;
3472         dec             %eax
3473         jnz             .Loop8xvl
3474
3475         vpaddd          @key[0],$xa0,$xa0       # accumulate key
3476         vpaddd          @key[1],$xa1,$xa1
3477         vpaddd          @key[2],$xa2,$xa2
3478         vpaddd          @key[3],$xa3,$xa3
3479
3480         vpunpckldq      $xa1,$xa0,$xt2          # "de-interlace" data
3481         vpunpckldq      $xa3,$xa2,$xt3
3482         vpunpckhdq      $xa1,$xa0,$xa0
3483         vpunpckhdq      $xa3,$xa2,$xa2
3484         vpunpcklqdq     $xt3,$xt2,$xa1          # "a0"
3485         vpunpckhqdq     $xt3,$xt2,$xt2          # "a1"
3486         vpunpcklqdq     $xa2,$xa0,$xa3          # "a2"
3487         vpunpckhqdq     $xa2,$xa0,$xa0          # "a3"
3488 ___
3489         ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
3490 $code.=<<___;
3491         vpaddd          @key[4],$xb0,$xb0
3492         vpaddd          @key[5],$xb1,$xb1
3493         vpaddd          @key[6],$xb2,$xb2
3494         vpaddd          @key[7],$xb3,$xb3
3495
3496         vpunpckldq      $xb1,$xb0,$xt2
3497         vpunpckldq      $xb3,$xb2,$xt3
3498         vpunpckhdq      $xb1,$xb0,$xb0
3499         vpunpckhdq      $xb3,$xb2,$xb2
3500         vpunpcklqdq     $xt3,$xt2,$xb1          # "b0"
3501         vpunpckhqdq     $xt3,$xt2,$xt2          # "b1"
3502         vpunpcklqdq     $xb2,$xb0,$xb3          # "b2"
3503         vpunpckhqdq     $xb2,$xb0,$xb0          # "b3"
3504 ___
3505         ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
3506 $code.=<<___;
3507         vshufi32x4      \$0,$xb0,$xa0,$xt3      # "de-interlace" further
3508         vshufi32x4      \$3,$xb0,$xa0,$xb0
3509         vshufi32x4      \$0,$xb1,$xa1,$xa0
3510         vshufi32x4      \$3,$xb1,$xa1,$xb1
3511         vshufi32x4      \$0,$xb2,$xa2,$xa1
3512         vshufi32x4      \$3,$xb2,$xa2,$xb2
3513         vshufi32x4      \$0,$xb3,$xa3,$xa2
3514         vshufi32x4      \$3,$xb3,$xa3,$xb3
3515 ___
3516         ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
3517 $code.=<<___;
3518         vpaddd          @key[8],$xc0,$xc0
3519         vpaddd          @key[9],$xc1,$xc1
3520         vpaddd          @key[10],$xc2,$xc2
3521         vpaddd          @key[11],$xc3,$xc3
3522
3523         vpunpckldq      $xc1,$xc0,$xt2
3524         vpunpckldq      $xc3,$xc2,$xt3
3525         vpunpckhdq      $xc1,$xc0,$xc0
3526         vpunpckhdq      $xc3,$xc2,$xc2
3527         vpunpcklqdq     $xt3,$xt2,$xc1          # "c0"
3528         vpunpckhqdq     $xt3,$xt2,$xt2          # "c1"
3529         vpunpcklqdq     $xc2,$xc0,$xc3          # "c2"
3530         vpunpckhqdq     $xc2,$xc0,$xc0          # "c3"
3531 ___
3532         ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
3533 $code.=<<___;
3534         vpaddd          @key[12],$xd0,$xd0
3535         vpaddd          @key[13],$xd1,$xd1
3536         vpaddd          @key[14],$xd2,$xd2
3537         vpaddd          @key[15],$xd3,$xd3
3538
3539         vpunpckldq      $xd1,$xd0,$xt2
3540         vpunpckldq      $xd3,$xd2,$xt3
3541         vpunpckhdq      $xd1,$xd0,$xd0
3542         vpunpckhdq      $xd3,$xd2,$xd2
3543         vpunpcklqdq     $xt3,$xt2,$xd1          # "d0"
3544         vpunpckhqdq     $xt3,$xt2,$xt2          # "d1"
3545         vpunpcklqdq     $xd2,$xd0,$xd3          # "d2"
3546         vpunpckhqdq     $xd2,$xd0,$xd0          # "d3"
3547 ___
3548         ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
3549 $code.=<<___;
3550         vperm2i128      \$0x20,$xd0,$xc0,$xt3   # "de-interlace" further
3551         vperm2i128      \$0x31,$xd0,$xc0,$xd0
3552         vperm2i128      \$0x20,$xd1,$xc1,$xc0
3553         vperm2i128      \$0x31,$xd1,$xc1,$xd1
3554         vperm2i128      \$0x20,$xd2,$xc2,$xc1
3555         vperm2i128      \$0x31,$xd2,$xc2,$xd2
3556         vperm2i128      \$0x20,$xd3,$xc3,$xc2
3557         vperm2i128      \$0x31,$xd3,$xc3,$xd3
3558 ___
3559         ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
3560         ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
3561         ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
3562 $code.=<<___;
3563         cmp             \$64*8,$len
3564         jb              .Ltail8xvl
3565
3566         mov             \$0x80,%eax             # size optimization
3567         vpxord          0x00($inp),$xa0,$xa0    # xor with input
3568         vpxor           0x20($inp),$xb0,$xb0
3569         vpxor           0x40($inp),$xc0,$xc0
3570         vpxor           0x60($inp),$xd0,$xd0
3571         lea             ($inp,%rax),$inp        # size optimization
3572         vmovdqu32       $xa0,0x00($out)
3573         vmovdqu         $xb0,0x20($out)
3574         vmovdqu         $xc0,0x40($out)
3575         vmovdqu         $xd0,0x60($out)
3576         lea             ($out,%rax),$out        # size optimization
3577
3578         vpxor           0x00($inp),$xa1,$xa1
3579         vpxor           0x20($inp),$xb1,$xb1
3580         vpxor           0x40($inp),$xc1,$xc1
3581         vpxor           0x60($inp),$xd1,$xd1
3582         lea             ($inp,%rax),$inp        # size optimization
3583         vmovdqu         $xa1,0x00($out)
3584         vmovdqu         $xb1,0x20($out)
3585         vmovdqu         $xc1,0x40($out)
3586         vmovdqu         $xd1,0x60($out)
3587         lea             ($out,%rax),$out        # size optimization
3588
3589         vpxord          0x00($inp),$xa2,$xa2
3590         vpxor           0x20($inp),$xb2,$xb2
3591         vpxor           0x40($inp),$xc2,$xc2
3592         vpxor           0x60($inp),$xd2,$xd2
3593         lea             ($inp,%rax),$inp        # size optimization
3594         vmovdqu32       $xa2,0x00($out)
3595         vmovdqu         $xb2,0x20($out)
3596         vmovdqu         $xc2,0x40($out)
3597         vmovdqu         $xd2,0x60($out)
3598         lea             ($out,%rax),$out        # size optimization
3599
3600         vpxor           0x00($inp),$xa3,$xa3
3601         vpxor           0x20($inp),$xb3,$xb3
3602         vpxor           0x40($inp),$xc3,$xc3
3603         vpxor           0x60($inp),$xd3,$xd3
3604         lea             ($inp,%rax),$inp        # size optimization
3605         vmovdqu         $xa3,0x00($out)
3606         vmovdqu         $xb3,0x20($out)
3607         vmovdqu         $xc3,0x40($out)
3608         vmovdqu         $xd3,0x60($out)
3609         lea             ($out,%rax),$out        # size optimization
3610
3611         vpbroadcastd    0(%r10),%ymm0           # reload key
3612         vpbroadcastd    4(%r10),%ymm1
3613
3614         sub             \$64*8,$len
3615         jnz             .Loop_outer8xvl
3616
3617         jmp             .Ldone8xvl
3618
3619 .align  32
3620 .Ltail8xvl:
3621         vmovdqa64       $xa0,%ymm8              # size optimization
3622 ___
3623 $xa0 = "%ymm8";
3624 $code.=<<___;
3625         xor             %r10,%r10
3626         sub             $inp,$out
3627         cmp             \$64*1,$len
3628         jb              .Less_than_64_8xvl
3629         vpxor           0x00($inp),$xa0,$xa0    # xor with input
3630         vpxor           0x20($inp),$xb0,$xb0
3631         vmovdqu         $xa0,0x00($out,$inp)
3632         vmovdqu         $xb0,0x20($out,$inp)
3633         je              .Ldone8xvl
3634         vmovdqa         $xc0,$xa0
3635         vmovdqa         $xd0,$xb0
3636         lea             64($inp),$inp
3637
3638         cmp             \$64*2,$len
3639         jb              .Less_than_64_8xvl
3640         vpxor           0x00($inp),$xc0,$xc0
3641         vpxor           0x20($inp),$xd0,$xd0
3642         vmovdqu         $xc0,0x00($out,$inp)
3643         vmovdqu         $xd0,0x20($out,$inp)
3644         je              .Ldone8xvl
3645         vmovdqa         $xa1,$xa0
3646         vmovdqa         $xb1,$xb0
3647         lea             64($inp),$inp
3648
3649         cmp             \$64*3,$len
3650         jb              .Less_than_64_8xvl
3651         vpxor           0x00($inp),$xa1,$xa1
3652         vpxor           0x20($inp),$xb1,$xb1
3653         vmovdqu         $xa1,0x00($out,$inp)
3654         vmovdqu         $xb1,0x20($out,$inp)
3655         je              .Ldone8xvl
3656         vmovdqa         $xc1,$xa0
3657         vmovdqa         $xd1,$xb0
3658         lea             64($inp),$inp
3659
3660         cmp             \$64*4,$len
3661         jb              .Less_than_64_8xvl
3662         vpxor           0x00($inp),$xc1,$xc1
3663         vpxor           0x20($inp),$xd1,$xd1
3664         vmovdqu         $xc1,0x00($out,$inp)
3665         vmovdqu         $xd1,0x20($out,$inp)
3666         je              .Ldone8xvl
3667         vmovdqa32       $xa2,$xa0
3668         vmovdqa         $xb2,$xb0
3669         lea             64($inp),$inp
3670
3671         cmp             \$64*5,$len
3672         jb              .Less_than_64_8xvl
3673         vpxord          0x00($inp),$xa2,$xa2
3674         vpxor           0x20($inp),$xb2,$xb2
3675         vmovdqu32       $xa2,0x00($out,$inp)
3676         vmovdqu         $xb2,0x20($out,$inp)
3677         je              .Ldone8xvl
3678         vmovdqa         $xc2,$xa0
3679         vmovdqa         $xd2,$xb0
3680         lea             64($inp),$inp
3681
3682         cmp             \$64*6,$len
3683         jb              .Less_than_64_8xvl
3684         vpxor           0x00($inp),$xc2,$xc2
3685         vpxor           0x20($inp),$xd2,$xd2
3686         vmovdqu         $xc2,0x00($out,$inp)
3687         vmovdqu         $xd2,0x20($out,$inp)
3688         je              .Ldone8xvl
3689         vmovdqa         $xa3,$xa0
3690         vmovdqa         $xb3,$xb0
3691         lea             64($inp),$inp
3692
3693         cmp             \$64*7,$len
3694         jb              .Less_than_64_8xvl
3695         vpxor           0x00($inp),$xa3,$xa3
3696         vpxor           0x20($inp),$xb3,$xb3
3697         vmovdqu         $xa3,0x00($out,$inp)
3698         vmovdqu         $xb3,0x20($out,$inp)
3699         je              .Ldone8xvl
3700         vmovdqa         $xc3,$xa0
3701         vmovdqa         $xd3,$xb0
3702         lea             64($inp),$inp
3703
3704 .Less_than_64_8xvl:
3705         vmovdqa         $xa0,0x00(%rsp)
3706         vmovdqa         $xb0,0x20(%rsp)
3707         lea             ($out,$inp),$out
3708         and             \$63,$len
3709
3710 .Loop_tail8xvl:
3711         movzb           ($inp,%r10),%eax
3712         movzb           (%rsp,%r10),%ecx
3713         lea             1(%r10),%r10
3714         xor             %ecx,%eax
3715         mov             %al,-1($out,%r10)
3716         dec             $len
3717         jnz             .Loop_tail8xvl
3718
3719         vpxor           $xa0,$xa0,$xa0
3720         vmovdqa         $xa0,0x00(%rsp)
3721         vmovdqa         $xa0,0x20(%rsp)
3722
3723 .Ldone8xvl:
3724         vzeroall
3725 ___
3726 $code.=<<___    if ($win64);
3727         movaps          -0xa8(%r9),%xmm6
3728         movaps          -0x98(%r9),%xmm7
3729         movaps          -0x88(%r9),%xmm8
3730         movaps          -0x78(%r9),%xmm9
3731         movaps          -0x68(%r9),%xmm10
3732         movaps          -0x58(%r9),%xmm11
3733         movaps          -0x48(%r9),%xmm12
3734         movaps          -0x38(%r9),%xmm13
3735         movaps          -0x28(%r9),%xmm14
3736         movaps          -0x18(%r9),%xmm15
3737 ___
3738 $code.=<<___;
3739         lea             (%r9),%rsp
3740 .cfi_def_cfa_register   %rsp
3741 .L8xvl_epilogue:
3742         ret
3743 .cfi_endproc
3744 .size   ChaCha20_8xvl,.-ChaCha20_8xvl
3745 ___
3746 }
3747
3748 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3749 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
3750 if ($win64) {
3751 $rec="%rcx";
3752 $frame="%rdx";
3753 $context="%r8";
3754 $disp="%r9";
3755
3756 $code.=<<___;
3757 .extern __imp_RtlVirtualUnwind
3758 .type   se_handler,\@abi-omnipotent
3759 .align  16
3760 se_handler:
3761         push    %rsi
3762         push    %rdi
3763         push    %rbx
3764         push    %rbp
3765         push    %r12
3766         push    %r13
3767         push    %r14
3768         push    %r15
3769         pushfq
3770         sub     \$64,%rsp
3771
3772         mov     120($context),%rax      # pull context->Rax
3773         mov     248($context),%rbx      # pull context->Rip
3774
3775         mov     8($disp),%rsi           # disp->ImageBase
3776         mov     56($disp),%r11          # disp->HandlerData
3777
3778         lea     .Lctr32_body(%rip),%r10
3779         cmp     %r10,%rbx               # context->Rip<.Lprologue
3780         jb      .Lcommon_seh_tail
3781
3782         mov     152($context),%rax      # pull context->Rsp
3783
3784         lea     .Lno_data(%rip),%r10    # epilogue label
3785         cmp     %r10,%rbx               # context->Rip>=.Lepilogue
3786         jae     .Lcommon_seh_tail
3787
3788         lea     64+24+48(%rax),%rax
3789
3790         mov     -8(%rax),%rbx
3791         mov     -16(%rax),%rbp
3792         mov     -24(%rax),%r12
3793         mov     -32(%rax),%r13
3794         mov     -40(%rax),%r14
3795         mov     -48(%rax),%r15
3796         mov     %rbx,144($context)      # restore context->Rbx
3797         mov     %rbp,160($context)      # restore context->Rbp
3798         mov     %r12,216($context)      # restore context->R12
3799         mov     %r13,224($context)      # restore context->R13
3800         mov     %r14,232($context)      # restore context->R14
3801         mov     %r15,240($context)      # restore context->R14
3802
3803 .Lcommon_seh_tail:
3804         mov     8(%rax),%rdi
3805         mov     16(%rax),%rsi
3806         mov     %rax,152($context)      # restore context->Rsp
3807         mov     %rsi,168($context)      # restore context->Rsi
3808         mov     %rdi,176($context)      # restore context->Rdi
3809
3810         mov     40($disp),%rdi          # disp->ContextRecord
3811         mov     $context,%rsi           # context
3812         mov     \$154,%ecx              # sizeof(CONTEXT)
3813         .long   0xa548f3fc              # cld; rep movsq
3814
3815         mov     $disp,%rsi
3816         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
3817         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
3818         mov     0(%rsi),%r8             # arg3, disp->ControlPc
3819         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
3820         mov     40(%rsi),%r10           # disp->ContextRecord
3821         lea     56(%rsi),%r11           # &disp->HandlerData
3822         lea     24(%rsi),%r12           # &disp->EstablisherFrame
3823         mov     %r10,32(%rsp)           # arg5
3824         mov     %r11,40(%rsp)           # arg6
3825         mov     %r12,48(%rsp)           # arg7
3826         mov     %rcx,56(%rsp)           # arg8, (NULL)
3827         call    *__imp_RtlVirtualUnwind(%rip)
3828
3829         mov     \$1,%eax                # ExceptionContinueSearch
3830         add     \$64,%rsp
3831         popfq
3832         pop     %r15
3833         pop     %r14
3834         pop     %r13
3835         pop     %r12
3836         pop     %rbp
3837         pop     %rbx
3838         pop     %rdi
3839         pop     %rsi
3840         ret
3841 .size   se_handler,.-se_handler
3842
3843 .type   simd_handler,\@abi-omnipotent
3844 .align  16
3845 simd_handler:
3846         push    %rsi
3847         push    %rdi
3848         push    %rbx
3849         push    %rbp
3850         push    %r12
3851         push    %r13
3852         push    %r14
3853         push    %r15
3854         pushfq
3855         sub     \$64,%rsp
3856
3857         mov     120($context),%rax      # pull context->Rax
3858         mov     248($context),%rbx      # pull context->Rip
3859
3860         mov     8($disp),%rsi           # disp->ImageBase
3861         mov     56($disp),%r11          # disp->HandlerData
3862
3863         mov     0(%r11),%r10d           # HandlerData[0]
3864         lea     (%rsi,%r10),%r10        # prologue label
3865         cmp     %r10,%rbx               # context->Rip<prologue label
3866         jb      .Lcommon_seh_tail
3867
3868         mov     192($context),%rax      # pull context->R9
3869
3870         mov     4(%r11),%r10d           # HandlerData[1]
3871         mov     8(%r11),%ecx            # HandlerData[2]
3872         lea     (%rsi,%r10),%r10        # epilogue label
3873         cmp     %r10,%rbx               # context->Rip>=epilogue label
3874         jae     .Lcommon_seh_tail
3875
3876         neg     %rcx
3877         lea     -8(%rax,%rcx),%rsi
3878         lea     512($context),%rdi      # &context.Xmm6
3879         neg     %ecx
3880         shr     \$3,%ecx
3881         .long   0xa548f3fc              # cld; rep movsq
3882
3883         jmp     .Lcommon_seh_tail
3884 .size   simd_handler,.-simd_handler
3885
3886 .section        .pdata
3887 .align  4
3888         .rva    .LSEH_begin_ChaCha20_ctr32
3889         .rva    .LSEH_end_ChaCha20_ctr32
3890         .rva    .LSEH_info_ChaCha20_ctr32
3891
3892         .rva    .LSEH_begin_ChaCha20_ssse3
3893         .rva    .LSEH_end_ChaCha20_ssse3
3894         .rva    .LSEH_info_ChaCha20_ssse3
3895
3896         .rva    .LSEH_begin_ChaCha20_128
3897         .rva    .LSEH_end_ChaCha20_128
3898         .rva    .LSEH_info_ChaCha20_128
3899
3900         .rva    .LSEH_begin_ChaCha20_4x
3901         .rva    .LSEH_end_ChaCha20_4x
3902         .rva    .LSEH_info_ChaCha20_4x
3903 ___
3904 $code.=<<___ if ($avx);
3905         .rva    .LSEH_begin_ChaCha20_4xop
3906         .rva    .LSEH_end_ChaCha20_4xop
3907         .rva    .LSEH_info_ChaCha20_4xop
3908 ___
3909 $code.=<<___ if ($avx>1);
3910         .rva    .LSEH_begin_ChaCha20_8x
3911         .rva    .LSEH_end_ChaCha20_8x
3912         .rva    .LSEH_info_ChaCha20_8x
3913 ___
3914 $code.=<<___ if ($avx>2);
3915         .rva    .LSEH_begin_ChaCha20_avx512
3916         .rva    .LSEH_end_ChaCha20_avx512
3917         .rva    .LSEH_info_ChaCha20_avx512
3918
3919         .rva    .LSEH_begin_ChaCha20_avx512vl
3920         .rva    .LSEH_end_ChaCha20_avx512vl
3921         .rva    .LSEH_info_ChaCha20_avx512vl
3922
3923         .rva    .LSEH_begin_ChaCha20_16x
3924         .rva    .LSEH_end_ChaCha20_16x
3925         .rva    .LSEH_info_ChaCha20_16x
3926
3927         .rva    .LSEH_begin_ChaCha20_8xvl
3928         .rva    .LSEH_end_ChaCha20_8xvl
3929         .rva    .LSEH_info_ChaCha20_8xvl
3930 ___
3931 $code.=<<___;
3932 .section        .xdata
3933 .align  8
3934 .LSEH_info_ChaCha20_ctr32:
3935         .byte   9,0,0,0
3936         .rva    se_handler
3937
3938 .LSEH_info_ChaCha20_ssse3:
3939         .byte   9,0,0,0
3940         .rva    simd_handler
3941         .rva    .Lssse3_body,.Lssse3_epilogue
3942         .long   0x20,0
3943
3944 .LSEH_info_ChaCha20_128:
3945         .byte   9,0,0,0
3946         .rva    simd_handler
3947         .rva    .L128_body,.L128_epilogue
3948         .long   0x60,0
3949
3950 .LSEH_info_ChaCha20_4x:
3951         .byte   9,0,0,0
3952         .rva    simd_handler
3953         .rva    .L4x_body,.L4x_epilogue
3954         .long   0xa0,0
3955 ___
3956 $code.=<<___ if ($avx);
3957 .LSEH_info_ChaCha20_4xop:
3958         .byte   9,0,0,0
3959         .rva    simd_handler
3960         .rva    .L4xop_body,.L4xop_epilogue             # HandlerData[]
3961         .long   0xa0,0
3962 ___
3963 $code.=<<___ if ($avx>1);
3964 .LSEH_info_ChaCha20_8x:
3965         .byte   9,0,0,0
3966         .rva    simd_handler
3967         .rva    .L8x_body,.L8x_epilogue                 # HandlerData[]
3968         .long   0xa0,0
3969 ___
3970 $code.=<<___ if ($avx>2);
3971 .LSEH_info_ChaCha20_avx512:
3972         .byte   9,0,0,0
3973         .rva    simd_handler
3974         .rva    .Lavx512_body,.Lavx512_epilogue         # HandlerData[]
3975         .long   0x20,0
3976
3977 .LSEH_info_ChaCha20_avx512vl:
3978         .byte   9,0,0,0
3979         .rva    simd_handler
3980         .rva    .Lavx512vl_body,.Lavx512vl_epilogue     # HandlerData[]
3981         .long   0x20,0
3982
3983 .LSEH_info_ChaCha20_16x:
3984         .byte   9,0,0,0
3985         .rva    simd_handler
3986         .rva    .L16x_body,.L16x_epilogue               # HandlerData[]
3987         .long   0xa0,0
3988
3989 .LSEH_info_ChaCha20_8xvl:
3990         .byte   9,0,0,0
3991         .rva    simd_handler
3992         .rva    .L8xvl_body,.L8xvl_epilogue             # HandlerData[]
3993         .long   0xa0,0
3994 ___
3995 }
3996
3997 foreach (split("\n",$code)) {
3998         s/\`([^\`]*)\`/eval $1/ge;
3999
4000         s/%x#%[yz]/%x/g;        # "down-shift"
4001
4002         print $_,"\n";
4003 }
4004
4005 close STDOUT;