chacha/asm/chacha-x86_64.pl: add AVX512VL code path.
[openssl.git] / crypto / chacha / asm / chacha-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # November 2014
18 #
19 # ChaCha20 for x86_64.
20 #
21 # December 2016
22 #
23 # Add AVX512F code path.
24 #
25 # December 2017
26 #
27 # Add AVX512VL code path.
28 #
29 # Performance in cycles per byte out of large buffer.
30 #
31 #               IALU/gcc 4.8(i) 1xSSSE3/SSE2    4xSSSE3     NxAVX(v)
32 #
33 # P4            9.48/+99%       -/22.7(ii)      -
34 # Core2         7.83/+55%       7.90/8.08       4.35
35 # Westmere      7.19/+50%       5.60/6.70       3.00
36 # Sandy Bridge  8.31/+42%       5.45/6.76       2.72
37 # Ivy Bridge    6.71/+46%       5.40/6.49       2.41
38 # Haswell       5.92/+43%       5.20/6.45       2.42        1.23
39 # Skylake[-X]   5.87/+39%       4.70/-          2.31        1.19[0.80(vi)]
40 # Silvermont    12.0/+33%       7.75/7.40       7.03(iii)
41 # Knights L     11.7/-          -               9.60(iii)   0.80
42 # Goldmont      10.6/+17%       5.10/-          3.28
43 # Sledgehammer  7.28/+52%       -/14.2(ii)      -
44 # Bulldozer     9.66/+28%       9.85/11.1       3.06(iv)
45 # Ryzen         5.96/+50%       5.19/-          2.40        2.09
46 # VIA Nano      10.5/+46%       6.72/8.60       6.05
47 #
48 # (i)   compared to older gcc 3.x one can observe >2x improvement on
49 #       most platforms;
50 # (ii)  as it can be seen, SSE2 performance is too low on legacy
51 #       processors; NxSSE2 results are naturally better, but not
52 #       impressively better than IALU ones, which is why you won't
53 #       find SSE2 code below;
54 # (iii) this is not optimal result for Atom because of MSROM
55 #       limitations, SSE2 can do better, but gain is considered too
56 #       low to justify the [maintenance] effort;
57 # (iv)  Bulldozer actually executes 4xXOP code path that delivers 2.20;
58 # (v)   8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
59 # (vi)  even though Skylake-X can execute AVX512F code and deliver 0.57
60 #       cpb in single thread, the corresponding capability is suppressed;
61
62 $flavour = shift;
63 $output  = shift;
64 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
65
66 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
67
68 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
69 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
70 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
71 die "can't locate x86_64-xlate.pl";
72
73 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
74                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
75         $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
76 }
77
78 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
79            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
80         $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
81         $avx += 1 if ($1==2.11 && $2>=8);
82 }
83
84 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
85            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
86         $avx = ($1>=10) + ($1>=11);
87 }
88
89 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
90         $avx = ($2>=3.0) + ($2>3.0);
91 }
92
93 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
94 *STDOUT=*OUT;
95
96 # input parameter block
97 ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
98
99 $code.=<<___;
100 .text
101
102 .extern OPENSSL_ia32cap_P
103
104 .align  64
105 .Lzero:
106 .long   0,0,0,0
107 .Lone:
108 .long   1,0,0,0
109 .Linc:
110 .long   0,1,2,3
111 .Lfour:
112 .long   4,4,4,4
113 .Lincy:
114 .long   0,2,4,6,1,3,5,7
115 .Leight:
116 .long   8,8,8,8,8,8,8,8
117 .Lrot16:
118 .byte   0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
119 .Lrot24:
120 .byte   0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
121 .Ltwoy:
122 .long   2,0,0,0, 2,0,0,0
123 .align  64
124 .Lzeroz:
125 .long   0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
126 .Lfourz:
127 .long   4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
128 .Lincz:
129 .long   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
130 .Lsixteen:
131 .long   16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
132 .Lsigma:
133 .asciz  "expand 32-byte k"
134 .asciz  "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
135 ___
136
137 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
138 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
139   my $arg = pop;
140     $arg = "\$$arg" if ($arg*1 eq $arg);
141     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
142 }
143
144 @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
145     "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
146 @t=("%esi","%edi");
147
148 sub ROUND {                     # critical path is 24 cycles per round
149 my ($a0,$b0,$c0,$d0)=@_;
150 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
151 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
152 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
153 my ($xc,$xc_)=map("\"$_\"",@t);
154 my @x=map("\"$_\"",@x);
155
156         # Consider order in which variables are addressed by their
157         # index:
158         #
159         #       a   b   c   d
160         #
161         #       0   4   8  12 < even round
162         #       1   5   9  13
163         #       2   6  10  14
164         #       3   7  11  15
165         #       0   5  10  15 < odd round
166         #       1   6  11  12
167         #       2   7   8  13
168         #       3   4   9  14
169         #
170         # 'a', 'b' and 'd's are permanently allocated in registers,
171         # @x[0..7,12..15], while 'c's are maintained in memory. If
172         # you observe 'c' column, you'll notice that pair of 'c's is
173         # invariant between rounds. This means that we have to reload
174         # them once per round, in the middle. This is why you'll see
175         # bunch of 'c' stores and loads in the middle, but none in
176         # the beginning or end.
177
178         # Normally instructions would be interleaved to favour in-order
179         # execution. Generally out-of-order cores manage it gracefully,
180         # but not this time for some reason. As in-order execution
181         # cores are dying breed, old Atom is the only one around,
182         # instructions are left uninterleaved. Besides, Atom is better
183         # off executing 1xSSSE3 code anyway...
184
185         (
186         "&add   (@x[$a0],@x[$b0])",     # Q1
187         "&xor   (@x[$d0],@x[$a0])",
188         "&rol   (@x[$d0],16)",
189          "&add  (@x[$a1],@x[$b1])",     # Q2
190          "&xor  (@x[$d1],@x[$a1])",
191          "&rol  (@x[$d1],16)",
192
193         "&add   ($xc,@x[$d0])",
194         "&xor   (@x[$b0],$xc)",
195         "&rol   (@x[$b0],12)",
196          "&add  ($xc_,@x[$d1])",
197          "&xor  (@x[$b1],$xc_)",
198          "&rol  (@x[$b1],12)",
199
200         "&add   (@x[$a0],@x[$b0])",
201         "&xor   (@x[$d0],@x[$a0])",
202         "&rol   (@x[$d0],8)",
203          "&add  (@x[$a1],@x[$b1])",
204          "&xor  (@x[$d1],@x[$a1])",
205          "&rol  (@x[$d1],8)",
206
207         "&add   ($xc,@x[$d0])",
208         "&xor   (@x[$b0],$xc)",
209         "&rol   (@x[$b0],7)",
210          "&add  ($xc_,@x[$d1])",
211          "&xor  (@x[$b1],$xc_)",
212          "&rol  (@x[$b1],7)",
213
214         "&mov   (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
215          "&mov  (\"4*$c1(%rsp)\",$xc_)",
216         "&mov   ($xc,\"4*$c2(%rsp)\")",
217          "&mov  ($xc_,\"4*$c3(%rsp)\")",
218
219         "&add   (@x[$a2],@x[$b2])",     # Q3
220         "&xor   (@x[$d2],@x[$a2])",
221         "&rol   (@x[$d2],16)",
222          "&add  (@x[$a3],@x[$b3])",     # Q4
223          "&xor  (@x[$d3],@x[$a3])",
224          "&rol  (@x[$d3],16)",
225
226         "&add   ($xc,@x[$d2])",
227         "&xor   (@x[$b2],$xc)",
228         "&rol   (@x[$b2],12)",
229          "&add  ($xc_,@x[$d3])",
230          "&xor  (@x[$b3],$xc_)",
231          "&rol  (@x[$b3],12)",
232
233         "&add   (@x[$a2],@x[$b2])",
234         "&xor   (@x[$d2],@x[$a2])",
235         "&rol   (@x[$d2],8)",
236          "&add  (@x[$a3],@x[$b3])",
237          "&xor  (@x[$d3],@x[$a3])",
238          "&rol  (@x[$d3],8)",
239
240         "&add   ($xc,@x[$d2])",
241         "&xor   (@x[$b2],$xc)",
242         "&rol   (@x[$b2],7)",
243          "&add  ($xc_,@x[$d3])",
244          "&xor  (@x[$b3],$xc_)",
245          "&rol  (@x[$b3],7)"
246         );
247 }
248
249 ########################################################################
250 # Generic code path that handles all lengths on pre-SSSE3 processors.
251 $code.=<<___;
252 .globl  ChaCha20_ctr32
253 .type   ChaCha20_ctr32,\@function,5
254 .align  64
255 ChaCha20_ctr32:
256 .cfi_startproc
257         cmp     \$0,$len
258         je      .Lno_data
259         mov     OPENSSL_ia32cap_P+4(%rip),%r10
260 ___
261 $code.=<<___    if ($avx>2);
262         bt      \$48,%r10               # check for AVX512F
263         jc      .LChaCha20_avx512
264         test    %r10,%r10               # check for AVX512VL
265         js      .LChaCha20_avx512vl
266 ___
267 $code.=<<___;
268         test    \$`1<<(41-32)`,%r10d
269         jnz     .LChaCha20_ssse3
270
271         push    %rbx
272 .cfi_push       %rbx
273         push    %rbp
274 .cfi_push       %rbp
275         push    %r12
276 .cfi_push       %r12
277         push    %r13
278 .cfi_push       %r13
279         push    %r14
280 .cfi_push       %r14
281         push    %r15
282 .cfi_push       %r15
283         sub     \$64+24,%rsp
284 .cfi_adjust_cfa_offset  64+24
285 .Lctr32_body:
286
287         #movdqa .Lsigma(%rip),%xmm0
288         movdqu  ($key),%xmm1
289         movdqu  16($key),%xmm2
290         movdqu  ($counter),%xmm3
291         movdqa  .Lone(%rip),%xmm4
292
293         #movdqa %xmm0,4*0(%rsp)         # key[0]
294         movdqa  %xmm1,4*4(%rsp)         # key[1]
295         movdqa  %xmm2,4*8(%rsp)         # key[2]
296         movdqa  %xmm3,4*12(%rsp)        # key[3]
297         mov     $len,%rbp               # reassign $len
298         jmp     .Loop_outer
299
300 .align  32
301 .Loop_outer:
302         mov     \$0x61707865,@x[0]      # 'expa'
303         mov     \$0x3320646e,@x[1]      # 'nd 3'
304         mov     \$0x79622d32,@x[2]      # '2-by'
305         mov     \$0x6b206574,@x[3]      # 'te k'
306         mov     4*4(%rsp),@x[4]
307         mov     4*5(%rsp),@x[5]
308         mov     4*6(%rsp),@x[6]
309         mov     4*7(%rsp),@x[7]
310         movd    %xmm3,@x[12]
311         mov     4*13(%rsp),@x[13]
312         mov     4*14(%rsp),@x[14]
313         mov     4*15(%rsp),@x[15]
314
315         mov     %rbp,64+0(%rsp)         # save len
316         mov     \$10,%ebp
317         mov     $inp,64+8(%rsp)         # save inp
318         movq    %xmm2,%rsi              # "@x[8]"
319         mov     $out,64+16(%rsp)        # save out
320         mov     %rsi,%rdi
321         shr     \$32,%rdi               # "@x[9]"
322         jmp     .Loop
323
324 .align  32
325 .Loop:
326 ___
327         foreach (&ROUND (0, 4, 8,12)) { eval; }
328         foreach (&ROUND (0, 5,10,15)) { eval; }
329         &dec    ("%ebp");
330         &jnz    (".Loop");
331
332 $code.=<<___;
333         mov     @t[1],4*9(%rsp)         # modulo-scheduled
334         mov     @t[0],4*8(%rsp)
335         mov     64(%rsp),%rbp           # load len
336         movdqa  %xmm2,%xmm1
337         mov     64+8(%rsp),$inp         # load inp
338         paddd   %xmm4,%xmm3             # increment counter
339         mov     64+16(%rsp),$out        # load out
340
341         add     \$0x61707865,@x[0]      # 'expa'
342         add     \$0x3320646e,@x[1]      # 'nd 3'
343         add     \$0x79622d32,@x[2]      # '2-by'
344         add     \$0x6b206574,@x[3]      # 'te k'
345         add     4*4(%rsp),@x[4]
346         add     4*5(%rsp),@x[5]
347         add     4*6(%rsp),@x[6]
348         add     4*7(%rsp),@x[7]
349         add     4*12(%rsp),@x[12]
350         add     4*13(%rsp),@x[13]
351         add     4*14(%rsp),@x[14]
352         add     4*15(%rsp),@x[15]
353         paddd   4*8(%rsp),%xmm1
354
355         cmp     \$64,%rbp
356         jb      .Ltail
357
358         xor     4*0($inp),@x[0]         # xor with input
359         xor     4*1($inp),@x[1]
360         xor     4*2($inp),@x[2]
361         xor     4*3($inp),@x[3]
362         xor     4*4($inp),@x[4]
363         xor     4*5($inp),@x[5]
364         xor     4*6($inp),@x[6]
365         xor     4*7($inp),@x[7]
366         movdqu  4*8($inp),%xmm0
367         xor     4*12($inp),@x[12]
368         xor     4*13($inp),@x[13]
369         xor     4*14($inp),@x[14]
370         xor     4*15($inp),@x[15]
371         lea     4*16($inp),$inp         # inp+=64
372         pxor    %xmm1,%xmm0
373
374         movdqa  %xmm2,4*8(%rsp)
375         movd    %xmm3,4*12(%rsp)
376
377         mov     @x[0],4*0($out)         # write output
378         mov     @x[1],4*1($out)
379         mov     @x[2],4*2($out)
380         mov     @x[3],4*3($out)
381         mov     @x[4],4*4($out)
382         mov     @x[5],4*5($out)
383         mov     @x[6],4*6($out)
384         mov     @x[7],4*7($out)
385         movdqu  %xmm0,4*8($out)
386         mov     @x[12],4*12($out)
387         mov     @x[13],4*13($out)
388         mov     @x[14],4*14($out)
389         mov     @x[15],4*15($out)
390         lea     4*16($out),$out         # out+=64
391
392         sub     \$64,%rbp
393         jnz     .Loop_outer
394
395         jmp     .Ldone
396
397 .align  16
398 .Ltail:
399         mov     @x[0],4*0(%rsp)
400         mov     @x[1],4*1(%rsp)
401         xor     %rbx,%rbx
402         mov     @x[2],4*2(%rsp)
403         mov     @x[3],4*3(%rsp)
404         mov     @x[4],4*4(%rsp)
405         mov     @x[5],4*5(%rsp)
406         mov     @x[6],4*6(%rsp)
407         mov     @x[7],4*7(%rsp)
408         movdqa  %xmm1,4*8(%rsp)
409         mov     @x[12],4*12(%rsp)
410         mov     @x[13],4*13(%rsp)
411         mov     @x[14],4*14(%rsp)
412         mov     @x[15],4*15(%rsp)
413
414 .Loop_tail:
415         movzb   ($inp,%rbx),%eax
416         movzb   (%rsp,%rbx),%edx
417         lea     1(%rbx),%rbx
418         xor     %edx,%eax
419         mov     %al,-1($out,%rbx)
420         dec     %rbp
421         jnz     .Loop_tail
422
423 .Ldone:
424         lea     64+24+48(%rsp),%rsi
425 .cfi_def_cfa    %rsi,8
426         mov     -48(%rsi),%r15
427 .cfi_restore    %r15
428         mov     -40(%rsi),%r14
429 .cfi_restore    %r14
430         mov     -32(%rsi),%r13
431 .cfi_restore    %r13
432         mov     -24(%rsi),%r12
433 .cfi_restore    %r12
434         mov     -16(%rsi),%rbp
435 .cfi_restore    %rbp
436         mov     -8(%rsi),%rbx
437 .cfi_restore    %rbx
438         lea     (%rsi),%rsp
439 .cfi_def_cfa_register   %rsp
440 .Lno_data:
441         ret
442 .cfi_endproc
443 .size   ChaCha20_ctr32,.-ChaCha20_ctr32
444 ___
445
446 ########################################################################
447 # SSSE3 code path that handles shorter lengths
448 {
449 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
450
451 sub SSSE3ROUND {        # critical path is 20 "SIMD ticks" per round
452         &paddd  ($a,$b);
453         &pxor   ($d,$a);
454         &pshufb ($d,$rot16);
455
456         &paddd  ($c,$d);
457         &pxor   ($b,$c);
458         &movdqa ($t,$b);
459         &psrld  ($b,20);
460         &pslld  ($t,12);
461         &por    ($b,$t);
462
463         &paddd  ($a,$b);
464         &pxor   ($d,$a);
465         &pshufb ($d,$rot24);
466
467         &paddd  ($c,$d);
468         &pxor   ($b,$c);
469         &movdqa ($t,$b);
470         &psrld  ($b,25);
471         &pslld  ($t,7);
472         &por    ($b,$t);
473 }
474
475 my $xframe = $win64 ? 32+8 : 8;
476
477 $code.=<<___;
478 .type   ChaCha20_ssse3,\@function,5
479 .align  32
480 ChaCha20_ssse3:
481 .cfi_startproc
482 .LChaCha20_ssse3:
483         mov     %rsp,%r9                # frame pointer
484 .cfi_def_cfa_register   %r9
485 ___
486 $code.=<<___    if ($avx);
487         test    \$`1<<(43-32)`,%r10d
488         jnz     .LChaCha20_4xop         # XOP is fastest even if we use 1/4
489 ___
490 $code.=<<___;
491         cmp     \$128,$len              # we might throw away some data,
492         ja      .LChaCha20_4x           # but overall it won't be slower
493
494 .Ldo_sse3_after_all:
495         sub     \$64+$xframe,%rsp
496 ___
497 $code.=<<___    if ($win64);
498         movaps  %xmm6,-0x28(%r9)
499         movaps  %xmm7,-0x18(%r9)
500 .Lssse3_body:
501 ___
502 $code.=<<___;
503         movdqa  .Lsigma(%rip),$a
504         movdqu  ($key),$b
505         movdqu  16($key),$c
506         movdqu  ($counter),$d
507         movdqa  .Lrot16(%rip),$rot16
508         movdqa  .Lrot24(%rip),$rot24
509
510         movdqa  $a,0x00(%rsp)
511         movdqa  $b,0x10(%rsp)
512         movdqa  $c,0x20(%rsp)
513         movdqa  $d,0x30(%rsp)
514         mov     \$10,$counter           # reuse $counter
515         jmp     .Loop_ssse3
516
517 .align  32
518 .Loop_outer_ssse3:
519         movdqa  .Lone(%rip),$d
520         movdqa  0x00(%rsp),$a
521         movdqa  0x10(%rsp),$b
522         movdqa  0x20(%rsp),$c
523         paddd   0x30(%rsp),$d
524         mov     \$10,$counter
525         movdqa  $d,0x30(%rsp)
526         jmp     .Loop_ssse3
527
528 .align  32
529 .Loop_ssse3:
530 ___
531         &SSSE3ROUND();
532         &pshufd ($c,$c,0b01001110);
533         &pshufd ($b,$b,0b00111001);
534         &pshufd ($d,$d,0b10010011);
535         &nop    ();
536
537         &SSSE3ROUND();
538         &pshufd ($c,$c,0b01001110);
539         &pshufd ($b,$b,0b10010011);
540         &pshufd ($d,$d,0b00111001);
541
542         &dec    ($counter);
543         &jnz    (".Loop_ssse3");
544
545 $code.=<<___;
546         paddd   0x00(%rsp),$a
547         paddd   0x10(%rsp),$b
548         paddd   0x20(%rsp),$c
549         paddd   0x30(%rsp),$d
550
551         cmp     \$64,$len
552         jb      .Ltail_ssse3
553
554         movdqu  0x00($inp),$t
555         movdqu  0x10($inp),$t1
556         pxor    $t,$a                   # xor with input
557         movdqu  0x20($inp),$t
558         pxor    $t1,$b
559         movdqu  0x30($inp),$t1
560         lea     0x40($inp),$inp         # inp+=64
561         pxor    $t,$c
562         pxor    $t1,$d
563
564         movdqu  $a,0x00($out)           # write output
565         movdqu  $b,0x10($out)
566         movdqu  $c,0x20($out)
567         movdqu  $d,0x30($out)
568         lea     0x40($out),$out         # out+=64
569
570         sub     \$64,$len
571         jnz     .Loop_outer_ssse3
572
573         jmp     .Ldone_ssse3
574
575 .align  16
576 .Ltail_ssse3:
577         movdqa  $a,0x00(%rsp)
578         movdqa  $b,0x10(%rsp)
579         movdqa  $c,0x20(%rsp)
580         movdqa  $d,0x30(%rsp)
581         xor     $counter,$counter
582
583 .Loop_tail_ssse3:
584         movzb   ($inp,$counter),%eax
585         movzb   (%rsp,$counter),%ecx
586         lea     1($counter),$counter
587         xor     %ecx,%eax
588         mov     %al,-1($out,$counter)
589         dec     $len
590         jnz     .Loop_tail_ssse3
591
592 .Ldone_ssse3:
593 ___
594 $code.=<<___    if ($win64);
595         movaps  -0x28(%r9),%xmm6
596         movaps  -0x18(%r9),%xmm7
597 ___
598 $code.=<<___;
599         lea     (%r9),%rsp
600 .cfi_def_cfa_register   %rsp
601 .Lssse3_epilogue:
602         ret
603 .cfi_endproc
604 .size   ChaCha20_ssse3,.-ChaCha20_ssse3
605 ___
606 }
607
608 ########################################################################
609 # SSSE3 code path that handles longer messages.
610 {
611 # assign variables to favor Atom front-end
612 my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
613     $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
614 my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
615         "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
616
617 sub SSSE3_lane_ROUND {
618 my ($a0,$b0,$c0,$d0)=@_;
619 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
620 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
621 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
622 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
623 my @x=map("\"$_\"",@xx);
624
625         # Consider order in which variables are addressed by their
626         # index:
627         #
628         #       a   b   c   d
629         #
630         #       0   4   8  12 < even round
631         #       1   5   9  13
632         #       2   6  10  14
633         #       3   7  11  15
634         #       0   5  10  15 < odd round
635         #       1   6  11  12
636         #       2   7   8  13
637         #       3   4   9  14
638         #
639         # 'a', 'b' and 'd's are permanently allocated in registers,
640         # @x[0..7,12..15], while 'c's are maintained in memory. If
641         # you observe 'c' column, you'll notice that pair of 'c's is
642         # invariant between rounds. This means that we have to reload
643         # them once per round, in the middle. This is why you'll see
644         # bunch of 'c' stores and loads in the middle, but none in
645         # the beginning or end.
646
647         (
648         "&paddd         (@x[$a0],@x[$b0])",     # Q1
649          "&paddd        (@x[$a1],@x[$b1])",     # Q2
650         "&pxor          (@x[$d0],@x[$a0])",
651          "&pxor         (@x[$d1],@x[$a1])",
652         "&pshufb        (@x[$d0],$t1)",
653          "&pshufb       (@x[$d1],$t1)",
654
655         "&paddd         ($xc,@x[$d0])",
656          "&paddd        ($xc_,@x[$d1])",
657         "&pxor          (@x[$b0],$xc)",
658          "&pxor         (@x[$b1],$xc_)",
659         "&movdqa        ($t0,@x[$b0])",
660         "&pslld         (@x[$b0],12)",
661         "&psrld         ($t0,20)",
662          "&movdqa       ($t1,@x[$b1])",
663          "&pslld        (@x[$b1],12)",
664         "&por           (@x[$b0],$t0)",
665          "&psrld        ($t1,20)",
666         "&movdqa        ($t0,'(%r11)')",        # .Lrot24(%rip)
667          "&por          (@x[$b1],$t1)",
668
669         "&paddd         (@x[$a0],@x[$b0])",
670          "&paddd        (@x[$a1],@x[$b1])",
671         "&pxor          (@x[$d0],@x[$a0])",
672          "&pxor         (@x[$d1],@x[$a1])",
673         "&pshufb        (@x[$d0],$t0)",
674          "&pshufb       (@x[$d1],$t0)",
675
676         "&paddd         ($xc,@x[$d0])",
677          "&paddd        ($xc_,@x[$d1])",
678         "&pxor          (@x[$b0],$xc)",
679          "&pxor         (@x[$b1],$xc_)",
680         "&movdqa        ($t1,@x[$b0])",
681         "&pslld         (@x[$b0],7)",
682         "&psrld         ($t1,25)",
683          "&movdqa       ($t0,@x[$b1])",
684          "&pslld        (@x[$b1],7)",
685         "&por           (@x[$b0],$t1)",
686          "&psrld        ($t0,25)",
687         "&movdqa        ($t1,'(%r10)')",        # .Lrot16(%rip)
688          "&por          (@x[$b1],$t0)",
689
690         "&movdqa        (\"`16*($c0-8)`(%rsp)\",$xc)",  # reload pair of 'c's
691          "&movdqa       (\"`16*($c1-8)`(%rsp)\",$xc_)",
692         "&movdqa        ($xc,\"`16*($c2-8)`(%rsp)\")",
693          "&movdqa       ($xc_,\"`16*($c3-8)`(%rsp)\")",
694
695         "&paddd         (@x[$a2],@x[$b2])",     # Q3
696          "&paddd        (@x[$a3],@x[$b3])",     # Q4
697         "&pxor          (@x[$d2],@x[$a2])",
698          "&pxor         (@x[$d3],@x[$a3])",
699         "&pshufb        (@x[$d2],$t1)",
700          "&pshufb       (@x[$d3],$t1)",
701
702         "&paddd         ($xc,@x[$d2])",
703          "&paddd        ($xc_,@x[$d3])",
704         "&pxor          (@x[$b2],$xc)",
705          "&pxor         (@x[$b3],$xc_)",
706         "&movdqa        ($t0,@x[$b2])",
707         "&pslld         (@x[$b2],12)",
708         "&psrld         ($t0,20)",
709          "&movdqa       ($t1,@x[$b3])",
710          "&pslld        (@x[$b3],12)",
711         "&por           (@x[$b2],$t0)",
712          "&psrld        ($t1,20)",
713         "&movdqa        ($t0,'(%r11)')",        # .Lrot24(%rip)
714          "&por          (@x[$b3],$t1)",
715
716         "&paddd         (@x[$a2],@x[$b2])",
717          "&paddd        (@x[$a3],@x[$b3])",
718         "&pxor          (@x[$d2],@x[$a2])",
719          "&pxor         (@x[$d3],@x[$a3])",
720         "&pshufb        (@x[$d2],$t0)",
721          "&pshufb       (@x[$d3],$t0)",
722
723         "&paddd         ($xc,@x[$d2])",
724          "&paddd        ($xc_,@x[$d3])",
725         "&pxor          (@x[$b2],$xc)",
726          "&pxor         (@x[$b3],$xc_)",
727         "&movdqa        ($t1,@x[$b2])",
728         "&pslld         (@x[$b2],7)",
729         "&psrld         ($t1,25)",
730          "&movdqa       ($t0,@x[$b3])",
731          "&pslld        (@x[$b3],7)",
732         "&por           (@x[$b2],$t1)",
733          "&psrld        ($t0,25)",
734         "&movdqa        ($t1,'(%r10)')",        # .Lrot16(%rip)
735          "&por          (@x[$b3],$t0)"
736         );
737 }
738
739 my $xframe = $win64 ? 0xa8 : 8;
740
741 $code.=<<___;
742 .type   ChaCha20_4x,\@function,5
743 .align  32
744 ChaCha20_4x:
745 .cfi_startproc
746 .LChaCha20_4x:
747         mov             %rsp,%r9                # frame pointer
748 .cfi_def_cfa_register   %r9
749         mov             %r10,%r11
750 ___
751 $code.=<<___    if ($avx>1);
752         shr             \$32,%r10               # OPENSSL_ia32cap_P+8
753         test            \$`1<<5`,%r10           # test AVX2
754         jnz             .LChaCha20_8x
755 ___
756 $code.=<<___;
757         cmp             \$192,$len
758         ja              .Lproceed4x
759
760         and             \$`1<<26|1<<22`,%r11    # isolate XSAVE+MOVBE
761         cmp             \$`1<<22`,%r11          # check for MOVBE without XSAVE
762         je              .Ldo_sse3_after_all     # to detect Atom
763
764 .Lproceed4x:
765         sub             \$0x140+$xframe,%rsp
766 ___
767         ################ stack layout
768         # +0x00         SIMD equivalent of @x[8-12]
769         # ...
770         # +0x40         constant copy of key[0-2] smashed by lanes
771         # ...
772         # +0x100        SIMD counters (with nonce smashed by lanes)
773         # ...
774         # +0x140
775 $code.=<<___    if ($win64);
776         movaps          %xmm6,-0xa8(%r9)
777         movaps          %xmm7,-0x98(%r9)
778         movaps          %xmm8,-0x88(%r9)
779         movaps          %xmm9,-0x78(%r9)
780         movaps          %xmm10,-0x68(%r9)
781         movaps          %xmm11,-0x58(%r9)
782         movaps          %xmm12,-0x48(%r9)
783         movaps          %xmm13,-0x38(%r9)
784         movaps          %xmm14,-0x28(%r9)
785         movaps          %xmm15,-0x18(%r9)
786 .L4x_body:
787 ___
788 $code.=<<___;
789         movdqa          .Lsigma(%rip),$xa3      # key[0]
790         movdqu          ($key),$xb3             # key[1]
791         movdqu          16($key),$xt3           # key[2]
792         movdqu          ($counter),$xd3         # key[3]
793         lea             0x100(%rsp),%rcx        # size optimization
794         lea             .Lrot16(%rip),%r10
795         lea             .Lrot24(%rip),%r11
796
797         pshufd          \$0x00,$xa3,$xa0        # smash key by lanes...
798         pshufd          \$0x55,$xa3,$xa1
799         movdqa          $xa0,0x40(%rsp)         # ... and offload
800         pshufd          \$0xaa,$xa3,$xa2
801         movdqa          $xa1,0x50(%rsp)
802         pshufd          \$0xff,$xa3,$xa3
803         movdqa          $xa2,0x60(%rsp)
804         movdqa          $xa3,0x70(%rsp)
805
806         pshufd          \$0x00,$xb3,$xb0
807         pshufd          \$0x55,$xb3,$xb1
808         movdqa          $xb0,0x80-0x100(%rcx)
809         pshufd          \$0xaa,$xb3,$xb2
810         movdqa          $xb1,0x90-0x100(%rcx)
811         pshufd          \$0xff,$xb3,$xb3
812         movdqa          $xb2,0xa0-0x100(%rcx)
813         movdqa          $xb3,0xb0-0x100(%rcx)
814
815         pshufd          \$0x00,$xt3,$xt0        # "$xc0"
816         pshufd          \$0x55,$xt3,$xt1        # "$xc1"
817         movdqa          $xt0,0xc0-0x100(%rcx)
818         pshufd          \$0xaa,$xt3,$xt2        # "$xc2"
819         movdqa          $xt1,0xd0-0x100(%rcx)
820         pshufd          \$0xff,$xt3,$xt3        # "$xc3"
821         movdqa          $xt2,0xe0-0x100(%rcx)
822         movdqa          $xt3,0xf0-0x100(%rcx)
823
824         pshufd          \$0x00,$xd3,$xd0
825         pshufd          \$0x55,$xd3,$xd1
826         paddd           .Linc(%rip),$xd0        # don't save counters yet
827         pshufd          \$0xaa,$xd3,$xd2
828         movdqa          $xd1,0x110-0x100(%rcx)
829         pshufd          \$0xff,$xd3,$xd3
830         movdqa          $xd2,0x120-0x100(%rcx)
831         movdqa          $xd3,0x130-0x100(%rcx)
832
833         jmp             .Loop_enter4x
834
835 .align  32
836 .Loop_outer4x:
837         movdqa          0x40(%rsp),$xa0         # re-load smashed key
838         movdqa          0x50(%rsp),$xa1
839         movdqa          0x60(%rsp),$xa2
840         movdqa          0x70(%rsp),$xa3
841         movdqa          0x80-0x100(%rcx),$xb0
842         movdqa          0x90-0x100(%rcx),$xb1
843         movdqa          0xa0-0x100(%rcx),$xb2
844         movdqa          0xb0-0x100(%rcx),$xb3
845         movdqa          0xc0-0x100(%rcx),$xt0   # "$xc0"
846         movdqa          0xd0-0x100(%rcx),$xt1   # "$xc1"
847         movdqa          0xe0-0x100(%rcx),$xt2   # "$xc2"
848         movdqa          0xf0-0x100(%rcx),$xt3   # "$xc3"
849         movdqa          0x100-0x100(%rcx),$xd0
850         movdqa          0x110-0x100(%rcx),$xd1
851         movdqa          0x120-0x100(%rcx),$xd2
852         movdqa          0x130-0x100(%rcx),$xd3
853         paddd           .Lfour(%rip),$xd0       # next SIMD counters
854
855 .Loop_enter4x:
856         movdqa          $xt2,0x20(%rsp)         # SIMD equivalent of "@x[10]"
857         movdqa          $xt3,0x30(%rsp)         # SIMD equivalent of "@x[11]"
858         movdqa          (%r10),$xt3             # .Lrot16(%rip)
859         mov             \$10,%eax
860         movdqa          $xd0,0x100-0x100(%rcx)  # save SIMD counters
861         jmp             .Loop4x
862
863 .align  32
864 .Loop4x:
865 ___
866         foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
867         foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
868 $code.=<<___;
869         dec             %eax
870         jnz             .Loop4x
871
872         paddd           0x40(%rsp),$xa0         # accumulate key material
873         paddd           0x50(%rsp),$xa1
874         paddd           0x60(%rsp),$xa2
875         paddd           0x70(%rsp),$xa3
876
877         movdqa          $xa0,$xt2               # "de-interlace" data
878         punpckldq       $xa1,$xa0
879         movdqa          $xa2,$xt3
880         punpckldq       $xa3,$xa2
881         punpckhdq       $xa1,$xt2
882         punpckhdq       $xa3,$xt3
883         movdqa          $xa0,$xa1
884         punpcklqdq      $xa2,$xa0               # "a0"
885         movdqa          $xt2,$xa3
886         punpcklqdq      $xt3,$xt2               # "a2"
887         punpckhqdq      $xa2,$xa1               # "a1"
888         punpckhqdq      $xt3,$xa3               # "a3"
889 ___
890         ($xa2,$xt2)=($xt2,$xa2);
891 $code.=<<___;
892         paddd           0x80-0x100(%rcx),$xb0
893         paddd           0x90-0x100(%rcx),$xb1
894         paddd           0xa0-0x100(%rcx),$xb2
895         paddd           0xb0-0x100(%rcx),$xb3
896
897         movdqa          $xa0,0x00(%rsp)         # offload $xaN
898         movdqa          $xa1,0x10(%rsp)
899         movdqa          0x20(%rsp),$xa0         # "xc2"
900         movdqa          0x30(%rsp),$xa1         # "xc3"
901
902         movdqa          $xb0,$xt2
903         punpckldq       $xb1,$xb0
904         movdqa          $xb2,$xt3
905         punpckldq       $xb3,$xb2
906         punpckhdq       $xb1,$xt2
907         punpckhdq       $xb3,$xt3
908         movdqa          $xb0,$xb1
909         punpcklqdq      $xb2,$xb0               # "b0"
910         movdqa          $xt2,$xb3
911         punpcklqdq      $xt3,$xt2               # "b2"
912         punpckhqdq      $xb2,$xb1               # "b1"
913         punpckhqdq      $xt3,$xb3               # "b3"
914 ___
915         ($xb2,$xt2)=($xt2,$xb2);
916         my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
917 $code.=<<___;
918         paddd           0xc0-0x100(%rcx),$xc0
919         paddd           0xd0-0x100(%rcx),$xc1
920         paddd           0xe0-0x100(%rcx),$xc2
921         paddd           0xf0-0x100(%rcx),$xc3
922
923         movdqa          $xa2,0x20(%rsp)         # keep offloading $xaN
924         movdqa          $xa3,0x30(%rsp)
925
926         movdqa          $xc0,$xt2
927         punpckldq       $xc1,$xc0
928         movdqa          $xc2,$xt3
929         punpckldq       $xc3,$xc2
930         punpckhdq       $xc1,$xt2
931         punpckhdq       $xc3,$xt3
932         movdqa          $xc0,$xc1
933         punpcklqdq      $xc2,$xc0               # "c0"
934         movdqa          $xt2,$xc3
935         punpcklqdq      $xt3,$xt2               # "c2"
936         punpckhqdq      $xc2,$xc1               # "c1"
937         punpckhqdq      $xt3,$xc3               # "c3"
938 ___
939         ($xc2,$xt2)=($xt2,$xc2);
940         ($xt0,$xt1)=($xa2,$xa3);                # use $xaN as temporary
941 $code.=<<___;
942         paddd           0x100-0x100(%rcx),$xd0
943         paddd           0x110-0x100(%rcx),$xd1
944         paddd           0x120-0x100(%rcx),$xd2
945         paddd           0x130-0x100(%rcx),$xd3
946
947         movdqa          $xd0,$xt2
948         punpckldq       $xd1,$xd0
949         movdqa          $xd2,$xt3
950         punpckldq       $xd3,$xd2
951         punpckhdq       $xd1,$xt2
952         punpckhdq       $xd3,$xt3
953         movdqa          $xd0,$xd1
954         punpcklqdq      $xd2,$xd0               # "d0"
955         movdqa          $xt2,$xd3
956         punpcklqdq      $xt3,$xt2               # "d2"
957         punpckhqdq      $xd2,$xd1               # "d1"
958         punpckhqdq      $xt3,$xd3               # "d3"
959 ___
960         ($xd2,$xt2)=($xt2,$xd2);
961 $code.=<<___;
962         cmp             \$64*4,$len
963         jb              .Ltail4x
964
965         movdqu          0x00($inp),$xt0         # xor with input
966         movdqu          0x10($inp),$xt1
967         movdqu          0x20($inp),$xt2
968         movdqu          0x30($inp),$xt3
969         pxor            0x00(%rsp),$xt0         # $xaN is offloaded, remember?
970         pxor            $xb0,$xt1
971         pxor            $xc0,$xt2
972         pxor            $xd0,$xt3
973
974          movdqu         $xt0,0x00($out)
975         movdqu          0x40($inp),$xt0
976          movdqu         $xt1,0x10($out)
977         movdqu          0x50($inp),$xt1
978          movdqu         $xt2,0x20($out)
979         movdqu          0x60($inp),$xt2
980          movdqu         $xt3,0x30($out)
981         movdqu          0x70($inp),$xt3
982         lea             0x80($inp),$inp         # size optimization
983         pxor            0x10(%rsp),$xt0
984         pxor            $xb1,$xt1
985         pxor            $xc1,$xt2
986         pxor            $xd1,$xt3
987
988          movdqu         $xt0,0x40($out)
989         movdqu          0x00($inp),$xt0
990          movdqu         $xt1,0x50($out)
991         movdqu          0x10($inp),$xt1
992          movdqu         $xt2,0x60($out)
993         movdqu          0x20($inp),$xt2
994          movdqu         $xt3,0x70($out)
995          lea            0x80($out),$out         # size optimization
996         movdqu          0x30($inp),$xt3
997         pxor            0x20(%rsp),$xt0
998         pxor            $xb2,$xt1
999         pxor            $xc2,$xt2
1000         pxor            $xd2,$xt3
1001
1002          movdqu         $xt0,0x00($out)
1003         movdqu          0x40($inp),$xt0
1004          movdqu         $xt1,0x10($out)
1005         movdqu          0x50($inp),$xt1
1006          movdqu         $xt2,0x20($out)
1007         movdqu          0x60($inp),$xt2
1008          movdqu         $xt3,0x30($out)
1009         movdqu          0x70($inp),$xt3
1010         lea             0x80($inp),$inp         # inp+=64*4
1011         pxor            0x30(%rsp),$xt0
1012         pxor            $xb3,$xt1
1013         pxor            $xc3,$xt2
1014         pxor            $xd3,$xt3
1015         movdqu          $xt0,0x40($out)
1016         movdqu          $xt1,0x50($out)
1017         movdqu          $xt2,0x60($out)
1018         movdqu          $xt3,0x70($out)
1019         lea             0x80($out),$out         # out+=64*4
1020
1021         sub             \$64*4,$len
1022         jnz             .Loop_outer4x
1023
1024         jmp             .Ldone4x
1025
1026 .Ltail4x:
1027         cmp             \$192,$len
1028         jae             .L192_or_more4x
1029         cmp             \$128,$len
1030         jae             .L128_or_more4x
1031         cmp             \$64,$len
1032         jae             .L64_or_more4x
1033
1034         #movdqa         0x00(%rsp),$xt0         # $xaN is offloaded, remember?
1035         xor             %r10,%r10
1036         #movdqa         $xt0,0x00(%rsp)
1037         movdqa          $xb0,0x10(%rsp)
1038         movdqa          $xc0,0x20(%rsp)
1039         movdqa          $xd0,0x30(%rsp)
1040         jmp             .Loop_tail4x
1041
1042 .align  32
1043 .L64_or_more4x:
1044         movdqu          0x00($inp),$xt0         # xor with input
1045         movdqu          0x10($inp),$xt1
1046         movdqu          0x20($inp),$xt2
1047         movdqu          0x30($inp),$xt3
1048         pxor            0x00(%rsp),$xt0         # $xaxN is offloaded, remember?
1049         pxor            $xb0,$xt1
1050         pxor            $xc0,$xt2
1051         pxor            $xd0,$xt3
1052         movdqu          $xt0,0x00($out)
1053         movdqu          $xt1,0x10($out)
1054         movdqu          $xt2,0x20($out)
1055         movdqu          $xt3,0x30($out)
1056         je              .Ldone4x
1057
1058         movdqa          0x10(%rsp),$xt0         # $xaN is offloaded, remember?
1059         lea             0x40($inp),$inp         # inp+=64*1
1060         xor             %r10,%r10
1061         movdqa          $xt0,0x00(%rsp)
1062         movdqa          $xb1,0x10(%rsp)
1063         lea             0x40($out),$out         # out+=64*1
1064         movdqa          $xc1,0x20(%rsp)
1065         sub             \$64,$len               # len-=64*1
1066         movdqa          $xd1,0x30(%rsp)
1067         jmp             .Loop_tail4x
1068
1069 .align  32
1070 .L128_or_more4x:
1071         movdqu          0x00($inp),$xt0         # xor with input
1072         movdqu          0x10($inp),$xt1
1073         movdqu          0x20($inp),$xt2
1074         movdqu          0x30($inp),$xt3
1075         pxor            0x00(%rsp),$xt0         # $xaN is offloaded, remember?
1076         pxor            $xb0,$xt1
1077         pxor            $xc0,$xt2
1078         pxor            $xd0,$xt3
1079
1080          movdqu         $xt0,0x00($out)
1081         movdqu          0x40($inp),$xt0
1082          movdqu         $xt1,0x10($out)
1083         movdqu          0x50($inp),$xt1
1084          movdqu         $xt2,0x20($out)
1085         movdqu          0x60($inp),$xt2
1086          movdqu         $xt3,0x30($out)
1087         movdqu          0x70($inp),$xt3
1088         pxor            0x10(%rsp),$xt0
1089         pxor            $xb1,$xt1
1090         pxor            $xc1,$xt2
1091         pxor            $xd1,$xt3
1092         movdqu          $xt0,0x40($out)
1093         movdqu          $xt1,0x50($out)
1094         movdqu          $xt2,0x60($out)
1095         movdqu          $xt3,0x70($out)
1096         je              .Ldone4x
1097
1098         movdqa          0x20(%rsp),$xt0         # $xaN is offloaded, remember?
1099         lea             0x80($inp),$inp         # inp+=64*2
1100         xor             %r10,%r10
1101         movdqa          $xt0,0x00(%rsp)
1102         movdqa          $xb2,0x10(%rsp)
1103         lea             0x80($out),$out         # out+=64*2
1104         movdqa          $xc2,0x20(%rsp)
1105         sub             \$128,$len              # len-=64*2
1106         movdqa          $xd2,0x30(%rsp)
1107         jmp             .Loop_tail4x
1108
1109 .align  32
1110 .L192_or_more4x:
1111         movdqu          0x00($inp),$xt0         # xor with input
1112         movdqu          0x10($inp),$xt1
1113         movdqu          0x20($inp),$xt2
1114         movdqu          0x30($inp),$xt3
1115         pxor            0x00(%rsp),$xt0         # $xaN is offloaded, remember?
1116         pxor            $xb0,$xt1
1117         pxor            $xc0,$xt2
1118         pxor            $xd0,$xt3
1119
1120          movdqu         $xt0,0x00($out)
1121         movdqu          0x40($inp),$xt0
1122          movdqu         $xt1,0x10($out)
1123         movdqu          0x50($inp),$xt1
1124          movdqu         $xt2,0x20($out)
1125         movdqu          0x60($inp),$xt2
1126          movdqu         $xt3,0x30($out)
1127         movdqu          0x70($inp),$xt3
1128         lea             0x80($inp),$inp         # size optimization
1129         pxor            0x10(%rsp),$xt0
1130         pxor            $xb1,$xt1
1131         pxor            $xc1,$xt2
1132         pxor            $xd1,$xt3
1133
1134          movdqu         $xt0,0x40($out)
1135         movdqu          0x00($inp),$xt0
1136          movdqu         $xt1,0x50($out)
1137         movdqu          0x10($inp),$xt1
1138          movdqu         $xt2,0x60($out)
1139         movdqu          0x20($inp),$xt2
1140          movdqu         $xt3,0x70($out)
1141          lea            0x80($out),$out         # size optimization
1142         movdqu          0x30($inp),$xt3
1143         pxor            0x20(%rsp),$xt0
1144         pxor            $xb2,$xt1
1145         pxor            $xc2,$xt2
1146         pxor            $xd2,$xt3
1147         movdqu          $xt0,0x00($out)
1148         movdqu          $xt1,0x10($out)
1149         movdqu          $xt2,0x20($out)
1150         movdqu          $xt3,0x30($out)
1151         je              .Ldone4x
1152
1153         movdqa          0x30(%rsp),$xt0         # $xaN is offloaded, remember?
1154         lea             0x40($inp),$inp         # inp+=64*3
1155         xor             %r10,%r10
1156         movdqa          $xt0,0x00(%rsp)
1157         movdqa          $xb3,0x10(%rsp)
1158         lea             0x40($out),$out         # out+=64*3
1159         movdqa          $xc3,0x20(%rsp)
1160         sub             \$192,$len              # len-=64*3
1161         movdqa          $xd3,0x30(%rsp)
1162
1163 .Loop_tail4x:
1164         movzb           ($inp,%r10),%eax
1165         movzb           (%rsp,%r10),%ecx
1166         lea             1(%r10),%r10
1167         xor             %ecx,%eax
1168         mov             %al,-1($out,%r10)
1169         dec             $len
1170         jnz             .Loop_tail4x
1171
1172 .Ldone4x:
1173 ___
1174 $code.=<<___    if ($win64);
1175         movaps          -0xa8(%r9),%xmm6
1176         movaps          -0x98(%r9),%xmm7
1177         movaps          -0x88(%r9),%xmm8
1178         movaps          -0x78(%r9),%xmm9
1179         movaps          -0x68(%r9),%xmm10
1180         movaps          -0x58(%r9),%xmm11
1181         movaps          -0x48(%r9),%xmm12
1182         movaps          -0x38(%r9),%xmm13
1183         movaps          -0x28(%r9),%xmm14
1184         movaps          -0x18(%r9),%xmm15
1185 ___
1186 $code.=<<___;
1187         lea             (%r9),%rsp
1188 .cfi_def_cfa_register   %rsp
1189 .L4x_epilogue:
1190         ret
1191 .cfi_endproc
1192 .size   ChaCha20_4x,.-ChaCha20_4x
1193 ___
1194 }
1195
1196 ########################################################################
1197 # XOP code path that handles all lengths.
1198 if ($avx) {
1199 # There is some "anomaly" observed depending on instructions' size or
1200 # alignment. If you look closely at below code you'll notice that
1201 # sometimes argument order varies. The order affects instruction
1202 # encoding by making it larger, and such fiddling gives 5% performance
1203 # improvement. This is on FX-4100...
1204
1205 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1206     $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1207 my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1208          $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1209
1210 sub XOP_lane_ROUND {
1211 my ($a0,$b0,$c0,$d0)=@_;
1212 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1213 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1214 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1215 my @x=map("\"$_\"",@xx);
1216
1217         (
1218         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",     # Q1
1219          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",     # Q2
1220           "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",     # Q3
1221            "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",     # Q4
1222         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1223          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1224           "&vpxor       (@x[$d2],@x[$a2],@x[$d2])",
1225            "&vpxor      (@x[$d3],@x[$a3],@x[$d3])",
1226         "&vprotd        (@x[$d0],@x[$d0],16)",
1227          "&vprotd       (@x[$d1],@x[$d1],16)",
1228           "&vprotd      (@x[$d2],@x[$d2],16)",
1229            "&vprotd     (@x[$d3],@x[$d3],16)",
1230
1231         "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
1232          "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
1233           "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
1234            "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
1235         "&vpxor         (@x[$b0],@x[$c0],@x[$b0])",
1236          "&vpxor        (@x[$b1],@x[$c1],@x[$b1])",
1237           "&vpxor       (@x[$b2],@x[$b2],@x[$c2])",     # flip
1238            "&vpxor      (@x[$b3],@x[$b3],@x[$c3])",     # flip
1239         "&vprotd        (@x[$b0],@x[$b0],12)",
1240          "&vprotd       (@x[$b1],@x[$b1],12)",
1241           "&vprotd      (@x[$b2],@x[$b2],12)",
1242            "&vprotd     (@x[$b3],@x[$b3],12)",
1243
1244         "&vpaddd        (@x[$a0],@x[$b0],@x[$a0])",     # flip
1245          "&vpaddd       (@x[$a1],@x[$b1],@x[$a1])",     # flip
1246           "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",
1247            "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",
1248         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1249          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1250           "&vpxor       (@x[$d2],@x[$a2],@x[$d2])",
1251            "&vpxor      (@x[$d3],@x[$a3],@x[$d3])",
1252         "&vprotd        (@x[$d0],@x[$d0],8)",
1253          "&vprotd       (@x[$d1],@x[$d1],8)",
1254           "&vprotd      (@x[$d2],@x[$d2],8)",
1255            "&vprotd     (@x[$d3],@x[$d3],8)",
1256
1257         "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
1258          "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
1259           "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
1260            "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
1261         "&vpxor         (@x[$b0],@x[$c0],@x[$b0])",
1262          "&vpxor        (@x[$b1],@x[$c1],@x[$b1])",
1263           "&vpxor       (@x[$b2],@x[$b2],@x[$c2])",     # flip
1264            "&vpxor      (@x[$b3],@x[$b3],@x[$c3])",     # flip
1265         "&vprotd        (@x[$b0],@x[$b0],7)",
1266          "&vprotd       (@x[$b1],@x[$b1],7)",
1267           "&vprotd      (@x[$b2],@x[$b2],7)",
1268            "&vprotd     (@x[$b3],@x[$b3],7)"
1269         );
1270 }
1271
1272 my $xframe = $win64 ? 0xa8 : 8;
1273
1274 $code.=<<___;
1275 .type   ChaCha20_4xop,\@function,5
1276 .align  32
1277 ChaCha20_4xop:
1278 .cfi_startproc
1279 .LChaCha20_4xop:
1280         mov             %rsp,%r9                # frame pointer
1281 .cfi_def_cfa_register   %r9
1282         sub             \$0x140+$xframe,%rsp
1283 ___
1284         ################ stack layout
1285         # +0x00         SIMD equivalent of @x[8-12]
1286         # ...
1287         # +0x40         constant copy of key[0-2] smashed by lanes
1288         # ...
1289         # +0x100        SIMD counters (with nonce smashed by lanes)
1290         # ...
1291         # +0x140
1292 $code.=<<___    if ($win64);
1293         movaps          %xmm6,-0xa8(%r9)
1294         movaps          %xmm7,-0x98(%r9)
1295         movaps          %xmm8,-0x88(%r9)
1296         movaps          %xmm9,-0x78(%r9)
1297         movaps          %xmm10,-0x68(%r9)
1298         movaps          %xmm11,-0x58(%r9)
1299         movaps          %xmm12,-0x48(%r9)
1300         movaps          %xmm13,-0x38(%r9)
1301         movaps          %xmm14,-0x28(%r9)
1302         movaps          %xmm15,-0x18(%r9)
1303 .L4xop_body:
1304 ___
1305 $code.=<<___;
1306         vzeroupper
1307
1308         vmovdqa         .Lsigma(%rip),$xa3      # key[0]
1309         vmovdqu         ($key),$xb3             # key[1]
1310         vmovdqu         16($key),$xt3           # key[2]
1311         vmovdqu         ($counter),$xd3         # key[3]
1312         lea             0x100(%rsp),%rcx        # size optimization
1313
1314         vpshufd         \$0x00,$xa3,$xa0        # smash key by lanes...
1315         vpshufd         \$0x55,$xa3,$xa1
1316         vmovdqa         $xa0,0x40(%rsp)         # ... and offload
1317         vpshufd         \$0xaa,$xa3,$xa2
1318         vmovdqa         $xa1,0x50(%rsp)
1319         vpshufd         \$0xff,$xa3,$xa3
1320         vmovdqa         $xa2,0x60(%rsp)
1321         vmovdqa         $xa3,0x70(%rsp)
1322
1323         vpshufd         \$0x00,$xb3,$xb0
1324         vpshufd         \$0x55,$xb3,$xb1
1325         vmovdqa         $xb0,0x80-0x100(%rcx)
1326         vpshufd         \$0xaa,$xb3,$xb2
1327         vmovdqa         $xb1,0x90-0x100(%rcx)
1328         vpshufd         \$0xff,$xb3,$xb3
1329         vmovdqa         $xb2,0xa0-0x100(%rcx)
1330         vmovdqa         $xb3,0xb0-0x100(%rcx)
1331
1332         vpshufd         \$0x00,$xt3,$xt0        # "$xc0"
1333         vpshufd         \$0x55,$xt3,$xt1        # "$xc1"
1334         vmovdqa         $xt0,0xc0-0x100(%rcx)
1335         vpshufd         \$0xaa,$xt3,$xt2        # "$xc2"
1336         vmovdqa         $xt1,0xd0-0x100(%rcx)
1337         vpshufd         \$0xff,$xt3,$xt3        # "$xc3"
1338         vmovdqa         $xt2,0xe0-0x100(%rcx)
1339         vmovdqa         $xt3,0xf0-0x100(%rcx)
1340
1341         vpshufd         \$0x00,$xd3,$xd0
1342         vpshufd         \$0x55,$xd3,$xd1
1343         vpaddd          .Linc(%rip),$xd0,$xd0   # don't save counters yet
1344         vpshufd         \$0xaa,$xd3,$xd2
1345         vmovdqa         $xd1,0x110-0x100(%rcx)
1346         vpshufd         \$0xff,$xd3,$xd3
1347         vmovdqa         $xd2,0x120-0x100(%rcx)
1348         vmovdqa         $xd3,0x130-0x100(%rcx)
1349
1350         jmp             .Loop_enter4xop
1351
1352 .align  32
1353 .Loop_outer4xop:
1354         vmovdqa         0x40(%rsp),$xa0         # re-load smashed key
1355         vmovdqa         0x50(%rsp),$xa1
1356         vmovdqa         0x60(%rsp),$xa2
1357         vmovdqa         0x70(%rsp),$xa3
1358         vmovdqa         0x80-0x100(%rcx),$xb0
1359         vmovdqa         0x90-0x100(%rcx),$xb1
1360         vmovdqa         0xa0-0x100(%rcx),$xb2
1361         vmovdqa         0xb0-0x100(%rcx),$xb3
1362         vmovdqa         0xc0-0x100(%rcx),$xt0   # "$xc0"
1363         vmovdqa         0xd0-0x100(%rcx),$xt1   # "$xc1"
1364         vmovdqa         0xe0-0x100(%rcx),$xt2   # "$xc2"
1365         vmovdqa         0xf0-0x100(%rcx),$xt3   # "$xc3"
1366         vmovdqa         0x100-0x100(%rcx),$xd0
1367         vmovdqa         0x110-0x100(%rcx),$xd1
1368         vmovdqa         0x120-0x100(%rcx),$xd2
1369         vmovdqa         0x130-0x100(%rcx),$xd3
1370         vpaddd          .Lfour(%rip),$xd0,$xd0  # next SIMD counters
1371
1372 .Loop_enter4xop:
1373         mov             \$10,%eax
1374         vmovdqa         $xd0,0x100-0x100(%rcx)  # save SIMD counters
1375         jmp             .Loop4xop
1376
1377 .align  32
1378 .Loop4xop:
1379 ___
1380         foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
1381         foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
1382 $code.=<<___;
1383         dec             %eax
1384         jnz             .Loop4xop
1385
1386         vpaddd          0x40(%rsp),$xa0,$xa0    # accumulate key material
1387         vpaddd          0x50(%rsp),$xa1,$xa1
1388         vpaddd          0x60(%rsp),$xa2,$xa2
1389         vpaddd          0x70(%rsp),$xa3,$xa3
1390
1391         vmovdqa         $xt2,0x20(%rsp)         # offload $xc2,3
1392         vmovdqa         $xt3,0x30(%rsp)
1393
1394         vpunpckldq      $xa1,$xa0,$xt2          # "de-interlace" data
1395         vpunpckldq      $xa3,$xa2,$xt3
1396         vpunpckhdq      $xa1,$xa0,$xa0
1397         vpunpckhdq      $xa3,$xa2,$xa2
1398         vpunpcklqdq     $xt3,$xt2,$xa1          # "a0"
1399         vpunpckhqdq     $xt3,$xt2,$xt2          # "a1"
1400         vpunpcklqdq     $xa2,$xa0,$xa3          # "a2"
1401         vpunpckhqdq     $xa2,$xa0,$xa0          # "a3"
1402 ___
1403         ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1404 $code.=<<___;
1405         vpaddd          0x80-0x100(%rcx),$xb0,$xb0
1406         vpaddd          0x90-0x100(%rcx),$xb1,$xb1
1407         vpaddd          0xa0-0x100(%rcx),$xb2,$xb2
1408         vpaddd          0xb0-0x100(%rcx),$xb3,$xb3
1409
1410         vmovdqa         $xa0,0x00(%rsp)         # offload $xa0,1
1411         vmovdqa         $xa1,0x10(%rsp)
1412         vmovdqa         0x20(%rsp),$xa0         # "xc2"
1413         vmovdqa         0x30(%rsp),$xa1         # "xc3"
1414
1415         vpunpckldq      $xb1,$xb0,$xt2
1416         vpunpckldq      $xb3,$xb2,$xt3
1417         vpunpckhdq      $xb1,$xb0,$xb0
1418         vpunpckhdq      $xb3,$xb2,$xb2
1419         vpunpcklqdq     $xt3,$xt2,$xb1          # "b0"
1420         vpunpckhqdq     $xt3,$xt2,$xt2          # "b1"
1421         vpunpcklqdq     $xb2,$xb0,$xb3          # "b2"
1422         vpunpckhqdq     $xb2,$xb0,$xb0          # "b3"
1423 ___
1424         ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1425         my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1426 $code.=<<___;
1427         vpaddd          0xc0-0x100(%rcx),$xc0,$xc0
1428         vpaddd          0xd0-0x100(%rcx),$xc1,$xc1
1429         vpaddd          0xe0-0x100(%rcx),$xc2,$xc2
1430         vpaddd          0xf0-0x100(%rcx),$xc3,$xc3
1431
1432         vpunpckldq      $xc1,$xc0,$xt2
1433         vpunpckldq      $xc3,$xc2,$xt3
1434         vpunpckhdq      $xc1,$xc0,$xc0
1435         vpunpckhdq      $xc3,$xc2,$xc2
1436         vpunpcklqdq     $xt3,$xt2,$xc1          # "c0"
1437         vpunpckhqdq     $xt3,$xt2,$xt2          # "c1"
1438         vpunpcklqdq     $xc2,$xc0,$xc3          # "c2"
1439         vpunpckhqdq     $xc2,$xc0,$xc0          # "c3"
1440 ___
1441         ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1442 $code.=<<___;
1443         vpaddd          0x100-0x100(%rcx),$xd0,$xd0
1444         vpaddd          0x110-0x100(%rcx),$xd1,$xd1
1445         vpaddd          0x120-0x100(%rcx),$xd2,$xd2
1446         vpaddd          0x130-0x100(%rcx),$xd3,$xd3
1447
1448         vpunpckldq      $xd1,$xd0,$xt2
1449         vpunpckldq      $xd3,$xd2,$xt3
1450         vpunpckhdq      $xd1,$xd0,$xd0
1451         vpunpckhdq      $xd3,$xd2,$xd2
1452         vpunpcklqdq     $xt3,$xt2,$xd1          # "d0"
1453         vpunpckhqdq     $xt3,$xt2,$xt2          # "d1"
1454         vpunpcklqdq     $xd2,$xd0,$xd3          # "d2"
1455         vpunpckhqdq     $xd2,$xd0,$xd0          # "d3"
1456 ___
1457         ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1458         ($xa0,$xa1)=($xt2,$xt3);
1459 $code.=<<___;
1460         vmovdqa         0x00(%rsp),$xa0         # restore $xa0,1
1461         vmovdqa         0x10(%rsp),$xa1
1462
1463         cmp             \$64*4,$len
1464         jb              .Ltail4xop
1465
1466         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1467         vpxor           0x10($inp),$xb0,$xb0
1468         vpxor           0x20($inp),$xc0,$xc0
1469         vpxor           0x30($inp),$xd0,$xd0
1470         vpxor           0x40($inp),$xa1,$xa1
1471         vpxor           0x50($inp),$xb1,$xb1
1472         vpxor           0x60($inp),$xc1,$xc1
1473         vpxor           0x70($inp),$xd1,$xd1
1474         lea             0x80($inp),$inp         # size optimization
1475         vpxor           0x00($inp),$xa2,$xa2
1476         vpxor           0x10($inp),$xb2,$xb2
1477         vpxor           0x20($inp),$xc2,$xc2
1478         vpxor           0x30($inp),$xd2,$xd2
1479         vpxor           0x40($inp),$xa3,$xa3
1480         vpxor           0x50($inp),$xb3,$xb3
1481         vpxor           0x60($inp),$xc3,$xc3
1482         vpxor           0x70($inp),$xd3,$xd3
1483         lea             0x80($inp),$inp         # inp+=64*4
1484
1485         vmovdqu         $xa0,0x00($out)
1486         vmovdqu         $xb0,0x10($out)
1487         vmovdqu         $xc0,0x20($out)
1488         vmovdqu         $xd0,0x30($out)
1489         vmovdqu         $xa1,0x40($out)
1490         vmovdqu         $xb1,0x50($out)
1491         vmovdqu         $xc1,0x60($out)
1492         vmovdqu         $xd1,0x70($out)
1493         lea             0x80($out),$out         # size optimization
1494         vmovdqu         $xa2,0x00($out)
1495         vmovdqu         $xb2,0x10($out)
1496         vmovdqu         $xc2,0x20($out)
1497         vmovdqu         $xd2,0x30($out)
1498         vmovdqu         $xa3,0x40($out)
1499         vmovdqu         $xb3,0x50($out)
1500         vmovdqu         $xc3,0x60($out)
1501         vmovdqu         $xd3,0x70($out)
1502         lea             0x80($out),$out         # out+=64*4
1503
1504         sub             \$64*4,$len
1505         jnz             .Loop_outer4xop
1506
1507         jmp             .Ldone4xop
1508
1509 .align  32
1510 .Ltail4xop:
1511         cmp             \$192,$len
1512         jae             .L192_or_more4xop
1513         cmp             \$128,$len
1514         jae             .L128_or_more4xop
1515         cmp             \$64,$len
1516         jae             .L64_or_more4xop
1517
1518         xor             %r10,%r10
1519         vmovdqa         $xa0,0x00(%rsp)
1520         vmovdqa         $xb0,0x10(%rsp)
1521         vmovdqa         $xc0,0x20(%rsp)
1522         vmovdqa         $xd0,0x30(%rsp)
1523         jmp             .Loop_tail4xop
1524
1525 .align  32
1526 .L64_or_more4xop:
1527         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1528         vpxor           0x10($inp),$xb0,$xb0
1529         vpxor           0x20($inp),$xc0,$xc0
1530         vpxor           0x30($inp),$xd0,$xd0
1531         vmovdqu         $xa0,0x00($out)
1532         vmovdqu         $xb0,0x10($out)
1533         vmovdqu         $xc0,0x20($out)
1534         vmovdqu         $xd0,0x30($out)
1535         je              .Ldone4xop
1536
1537         lea             0x40($inp),$inp         # inp+=64*1
1538         vmovdqa         $xa1,0x00(%rsp)
1539         xor             %r10,%r10
1540         vmovdqa         $xb1,0x10(%rsp)
1541         lea             0x40($out),$out         # out+=64*1
1542         vmovdqa         $xc1,0x20(%rsp)
1543         sub             \$64,$len               # len-=64*1
1544         vmovdqa         $xd1,0x30(%rsp)
1545         jmp             .Loop_tail4xop
1546
1547 .align  32
1548 .L128_or_more4xop:
1549         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1550         vpxor           0x10($inp),$xb0,$xb0
1551         vpxor           0x20($inp),$xc0,$xc0
1552         vpxor           0x30($inp),$xd0,$xd0
1553         vpxor           0x40($inp),$xa1,$xa1
1554         vpxor           0x50($inp),$xb1,$xb1
1555         vpxor           0x60($inp),$xc1,$xc1
1556         vpxor           0x70($inp),$xd1,$xd1
1557
1558         vmovdqu         $xa0,0x00($out)
1559         vmovdqu         $xb0,0x10($out)
1560         vmovdqu         $xc0,0x20($out)
1561         vmovdqu         $xd0,0x30($out)
1562         vmovdqu         $xa1,0x40($out)
1563         vmovdqu         $xb1,0x50($out)
1564         vmovdqu         $xc1,0x60($out)
1565         vmovdqu         $xd1,0x70($out)
1566         je              .Ldone4xop
1567
1568         lea             0x80($inp),$inp         # inp+=64*2
1569         vmovdqa         $xa2,0x00(%rsp)
1570         xor             %r10,%r10
1571         vmovdqa         $xb2,0x10(%rsp)
1572         lea             0x80($out),$out         # out+=64*2
1573         vmovdqa         $xc2,0x20(%rsp)
1574         sub             \$128,$len              # len-=64*2
1575         vmovdqa         $xd2,0x30(%rsp)
1576         jmp             .Loop_tail4xop
1577
1578 .align  32
1579 .L192_or_more4xop:
1580         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1581         vpxor           0x10($inp),$xb0,$xb0
1582         vpxor           0x20($inp),$xc0,$xc0
1583         vpxor           0x30($inp),$xd0,$xd0
1584         vpxor           0x40($inp),$xa1,$xa1
1585         vpxor           0x50($inp),$xb1,$xb1
1586         vpxor           0x60($inp),$xc1,$xc1
1587         vpxor           0x70($inp),$xd1,$xd1
1588         lea             0x80($inp),$inp         # size optimization
1589         vpxor           0x00($inp),$xa2,$xa2
1590         vpxor           0x10($inp),$xb2,$xb2
1591         vpxor           0x20($inp),$xc2,$xc2
1592         vpxor           0x30($inp),$xd2,$xd2
1593
1594         vmovdqu         $xa0,0x00($out)
1595         vmovdqu         $xb0,0x10($out)
1596         vmovdqu         $xc0,0x20($out)
1597         vmovdqu         $xd0,0x30($out)
1598         vmovdqu         $xa1,0x40($out)
1599         vmovdqu         $xb1,0x50($out)
1600         vmovdqu         $xc1,0x60($out)
1601         vmovdqu         $xd1,0x70($out)
1602         lea             0x80($out),$out         # size optimization
1603         vmovdqu         $xa2,0x00($out)
1604         vmovdqu         $xb2,0x10($out)
1605         vmovdqu         $xc2,0x20($out)
1606         vmovdqu         $xd2,0x30($out)
1607         je              .Ldone4xop
1608
1609         lea             0x40($inp),$inp         # inp+=64*3
1610         vmovdqa         $xa3,0x00(%rsp)
1611         xor             %r10,%r10
1612         vmovdqa         $xb3,0x10(%rsp)
1613         lea             0x40($out),$out         # out+=64*3
1614         vmovdqa         $xc3,0x20(%rsp)
1615         sub             \$192,$len              # len-=64*3
1616         vmovdqa         $xd3,0x30(%rsp)
1617
1618 .Loop_tail4xop:
1619         movzb           ($inp,%r10),%eax
1620         movzb           (%rsp,%r10),%ecx
1621         lea             1(%r10),%r10
1622         xor             %ecx,%eax
1623         mov             %al,-1($out,%r10)
1624         dec             $len
1625         jnz             .Loop_tail4xop
1626
1627 .Ldone4xop:
1628         vzeroupper
1629 ___
1630 $code.=<<___    if ($win64);
1631         movaps          -0xa8(%r9),%xmm6
1632         movaps          -0x98(%r9),%xmm7
1633         movaps          -0x88(%r9),%xmm8
1634         movaps          -0x78(%r9),%xmm9
1635         movaps          -0x68(%r9),%xmm10
1636         movaps          -0x58(%r9),%xmm11
1637         movaps          -0x48(%r9),%xmm12
1638         movaps          -0x38(%r9),%xmm13
1639         movaps          -0x28(%r9),%xmm14
1640         movaps          -0x18(%r9),%xmm15
1641 ___
1642 $code.=<<___;
1643         lea             (%r9),%rsp
1644 .cfi_def_cfa_register   %rsp
1645 .L4xop_epilogue:
1646         ret
1647 .cfi_endproc
1648 .size   ChaCha20_4xop,.-ChaCha20_4xop
1649 ___
1650 }
1651
1652 ########################################################################
1653 # AVX2 code path
1654 if ($avx>1) {
1655 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1656     $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1657 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1658         "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1659
1660 sub AVX2_lane_ROUND {
1661 my ($a0,$b0,$c0,$d0)=@_;
1662 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1663 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1664 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1665 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1666 my @x=map("\"$_\"",@xx);
1667
1668         # Consider order in which variables are addressed by their
1669         # index:
1670         #
1671         #       a   b   c   d
1672         #
1673         #       0   4   8  12 < even round
1674         #       1   5   9  13
1675         #       2   6  10  14
1676         #       3   7  11  15
1677         #       0   5  10  15 < odd round
1678         #       1   6  11  12
1679         #       2   7   8  13
1680         #       3   4   9  14
1681         #
1682         # 'a', 'b' and 'd's are permanently allocated in registers,
1683         # @x[0..7,12..15], while 'c's are maintained in memory. If
1684         # you observe 'c' column, you'll notice that pair of 'c's is
1685         # invariant between rounds. This means that we have to reload
1686         # them once per round, in the middle. This is why you'll see
1687         # bunch of 'c' stores and loads in the middle, but none in
1688         # the beginning or end.
1689
1690         (
1691         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",     # Q1
1692         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1693         "&vpshufb       (@x[$d0],@x[$d0],$t1)",
1694          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",     # Q2
1695          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1696          "&vpshufb      (@x[$d1],@x[$d1],$t1)",
1697
1698         "&vpaddd        ($xc,$xc,@x[$d0])",
1699         "&vpxor         (@x[$b0],$xc,@x[$b0])",
1700         "&vpslld        ($t0,@x[$b0],12)",
1701         "&vpsrld        (@x[$b0],@x[$b0],20)",
1702         "&vpor          (@x[$b0],$t0,@x[$b0])",
1703         "&vbroadcasti128($t0,'(%r11)')",                # .Lrot24(%rip)
1704          "&vpaddd       ($xc_,$xc_,@x[$d1])",
1705          "&vpxor        (@x[$b1],$xc_,@x[$b1])",
1706          "&vpslld       ($t1,@x[$b1],12)",
1707          "&vpsrld       (@x[$b1],@x[$b1],20)",
1708          "&vpor         (@x[$b1],$t1,@x[$b1])",
1709
1710         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",
1711         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1712         "&vpshufb       (@x[$d0],@x[$d0],$t0)",
1713          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",
1714          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1715          "&vpshufb      (@x[$d1],@x[$d1],$t0)",
1716
1717         "&vpaddd        ($xc,$xc,@x[$d0])",
1718         "&vpxor         (@x[$b0],$xc,@x[$b0])",
1719         "&vpslld        ($t1,@x[$b0],7)",
1720         "&vpsrld        (@x[$b0],@x[$b0],25)",
1721         "&vpor          (@x[$b0],$t1,@x[$b0])",
1722         "&vbroadcasti128($t1,'(%r10)')",                # .Lrot16(%rip)
1723          "&vpaddd       ($xc_,$xc_,@x[$d1])",
1724          "&vpxor        (@x[$b1],$xc_,@x[$b1])",
1725          "&vpslld       ($t0,@x[$b1],7)",
1726          "&vpsrld       (@x[$b1],@x[$b1],25)",
1727          "&vpor         (@x[$b1],$t0,@x[$b1])",
1728
1729         "&vmovdqa       (\"`32*($c0-8)`(%rsp)\",$xc)",  # reload pair of 'c's
1730          "&vmovdqa      (\"`32*($c1-8)`(%rsp)\",$xc_)",
1731         "&vmovdqa       ($xc,\"`32*($c2-8)`(%rsp)\")",
1732          "&vmovdqa      ($xc_,\"`32*($c3-8)`(%rsp)\")",
1733
1734         "&vpaddd        (@x[$a2],@x[$a2],@x[$b2])",     # Q3
1735         "&vpxor         (@x[$d2],@x[$a2],@x[$d2])",
1736         "&vpshufb       (@x[$d2],@x[$d2],$t1)",
1737          "&vpaddd       (@x[$a3],@x[$a3],@x[$b3])",     # Q4
1738          "&vpxor        (@x[$d3],@x[$a3],@x[$d3])",
1739          "&vpshufb      (@x[$d3],@x[$d3],$t1)",
1740
1741         "&vpaddd        ($xc,$xc,@x[$d2])",
1742         "&vpxor         (@x[$b2],$xc,@x[$b2])",
1743         "&vpslld        ($t0,@x[$b2],12)",
1744         "&vpsrld        (@x[$b2],@x[$b2],20)",
1745         "&vpor          (@x[$b2],$t0,@x[$b2])",
1746         "&vbroadcasti128($t0,'(%r11)')",                # .Lrot24(%rip)
1747          "&vpaddd       ($xc_,$xc_,@x[$d3])",
1748          "&vpxor        (@x[$b3],$xc_,@x[$b3])",
1749          "&vpslld       ($t1,@x[$b3],12)",
1750          "&vpsrld       (@x[$b3],@x[$b3],20)",
1751          "&vpor         (@x[$b3],$t1,@x[$b3])",
1752
1753         "&vpaddd        (@x[$a2],@x[$a2],@x[$b2])",
1754         "&vpxor         (@x[$d2],@x[$a2],@x[$d2])",
1755         "&vpshufb       (@x[$d2],@x[$d2],$t0)",
1756          "&vpaddd       (@x[$a3],@x[$a3],@x[$b3])",
1757          "&vpxor        (@x[$d3],@x[$a3],@x[$d3])",
1758          "&vpshufb      (@x[$d3],@x[$d3],$t0)",
1759
1760         "&vpaddd        ($xc,$xc,@x[$d2])",
1761         "&vpxor         (@x[$b2],$xc,@x[$b2])",
1762         "&vpslld        ($t1,@x[$b2],7)",
1763         "&vpsrld        (@x[$b2],@x[$b2],25)",
1764         "&vpor          (@x[$b2],$t1,@x[$b2])",
1765         "&vbroadcasti128($t1,'(%r10)')",                # .Lrot16(%rip)
1766          "&vpaddd       ($xc_,$xc_,@x[$d3])",
1767          "&vpxor        (@x[$b3],$xc_,@x[$b3])",
1768          "&vpslld       ($t0,@x[$b3],7)",
1769          "&vpsrld       (@x[$b3],@x[$b3],25)",
1770          "&vpor         (@x[$b3],$t0,@x[$b3])"
1771         );
1772 }
1773
1774 my $xframe = $win64 ? 0xa8 : 8;
1775
1776 $code.=<<___;
1777 .type   ChaCha20_8x,\@function,5
1778 .align  32
1779 ChaCha20_8x:
1780 .cfi_startproc
1781 .LChaCha20_8x:
1782         mov             %rsp,%r9                # frame register
1783 .cfi_def_cfa_register   %r9
1784         sub             \$0x280+$xframe,%rsp
1785         and             \$-32,%rsp
1786 ___
1787 $code.=<<___    if ($win64);
1788         movaps          %xmm6,-0xa8(%r9)
1789         movaps          %xmm7,-0x98(%r9)
1790         movaps          %xmm8,-0x88(%r9)
1791         movaps          %xmm9,-0x78(%r9)
1792         movaps          %xmm10,-0x68(%r9)
1793         movaps          %xmm11,-0x58(%r9)
1794         movaps          %xmm12,-0x48(%r9)
1795         movaps          %xmm13,-0x38(%r9)
1796         movaps          %xmm14,-0x28(%r9)
1797         movaps          %xmm15,-0x18(%r9)
1798 .L8x_body:
1799 ___
1800 $code.=<<___;
1801         vzeroupper
1802
1803         ################ stack layout
1804         # +0x00         SIMD equivalent of @x[8-12]
1805         # ...
1806         # +0x80         constant copy of key[0-2] smashed by lanes
1807         # ...
1808         # +0x200        SIMD counters (with nonce smashed by lanes)
1809         # ...
1810         # +0x280
1811
1812         vbroadcasti128  .Lsigma(%rip),$xa3      # key[0]
1813         vbroadcasti128  ($key),$xb3             # key[1]
1814         vbroadcasti128  16($key),$xt3           # key[2]
1815         vbroadcasti128  ($counter),$xd3         # key[3]
1816         lea             0x100(%rsp),%rcx        # size optimization
1817         lea             0x200(%rsp),%rax        # size optimization
1818         lea             .Lrot16(%rip),%r10
1819         lea             .Lrot24(%rip),%r11
1820
1821         vpshufd         \$0x00,$xa3,$xa0        # smash key by lanes...
1822         vpshufd         \$0x55,$xa3,$xa1
1823         vmovdqa         $xa0,0x80-0x100(%rcx)   # ... and offload
1824         vpshufd         \$0xaa,$xa3,$xa2
1825         vmovdqa         $xa1,0xa0-0x100(%rcx)
1826         vpshufd         \$0xff,$xa3,$xa3
1827         vmovdqa         $xa2,0xc0-0x100(%rcx)
1828         vmovdqa         $xa3,0xe0-0x100(%rcx)
1829
1830         vpshufd         \$0x00,$xb3,$xb0
1831         vpshufd         \$0x55,$xb3,$xb1
1832         vmovdqa         $xb0,0x100-0x100(%rcx)
1833         vpshufd         \$0xaa,$xb3,$xb2
1834         vmovdqa         $xb1,0x120-0x100(%rcx)
1835         vpshufd         \$0xff,$xb3,$xb3
1836         vmovdqa         $xb2,0x140-0x100(%rcx)
1837         vmovdqa         $xb3,0x160-0x100(%rcx)
1838
1839         vpshufd         \$0x00,$xt3,$xt0        # "xc0"
1840         vpshufd         \$0x55,$xt3,$xt1        # "xc1"
1841         vmovdqa         $xt0,0x180-0x200(%rax)
1842         vpshufd         \$0xaa,$xt3,$xt2        # "xc2"
1843         vmovdqa         $xt1,0x1a0-0x200(%rax)
1844         vpshufd         \$0xff,$xt3,$xt3        # "xc3"
1845         vmovdqa         $xt2,0x1c0-0x200(%rax)
1846         vmovdqa         $xt3,0x1e0-0x200(%rax)
1847
1848         vpshufd         \$0x00,$xd3,$xd0
1849         vpshufd         \$0x55,$xd3,$xd1
1850         vpaddd          .Lincy(%rip),$xd0,$xd0  # don't save counters yet
1851         vpshufd         \$0xaa,$xd3,$xd2
1852         vmovdqa         $xd1,0x220-0x200(%rax)
1853         vpshufd         \$0xff,$xd3,$xd3
1854         vmovdqa         $xd2,0x240-0x200(%rax)
1855         vmovdqa         $xd3,0x260-0x200(%rax)
1856
1857         jmp             .Loop_enter8x
1858
1859 .align  32
1860 .Loop_outer8x:
1861         vmovdqa         0x80-0x100(%rcx),$xa0   # re-load smashed key
1862         vmovdqa         0xa0-0x100(%rcx),$xa1
1863         vmovdqa         0xc0-0x100(%rcx),$xa2
1864         vmovdqa         0xe0-0x100(%rcx),$xa3
1865         vmovdqa         0x100-0x100(%rcx),$xb0
1866         vmovdqa         0x120-0x100(%rcx),$xb1
1867         vmovdqa         0x140-0x100(%rcx),$xb2
1868         vmovdqa         0x160-0x100(%rcx),$xb3
1869         vmovdqa         0x180-0x200(%rax),$xt0  # "xc0"
1870         vmovdqa         0x1a0-0x200(%rax),$xt1  # "xc1"
1871         vmovdqa         0x1c0-0x200(%rax),$xt2  # "xc2"
1872         vmovdqa         0x1e0-0x200(%rax),$xt3  # "xc3"
1873         vmovdqa         0x200-0x200(%rax),$xd0
1874         vmovdqa         0x220-0x200(%rax),$xd1
1875         vmovdqa         0x240-0x200(%rax),$xd2
1876         vmovdqa         0x260-0x200(%rax),$xd3
1877         vpaddd          .Leight(%rip),$xd0,$xd0 # next SIMD counters
1878
1879 .Loop_enter8x:
1880         vmovdqa         $xt2,0x40(%rsp)         # SIMD equivalent of "@x[10]"
1881         vmovdqa         $xt3,0x60(%rsp)         # SIMD equivalent of "@x[11]"
1882         vbroadcasti128  (%r10),$xt3
1883         vmovdqa         $xd0,0x200-0x200(%rax)  # save SIMD counters
1884         mov             \$10,%eax
1885         jmp             .Loop8x
1886
1887 .align  32
1888 .Loop8x:
1889 ___
1890         foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
1891         foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
1892 $code.=<<___;
1893         dec             %eax
1894         jnz             .Loop8x
1895
1896         lea             0x200(%rsp),%rax        # size optimization
1897         vpaddd          0x80-0x100(%rcx),$xa0,$xa0      # accumulate key
1898         vpaddd          0xa0-0x100(%rcx),$xa1,$xa1
1899         vpaddd          0xc0-0x100(%rcx),$xa2,$xa2
1900         vpaddd          0xe0-0x100(%rcx),$xa3,$xa3
1901
1902         vpunpckldq      $xa1,$xa0,$xt2          # "de-interlace" data
1903         vpunpckldq      $xa3,$xa2,$xt3
1904         vpunpckhdq      $xa1,$xa0,$xa0
1905         vpunpckhdq      $xa3,$xa2,$xa2
1906         vpunpcklqdq     $xt3,$xt2,$xa1          # "a0"
1907         vpunpckhqdq     $xt3,$xt2,$xt2          # "a1"
1908         vpunpcklqdq     $xa2,$xa0,$xa3          # "a2"
1909         vpunpckhqdq     $xa2,$xa0,$xa0          # "a3"
1910 ___
1911         ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1912 $code.=<<___;
1913         vpaddd          0x100-0x100(%rcx),$xb0,$xb0
1914         vpaddd          0x120-0x100(%rcx),$xb1,$xb1
1915         vpaddd          0x140-0x100(%rcx),$xb2,$xb2
1916         vpaddd          0x160-0x100(%rcx),$xb3,$xb3
1917
1918         vpunpckldq      $xb1,$xb0,$xt2
1919         vpunpckldq      $xb3,$xb2,$xt3
1920         vpunpckhdq      $xb1,$xb0,$xb0
1921         vpunpckhdq      $xb3,$xb2,$xb2
1922         vpunpcklqdq     $xt3,$xt2,$xb1          # "b0"
1923         vpunpckhqdq     $xt3,$xt2,$xt2          # "b1"
1924         vpunpcklqdq     $xb2,$xb0,$xb3          # "b2"
1925         vpunpckhqdq     $xb2,$xb0,$xb0          # "b3"
1926 ___
1927         ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1928 $code.=<<___;
1929         vperm2i128      \$0x20,$xb0,$xa0,$xt3   # "de-interlace" further
1930         vperm2i128      \$0x31,$xb0,$xa0,$xb0
1931         vperm2i128      \$0x20,$xb1,$xa1,$xa0
1932         vperm2i128      \$0x31,$xb1,$xa1,$xb1
1933         vperm2i128      \$0x20,$xb2,$xa2,$xa1
1934         vperm2i128      \$0x31,$xb2,$xa2,$xb2
1935         vperm2i128      \$0x20,$xb3,$xa3,$xa2
1936         vperm2i128      \$0x31,$xb3,$xa3,$xb3
1937 ___
1938         ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
1939         my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1940 $code.=<<___;
1941         vmovdqa         $xa0,0x00(%rsp)         # offload $xaN
1942         vmovdqa         $xa1,0x20(%rsp)
1943         vmovdqa         0x40(%rsp),$xc2         # $xa0
1944         vmovdqa         0x60(%rsp),$xc3         # $xa1
1945
1946         vpaddd          0x180-0x200(%rax),$xc0,$xc0
1947         vpaddd          0x1a0-0x200(%rax),$xc1,$xc1
1948         vpaddd          0x1c0-0x200(%rax),$xc2,$xc2
1949         vpaddd          0x1e0-0x200(%rax),$xc3,$xc3
1950
1951         vpunpckldq      $xc1,$xc0,$xt2
1952         vpunpckldq      $xc3,$xc2,$xt3
1953         vpunpckhdq      $xc1,$xc0,$xc0
1954         vpunpckhdq      $xc3,$xc2,$xc2
1955         vpunpcklqdq     $xt3,$xt2,$xc1          # "c0"
1956         vpunpckhqdq     $xt3,$xt2,$xt2          # "c1"
1957         vpunpcklqdq     $xc2,$xc0,$xc3          # "c2"
1958         vpunpckhqdq     $xc2,$xc0,$xc0          # "c3"
1959 ___
1960         ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1961 $code.=<<___;
1962         vpaddd          0x200-0x200(%rax),$xd0,$xd0
1963         vpaddd          0x220-0x200(%rax),$xd1,$xd1
1964         vpaddd          0x240-0x200(%rax),$xd2,$xd2
1965         vpaddd          0x260-0x200(%rax),$xd3,$xd3
1966
1967         vpunpckldq      $xd1,$xd0,$xt2
1968         vpunpckldq      $xd3,$xd2,$xt3
1969         vpunpckhdq      $xd1,$xd0,$xd0
1970         vpunpckhdq      $xd3,$xd2,$xd2
1971         vpunpcklqdq     $xt3,$xt2,$xd1          # "d0"
1972         vpunpckhqdq     $xt3,$xt2,$xt2          # "d1"
1973         vpunpcklqdq     $xd2,$xd0,$xd3          # "d2"
1974         vpunpckhqdq     $xd2,$xd0,$xd0          # "d3"
1975 ___
1976         ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1977 $code.=<<___;
1978         vperm2i128      \$0x20,$xd0,$xc0,$xt3   # "de-interlace" further
1979         vperm2i128      \$0x31,$xd0,$xc0,$xd0
1980         vperm2i128      \$0x20,$xd1,$xc1,$xc0
1981         vperm2i128      \$0x31,$xd1,$xc1,$xd1
1982         vperm2i128      \$0x20,$xd2,$xc2,$xc1
1983         vperm2i128      \$0x31,$xd2,$xc2,$xd2
1984         vperm2i128      \$0x20,$xd3,$xc3,$xc2
1985         vperm2i128      \$0x31,$xd3,$xc3,$xd3
1986 ___
1987         ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
1988         ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
1989         ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
1990         ($xa0,$xa1)=($xt2,$xt3);
1991 $code.=<<___;
1992         vmovdqa         0x00(%rsp),$xa0         # $xaN was offloaded, remember?
1993         vmovdqa         0x20(%rsp),$xa1
1994
1995         cmp             \$64*8,$len
1996         jb              .Ltail8x
1997
1998         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1999         vpxor           0x20($inp),$xb0,$xb0
2000         vpxor           0x40($inp),$xc0,$xc0
2001         vpxor           0x60($inp),$xd0,$xd0
2002         lea             0x80($inp),$inp         # size optimization
2003         vmovdqu         $xa0,0x00($out)
2004         vmovdqu         $xb0,0x20($out)
2005         vmovdqu         $xc0,0x40($out)
2006         vmovdqu         $xd0,0x60($out)
2007         lea             0x80($out),$out         # size optimization
2008
2009         vpxor           0x00($inp),$xa1,$xa1
2010         vpxor           0x20($inp),$xb1,$xb1
2011         vpxor           0x40($inp),$xc1,$xc1
2012         vpxor           0x60($inp),$xd1,$xd1
2013         lea             0x80($inp),$inp         # size optimization
2014         vmovdqu         $xa1,0x00($out)
2015         vmovdqu         $xb1,0x20($out)
2016         vmovdqu         $xc1,0x40($out)
2017         vmovdqu         $xd1,0x60($out)
2018         lea             0x80($out),$out         # size optimization
2019
2020         vpxor           0x00($inp),$xa2,$xa2
2021         vpxor           0x20($inp),$xb2,$xb2
2022         vpxor           0x40($inp),$xc2,$xc2
2023         vpxor           0x60($inp),$xd2,$xd2
2024         lea             0x80($inp),$inp         # size optimization
2025         vmovdqu         $xa2,0x00($out)
2026         vmovdqu         $xb2,0x20($out)
2027         vmovdqu         $xc2,0x40($out)
2028         vmovdqu         $xd2,0x60($out)
2029         lea             0x80($out),$out         # size optimization
2030
2031         vpxor           0x00($inp),$xa3,$xa3
2032         vpxor           0x20($inp),$xb3,$xb3
2033         vpxor           0x40($inp),$xc3,$xc3
2034         vpxor           0x60($inp),$xd3,$xd3
2035         lea             0x80($inp),$inp         # size optimization
2036         vmovdqu         $xa3,0x00($out)
2037         vmovdqu         $xb3,0x20($out)
2038         vmovdqu         $xc3,0x40($out)
2039         vmovdqu         $xd3,0x60($out)
2040         lea             0x80($out),$out         # size optimization
2041
2042         sub             \$64*8,$len
2043         jnz             .Loop_outer8x
2044
2045         jmp             .Ldone8x
2046
2047 .Ltail8x:
2048         cmp             \$448,$len
2049         jae             .L448_or_more8x
2050         cmp             \$384,$len
2051         jae             .L384_or_more8x
2052         cmp             \$320,$len
2053         jae             .L320_or_more8x
2054         cmp             \$256,$len
2055         jae             .L256_or_more8x
2056         cmp             \$192,$len
2057         jae             .L192_or_more8x
2058         cmp             \$128,$len
2059         jae             .L128_or_more8x
2060         cmp             \$64,$len
2061         jae             .L64_or_more8x
2062
2063         xor             %r10,%r10
2064         vmovdqa         $xa0,0x00(%rsp)
2065         vmovdqa         $xb0,0x20(%rsp)
2066         jmp             .Loop_tail8x
2067
2068 .align  32
2069 .L64_or_more8x:
2070         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2071         vpxor           0x20($inp),$xb0,$xb0
2072         vmovdqu         $xa0,0x00($out)
2073         vmovdqu         $xb0,0x20($out)
2074         je              .Ldone8x
2075
2076         lea             0x40($inp),$inp         # inp+=64*1
2077         xor             %r10,%r10
2078         vmovdqa         $xc0,0x00(%rsp)
2079         lea             0x40($out),$out         # out+=64*1
2080         sub             \$64,$len               # len-=64*1
2081         vmovdqa         $xd0,0x20(%rsp)
2082         jmp             .Loop_tail8x
2083
2084 .align  32
2085 .L128_or_more8x:
2086         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2087         vpxor           0x20($inp),$xb0,$xb0
2088         vpxor           0x40($inp),$xc0,$xc0
2089         vpxor           0x60($inp),$xd0,$xd0
2090         vmovdqu         $xa0,0x00($out)
2091         vmovdqu         $xb0,0x20($out)
2092         vmovdqu         $xc0,0x40($out)
2093         vmovdqu         $xd0,0x60($out)
2094         je              .Ldone8x
2095
2096         lea             0x80($inp),$inp         # inp+=64*2
2097         xor             %r10,%r10
2098         vmovdqa         $xa1,0x00(%rsp)
2099         lea             0x80($out),$out         # out+=64*2
2100         sub             \$128,$len              # len-=64*2
2101         vmovdqa         $xb1,0x20(%rsp)
2102         jmp             .Loop_tail8x
2103
2104 .align  32
2105 .L192_or_more8x:
2106         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2107         vpxor           0x20($inp),$xb0,$xb0
2108         vpxor           0x40($inp),$xc0,$xc0
2109         vpxor           0x60($inp),$xd0,$xd0
2110         vpxor           0x80($inp),$xa1,$xa1
2111         vpxor           0xa0($inp),$xb1,$xb1
2112         vmovdqu         $xa0,0x00($out)
2113         vmovdqu         $xb0,0x20($out)
2114         vmovdqu         $xc0,0x40($out)
2115         vmovdqu         $xd0,0x60($out)
2116         vmovdqu         $xa1,0x80($out)
2117         vmovdqu         $xb1,0xa0($out)
2118         je              .Ldone8x
2119
2120         lea             0xc0($inp),$inp         # inp+=64*3
2121         xor             %r10,%r10
2122         vmovdqa         $xc1,0x00(%rsp)
2123         lea             0xc0($out),$out         # out+=64*3
2124         sub             \$192,$len              # len-=64*3
2125         vmovdqa         $xd1,0x20(%rsp)
2126         jmp             .Loop_tail8x
2127
2128 .align  32
2129 .L256_or_more8x:
2130         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2131         vpxor           0x20($inp),$xb0,$xb0
2132         vpxor           0x40($inp),$xc0,$xc0
2133         vpxor           0x60($inp),$xd0,$xd0
2134         vpxor           0x80($inp),$xa1,$xa1
2135         vpxor           0xa0($inp),$xb1,$xb1
2136         vpxor           0xc0($inp),$xc1,$xc1
2137         vpxor           0xe0($inp),$xd1,$xd1
2138         vmovdqu         $xa0,0x00($out)
2139         vmovdqu         $xb0,0x20($out)
2140         vmovdqu         $xc0,0x40($out)
2141         vmovdqu         $xd0,0x60($out)
2142         vmovdqu         $xa1,0x80($out)
2143         vmovdqu         $xb1,0xa0($out)
2144         vmovdqu         $xc1,0xc0($out)
2145         vmovdqu         $xd1,0xe0($out)
2146         je              .Ldone8x
2147
2148         lea             0x100($inp),$inp        # inp+=64*4
2149         xor             %r10,%r10
2150         vmovdqa         $xa2,0x00(%rsp)
2151         lea             0x100($out),$out        # out+=64*4
2152         sub             \$256,$len              # len-=64*4
2153         vmovdqa         $xb2,0x20(%rsp)
2154         jmp             .Loop_tail8x
2155
2156 .align  32
2157 .L320_or_more8x:
2158         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2159         vpxor           0x20($inp),$xb0,$xb0
2160         vpxor           0x40($inp),$xc0,$xc0
2161         vpxor           0x60($inp),$xd0,$xd0
2162         vpxor           0x80($inp),$xa1,$xa1
2163         vpxor           0xa0($inp),$xb1,$xb1
2164         vpxor           0xc0($inp),$xc1,$xc1
2165         vpxor           0xe0($inp),$xd1,$xd1
2166         vpxor           0x100($inp),$xa2,$xa2
2167         vpxor           0x120($inp),$xb2,$xb2
2168         vmovdqu         $xa0,0x00($out)
2169         vmovdqu         $xb0,0x20($out)
2170         vmovdqu         $xc0,0x40($out)
2171         vmovdqu         $xd0,0x60($out)
2172         vmovdqu         $xa1,0x80($out)
2173         vmovdqu         $xb1,0xa0($out)
2174         vmovdqu         $xc1,0xc0($out)
2175         vmovdqu         $xd1,0xe0($out)
2176         vmovdqu         $xa2,0x100($out)
2177         vmovdqu         $xb2,0x120($out)
2178         je              .Ldone8x
2179
2180         lea             0x140($inp),$inp        # inp+=64*5
2181         xor             %r10,%r10
2182         vmovdqa         $xc2,0x00(%rsp)
2183         lea             0x140($out),$out        # out+=64*5
2184         sub             \$320,$len              # len-=64*5
2185         vmovdqa         $xd2,0x20(%rsp)
2186         jmp             .Loop_tail8x
2187
2188 .align  32
2189 .L384_or_more8x:
2190         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2191         vpxor           0x20($inp),$xb0,$xb0
2192         vpxor           0x40($inp),$xc0,$xc0
2193         vpxor           0x60($inp),$xd0,$xd0
2194         vpxor           0x80($inp),$xa1,$xa1
2195         vpxor           0xa0($inp),$xb1,$xb1
2196         vpxor           0xc0($inp),$xc1,$xc1
2197         vpxor           0xe0($inp),$xd1,$xd1
2198         vpxor           0x100($inp),$xa2,$xa2
2199         vpxor           0x120($inp),$xb2,$xb2
2200         vpxor           0x140($inp),$xc2,$xc2
2201         vpxor           0x160($inp),$xd2,$xd2
2202         vmovdqu         $xa0,0x00($out)
2203         vmovdqu         $xb0,0x20($out)
2204         vmovdqu         $xc0,0x40($out)
2205         vmovdqu         $xd0,0x60($out)
2206         vmovdqu         $xa1,0x80($out)
2207         vmovdqu         $xb1,0xa0($out)
2208         vmovdqu         $xc1,0xc0($out)
2209         vmovdqu         $xd1,0xe0($out)
2210         vmovdqu         $xa2,0x100($out)
2211         vmovdqu         $xb2,0x120($out)
2212         vmovdqu         $xc2,0x140($out)
2213         vmovdqu         $xd2,0x160($out)
2214         je              .Ldone8x
2215
2216         lea             0x180($inp),$inp        # inp+=64*6
2217         xor             %r10,%r10
2218         vmovdqa         $xa3,0x00(%rsp)
2219         lea             0x180($out),$out        # out+=64*6
2220         sub             \$384,$len              # len-=64*6
2221         vmovdqa         $xb3,0x20(%rsp)
2222         jmp             .Loop_tail8x
2223
2224 .align  32
2225 .L448_or_more8x:
2226         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2227         vpxor           0x20($inp),$xb0,$xb0
2228         vpxor           0x40($inp),$xc0,$xc0
2229         vpxor           0x60($inp),$xd0,$xd0
2230         vpxor           0x80($inp),$xa1,$xa1
2231         vpxor           0xa0($inp),$xb1,$xb1
2232         vpxor           0xc0($inp),$xc1,$xc1
2233         vpxor           0xe0($inp),$xd1,$xd1
2234         vpxor           0x100($inp),$xa2,$xa2
2235         vpxor           0x120($inp),$xb2,$xb2
2236         vpxor           0x140($inp),$xc2,$xc2
2237         vpxor           0x160($inp),$xd2,$xd2
2238         vpxor           0x180($inp),$xa3,$xa3
2239         vpxor           0x1a0($inp),$xb3,$xb3
2240         vmovdqu         $xa0,0x00($out)
2241         vmovdqu         $xb0,0x20($out)
2242         vmovdqu         $xc0,0x40($out)
2243         vmovdqu         $xd0,0x60($out)
2244         vmovdqu         $xa1,0x80($out)
2245         vmovdqu         $xb1,0xa0($out)
2246         vmovdqu         $xc1,0xc0($out)
2247         vmovdqu         $xd1,0xe0($out)
2248         vmovdqu         $xa2,0x100($out)
2249         vmovdqu         $xb2,0x120($out)
2250         vmovdqu         $xc2,0x140($out)
2251         vmovdqu         $xd2,0x160($out)
2252         vmovdqu         $xa3,0x180($out)
2253         vmovdqu         $xb3,0x1a0($out)
2254         je              .Ldone8x
2255
2256         lea             0x1c0($inp),$inp        # inp+=64*7
2257         xor             %r10,%r10
2258         vmovdqa         $xc3,0x00(%rsp)
2259         lea             0x1c0($out),$out        # out+=64*7
2260         sub             \$448,$len              # len-=64*7
2261         vmovdqa         $xd3,0x20(%rsp)
2262
2263 .Loop_tail8x:
2264         movzb           ($inp,%r10),%eax
2265         movzb           (%rsp,%r10),%ecx
2266         lea             1(%r10),%r10
2267         xor             %ecx,%eax
2268         mov             %al,-1($out,%r10)
2269         dec             $len
2270         jnz             .Loop_tail8x
2271
2272 .Ldone8x:
2273         vzeroall
2274 ___
2275 $code.=<<___    if ($win64);
2276         movaps          -0xa8(%r9),%xmm6
2277         movaps          -0x98(%r9),%xmm7
2278         movaps          -0x88(%r9),%xmm8
2279         movaps          -0x78(%r9),%xmm9
2280         movaps          -0x68(%r9),%xmm10
2281         movaps          -0x58(%r9),%xmm11
2282         movaps          -0x48(%r9),%xmm12
2283         movaps          -0x38(%r9),%xmm13
2284         movaps          -0x28(%r9),%xmm14
2285         movaps          -0x18(%r9),%xmm15
2286 ___
2287 $code.=<<___;
2288         lea             (%r9),%rsp
2289 .cfi_def_cfa_register   %rsp
2290 .L8x_epilogue:
2291         ret
2292 .cfi_endproc
2293 .size   ChaCha20_8x,.-ChaCha20_8x
2294 ___
2295 }
2296
2297 ########################################################################
2298 # AVX512 code paths
2299 if ($avx>2) {
2300 # This one handles shorter inputs...
2301
2302 my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
2303 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
2304
2305 sub vpxord()            # size optimization
2306 { my $opcode = "vpxor"; # adhere to vpxor when possible
2307
2308     foreach (@_) {
2309         if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) {
2310             $opcode = "vpxord";
2311             last;
2312         }
2313     }
2314
2315     $code .= "\t$opcode\t".join(',',reverse @_)."\n";
2316 }
2317
2318 sub AVX512ROUND {       # critical path is 14 "SIMD ticks" per round
2319         &vpaddd ($a,$a,$b);
2320         &vpxord ($d,$d,$a);
2321         &vprold ($d,$d,16);
2322
2323         &vpaddd ($c,$c,$d);
2324         &vpxord ($b,$b,$c);
2325         &vprold ($b,$b,12);
2326
2327         &vpaddd ($a,$a,$b);
2328         &vpxord ($d,$d,$a);
2329         &vprold ($d,$d,8);
2330
2331         &vpaddd ($c,$c,$d);
2332         &vpxord ($b,$b,$c);
2333         &vprold ($b,$b,7);
2334 }
2335
2336 my $xframe = $win64 ? 32+8 : 8;
2337
2338 $code.=<<___;
2339 .type   ChaCha20_avx512,\@function,5
2340 .align  32
2341 ChaCha20_avx512:
2342 .cfi_startproc
2343 .LChaCha20_avx512:
2344         mov     %rsp,%r9                # frame pointer
2345 .cfi_def_cfa_register   %r9
2346         cmp     \$512,$len
2347         ja      .LChaCha20_16x
2348
2349         sub     \$64+$xframe,%rsp
2350 ___
2351 $code.=<<___    if ($win64);
2352         movaps  %xmm6,-0x28(%r9)
2353         movaps  %xmm7,-0x18(%r9)
2354 .Lavx512_body:
2355 ___
2356 $code.=<<___;
2357         vbroadcasti32x4 .Lsigma(%rip),$a
2358         vbroadcasti32x4 ($key),$b
2359         vbroadcasti32x4 16($key),$c
2360         vbroadcasti32x4 ($counter),$d
2361
2362         vmovdqa32       $a,$a_
2363         vmovdqa32       $b,$b_
2364         vmovdqa32       $c,$c_
2365         vpaddd          .Lzeroz(%rip),$d,$d
2366         vmovdqa32       .Lfourz(%rip),$fourz
2367         mov             \$10,$counter   # reuse $counter
2368         vmovdqa32       $d,$d_
2369         jmp             .Loop_avx512
2370
2371 .align  16
2372 .Loop_outer_avx512:
2373         vmovdqa32       $a_,$a
2374         vmovdqa32       $b_,$b
2375         vmovdqa32       $c_,$c
2376         vpaddd          $fourz,$d_,$d
2377         mov             \$10,$counter
2378         vmovdqa32       $d,$d_
2379         jmp             .Loop_avx512
2380
2381 .align  32
2382 .Loop_avx512:
2383 ___
2384         &AVX512ROUND();
2385         &vpshufd        ($c,$c,0b01001110);
2386         &vpshufd        ($b,$b,0b00111001);
2387         &vpshufd        ($d,$d,0b10010011);
2388
2389         &AVX512ROUND();
2390         &vpshufd        ($c,$c,0b01001110);
2391         &vpshufd        ($b,$b,0b10010011);
2392         &vpshufd        ($d,$d,0b00111001);
2393
2394         &dec            ($counter);
2395         &jnz            (".Loop_avx512");
2396
2397 $code.=<<___;
2398         vpaddd          $a_,$a,$a
2399         vpaddd          $b_,$b,$b
2400         vpaddd          $c_,$c,$c
2401         vpaddd          $d_,$d,$d
2402
2403         sub             \$64,$len
2404         jb              .Ltail64_avx512
2405
2406         vpxor           0x00($inp),%x#$a,$t0    # xor with input
2407         vpxor           0x10($inp),%x#$b,$t1
2408         vpxor           0x20($inp),%x#$c,$t2
2409         vpxor           0x30($inp),%x#$d,$t3
2410         lea             0x40($inp),$inp         # inp+=64
2411
2412         vmovdqu         $t0,0x00($out)          # write output
2413         vmovdqu         $t1,0x10($out)
2414         vmovdqu         $t2,0x20($out)
2415         vmovdqu         $t3,0x30($out)
2416         lea             0x40($out),$out         # out+=64
2417
2418         jz              .Ldone_avx512
2419
2420         vextracti32x4   \$1,$a,$t0
2421         vextracti32x4   \$1,$b,$t1
2422         vextracti32x4   \$1,$c,$t2
2423         vextracti32x4   \$1,$d,$t3
2424
2425         sub             \$64,$len
2426         jb              .Ltail_avx512
2427
2428         vpxor           0x00($inp),$t0,$t0      # xor with input
2429         vpxor           0x10($inp),$t1,$t1
2430         vpxor           0x20($inp),$t2,$t2
2431         vpxor           0x30($inp),$t3,$t3
2432         lea             0x40($inp),$inp         # inp+=64
2433
2434         vmovdqu         $t0,0x00($out)          # write output
2435         vmovdqu         $t1,0x10($out)
2436         vmovdqu         $t2,0x20($out)
2437         vmovdqu         $t3,0x30($out)
2438         lea             0x40($out),$out         # out+=64
2439
2440         jz              .Ldone_avx512
2441
2442         vextracti32x4   \$2,$a,$t0
2443         vextracti32x4   \$2,$b,$t1
2444         vextracti32x4   \$2,$c,$t2
2445         vextracti32x4   \$2,$d,$t3
2446
2447         sub             \$64,$len
2448         jb              .Ltail_avx512
2449
2450         vpxor           0x00($inp),$t0,$t0      # xor with input
2451         vpxor           0x10($inp),$t1,$t1
2452         vpxor           0x20($inp),$t2,$t2
2453         vpxor           0x30($inp),$t3,$t3
2454         lea             0x40($inp),$inp         # inp+=64
2455
2456         vmovdqu         $t0,0x00($out)          # write output
2457         vmovdqu         $t1,0x10($out)
2458         vmovdqu         $t2,0x20($out)
2459         vmovdqu         $t3,0x30($out)
2460         lea             0x40($out),$out         # out+=64
2461
2462         jz              .Ldone_avx512
2463
2464         vextracti32x4   \$3,$a,$t0
2465         vextracti32x4   \$3,$b,$t1
2466         vextracti32x4   \$3,$c,$t2
2467         vextracti32x4   \$3,$d,$t3
2468
2469         sub             \$64,$len
2470         jb              .Ltail_avx512
2471
2472         vpxor           0x00($inp),$t0,$t0      # xor with input
2473         vpxor           0x10($inp),$t1,$t1
2474         vpxor           0x20($inp),$t2,$t2
2475         vpxor           0x30($inp),$t3,$t3
2476         lea             0x40($inp),$inp         # inp+=64
2477
2478         vmovdqu         $t0,0x00($out)          # write output
2479         vmovdqu         $t1,0x10($out)
2480         vmovdqu         $t2,0x20($out)
2481         vmovdqu         $t3,0x30($out)
2482         lea             0x40($out),$out         # out+=64
2483
2484         jnz             .Loop_outer_avx512
2485
2486         jmp             .Ldone_avx512
2487
2488 .align  16
2489 .Ltail64_avx512:
2490         vmovdqa         %x#$a,0x00(%rsp)
2491         vmovdqa         %x#$b,0x10(%rsp)
2492         vmovdqa         %x#$c,0x20(%rsp)
2493         vmovdqa         %x#$d,0x30(%rsp)
2494         add             \$64,$len
2495         jmp             .Loop_tail_avx512
2496
2497 .align  16
2498 .Ltail_avx512:
2499         vmovdqa         $t0,0x00(%rsp)
2500         vmovdqa         $t1,0x10(%rsp)
2501         vmovdqa         $t2,0x20(%rsp)
2502         vmovdqa         $t3,0x30(%rsp)
2503         add             \$64,$len
2504
2505 .Loop_tail_avx512:
2506         movzb           ($inp,$counter),%eax
2507         movzb           (%rsp,$counter),%ecx
2508         lea             1($counter),$counter
2509         xor             %ecx,%eax
2510         mov             %al,-1($out,$counter)
2511         dec             $len
2512         jnz             .Loop_tail_avx512
2513
2514         vmovdqu32       $a_,0x00(%rsp)
2515
2516 .Ldone_avx512:
2517         vzeroall
2518 ___
2519 $code.=<<___    if ($win64);
2520         movaps  -0x28(%r9),%xmm6
2521         movaps  -0x18(%r9),%xmm7
2522 ___
2523 $code.=<<___;
2524         lea     (%r9),%rsp
2525 .cfi_def_cfa_register   %rsp
2526 .Lavx512_epilogue:
2527         ret
2528 .cfi_endproc
2529 .size   ChaCha20_avx512,.-ChaCha20_avx512
2530 ___
2531
2532 map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz);
2533
2534 $code.=<<___;
2535 .type   ChaCha20_avx512vl,\@function,5
2536 .align  32
2537 ChaCha20_avx512vl:
2538 .cfi_startproc
2539 .LChaCha20_avx512vl:
2540         mov     %rsp,%r9                # frame pointer
2541 .cfi_def_cfa_register   %r9
2542         cmp     \$128,$len
2543         ja      .LChaCha20_8xvl
2544
2545         sub     \$64+$xframe,%rsp
2546 ___
2547 $code.=<<___    if ($win64);
2548         movaps  %xmm6,-0x28(%r9)
2549         movaps  %xmm7,-0x18(%r9)
2550 .Lavx512vl_body:
2551 ___
2552 $code.=<<___;
2553         vbroadcasti128  .Lsigma(%rip),$a
2554         vbroadcasti128  ($key),$b
2555         vbroadcasti128  16($key),$c
2556         vbroadcasti128  ($counter),$d
2557
2558         vmovdqa32       $a,$a_
2559         vmovdqa32       $b,$b_
2560         vmovdqa32       $c,$c_
2561         vpaddd          .Lzeroz(%rip),$d,$d
2562         vmovdqa32       .Ltwoy(%rip),$fourz
2563         mov             \$10,$counter   # reuse $counter
2564         vmovdqa32       $d,$d_
2565         jmp             .Loop_avx512vl
2566
2567 .align  16
2568 .Loop_outer_avx512vl:
2569         vmovdqa32       $c_,$c
2570         vpaddd          $fourz,$d_,$d
2571         mov             \$10,$counter
2572         vmovdqa32       $d,$d_
2573         jmp             .Loop_avx512vl
2574
2575 .align  32
2576 .Loop_avx512vl:
2577 ___
2578         &AVX512ROUND();
2579         &vpshufd        ($c,$c,0b01001110);
2580         &vpshufd        ($b,$b,0b00111001);
2581         &vpshufd        ($d,$d,0b10010011);
2582
2583         &AVX512ROUND();
2584         &vpshufd        ($c,$c,0b01001110);
2585         &vpshufd        ($b,$b,0b10010011);
2586         &vpshufd        ($d,$d,0b00111001);
2587
2588         &dec            ($counter);
2589         &jnz            (".Loop_avx512vl");
2590
2591 $code.=<<___;
2592         vpaddd          $a_,$a,$a
2593         vpaddd          $b_,$b,$b
2594         vpaddd          $c_,$c,$c
2595         vpaddd          $d_,$d,$d
2596
2597         sub             \$64,$len
2598         jb              .Ltail64_avx512vl
2599
2600         vpxor           0x00($inp),%x#$a,$t0    # xor with input
2601         vpxor           0x10($inp),%x#$b,$t1
2602         vpxor           0x20($inp),%x#$c,$t2
2603         vpxor           0x30($inp),%x#$d,$t3
2604         lea             0x40($inp),$inp         # inp+=64
2605
2606         vmovdqu         $t0,0x00($out)          # write output
2607         vmovdqu         $t1,0x10($out)
2608         vmovdqu         $t2,0x20($out)
2609         vmovdqu         $t3,0x30($out)
2610         lea             0x40($out),$out         # out+=64
2611
2612         jz              .Ldone_avx512vl
2613
2614         vextracti128    \$1,$a,$t0
2615         vextracti128    \$1,$b,$t1
2616         vextracti128    \$1,$c,$t2
2617         vextracti128    \$1,$d,$t3
2618
2619         sub             \$64,$len
2620         jb              .Ltail_avx512vl
2621
2622         vpxor           0x00($inp),$t0,$t0      # xor with input
2623         vpxor           0x10($inp),$t1,$t1
2624         vpxor           0x20($inp),$t2,$t2
2625         vpxor           0x30($inp),$t3,$t3
2626         lea             0x40($inp),$inp         # inp+=64
2627
2628         vmovdqu         $t0,0x00($out)          # write output
2629         vmovdqu         $t1,0x10($out)
2630         vmovdqu         $t2,0x20($out)
2631         vmovdqu         $t3,0x30($out)
2632         lea             0x40($out),$out         # out+=64
2633
2634         vmovdqa32       $a_,$a
2635         vmovdqa32       $b_,$b
2636         jnz             .Loop_outer_avx512vl
2637
2638         jmp             .Ldone_avx512vl
2639
2640 .align  16
2641 .Ltail64_avx512vl:
2642         vmovdqa         %x#$a,0x00(%rsp)
2643         vmovdqa         %x#$b,0x10(%rsp)
2644         vmovdqa         %x#$c,0x20(%rsp)
2645         vmovdqa         %x#$d,0x30(%rsp)
2646         add             \$64,$len
2647         jmp             .Loop_tail_avx512vl
2648
2649 .align  16
2650 .Ltail_avx512vl:
2651         vmovdqa         $t0,0x00(%rsp)
2652         vmovdqa         $t1,0x10(%rsp)
2653         vmovdqa         $t2,0x20(%rsp)
2654         vmovdqa         $t3,0x30(%rsp)
2655         add             \$64,$len
2656
2657 .Loop_tail_avx512vl:
2658         movzb           ($inp,$counter),%eax
2659         movzb           (%rsp,$counter),%ecx
2660         lea             1($counter),$counter
2661         xor             %ecx,%eax
2662         mov             %al,-1($out,$counter)
2663         dec             $len
2664         jnz             .Loop_tail_avx512vl
2665
2666         vmovdqu32       $a_,0x00(%rsp)
2667         vmovdqu32       $a_,0x20(%rsp)
2668
2669 .Ldone_avx512vl:
2670         vzeroall
2671 ___
2672 $code.=<<___    if ($win64);
2673         movaps  -0x28(%r9),%xmm6
2674         movaps  -0x18(%r9),%xmm7
2675 ___
2676 $code.=<<___;
2677         lea     (%r9),%rsp
2678 .cfi_def_cfa_register   %rsp
2679 .Lavx512vl_epilogue:
2680         ret
2681 .cfi_endproc
2682 .size   ChaCha20_avx512vl,.-ChaCha20_avx512vl
2683 ___
2684 }
2685 if ($avx>2) {
2686 # This one handles longer inputs...
2687
2688 my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2689     $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
2690 my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2691          $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2692 my @key=map("%zmm$_",(16..31));
2693 my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
2694
2695 sub AVX512_lane_ROUND {
2696 my ($a0,$b0,$c0,$d0)=@_;
2697 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
2698 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
2699 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
2700 my @x=map("\"$_\"",@xx);
2701
2702         (
2703         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",     # Q1
2704          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",     # Q2
2705           "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",     # Q3
2706            "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",     # Q4
2707         "&vpxord        (@x[$d0],@x[$d0],@x[$a0])",
2708          "&vpxord       (@x[$d1],@x[$d1],@x[$a1])",
2709           "&vpxord      (@x[$d2],@x[$d2],@x[$a2])",
2710            "&vpxord     (@x[$d3],@x[$d3],@x[$a3])",
2711         "&vprold        (@x[$d0],@x[$d0],16)",
2712          "&vprold       (@x[$d1],@x[$d1],16)",
2713           "&vprold      (@x[$d2],@x[$d2],16)",
2714            "&vprold     (@x[$d3],@x[$d3],16)",
2715
2716         "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
2717          "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
2718           "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
2719            "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
2720         "&vpxord        (@x[$b0],@x[$b0],@x[$c0])",
2721          "&vpxord       (@x[$b1],@x[$b1],@x[$c1])",
2722           "&vpxord      (@x[$b2],@x[$b2],@x[$c2])",
2723            "&vpxord     (@x[$b3],@x[$b3],@x[$c3])",
2724         "&vprold        (@x[$b0],@x[$b0],12)",
2725          "&vprold       (@x[$b1],@x[$b1],12)",
2726           "&vprold      (@x[$b2],@x[$b2],12)",
2727            "&vprold     (@x[$b3],@x[$b3],12)",
2728
2729         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",
2730          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",
2731           "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",
2732            "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",
2733         "&vpxord        (@x[$d0],@x[$d0],@x[$a0])",
2734          "&vpxord       (@x[$d1],@x[$d1],@x[$a1])",
2735           "&vpxord      (@x[$d2],@x[$d2],@x[$a2])",
2736            "&vpxord     (@x[$d3],@x[$d3],@x[$a3])",
2737         "&vprold        (@x[$d0],@x[$d0],8)",
2738          "&vprold       (@x[$d1],@x[$d1],8)",
2739           "&vprold      (@x[$d2],@x[$d2],8)",
2740            "&vprold     (@x[$d3],@x[$d3],8)",
2741
2742         "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
2743          "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
2744           "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
2745            "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
2746         "&vpxord        (@x[$b0],@x[$b0],@x[$c0])",
2747          "&vpxord       (@x[$b1],@x[$b1],@x[$c1])",
2748           "&vpxord      (@x[$b2],@x[$b2],@x[$c2])",
2749            "&vpxord     (@x[$b3],@x[$b3],@x[$c3])",
2750         "&vprold        (@x[$b0],@x[$b0],7)",
2751          "&vprold       (@x[$b1],@x[$b1],7)",
2752           "&vprold      (@x[$b2],@x[$b2],7)",
2753            "&vprold     (@x[$b3],@x[$b3],7)"
2754         );
2755 }
2756
2757 my $xframe = $win64 ? 0xa8 : 8;
2758
2759 $code.=<<___;
2760 .type   ChaCha20_16x,\@function,5
2761 .align  32
2762 ChaCha20_16x:
2763 .cfi_startproc
2764 .LChaCha20_16x:
2765         mov             %rsp,%r9                # frame register
2766 .cfi_def_cfa_register   %r9
2767         sub             \$64+$xframe,%rsp
2768         and             \$-64,%rsp
2769 ___
2770 $code.=<<___    if ($win64);
2771         movaps          %xmm6,-0xa8(%r9)
2772         movaps          %xmm7,-0x98(%r9)
2773         movaps          %xmm8,-0x88(%r9)
2774         movaps          %xmm9,-0x78(%r9)
2775         movaps          %xmm10,-0x68(%r9)
2776         movaps          %xmm11,-0x58(%r9)
2777         movaps          %xmm12,-0x48(%r9)
2778         movaps          %xmm13,-0x38(%r9)
2779         movaps          %xmm14,-0x28(%r9)
2780         movaps          %xmm15,-0x18(%r9)
2781 .L16x_body:
2782 ___
2783 $code.=<<___;
2784         vzeroupper
2785
2786         lea             .Lsigma(%rip),%r10
2787         vbroadcasti32x4 (%r10),$xa3             # key[0]
2788         vbroadcasti32x4 ($key),$xb3             # key[1]
2789         vbroadcasti32x4 16($key),$xc3           # key[2]
2790         vbroadcasti32x4 ($counter),$xd3         # key[3]
2791
2792         vpshufd         \$0x00,$xa3,$xa0        # smash key by lanes...
2793         vpshufd         \$0x55,$xa3,$xa1
2794         vpshufd         \$0xaa,$xa3,$xa2
2795         vpshufd         \$0xff,$xa3,$xa3
2796         vmovdqa64       $xa0,@key[0]
2797         vmovdqa64       $xa1,@key[1]
2798         vmovdqa64       $xa2,@key[2]
2799         vmovdqa64       $xa3,@key[3]
2800
2801         vpshufd         \$0x00,$xb3,$xb0
2802         vpshufd         \$0x55,$xb3,$xb1
2803         vpshufd         \$0xaa,$xb3,$xb2
2804         vpshufd         \$0xff,$xb3,$xb3
2805         vmovdqa64       $xb0,@key[4]
2806         vmovdqa64       $xb1,@key[5]
2807         vmovdqa64       $xb2,@key[6]
2808         vmovdqa64       $xb3,@key[7]
2809
2810         vpshufd         \$0x00,$xc3,$xc0
2811         vpshufd         \$0x55,$xc3,$xc1
2812         vpshufd         \$0xaa,$xc3,$xc2
2813         vpshufd         \$0xff,$xc3,$xc3
2814         vmovdqa64       $xc0,@key[8]
2815         vmovdqa64       $xc1,@key[9]
2816         vmovdqa64       $xc2,@key[10]
2817         vmovdqa64       $xc3,@key[11]
2818
2819         vpshufd         \$0x00,$xd3,$xd0
2820         vpshufd         \$0x55,$xd3,$xd1
2821         vpshufd         \$0xaa,$xd3,$xd2
2822         vpshufd         \$0xff,$xd3,$xd3
2823         vpaddd          .Lincz(%rip),$xd0,$xd0  # don't save counters yet
2824         vmovdqa64       $xd0,@key[12]
2825         vmovdqa64       $xd1,@key[13]
2826         vmovdqa64       $xd2,@key[14]
2827         vmovdqa64       $xd3,@key[15]
2828
2829         mov             \$10,%eax
2830         jmp             .Loop16x
2831
2832 .align  32
2833 .Loop_outer16x:
2834         vpbroadcastd    0(%r10),$xa0            # reload key
2835         vpbroadcastd    4(%r10),$xa1
2836         vpbroadcastd    8(%r10),$xa2
2837         vpbroadcastd    12(%r10),$xa3
2838         vpaddd          .Lsixteen(%rip),@key[12],@key[12]       # next SIMD counters
2839         vmovdqa64       @key[4],$xb0
2840         vmovdqa64       @key[5],$xb1
2841         vmovdqa64       @key[6],$xb2
2842         vmovdqa64       @key[7],$xb3
2843         vmovdqa64       @key[8],$xc0
2844         vmovdqa64       @key[9],$xc1
2845         vmovdqa64       @key[10],$xc2
2846         vmovdqa64       @key[11],$xc3
2847         vmovdqa64       @key[12],$xd0
2848         vmovdqa64       @key[13],$xd1
2849         vmovdqa64       @key[14],$xd2
2850         vmovdqa64       @key[15],$xd3
2851
2852         vmovdqa64       $xa0,@key[0]
2853         vmovdqa64       $xa1,@key[1]
2854         vmovdqa64       $xa2,@key[2]
2855         vmovdqa64       $xa3,@key[3]
2856
2857         mov             \$10,%eax
2858         jmp             .Loop16x
2859
2860 .align  32
2861 .Loop16x:
2862 ___
2863         foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
2864         foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
2865 $code.=<<___;
2866         dec             %eax
2867         jnz             .Loop16x
2868
2869         vpaddd          @key[0],$xa0,$xa0       # accumulate key
2870         vpaddd          @key[1],$xa1,$xa1
2871         vpaddd          @key[2],$xa2,$xa2
2872         vpaddd          @key[3],$xa3,$xa3
2873
2874         vpunpckldq      $xa1,$xa0,$xt2          # "de-interlace" data
2875         vpunpckldq      $xa3,$xa2,$xt3
2876         vpunpckhdq      $xa1,$xa0,$xa0
2877         vpunpckhdq      $xa3,$xa2,$xa2
2878         vpunpcklqdq     $xt3,$xt2,$xa1          # "a0"
2879         vpunpckhqdq     $xt3,$xt2,$xt2          # "a1"
2880         vpunpcklqdq     $xa2,$xa0,$xa3          # "a2"
2881         vpunpckhqdq     $xa2,$xa0,$xa0          # "a3"
2882 ___
2883         ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
2884 $code.=<<___;
2885         vpaddd          @key[4],$xb0,$xb0
2886         vpaddd          @key[5],$xb1,$xb1
2887         vpaddd          @key[6],$xb2,$xb2
2888         vpaddd          @key[7],$xb3,$xb3
2889
2890         vpunpckldq      $xb1,$xb0,$xt2
2891         vpunpckldq      $xb3,$xb2,$xt3
2892         vpunpckhdq      $xb1,$xb0,$xb0
2893         vpunpckhdq      $xb3,$xb2,$xb2
2894         vpunpcklqdq     $xt3,$xt2,$xb1          # "b0"
2895         vpunpckhqdq     $xt3,$xt2,$xt2          # "b1"
2896         vpunpcklqdq     $xb2,$xb0,$xb3          # "b2"
2897         vpunpckhqdq     $xb2,$xb0,$xb0          # "b3"
2898 ___
2899         ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
2900 $code.=<<___;
2901         vshufi32x4      \$0x44,$xb0,$xa0,$xt3   # "de-interlace" further
2902         vshufi32x4      \$0xee,$xb0,$xa0,$xb0
2903         vshufi32x4      \$0x44,$xb1,$xa1,$xa0
2904         vshufi32x4      \$0xee,$xb1,$xa1,$xb1
2905         vshufi32x4      \$0x44,$xb2,$xa2,$xa1
2906         vshufi32x4      \$0xee,$xb2,$xa2,$xb2
2907         vshufi32x4      \$0x44,$xb3,$xa3,$xa2
2908         vshufi32x4      \$0xee,$xb3,$xa3,$xb3
2909 ___
2910         ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
2911 $code.=<<___;
2912         vpaddd          @key[8],$xc0,$xc0
2913         vpaddd          @key[9],$xc1,$xc1
2914         vpaddd          @key[10],$xc2,$xc2
2915         vpaddd          @key[11],$xc3,$xc3
2916
2917         vpunpckldq      $xc1,$xc0,$xt2
2918         vpunpckldq      $xc3,$xc2,$xt3
2919         vpunpckhdq      $xc1,$xc0,$xc0
2920         vpunpckhdq      $xc3,$xc2,$xc2
2921         vpunpcklqdq     $xt3,$xt2,$xc1          # "c0"
2922         vpunpckhqdq     $xt3,$xt2,$xt2          # "c1"
2923         vpunpcklqdq     $xc2,$xc0,$xc3          # "c2"
2924         vpunpckhqdq     $xc2,$xc0,$xc0          # "c3"
2925 ___
2926         ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
2927 $code.=<<___;
2928         vpaddd          @key[12],$xd0,$xd0
2929         vpaddd          @key[13],$xd1,$xd1
2930         vpaddd          @key[14],$xd2,$xd2
2931         vpaddd          @key[15],$xd3,$xd3
2932
2933         vpunpckldq      $xd1,$xd0,$xt2
2934         vpunpckldq      $xd3,$xd2,$xt3
2935         vpunpckhdq      $xd1,$xd0,$xd0
2936         vpunpckhdq      $xd3,$xd2,$xd2
2937         vpunpcklqdq     $xt3,$xt2,$xd1          # "d0"
2938         vpunpckhqdq     $xt3,$xt2,$xt2          # "d1"
2939         vpunpcklqdq     $xd2,$xd0,$xd3          # "d2"
2940         vpunpckhqdq     $xd2,$xd0,$xd0          # "d3"
2941 ___
2942         ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
2943 $code.=<<___;
2944         vshufi32x4      \$0x44,$xd0,$xc0,$xt3   # "de-interlace" further
2945         vshufi32x4      \$0xee,$xd0,$xc0,$xd0
2946         vshufi32x4      \$0x44,$xd1,$xc1,$xc0
2947         vshufi32x4      \$0xee,$xd1,$xc1,$xd1
2948         vshufi32x4      \$0x44,$xd2,$xc2,$xc1
2949         vshufi32x4      \$0xee,$xd2,$xc2,$xd2
2950         vshufi32x4      \$0x44,$xd3,$xc3,$xc2
2951         vshufi32x4      \$0xee,$xd3,$xc3,$xd3
2952 ___
2953         ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
2954 $code.=<<___;
2955         vshufi32x4      \$0x88,$xc0,$xa0,$xt0   # "de-interlace" further
2956         vshufi32x4      \$0xdd,$xc0,$xa0,$xa0
2957          vshufi32x4     \$0x88,$xd0,$xb0,$xc0
2958          vshufi32x4     \$0xdd,$xd0,$xb0,$xd0
2959         vshufi32x4      \$0x88,$xc1,$xa1,$xt1
2960         vshufi32x4      \$0xdd,$xc1,$xa1,$xa1
2961          vshufi32x4     \$0x88,$xd1,$xb1,$xc1
2962          vshufi32x4     \$0xdd,$xd1,$xb1,$xd1
2963         vshufi32x4      \$0x88,$xc2,$xa2,$xt2
2964         vshufi32x4      \$0xdd,$xc2,$xa2,$xa2
2965          vshufi32x4     \$0x88,$xd2,$xb2,$xc2
2966          vshufi32x4     \$0xdd,$xd2,$xb2,$xd2
2967         vshufi32x4      \$0x88,$xc3,$xa3,$xt3
2968         vshufi32x4      \$0xdd,$xc3,$xa3,$xa3
2969          vshufi32x4     \$0x88,$xd3,$xb3,$xc3
2970          vshufi32x4     \$0xdd,$xd3,$xb3,$xd3
2971 ___
2972         ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
2973         ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
2974
2975         ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
2976          $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
2977         ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2978          $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2979 $code.=<<___;
2980         cmp             \$64*16,$len
2981         jb              .Ltail16x
2982
2983         vpxord          0x00($inp),$xa0,$xa0    # xor with input
2984         vpxord          0x40($inp),$xb0,$xb0
2985         vpxord          0x80($inp),$xc0,$xc0
2986         vpxord          0xc0($inp),$xd0,$xd0
2987         vmovdqu32       $xa0,0x00($out)
2988         vmovdqu32       $xb0,0x40($out)
2989         vmovdqu32       $xc0,0x80($out)
2990         vmovdqu32       $xd0,0xc0($out)
2991
2992         vpxord          0x100($inp),$xa1,$xa1
2993         vpxord          0x140($inp),$xb1,$xb1
2994         vpxord          0x180($inp),$xc1,$xc1
2995         vpxord          0x1c0($inp),$xd1,$xd1
2996         vmovdqu32       $xa1,0x100($out)
2997         vmovdqu32       $xb1,0x140($out)
2998         vmovdqu32       $xc1,0x180($out)
2999         vmovdqu32       $xd1,0x1c0($out)
3000
3001         vpxord          0x200($inp),$xa2,$xa2
3002         vpxord          0x240($inp),$xb2,$xb2
3003         vpxord          0x280($inp),$xc2,$xc2
3004         vpxord          0x2c0($inp),$xd2,$xd2
3005         vmovdqu32       $xa2,0x200($out)
3006         vmovdqu32       $xb2,0x240($out)
3007         vmovdqu32       $xc2,0x280($out)
3008         vmovdqu32       $xd2,0x2c0($out)
3009
3010         vpxord          0x300($inp),$xa3,$xa3
3011         vpxord          0x340($inp),$xb3,$xb3
3012         vpxord          0x380($inp),$xc3,$xc3
3013         vpxord          0x3c0($inp),$xd3,$xd3
3014         lea             0x400($inp),$inp
3015         vmovdqu32       $xa3,0x300($out)
3016         vmovdqu32       $xb3,0x340($out)
3017         vmovdqu32       $xc3,0x380($out)
3018         vmovdqu32       $xd3,0x3c0($out)
3019         lea             0x400($out),$out
3020
3021         sub             \$64*16,$len
3022         jnz             .Loop_outer16x
3023
3024         jmp             .Ldone16x
3025
3026 .align  32
3027 .Ltail16x:
3028         xor             %r10,%r10
3029         sub             $inp,$out
3030         cmp             \$64*1,$len
3031         jb              .Less_than_64_16x
3032         vpxord          ($inp),$xa0,$xa0        # xor with input
3033         vmovdqu32       $xa0,($out,$inp)
3034         je              .Ldone16x
3035         vmovdqa32       $xb0,$xa0
3036         lea             64($inp),$inp
3037
3038         cmp             \$64*2,$len
3039         jb              .Less_than_64_16x
3040         vpxord          ($inp),$xb0,$xb0
3041         vmovdqu32       $xb0,($out,$inp)
3042         je              .Ldone16x
3043         vmovdqa32       $xc0,$xa0
3044         lea             64($inp),$inp
3045
3046         cmp             \$64*3,$len
3047         jb              .Less_than_64_16x
3048         vpxord          ($inp),$xc0,$xc0
3049         vmovdqu32       $xc0,($out,$inp)
3050         je              .Ldone16x
3051         vmovdqa32       $xd0,$xa0
3052         lea             64($inp),$inp
3053
3054         cmp             \$64*4,$len
3055         jb              .Less_than_64_16x
3056         vpxord          ($inp),$xd0,$xd0
3057         vmovdqu32       $xd0,($out,$inp)
3058         je              .Ldone16x
3059         vmovdqa32       $xa1,$xa0
3060         lea             64($inp),$inp
3061
3062         cmp             \$64*5,$len
3063         jb              .Less_than_64_16x
3064         vpxord          ($inp),$xa1,$xa1
3065         vmovdqu32       $xa1,($out,$inp)
3066         je              .Ldone16x
3067         vmovdqa32       $xb1,$xa0
3068         lea             64($inp),$inp
3069
3070         cmp             \$64*6,$len
3071         jb              .Less_than_64_16x
3072         vpxord          ($inp),$xb1,$xb1
3073         vmovdqu32       $xb1,($out,$inp)
3074         je              .Ldone16x
3075         vmovdqa32       $xc1,$xa0
3076         lea             64($inp),$inp
3077
3078         cmp             \$64*7,$len
3079         jb              .Less_than_64_16x
3080         vpxord          ($inp),$xc1,$xc1
3081         vmovdqu32       $xc1,($out,$inp)
3082         je              .Ldone16x
3083         vmovdqa32       $xd1,$xa0
3084         lea             64($inp),$inp
3085
3086         cmp             \$64*8,$len
3087         jb              .Less_than_64_16x
3088         vpxord          ($inp),$xd1,$xd1
3089         vmovdqu32       $xd1,($out,$inp)
3090         je              .Ldone16x
3091         vmovdqa32       $xa2,$xa0
3092         lea             64($inp),$inp
3093
3094         cmp             \$64*9,$len
3095         jb              .Less_than_64_16x
3096         vpxord          ($inp),$xa2,$xa2
3097         vmovdqu32       $xa2,($out,$inp)
3098         je              .Ldone16x
3099         vmovdqa32       $xb2,$xa0
3100         lea             64($inp),$inp
3101
3102         cmp             \$64*10,$len
3103         jb              .Less_than_64_16x
3104         vpxord          ($inp),$xb2,$xb2
3105         vmovdqu32       $xb2,($out,$inp)
3106         je              .Ldone16x
3107         vmovdqa32       $xc2,$xa0
3108         lea             64($inp),$inp
3109
3110         cmp             \$64*11,$len
3111         jb              .Less_than_64_16x
3112         vpxord          ($inp),$xc2,$xc2
3113         vmovdqu32       $xc2,($out,$inp)
3114         je              .Ldone16x
3115         vmovdqa32       $xd2,$xa0
3116         lea             64($inp),$inp
3117
3118         cmp             \$64*12,$len
3119         jb              .Less_than_64_16x
3120         vpxord          ($inp),$xd2,$xd2
3121         vmovdqu32       $xd2,($out,$inp)
3122         je              .Ldone16x
3123         vmovdqa32       $xa3,$xa0
3124         lea             64($inp),$inp
3125
3126         cmp             \$64*13,$len
3127         jb              .Less_than_64_16x
3128         vpxord          ($inp),$xa3,$xa3
3129         vmovdqu32       $xa3,($out,$inp)
3130         je              .Ldone16x
3131         vmovdqa32       $xb3,$xa0
3132         lea             64($inp),$inp
3133
3134         cmp             \$64*14,$len
3135         jb              .Less_than_64_16x
3136         vpxord          ($inp),$xb3,$xb3
3137         vmovdqu32       $xb3,($out,$inp)
3138         je              .Ldone16x
3139         vmovdqa32       $xc3,$xa0
3140         lea             64($inp),$inp
3141
3142         cmp             \$64*15,$len
3143         jb              .Less_than_64_16x
3144         vpxord          ($inp),$xc3,$xc3
3145         vmovdqu32       $xc3,($out,$inp)
3146         je              .Ldone16x
3147         vmovdqa32       $xd3,$xa0
3148         lea             64($inp),$inp
3149
3150 .Less_than_64_16x:
3151         vmovdqa32       $xa0,0x00(%rsp)
3152         lea             ($out,$inp),$out
3153         and             \$63,$len
3154
3155 .Loop_tail16x:
3156         movzb           ($inp,%r10),%eax
3157         movzb           (%rsp,%r10),%ecx
3158         lea             1(%r10),%r10
3159         xor             %ecx,%eax
3160         mov             %al,-1($out,%r10)
3161         dec             $len
3162         jnz             .Loop_tail16x
3163
3164         vpxord          $xa0,$xa0,$xa0
3165         vmovdqa32       $xa0,0(%rsp)
3166
3167 .Ldone16x:
3168         vzeroall
3169 ___
3170 $code.=<<___    if ($win64);
3171         movaps          -0xa8(%r9),%xmm6
3172         movaps          -0x98(%r9),%xmm7
3173         movaps          -0x88(%r9),%xmm8
3174         movaps          -0x78(%r9),%xmm9
3175         movaps          -0x68(%r9),%xmm10
3176         movaps          -0x58(%r9),%xmm11
3177         movaps          -0x48(%r9),%xmm12
3178         movaps          -0x38(%r9),%xmm13
3179         movaps          -0x28(%r9),%xmm14
3180         movaps          -0x18(%r9),%xmm15
3181 ___
3182 $code.=<<___;
3183         lea             (%r9),%rsp
3184 .cfi_def_cfa_register   %rsp
3185 .L16x_epilogue:
3186         ret
3187 .cfi_endproc
3188 .size   ChaCha20_16x,.-ChaCha20_16x
3189 ___
3190
3191 # switch to %ymm domain
3192 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3193  $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15));
3194 @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3195      $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
3196 @key=map("%ymm$_",(16..31));
3197 ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
3198
3199 $code.=<<___;
3200 .type   ChaCha20_8xvl,\@function,5
3201 .align  32
3202 ChaCha20_8xvl:
3203 .cfi_startproc
3204 .LChaCha20_8xvl:
3205         mov             %rsp,%r9                # frame register
3206 .cfi_def_cfa_register   %r9
3207         sub             \$64+$xframe,%rsp
3208         and             \$-64,%rsp
3209 ___
3210 $code.=<<___    if ($win64);
3211         movaps          %xmm6,-0xa8(%r9)