b854e2e332e53901c3ef285bf69cb8d380cab35a
[openssl.git] / crypto / chacha / asm / chacha-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # November 2014
18 #
19 # ChaCha20 for x86_64.
20 #
21 # December 2016
22 #
23 # Add AVX512F code path.
24 #
25 # December 2017
26 #
27 # Add AVX512VL code path.
28 #
29 # Performance in cycles per byte out of large buffer.
30 #
31 #               IALU/gcc 4.8(i) 1x/2xSSSE3(ii)  4xSSSE3     NxAVX(v)
32 #
33 # P4            9.48/+99%       -               -
34 # Core2         7.83/+55%       7.90/5.76       4.35
35 # Westmere      7.19/+50%       5.60/4.50       3.00
36 # Sandy Bridge  8.31/+42%       5.45/4.00       2.72
37 # Ivy Bridge    6.71/+46%       5.40/?          2.41
38 # Haswell       5.92/+43%       5.20/3.45       2.42        1.23
39 # Skylake[-X]   5.87/+39%       4.70/3.22       2.31        1.19[0.80(vi)]
40 # Silvermont    12.0/+33%       7.75/6.90       7.03(iii)
41 # Knights L     11.7/-          ?               9.60(iii)   0.80
42 # Goldmont      10.6/+17%       5.10/3.52       3.28
43 # Sledgehammer  7.28/+52%       -               -
44 # Bulldozer     9.66/+28%       9.85/5.35(iv)   3.06(iv)
45 # Ryzen         5.96/+50%       5.19/3.00       2.40        2.09
46 # VIA Nano      10.5/+46%       6.72/6.88       6.05
47 #
48 # (i)   compared to older gcc 3.x one can observe >2x improvement on
49 #       most platforms;
50 # (ii)  2xSSSE3 is code path optimized specifically for 128 bytes used
51 #       by chacha20_poly1305_tls_cipher, results are EVP-free;
52 # (iii) this is not optimal result for Atom because of MSROM
53 #       limitations, SSE2 can do better, but gain is considered too
54 #       low to justify the [maintenance] effort;
55 # (iv)  Bulldozer actually executes 4xXOP code path that delivers 2.20
56 #       and 4.85 for 128-byte inputs;
57 # (v)   8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
58 # (vi)  even though Skylake-X can execute AVX512F code and deliver 0.57
59 #       cpb in single thread, the corresponding capability is suppressed;
60
61 # $output is the last argument if it looks like a file (it has an extension)
62 # $flavour is the first argument if it doesn't look like a file
63 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
64 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
65
66 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
67
68 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
69 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
70 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
71 die "can't locate x86_64-xlate.pl";
72
73 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
74                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
75         $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
76 }
77
78 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
79            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
80         $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
81         $avx += 1 if ($1==2.11 && $2>=8);
82 }
83
84 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
85            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
86         $avx = ($1>=10) + ($1>=11);
87 }
88
89 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
90         $avx = ($2>=3.0) + ($2>3.0);
91 }
92
93 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
94     or die "can't call $xlate: $!";
95 *STDOUT=*OUT;
96
97 # input parameter block
98 ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
99
100 $code.=<<___;
101 .text
102
103 .extern OPENSSL_ia32cap_P
104
105 .align  64
106 .Lzero:
107 .long   0,0,0,0
108 .Lone:
109 .long   1,0,0,0
110 .Linc:
111 .long   0,1,2,3
112 .Lfour:
113 .long   4,4,4,4
114 .Lincy:
115 .long   0,2,4,6,1,3,5,7
116 .Leight:
117 .long   8,8,8,8,8,8,8,8
118 .Lrot16:
119 .byte   0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
120 .Lrot24:
121 .byte   0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
122 .Ltwoy:
123 .long   2,0,0,0, 2,0,0,0
124 .align  64
125 .Lzeroz:
126 .long   0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
127 .Lfourz:
128 .long   4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
129 .Lincz:
130 .long   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
131 .Lsixteen:
132 .long   16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
133 .Lsigma:
134 .asciz  "expand 32-byte k"
135 .asciz  "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
136 ___
137
138 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
139 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
140   my $arg = pop;
141     $arg = "\$$arg" if ($arg*1 eq $arg);
142     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
143 }
144
145 @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
146     "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
147 @t=("%esi","%edi");
148
149 sub ROUND {                     # critical path is 24 cycles per round
150 my ($a0,$b0,$c0,$d0)=@_;
151 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
152 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
153 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
154 my ($xc,$xc_)=map("\"$_\"",@t);
155 my @x=map("\"$_\"",@x);
156
157         # Consider order in which variables are addressed by their
158         # index:
159         #
160         #       a   b   c   d
161         #
162         #       0   4   8  12 < even round
163         #       1   5   9  13
164         #       2   6  10  14
165         #       3   7  11  15
166         #       0   5  10  15 < odd round
167         #       1   6  11  12
168         #       2   7   8  13
169         #       3   4   9  14
170         #
171         # 'a', 'b' and 'd's are permanently allocated in registers,
172         # @x[0..7,12..15], while 'c's are maintained in memory. If
173         # you observe 'c' column, you'll notice that pair of 'c's is
174         # invariant between rounds. This means that we have to reload
175         # them once per round, in the middle. This is why you'll see
176         # bunch of 'c' stores and loads in the middle, but none in
177         # the beginning or end.
178
179         # Normally instructions would be interleaved to favour in-order
180         # execution. Generally out-of-order cores manage it gracefully,
181         # but not this time for some reason. As in-order execution
182         # cores are dying breed, old Atom is the only one around,
183         # instructions are left uninterleaved. Besides, Atom is better
184         # off executing 1xSSSE3 code anyway...
185
186         (
187         "&add   (@x[$a0],@x[$b0])",     # Q1
188         "&xor   (@x[$d0],@x[$a0])",
189         "&rol   (@x[$d0],16)",
190          "&add  (@x[$a1],@x[$b1])",     # Q2
191          "&xor  (@x[$d1],@x[$a1])",
192          "&rol  (@x[$d1],16)",
193
194         "&add   ($xc,@x[$d0])",
195         "&xor   (@x[$b0],$xc)",
196         "&rol   (@x[$b0],12)",
197          "&add  ($xc_,@x[$d1])",
198          "&xor  (@x[$b1],$xc_)",
199          "&rol  (@x[$b1],12)",
200
201         "&add   (@x[$a0],@x[$b0])",
202         "&xor   (@x[$d0],@x[$a0])",
203         "&rol   (@x[$d0],8)",
204          "&add  (@x[$a1],@x[$b1])",
205          "&xor  (@x[$d1],@x[$a1])",
206          "&rol  (@x[$d1],8)",
207
208         "&add   ($xc,@x[$d0])",
209         "&xor   (@x[$b0],$xc)",
210         "&rol   (@x[$b0],7)",
211          "&add  ($xc_,@x[$d1])",
212          "&xor  (@x[$b1],$xc_)",
213          "&rol  (@x[$b1],7)",
214
215         "&mov   (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
216          "&mov  (\"4*$c1(%rsp)\",$xc_)",
217         "&mov   ($xc,\"4*$c2(%rsp)\")",
218          "&mov  ($xc_,\"4*$c3(%rsp)\")",
219
220         "&add   (@x[$a2],@x[$b2])",     # Q3
221         "&xor   (@x[$d2],@x[$a2])",
222         "&rol   (@x[$d2],16)",
223          "&add  (@x[$a3],@x[$b3])",     # Q4
224          "&xor  (@x[$d3],@x[$a3])",
225          "&rol  (@x[$d3],16)",
226
227         "&add   ($xc,@x[$d2])",
228         "&xor   (@x[$b2],$xc)",
229         "&rol   (@x[$b2],12)",
230          "&add  ($xc_,@x[$d3])",
231          "&xor  (@x[$b3],$xc_)",
232          "&rol  (@x[$b3],12)",
233
234         "&add   (@x[$a2],@x[$b2])",
235         "&xor   (@x[$d2],@x[$a2])",
236         "&rol   (@x[$d2],8)",
237          "&add  (@x[$a3],@x[$b3])",
238          "&xor  (@x[$d3],@x[$a3])",
239          "&rol  (@x[$d3],8)",
240
241         "&add   ($xc,@x[$d2])",
242         "&xor   (@x[$b2],$xc)",
243         "&rol   (@x[$b2],7)",
244          "&add  ($xc_,@x[$d3])",
245          "&xor  (@x[$b3],$xc_)",
246          "&rol  (@x[$b3],7)"
247         );
248 }
249
250 ########################################################################
251 # Generic code path that handles all lengths on pre-SSSE3 processors.
252 $code.=<<___;
253 .globl  ChaCha20_ctr32
254 .type   ChaCha20_ctr32,\@function,5
255 .align  64
256 ChaCha20_ctr32:
257 .cfi_startproc
258         cmp     \$0,$len
259         je      .Lno_data
260         mov     OPENSSL_ia32cap_P+4(%rip),%r10
261 ___
262 $code.=<<___    if ($avx>2);
263         bt      \$48,%r10               # check for AVX512F
264         jc      .LChaCha20_avx512
265         test    %r10,%r10               # check for AVX512VL
266         js      .LChaCha20_avx512vl
267 ___
268 $code.=<<___;
269         test    \$`1<<(41-32)`,%r10d
270         jnz     .LChaCha20_ssse3
271
272         push    %rbx
273 .cfi_push       %rbx
274         push    %rbp
275 .cfi_push       %rbp
276         push    %r12
277 .cfi_push       %r12
278         push    %r13
279 .cfi_push       %r13
280         push    %r14
281 .cfi_push       %r14
282         push    %r15
283 .cfi_push       %r15
284         sub     \$64+24,%rsp
285 .cfi_adjust_cfa_offset  64+24
286 .Lctr32_body:
287
288         #movdqa .Lsigma(%rip),%xmm0
289         movdqu  ($key),%xmm1
290         movdqu  16($key),%xmm2
291         movdqu  ($counter),%xmm3
292         movdqa  .Lone(%rip),%xmm4
293
294         #movdqa %xmm0,4*0(%rsp)         # key[0]
295         movdqa  %xmm1,4*4(%rsp)         # key[1]
296         movdqa  %xmm2,4*8(%rsp)         # key[2]
297         movdqa  %xmm3,4*12(%rsp)        # key[3]
298         mov     $len,%rbp               # reassign $len
299         jmp     .Loop_outer
300
301 .align  32
302 .Loop_outer:
303         mov     \$0x61707865,@x[0]      # 'expa'
304         mov     \$0x3320646e,@x[1]      # 'nd 3'
305         mov     \$0x79622d32,@x[2]      # '2-by'
306         mov     \$0x6b206574,@x[3]      # 'te k'
307         mov     4*4(%rsp),@x[4]
308         mov     4*5(%rsp),@x[5]
309         mov     4*6(%rsp),@x[6]
310         mov     4*7(%rsp),@x[7]
311         movd    %xmm3,@x[12]
312         mov     4*13(%rsp),@x[13]
313         mov     4*14(%rsp),@x[14]
314         mov     4*15(%rsp),@x[15]
315
316         mov     %rbp,64+0(%rsp)         # save len
317         mov     \$10,%ebp
318         mov     $inp,64+8(%rsp)         # save inp
319         movq    %xmm2,%rsi              # "@x[8]"
320         mov     $out,64+16(%rsp)        # save out
321         mov     %rsi,%rdi
322         shr     \$32,%rdi               # "@x[9]"
323         jmp     .Loop
324
325 .align  32
326 .Loop:
327 ___
328         foreach (&ROUND (0, 4, 8,12)) { eval; }
329         foreach (&ROUND (0, 5,10,15)) { eval; }
330         &dec    ("%ebp");
331         &jnz    (".Loop");
332
333 $code.=<<___;
334         mov     @t[1],4*9(%rsp)         # modulo-scheduled
335         mov     @t[0],4*8(%rsp)
336         mov     64(%rsp),%rbp           # load len
337         movdqa  %xmm2,%xmm1
338         mov     64+8(%rsp),$inp         # load inp
339         paddd   %xmm4,%xmm3             # increment counter
340         mov     64+16(%rsp),$out        # load out
341
342         add     \$0x61707865,@x[0]      # 'expa'
343         add     \$0x3320646e,@x[1]      # 'nd 3'
344         add     \$0x79622d32,@x[2]      # '2-by'
345         add     \$0x6b206574,@x[3]      # 'te k'
346         add     4*4(%rsp),@x[4]
347         add     4*5(%rsp),@x[5]
348         add     4*6(%rsp),@x[6]
349         add     4*7(%rsp),@x[7]
350         add     4*12(%rsp),@x[12]
351         add     4*13(%rsp),@x[13]
352         add     4*14(%rsp),@x[14]
353         add     4*15(%rsp),@x[15]
354         paddd   4*8(%rsp),%xmm1
355
356         cmp     \$64,%rbp
357         jb      .Ltail
358
359         xor     4*0($inp),@x[0]         # xor with input
360         xor     4*1($inp),@x[1]
361         xor     4*2($inp),@x[2]
362         xor     4*3($inp),@x[3]
363         xor     4*4($inp),@x[4]
364         xor     4*5($inp),@x[5]
365         xor     4*6($inp),@x[6]
366         xor     4*7($inp),@x[7]
367         movdqu  4*8($inp),%xmm0
368         xor     4*12($inp),@x[12]
369         xor     4*13($inp),@x[13]
370         xor     4*14($inp),@x[14]
371         xor     4*15($inp),@x[15]
372         lea     4*16($inp),$inp         # inp+=64
373         pxor    %xmm1,%xmm0
374
375         movdqa  %xmm2,4*8(%rsp)
376         movd    %xmm3,4*12(%rsp)
377
378         mov     @x[0],4*0($out)         # write output
379         mov     @x[1],4*1($out)
380         mov     @x[2],4*2($out)
381         mov     @x[3],4*3($out)
382         mov     @x[4],4*4($out)
383         mov     @x[5],4*5($out)
384         mov     @x[6],4*6($out)
385         mov     @x[7],4*7($out)
386         movdqu  %xmm0,4*8($out)
387         mov     @x[12],4*12($out)
388         mov     @x[13],4*13($out)
389         mov     @x[14],4*14($out)
390         mov     @x[15],4*15($out)
391         lea     4*16($out),$out         # out+=64
392
393         sub     \$64,%rbp
394         jnz     .Loop_outer
395
396         jmp     .Ldone
397
398 .align  16
399 .Ltail:
400         mov     @x[0],4*0(%rsp)
401         mov     @x[1],4*1(%rsp)
402         xor     %rbx,%rbx
403         mov     @x[2],4*2(%rsp)
404         mov     @x[3],4*3(%rsp)
405         mov     @x[4],4*4(%rsp)
406         mov     @x[5],4*5(%rsp)
407         mov     @x[6],4*6(%rsp)
408         mov     @x[7],4*7(%rsp)
409         movdqa  %xmm1,4*8(%rsp)
410         mov     @x[12],4*12(%rsp)
411         mov     @x[13],4*13(%rsp)
412         mov     @x[14],4*14(%rsp)
413         mov     @x[15],4*15(%rsp)
414
415 .Loop_tail:
416         movzb   ($inp,%rbx),%eax
417         movzb   (%rsp,%rbx),%edx
418         lea     1(%rbx),%rbx
419         xor     %edx,%eax
420         mov     %al,-1($out,%rbx)
421         dec     %rbp
422         jnz     .Loop_tail
423
424 .Ldone:
425         lea     64+24+48(%rsp),%rsi
426 .cfi_def_cfa    %rsi,8
427         mov     -48(%rsi),%r15
428 .cfi_restore    %r15
429         mov     -40(%rsi),%r14
430 .cfi_restore    %r14
431         mov     -32(%rsi),%r13
432 .cfi_restore    %r13
433         mov     -24(%rsi),%r12
434 .cfi_restore    %r12
435         mov     -16(%rsi),%rbp
436 .cfi_restore    %rbp
437         mov     -8(%rsi),%rbx
438 .cfi_restore    %rbx
439         lea     (%rsi),%rsp
440 .cfi_def_cfa_register   %rsp
441 .Lno_data:
442         ret
443 .cfi_endproc
444 .size   ChaCha20_ctr32,.-ChaCha20_ctr32
445 ___
446
447 ########################################################################
448 # SSSE3 code path that handles shorter lengths
449 {
450 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
451
452 sub SSSE3ROUND {        # critical path is 20 "SIMD ticks" per round
453         &paddd  ($a,$b);
454         &pxor   ($d,$a);
455         &pshufb ($d,$rot16);
456
457         &paddd  ($c,$d);
458         &pxor   ($b,$c);
459         &movdqa ($t,$b);
460         &psrld  ($b,20);
461         &pslld  ($t,12);
462         &por    ($b,$t);
463
464         &paddd  ($a,$b);
465         &pxor   ($d,$a);
466         &pshufb ($d,$rot24);
467
468         &paddd  ($c,$d);
469         &pxor   ($b,$c);
470         &movdqa ($t,$b);
471         &psrld  ($b,25);
472         &pslld  ($t,7);
473         &por    ($b,$t);
474 }
475
476 my $xframe = $win64 ? 32+8 : 8;
477
478 $code.=<<___;
479 .type   ChaCha20_ssse3,\@function,5
480 .align  32
481 ChaCha20_ssse3:
482 .cfi_startproc
483 .LChaCha20_ssse3:
484         mov     %rsp,%r9                # frame pointer
485 .cfi_def_cfa_register   %r9
486 ___
487 $code.=<<___    if ($avx);
488         test    \$`1<<(43-32)`,%r10d
489         jnz     .LChaCha20_4xop         # XOP is fastest even if we use 1/4
490 ___
491 $code.=<<___;
492         cmp     \$128,$len              # we might throw away some data,
493         je      .LChaCha20_128
494         ja      .LChaCha20_4x           # but overall it won't be slower
495
496 .Ldo_sse3_after_all:
497         sub     \$64+$xframe,%rsp
498 ___
499 $code.=<<___    if ($win64);
500         movaps  %xmm6,-0x28(%r9)
501         movaps  %xmm7,-0x18(%r9)
502 .Lssse3_body:
503 ___
504 $code.=<<___;
505         movdqa  .Lsigma(%rip),$a
506         movdqu  ($key),$b
507         movdqu  16($key),$c
508         movdqu  ($counter),$d
509         movdqa  .Lrot16(%rip),$rot16
510         movdqa  .Lrot24(%rip),$rot24
511
512         movdqa  $a,0x00(%rsp)
513         movdqa  $b,0x10(%rsp)
514         movdqa  $c,0x20(%rsp)
515         movdqa  $d,0x30(%rsp)
516         mov     \$10,$counter           # reuse $counter
517         jmp     .Loop_ssse3
518
519 .align  32
520 .Loop_outer_ssse3:
521         movdqa  .Lone(%rip),$d
522         movdqa  0x00(%rsp),$a
523         movdqa  0x10(%rsp),$b
524         movdqa  0x20(%rsp),$c
525         paddd   0x30(%rsp),$d
526         mov     \$10,$counter
527         movdqa  $d,0x30(%rsp)
528         jmp     .Loop_ssse3
529
530 .align  32
531 .Loop_ssse3:
532 ___
533         &SSSE3ROUND();
534         &pshufd ($c,$c,0b01001110);
535         &pshufd ($b,$b,0b00111001);
536         &pshufd ($d,$d,0b10010011);
537         &nop    ();
538
539         &SSSE3ROUND();
540         &pshufd ($c,$c,0b01001110);
541         &pshufd ($b,$b,0b10010011);
542         &pshufd ($d,$d,0b00111001);
543
544         &dec    ($counter);
545         &jnz    (".Loop_ssse3");
546
547 $code.=<<___;
548         paddd   0x00(%rsp),$a
549         paddd   0x10(%rsp),$b
550         paddd   0x20(%rsp),$c
551         paddd   0x30(%rsp),$d
552
553         cmp     \$64,$len
554         jb      .Ltail_ssse3
555
556         movdqu  0x00($inp),$t
557         movdqu  0x10($inp),$t1
558         pxor    $t,$a                   # xor with input
559         movdqu  0x20($inp),$t
560         pxor    $t1,$b
561         movdqu  0x30($inp),$t1
562         lea     0x40($inp),$inp         # inp+=64
563         pxor    $t,$c
564         pxor    $t1,$d
565
566         movdqu  $a,0x00($out)           # write output
567         movdqu  $b,0x10($out)
568         movdqu  $c,0x20($out)
569         movdqu  $d,0x30($out)
570         lea     0x40($out),$out         # out+=64
571
572         sub     \$64,$len
573         jnz     .Loop_outer_ssse3
574
575         jmp     .Ldone_ssse3
576
577 .align  16
578 .Ltail_ssse3:
579         movdqa  $a,0x00(%rsp)
580         movdqa  $b,0x10(%rsp)
581         movdqa  $c,0x20(%rsp)
582         movdqa  $d,0x30(%rsp)
583         xor     $counter,$counter
584
585 .Loop_tail_ssse3:
586         movzb   ($inp,$counter),%eax
587         movzb   (%rsp,$counter),%ecx
588         lea     1($counter),$counter
589         xor     %ecx,%eax
590         mov     %al,-1($out,$counter)
591         dec     $len
592         jnz     .Loop_tail_ssse3
593
594 .Ldone_ssse3:
595 ___
596 $code.=<<___    if ($win64);
597         movaps  -0x28(%r9),%xmm6
598         movaps  -0x18(%r9),%xmm7
599 ___
600 $code.=<<___;
601         lea     (%r9),%rsp
602 .cfi_def_cfa_register   %rsp
603 .Lssse3_epilogue:
604         ret
605 .cfi_endproc
606 .size   ChaCha20_ssse3,.-ChaCha20_ssse3
607 ___
608 }
609
610 ########################################################################
611 # SSSE3 code path that handles 128-byte inputs
612 {
613 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7));
614 my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1));
615
616 sub SSSE3ROUND_2x {
617         &paddd  ($a,$b);
618         &pxor   ($d,$a);
619          &paddd ($a1,$b1);
620          &pxor  ($d1,$a1);
621         &pshufb ($d,$rot16);
622          &pshufb($d1,$rot16);
623
624         &paddd  ($c,$d);
625          &paddd ($c1,$d1);
626         &pxor   ($b,$c);
627          &pxor  ($b1,$c1);
628         &movdqa ($t,$b);
629         &psrld  ($b,20);
630          &movdqa($t1,$b1);
631         &pslld  ($t,12);
632          &psrld ($b1,20);
633         &por    ($b,$t);
634          &pslld ($t1,12);
635          &por   ($b1,$t1);
636
637         &paddd  ($a,$b);
638         &pxor   ($d,$a);
639          &paddd ($a1,$b1);
640          &pxor  ($d1,$a1);
641         &pshufb ($d,$rot24);
642          &pshufb($d1,$rot24);
643
644         &paddd  ($c,$d);
645          &paddd ($c1,$d1);
646         &pxor   ($b,$c);
647          &pxor  ($b1,$c1);
648         &movdqa ($t,$b);
649         &psrld  ($b,25);
650          &movdqa($t1,$b1);
651         &pslld  ($t,7);
652          &psrld ($b1,25);
653         &por    ($b,$t);
654          &pslld ($t1,7);
655          &por   ($b1,$t1);
656 }
657
658 my $xframe = $win64 ? 0x68 : 8;
659
660 $code.=<<___;
661 .type   ChaCha20_128,\@function,5
662 .align  32
663 ChaCha20_128:
664 .cfi_startproc
665 .LChaCha20_128:
666         mov     %rsp,%r9                # frame pointer
667 .cfi_def_cfa_register   %r9
668         sub     \$64+$xframe,%rsp
669 ___
670 $code.=<<___    if ($win64);
671         movaps  %xmm6,-0x68(%r9)
672         movaps  %xmm7,-0x58(%r9)
673         movaps  %xmm8,-0x48(%r9)
674         movaps  %xmm9,-0x38(%r9)
675         movaps  %xmm10,-0x28(%r9)
676         movaps  %xmm11,-0x18(%r9)
677 .L128_body:
678 ___
679 $code.=<<___;
680         movdqa  .Lsigma(%rip),$a
681         movdqu  ($key),$b
682         movdqu  16($key),$c
683         movdqu  ($counter),$d
684         movdqa  .Lone(%rip),$d1
685         movdqa  .Lrot16(%rip),$rot16
686         movdqa  .Lrot24(%rip),$rot24
687
688         movdqa  $a,$a1
689         movdqa  $a,0x00(%rsp)
690         movdqa  $b,$b1
691         movdqa  $b,0x10(%rsp)
692         movdqa  $c,$c1
693         movdqa  $c,0x20(%rsp)
694         paddd   $d,$d1
695         movdqa  $d,0x30(%rsp)
696         mov     \$10,$counter           # reuse $counter
697         jmp     .Loop_128
698
699 .align  32
700 .Loop_128:
701 ___
702         &SSSE3ROUND_2x();
703         &pshufd ($c,$c,0b01001110);
704         &pshufd ($b,$b,0b00111001);
705         &pshufd ($d,$d,0b10010011);
706         &pshufd ($c1,$c1,0b01001110);
707         &pshufd ($b1,$b1,0b00111001);
708         &pshufd ($d1,$d1,0b10010011);
709
710         &SSSE3ROUND_2x();
711         &pshufd ($c,$c,0b01001110);
712         &pshufd ($b,$b,0b10010011);
713         &pshufd ($d,$d,0b00111001);
714         &pshufd ($c1,$c1,0b01001110);
715         &pshufd ($b1,$b1,0b10010011);
716         &pshufd ($d1,$d1,0b00111001);
717
718         &dec    ($counter);
719         &jnz    (".Loop_128");
720
721 $code.=<<___;
722         paddd   0x00(%rsp),$a
723         paddd   0x10(%rsp),$b
724         paddd   0x20(%rsp),$c
725         paddd   0x30(%rsp),$d
726         paddd   .Lone(%rip),$d1
727         paddd   0x00(%rsp),$a1
728         paddd   0x10(%rsp),$b1
729         paddd   0x20(%rsp),$c1
730         paddd   0x30(%rsp),$d1
731
732         movdqu  0x00($inp),$t
733         movdqu  0x10($inp),$t1
734         pxor    $t,$a                   # xor with input
735         movdqu  0x20($inp),$t
736         pxor    $t1,$b
737         movdqu  0x30($inp),$t1
738         pxor    $t,$c
739         movdqu  0x40($inp),$t
740         pxor    $t1,$d
741         movdqu  0x50($inp),$t1
742         pxor    $t,$a1
743         movdqu  0x60($inp),$t
744         pxor    $t1,$b1
745         movdqu  0x70($inp),$t1
746         pxor    $t,$c1
747         pxor    $t1,$d1
748
749         movdqu  $a,0x00($out)           # write output
750         movdqu  $b,0x10($out)
751         movdqu  $c,0x20($out)
752         movdqu  $d,0x30($out)
753         movdqu  $a1,0x40($out)
754         movdqu  $b1,0x50($out)
755         movdqu  $c1,0x60($out)
756         movdqu  $d1,0x70($out)
757 ___
758 $code.=<<___    if ($win64);
759         movaps  -0x68(%r9),%xmm6
760         movaps  -0x58(%r9),%xmm7
761         movaps  -0x48(%r9),%xmm8
762         movaps  -0x38(%r9),%xmm9
763         movaps  -0x28(%r9),%xmm10
764         movaps  -0x18(%r9),%xmm11
765 ___
766 $code.=<<___;
767         lea     (%r9),%rsp
768 .cfi_def_cfa_register   %rsp
769 .L128_epilogue:
770         ret
771 .cfi_endproc
772 .size   ChaCha20_128,.-ChaCha20_128
773 ___
774 }
775
776 ########################################################################
777 # SSSE3 code path that handles longer messages.
778 {
779 # assign variables to favor Atom front-end
780 my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
781     $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
782 my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
783         "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
784
785 sub SSSE3_lane_ROUND {
786 my ($a0,$b0,$c0,$d0)=@_;
787 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
788 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
789 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
790 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
791 my @x=map("\"$_\"",@xx);
792
793         # Consider order in which variables are addressed by their
794         # index:
795         #
796         #       a   b   c   d
797         #
798         #       0   4   8  12 < even round
799         #       1   5   9  13
800         #       2   6  10  14
801         #       3   7  11  15
802         #       0   5  10  15 < odd round
803         #       1   6  11  12
804         #       2   7   8  13
805         #       3   4   9  14
806         #
807         # 'a', 'b' and 'd's are permanently allocated in registers,
808         # @x[0..7,12..15], while 'c's are maintained in memory. If
809         # you observe 'c' column, you'll notice that pair of 'c's is
810         # invariant between rounds. This means that we have to reload
811         # them once per round, in the middle. This is why you'll see
812         # bunch of 'c' stores and loads in the middle, but none in
813         # the beginning or end.
814
815         (
816         "&paddd         (@x[$a0],@x[$b0])",     # Q1
817          "&paddd        (@x[$a1],@x[$b1])",     # Q2
818         "&pxor          (@x[$d0],@x[$a0])",
819          "&pxor         (@x[$d1],@x[$a1])",
820         "&pshufb        (@x[$d0],$t1)",
821          "&pshufb       (@x[$d1],$t1)",
822
823         "&paddd         ($xc,@x[$d0])",
824          "&paddd        ($xc_,@x[$d1])",
825         "&pxor          (@x[$b0],$xc)",
826          "&pxor         (@x[$b1],$xc_)",
827         "&movdqa        ($t0,@x[$b0])",
828         "&pslld         (@x[$b0],12)",
829         "&psrld         ($t0,20)",
830          "&movdqa       ($t1,@x[$b1])",
831          "&pslld        (@x[$b1],12)",
832         "&por           (@x[$b0],$t0)",
833          "&psrld        ($t1,20)",
834         "&movdqa        ($t0,'(%r11)')",        # .Lrot24(%rip)
835          "&por          (@x[$b1],$t1)",
836
837         "&paddd         (@x[$a0],@x[$b0])",
838          "&paddd        (@x[$a1],@x[$b1])",
839         "&pxor          (@x[$d0],@x[$a0])",
840          "&pxor         (@x[$d1],@x[$a1])",
841         "&pshufb        (@x[$d0],$t0)",
842          "&pshufb       (@x[$d1],$t0)",
843
844         "&paddd         ($xc,@x[$d0])",
845          "&paddd        ($xc_,@x[$d1])",
846         "&pxor          (@x[$b0],$xc)",
847          "&pxor         (@x[$b1],$xc_)",
848         "&movdqa        ($t1,@x[$b0])",
849         "&pslld         (@x[$b0],7)",
850         "&psrld         ($t1,25)",
851          "&movdqa       ($t0,@x[$b1])",
852          "&pslld        (@x[$b1],7)",
853         "&por           (@x[$b0],$t1)",
854          "&psrld        ($t0,25)",
855         "&movdqa        ($t1,'(%r10)')",        # .Lrot16(%rip)
856          "&por          (@x[$b1],$t0)",
857
858         "&movdqa        (\"`16*($c0-8)`(%rsp)\",$xc)",  # reload pair of 'c's
859          "&movdqa       (\"`16*($c1-8)`(%rsp)\",$xc_)",
860         "&movdqa        ($xc,\"`16*($c2-8)`(%rsp)\")",
861          "&movdqa       ($xc_,\"`16*($c3-8)`(%rsp)\")",
862
863         "&paddd         (@x[$a2],@x[$b2])",     # Q3
864          "&paddd        (@x[$a3],@x[$b3])",     # Q4
865         "&pxor          (@x[$d2],@x[$a2])",
866          "&pxor         (@x[$d3],@x[$a3])",
867         "&pshufb        (@x[$d2],$t1)",
868          "&pshufb       (@x[$d3],$t1)",
869
870         "&paddd         ($xc,@x[$d2])",
871          "&paddd        ($xc_,@x[$d3])",
872         "&pxor          (@x[$b2],$xc)",
873          "&pxor         (@x[$b3],$xc_)",
874         "&movdqa        ($t0,@x[$b2])",
875         "&pslld         (@x[$b2],12)",
876         "&psrld         ($t0,20)",
877          "&movdqa       ($t1,@x[$b3])",
878          "&pslld        (@x[$b3],12)",
879         "&por           (@x[$b2],$t0)",
880          "&psrld        ($t1,20)",
881         "&movdqa        ($t0,'(%r11)')",        # .Lrot24(%rip)
882          "&por          (@x[$b3],$t1)",
883
884         "&paddd         (@x[$a2],@x[$b2])",
885          "&paddd        (@x[$a3],@x[$b3])",
886         "&pxor          (@x[$d2],@x[$a2])",
887          "&pxor         (@x[$d3],@x[$a3])",
888         "&pshufb        (@x[$d2],$t0)",
889          "&pshufb       (@x[$d3],$t0)",
890
891         "&paddd         ($xc,@x[$d2])",
892          "&paddd        ($xc_,@x[$d3])",
893         "&pxor          (@x[$b2],$xc)",
894          "&pxor         (@x[$b3],$xc_)",
895         "&movdqa        ($t1,@x[$b2])",
896         "&pslld         (@x[$b2],7)",
897         "&psrld         ($t1,25)",
898          "&movdqa       ($t0,@x[$b3])",
899          "&pslld        (@x[$b3],7)",
900         "&por           (@x[$b2],$t1)",
901          "&psrld        ($t0,25)",
902         "&movdqa        ($t1,'(%r10)')",        # .Lrot16(%rip)
903          "&por          (@x[$b3],$t0)"
904         );
905 }
906
907 my $xframe = $win64 ? 0xa8 : 8;
908
909 $code.=<<___;
910 .type   ChaCha20_4x,\@function,5
911 .align  32
912 ChaCha20_4x:
913 .cfi_startproc
914 .LChaCha20_4x:
915         mov             %rsp,%r9                # frame pointer
916 .cfi_def_cfa_register   %r9
917         mov             %r10,%r11
918 ___
919 $code.=<<___    if ($avx>1);
920         shr             \$32,%r10               # OPENSSL_ia32cap_P+8
921         test            \$`1<<5`,%r10           # test AVX2
922         jnz             .LChaCha20_8x
923 ___
924 $code.=<<___;
925         cmp             \$192,$len
926         ja              .Lproceed4x
927
928         and             \$`1<<26|1<<22`,%r11    # isolate XSAVE+MOVBE
929         cmp             \$`1<<22`,%r11          # check for MOVBE without XSAVE
930         je              .Ldo_sse3_after_all     # to detect Atom
931
932 .Lproceed4x:
933         sub             \$0x140+$xframe,%rsp
934 ___
935         ################ stack layout
936         # +0x00         SIMD equivalent of @x[8-12]
937         # ...
938         # +0x40         constant copy of key[0-2] smashed by lanes
939         # ...
940         # +0x100        SIMD counters (with nonce smashed by lanes)
941         # ...
942         # +0x140
943 $code.=<<___    if ($win64);
944         movaps          %xmm6,-0xa8(%r9)
945         movaps          %xmm7,-0x98(%r9)
946         movaps          %xmm8,-0x88(%r9)
947         movaps          %xmm9,-0x78(%r9)
948         movaps          %xmm10,-0x68(%r9)
949         movaps          %xmm11,-0x58(%r9)
950         movaps          %xmm12,-0x48(%r9)
951         movaps          %xmm13,-0x38(%r9)
952         movaps          %xmm14,-0x28(%r9)
953         movaps          %xmm15,-0x18(%r9)
954 .L4x_body:
955 ___
956 $code.=<<___;
957         movdqa          .Lsigma(%rip),$xa3      # key[0]
958         movdqu          ($key),$xb3             # key[1]
959         movdqu          16($key),$xt3           # key[2]
960         movdqu          ($counter),$xd3         # key[3]
961         lea             0x100(%rsp),%rcx        # size optimization
962         lea             .Lrot16(%rip),%r10
963         lea             .Lrot24(%rip),%r11
964
965         pshufd          \$0x00,$xa3,$xa0        # smash key by lanes...
966         pshufd          \$0x55,$xa3,$xa1
967         movdqa          $xa0,0x40(%rsp)         # ... and offload
968         pshufd          \$0xaa,$xa3,$xa2
969         movdqa          $xa1,0x50(%rsp)
970         pshufd          \$0xff,$xa3,$xa3
971         movdqa          $xa2,0x60(%rsp)
972         movdqa          $xa3,0x70(%rsp)
973
974         pshufd          \$0x00,$xb3,$xb0
975         pshufd          \$0x55,$xb3,$xb1
976         movdqa          $xb0,0x80-0x100(%rcx)
977         pshufd          \$0xaa,$xb3,$xb2
978         movdqa          $xb1,0x90-0x100(%rcx)
979         pshufd          \$0xff,$xb3,$xb3
980         movdqa          $xb2,0xa0-0x100(%rcx)
981         movdqa          $xb3,0xb0-0x100(%rcx)
982
983         pshufd          \$0x00,$xt3,$xt0        # "$xc0"
984         pshufd          \$0x55,$xt3,$xt1        # "$xc1"
985         movdqa          $xt0,0xc0-0x100(%rcx)
986         pshufd          \$0xaa,$xt3,$xt2        # "$xc2"
987         movdqa          $xt1,0xd0-0x100(%rcx)
988         pshufd          \$0xff,$xt3,$xt3        # "$xc3"
989         movdqa          $xt2,0xe0-0x100(%rcx)
990         movdqa          $xt3,0xf0-0x100(%rcx)
991
992         pshufd          \$0x00,$xd3,$xd0
993         pshufd          \$0x55,$xd3,$xd1
994         paddd           .Linc(%rip),$xd0        # don't save counters yet
995         pshufd          \$0xaa,$xd3,$xd2
996         movdqa          $xd1,0x110-0x100(%rcx)
997         pshufd          \$0xff,$xd3,$xd3
998         movdqa          $xd2,0x120-0x100(%rcx)
999         movdqa          $xd3,0x130-0x100(%rcx)
1000
1001         jmp             .Loop_enter4x
1002
1003 .align  32
1004 .Loop_outer4x:
1005         movdqa          0x40(%rsp),$xa0         # re-load smashed key
1006         movdqa          0x50(%rsp),$xa1
1007         movdqa          0x60(%rsp),$xa2
1008         movdqa          0x70(%rsp),$xa3
1009         movdqa          0x80-0x100(%rcx),$xb0
1010         movdqa          0x90-0x100(%rcx),$xb1
1011         movdqa          0xa0-0x100(%rcx),$xb2
1012         movdqa          0xb0-0x100(%rcx),$xb3
1013         movdqa          0xc0-0x100(%rcx),$xt0   # "$xc0"
1014         movdqa          0xd0-0x100(%rcx),$xt1   # "$xc1"
1015         movdqa          0xe0-0x100(%rcx),$xt2   # "$xc2"
1016         movdqa          0xf0-0x100(%rcx),$xt3   # "$xc3"
1017         movdqa          0x100-0x100(%rcx),$xd0
1018         movdqa          0x110-0x100(%rcx),$xd1
1019         movdqa          0x120-0x100(%rcx),$xd2
1020         movdqa          0x130-0x100(%rcx),$xd3
1021         paddd           .Lfour(%rip),$xd0       # next SIMD counters
1022
1023 .Loop_enter4x:
1024         movdqa          $xt2,0x20(%rsp)         # SIMD equivalent of "@x[10]"
1025         movdqa          $xt3,0x30(%rsp)         # SIMD equivalent of "@x[11]"
1026         movdqa          (%r10),$xt3             # .Lrot16(%rip)
1027         mov             \$10,%eax
1028         movdqa          $xd0,0x100-0x100(%rcx)  # save SIMD counters
1029         jmp             .Loop4x
1030
1031 .align  32
1032 .Loop4x:
1033 ___
1034         foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
1035         foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
1036 $code.=<<___;
1037         dec             %eax
1038         jnz             .Loop4x
1039
1040         paddd           0x40(%rsp),$xa0         # accumulate key material
1041         paddd           0x50(%rsp),$xa1
1042         paddd           0x60(%rsp),$xa2
1043         paddd           0x70(%rsp),$xa3
1044
1045         movdqa          $xa0,$xt2               # "de-interlace" data
1046         punpckldq       $xa1,$xa0
1047         movdqa          $xa2,$xt3
1048         punpckldq       $xa3,$xa2
1049         punpckhdq       $xa1,$xt2
1050         punpckhdq       $xa3,$xt3
1051         movdqa          $xa0,$xa1
1052         punpcklqdq      $xa2,$xa0               # "a0"
1053         movdqa          $xt2,$xa3
1054         punpcklqdq      $xt3,$xt2               # "a2"
1055         punpckhqdq      $xa2,$xa1               # "a1"
1056         punpckhqdq      $xt3,$xa3               # "a3"
1057 ___
1058         ($xa2,$xt2)=($xt2,$xa2);
1059 $code.=<<___;
1060         paddd           0x80-0x100(%rcx),$xb0
1061         paddd           0x90-0x100(%rcx),$xb1
1062         paddd           0xa0-0x100(%rcx),$xb2
1063         paddd           0xb0-0x100(%rcx),$xb3
1064
1065         movdqa          $xa0,0x00(%rsp)         # offload $xaN
1066         movdqa          $xa1,0x10(%rsp)
1067         movdqa          0x20(%rsp),$xa0         # "xc2"
1068         movdqa          0x30(%rsp),$xa1         # "xc3"
1069
1070         movdqa          $xb0,$xt2
1071         punpckldq       $xb1,$xb0
1072         movdqa          $xb2,$xt3
1073         punpckldq       $xb3,$xb2
1074         punpckhdq       $xb1,$xt2
1075         punpckhdq       $xb3,$xt3
1076         movdqa          $xb0,$xb1
1077         punpcklqdq      $xb2,$xb0               # "b0"
1078         movdqa          $xt2,$xb3
1079         punpcklqdq      $xt3,$xt2               # "b2"
1080         punpckhqdq      $xb2,$xb1               # "b1"
1081         punpckhqdq      $xt3,$xb3               # "b3"
1082 ___
1083         ($xb2,$xt2)=($xt2,$xb2);
1084         my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1085 $code.=<<___;
1086         paddd           0xc0-0x100(%rcx),$xc0
1087         paddd           0xd0-0x100(%rcx),$xc1
1088         paddd           0xe0-0x100(%rcx),$xc2
1089         paddd           0xf0-0x100(%rcx),$xc3
1090
1091         movdqa          $xa2,0x20(%rsp)         # keep offloading $xaN
1092         movdqa          $xa3,0x30(%rsp)
1093
1094         movdqa          $xc0,$xt2
1095         punpckldq       $xc1,$xc0
1096         movdqa          $xc2,$xt3
1097         punpckldq       $xc3,$xc2
1098         punpckhdq       $xc1,$xt2
1099         punpckhdq       $xc3,$xt3
1100         movdqa          $xc0,$xc1
1101         punpcklqdq      $xc2,$xc0               # "c0"
1102         movdqa          $xt2,$xc3
1103         punpcklqdq      $xt3,$xt2               # "c2"
1104         punpckhqdq      $xc2,$xc1               # "c1"
1105         punpckhqdq      $xt3,$xc3               # "c3"
1106 ___
1107         ($xc2,$xt2)=($xt2,$xc2);
1108         ($xt0,$xt1)=($xa2,$xa3);                # use $xaN as temporary
1109 $code.=<<___;
1110         paddd           0x100-0x100(%rcx),$xd0
1111         paddd           0x110-0x100(%rcx),$xd1
1112         paddd           0x120-0x100(%rcx),$xd2
1113         paddd           0x130-0x100(%rcx),$xd3
1114
1115         movdqa          $xd0,$xt2
1116         punpckldq       $xd1,$xd0
1117         movdqa          $xd2,$xt3
1118         punpckldq       $xd3,$xd2
1119         punpckhdq       $xd1,$xt2
1120         punpckhdq       $xd3,$xt3
1121         movdqa          $xd0,$xd1
1122         punpcklqdq      $xd2,$xd0               # "d0"
1123         movdqa          $xt2,$xd3
1124         punpcklqdq      $xt3,$xt2               # "d2"
1125         punpckhqdq      $xd2,$xd1               # "d1"
1126         punpckhqdq      $xt3,$xd3               # "d3"
1127 ___
1128         ($xd2,$xt2)=($xt2,$xd2);
1129 $code.=<<___;
1130         cmp             \$64*4,$len
1131         jb              .Ltail4x
1132
1133         movdqu          0x00($inp),$xt0         # xor with input
1134         movdqu          0x10($inp),$xt1
1135         movdqu          0x20($inp),$xt2
1136         movdqu          0x30($inp),$xt3
1137         pxor            0x00(%rsp),$xt0         # $xaN is offloaded, remember?
1138         pxor            $xb0,$xt1
1139         pxor            $xc0,$xt2
1140         pxor            $xd0,$xt3
1141
1142          movdqu         $xt0,0x00($out)
1143         movdqu          0x40($inp),$xt0
1144          movdqu         $xt1,0x10($out)
1145         movdqu          0x50($inp),$xt1
1146          movdqu         $xt2,0x20($out)
1147         movdqu          0x60($inp),$xt2
1148          movdqu         $xt3,0x30($out)
1149         movdqu          0x70($inp),$xt3
1150         lea             0x80($inp),$inp         # size optimization
1151         pxor            0x10(%rsp),$xt0
1152         pxor            $xb1,$xt1
1153         pxor            $xc1,$xt2
1154         pxor            $xd1,$xt3
1155
1156          movdqu         $xt0,0x40($out)
1157         movdqu          0x00($inp),$xt0
1158          movdqu         $xt1,0x50($out)
1159         movdqu          0x10($inp),$xt1
1160          movdqu         $xt2,0x60($out)
1161         movdqu          0x20($inp),$xt2
1162          movdqu         $xt3,0x70($out)
1163          lea            0x80($out),$out         # size optimization
1164         movdqu          0x30($inp),$xt3
1165         pxor            0x20(%rsp),$xt0
1166         pxor            $xb2,$xt1
1167         pxor            $xc2,$xt2
1168         pxor            $xd2,$xt3
1169
1170          movdqu         $xt0,0x00($out)
1171         movdqu          0x40($inp),$xt0
1172          movdqu         $xt1,0x10($out)
1173         movdqu          0x50($inp),$xt1
1174          movdqu         $xt2,0x20($out)
1175         movdqu          0x60($inp),$xt2
1176          movdqu         $xt3,0x30($out)
1177         movdqu          0x70($inp),$xt3
1178         lea             0x80($inp),$inp         # inp+=64*4
1179         pxor            0x30(%rsp),$xt0
1180         pxor            $xb3,$xt1
1181         pxor            $xc3,$xt2
1182         pxor            $xd3,$xt3
1183         movdqu          $xt0,0x40($out)
1184         movdqu          $xt1,0x50($out)
1185         movdqu          $xt2,0x60($out)
1186         movdqu          $xt3,0x70($out)
1187         lea             0x80($out),$out         # out+=64*4
1188
1189         sub             \$64*4,$len
1190         jnz             .Loop_outer4x
1191
1192         jmp             .Ldone4x
1193
1194 .Ltail4x:
1195         cmp             \$192,$len
1196         jae             .L192_or_more4x
1197         cmp             \$128,$len
1198         jae             .L128_or_more4x
1199         cmp             \$64,$len
1200         jae             .L64_or_more4x
1201
1202         #movdqa         0x00(%rsp),$xt0         # $xaN is offloaded, remember?
1203         xor             %r10,%r10
1204         #movdqa         $xt0,0x00(%rsp)
1205         movdqa          $xb0,0x10(%rsp)
1206         movdqa          $xc0,0x20(%rsp)
1207         movdqa          $xd0,0x30(%rsp)
1208         jmp             .Loop_tail4x
1209
1210 .align  32
1211 .L64_or_more4x:
1212         movdqu          0x00($inp),$xt0         # xor with input
1213         movdqu          0x10($inp),$xt1
1214         movdqu          0x20($inp),$xt2
1215         movdqu          0x30($inp),$xt3
1216         pxor            0x00(%rsp),$xt0         # $xaxN is offloaded, remember?
1217         pxor            $xb0,$xt1
1218         pxor            $xc0,$xt2
1219         pxor            $xd0,$xt3
1220         movdqu          $xt0,0x00($out)
1221         movdqu          $xt1,0x10($out)
1222         movdqu          $xt2,0x20($out)
1223         movdqu          $xt3,0x30($out)
1224         je              .Ldone4x
1225
1226         movdqa          0x10(%rsp),$xt0         # $xaN is offloaded, remember?
1227         lea             0x40($inp),$inp         # inp+=64*1
1228         xor             %r10,%r10
1229         movdqa          $xt0,0x00(%rsp)
1230         movdqa          $xb1,0x10(%rsp)
1231         lea             0x40($out),$out         # out+=64*1
1232         movdqa          $xc1,0x20(%rsp)
1233         sub             \$64,$len               # len-=64*1
1234         movdqa          $xd1,0x30(%rsp)
1235         jmp             .Loop_tail4x
1236
1237 .align  32
1238 .L128_or_more4x:
1239         movdqu          0x00($inp),$xt0         # xor with input
1240         movdqu          0x10($inp),$xt1
1241         movdqu          0x20($inp),$xt2
1242         movdqu          0x30($inp),$xt3
1243         pxor            0x00(%rsp),$xt0         # $xaN is offloaded, remember?
1244         pxor            $xb0,$xt1
1245         pxor            $xc0,$xt2
1246         pxor            $xd0,$xt3
1247
1248          movdqu         $xt0,0x00($out)
1249         movdqu          0x40($inp),$xt0
1250          movdqu         $xt1,0x10($out)
1251         movdqu          0x50($inp),$xt1
1252          movdqu         $xt2,0x20($out)
1253         movdqu          0x60($inp),$xt2
1254          movdqu         $xt3,0x30($out)
1255         movdqu          0x70($inp),$xt3
1256         pxor            0x10(%rsp),$xt0
1257         pxor            $xb1,$xt1
1258         pxor            $xc1,$xt2
1259         pxor            $xd1,$xt3
1260         movdqu          $xt0,0x40($out)
1261         movdqu          $xt1,0x50($out)
1262         movdqu          $xt2,0x60($out)
1263         movdqu          $xt3,0x70($out)
1264         je              .Ldone4x
1265
1266         movdqa          0x20(%rsp),$xt0         # $xaN is offloaded, remember?
1267         lea             0x80($inp),$inp         # inp+=64*2
1268         xor             %r10,%r10
1269         movdqa          $xt0,0x00(%rsp)
1270         movdqa          $xb2,0x10(%rsp)
1271         lea             0x80($out),$out         # out+=64*2
1272         movdqa          $xc2,0x20(%rsp)
1273         sub             \$128,$len              # len-=64*2
1274         movdqa          $xd2,0x30(%rsp)
1275         jmp             .Loop_tail4x
1276
1277 .align  32
1278 .L192_or_more4x:
1279         movdqu          0x00($inp),$xt0         # xor with input
1280         movdqu          0x10($inp),$xt1
1281         movdqu          0x20($inp),$xt2
1282         movdqu          0x30($inp),$xt3
1283         pxor            0x00(%rsp),$xt0         # $xaN is offloaded, remember?
1284         pxor            $xb0,$xt1
1285         pxor            $xc0,$xt2
1286         pxor            $xd0,$xt3
1287
1288          movdqu         $xt0,0x00($out)
1289         movdqu          0x40($inp),$xt0
1290          movdqu         $xt1,0x10($out)
1291         movdqu          0x50($inp),$xt1
1292          movdqu         $xt2,0x20($out)
1293         movdqu          0x60($inp),$xt2
1294          movdqu         $xt3,0x30($out)
1295         movdqu          0x70($inp),$xt3
1296         lea             0x80($inp),$inp         # size optimization
1297         pxor            0x10(%rsp),$xt0
1298         pxor            $xb1,$xt1
1299         pxor            $xc1,$xt2
1300         pxor            $xd1,$xt3
1301
1302          movdqu         $xt0,0x40($out)
1303         movdqu          0x00($inp),$xt0
1304          movdqu         $xt1,0x50($out)
1305         movdqu          0x10($inp),$xt1
1306          movdqu         $xt2,0x60($out)
1307         movdqu          0x20($inp),$xt2
1308          movdqu         $xt3,0x70($out)
1309          lea            0x80($out),$out         # size optimization
1310         movdqu          0x30($inp),$xt3
1311         pxor            0x20(%rsp),$xt0
1312         pxor            $xb2,$xt1
1313         pxor            $xc2,$xt2
1314         pxor            $xd2,$xt3
1315         movdqu          $xt0,0x00($out)
1316         movdqu          $xt1,0x10($out)
1317         movdqu          $xt2,0x20($out)
1318         movdqu          $xt3,0x30($out)
1319         je              .Ldone4x
1320
1321         movdqa          0x30(%rsp),$xt0         # $xaN is offloaded, remember?
1322         lea             0x40($inp),$inp         # inp+=64*3
1323         xor             %r10,%r10
1324         movdqa          $xt0,0x00(%rsp)
1325         movdqa          $xb3,0x10(%rsp)
1326         lea             0x40($out),$out         # out+=64*3
1327         movdqa          $xc3,0x20(%rsp)
1328         sub             \$192,$len              # len-=64*3
1329         movdqa          $xd3,0x30(%rsp)
1330
1331 .Loop_tail4x:
1332         movzb           ($inp,%r10),%eax
1333         movzb           (%rsp,%r10),%ecx
1334         lea             1(%r10),%r10
1335         xor             %ecx,%eax
1336         mov             %al,-1($out,%r10)
1337         dec             $len
1338         jnz             .Loop_tail4x
1339
1340 .Ldone4x:
1341 ___
1342 $code.=<<___    if ($win64);
1343         movaps          -0xa8(%r9),%xmm6
1344         movaps          -0x98(%r9),%xmm7
1345         movaps          -0x88(%r9),%xmm8
1346         movaps          -0x78(%r9),%xmm9
1347         movaps          -0x68(%r9),%xmm10
1348         movaps          -0x58(%r9),%xmm11
1349         movaps          -0x48(%r9),%xmm12
1350         movaps          -0x38(%r9),%xmm13
1351         movaps          -0x28(%r9),%xmm14
1352         movaps          -0x18(%r9),%xmm15
1353 ___
1354 $code.=<<___;
1355         lea             (%r9),%rsp
1356 .cfi_def_cfa_register   %rsp
1357 .L4x_epilogue:
1358         ret
1359 .cfi_endproc
1360 .size   ChaCha20_4x,.-ChaCha20_4x
1361 ___
1362 }
1363
1364 ########################################################################
1365 # XOP code path that handles all lengths.
1366 if ($avx) {
1367 # There is some "anomaly" observed depending on instructions' size or
1368 # alignment. If you look closely at below code you'll notice that
1369 # sometimes argument order varies. The order affects instruction
1370 # encoding by making it larger, and such fiddling gives 5% performance
1371 # improvement. This is on FX-4100...
1372
1373 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1374     $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1375 my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1376          $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1377
1378 sub XOP_lane_ROUND {
1379 my ($a0,$b0,$c0,$d0)=@_;
1380 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1381 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1382 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1383 my @x=map("\"$_\"",@xx);
1384
1385         (
1386         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",     # Q1
1387          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",     # Q2
1388           "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",     # Q3
1389            "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",     # Q4
1390         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1391          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1392           "&vpxor       (@x[$d2],@x[$a2],@x[$d2])",
1393            "&vpxor      (@x[$d3],@x[$a3],@x[$d3])",
1394         "&vprotd        (@x[$d0],@x[$d0],16)",
1395          "&vprotd       (@x[$d1],@x[$d1],16)",
1396           "&vprotd      (@x[$d2],@x[$d2],16)",
1397            "&vprotd     (@x[$d3],@x[$d3],16)",
1398
1399         "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
1400          "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
1401           "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
1402            "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
1403         "&vpxor         (@x[$b0],@x[$c0],@x[$b0])",
1404          "&vpxor        (@x[$b1],@x[$c1],@x[$b1])",
1405           "&vpxor       (@x[$b2],@x[$b2],@x[$c2])",     # flip
1406            "&vpxor      (@x[$b3],@x[$b3],@x[$c3])",     # flip
1407         "&vprotd        (@x[$b0],@x[$b0],12)",
1408          "&vprotd       (@x[$b1],@x[$b1],12)",
1409           "&vprotd      (@x[$b2],@x[$b2],12)",
1410            "&vprotd     (@x[$b3],@x[$b3],12)",
1411
1412         "&vpaddd        (@x[$a0],@x[$b0],@x[$a0])",     # flip
1413          "&vpaddd       (@x[$a1],@x[$b1],@x[$a1])",     # flip
1414           "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",
1415            "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",
1416         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1417          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1418           "&vpxor       (@x[$d2],@x[$a2],@x[$d2])",
1419            "&vpxor      (@x[$d3],@x[$a3],@x[$d3])",
1420         "&vprotd        (@x[$d0],@x[$d0],8)",
1421          "&vprotd       (@x[$d1],@x[$d1],8)",
1422           "&vprotd      (@x[$d2],@x[$d2],8)",
1423            "&vprotd     (@x[$d3],@x[$d3],8)",
1424
1425         "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
1426          "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
1427           "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
1428            "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
1429         "&vpxor         (@x[$b0],@x[$c0],@x[$b0])",
1430          "&vpxor        (@x[$b1],@x[$c1],@x[$b1])",
1431           "&vpxor       (@x[$b2],@x[$b2],@x[$c2])",     # flip
1432            "&vpxor      (@x[$b3],@x[$b3],@x[$c3])",     # flip
1433         "&vprotd        (@x[$b0],@x[$b0],7)",
1434          "&vprotd       (@x[$b1],@x[$b1],7)",
1435           "&vprotd      (@x[$b2],@x[$b2],7)",
1436            "&vprotd     (@x[$b3],@x[$b3],7)"
1437         );
1438 }
1439
1440 my $xframe = $win64 ? 0xa8 : 8;
1441
1442 $code.=<<___;
1443 .type   ChaCha20_4xop,\@function,5
1444 .align  32
1445 ChaCha20_4xop:
1446 .cfi_startproc
1447 .LChaCha20_4xop:
1448         mov             %rsp,%r9                # frame pointer
1449 .cfi_def_cfa_register   %r9
1450         sub             \$0x140+$xframe,%rsp
1451 ___
1452         ################ stack layout
1453         # +0x00         SIMD equivalent of @x[8-12]
1454         # ...
1455         # +0x40         constant copy of key[0-2] smashed by lanes
1456         # ...
1457         # +0x100        SIMD counters (with nonce smashed by lanes)
1458         # ...
1459         # +0x140
1460 $code.=<<___    if ($win64);
1461         movaps          %xmm6,-0xa8(%r9)
1462         movaps          %xmm7,-0x98(%r9)
1463         movaps          %xmm8,-0x88(%r9)
1464         movaps          %xmm9,-0x78(%r9)
1465         movaps          %xmm10,-0x68(%r9)
1466         movaps          %xmm11,-0x58(%r9)
1467         movaps          %xmm12,-0x48(%r9)
1468         movaps          %xmm13,-0x38(%r9)
1469         movaps          %xmm14,-0x28(%r9)
1470         movaps          %xmm15,-0x18(%r9)
1471 .L4xop_body:
1472 ___
1473 $code.=<<___;
1474         vzeroupper
1475
1476         vmovdqa         .Lsigma(%rip),$xa3      # key[0]
1477         vmovdqu         ($key),$xb3             # key[1]
1478         vmovdqu         16($key),$xt3           # key[2]
1479         vmovdqu         ($counter),$xd3         # key[3]
1480         lea             0x100(%rsp),%rcx        # size optimization
1481
1482         vpshufd         \$0x00,$xa3,$xa0        # smash key by lanes...
1483         vpshufd         \$0x55,$xa3,$xa1
1484         vmovdqa         $xa0,0x40(%rsp)         # ... and offload
1485         vpshufd         \$0xaa,$xa3,$xa2
1486         vmovdqa         $xa1,0x50(%rsp)
1487         vpshufd         \$0xff,$xa3,$xa3
1488         vmovdqa         $xa2,0x60(%rsp)
1489         vmovdqa         $xa3,0x70(%rsp)
1490
1491         vpshufd         \$0x00,$xb3,$xb0
1492         vpshufd         \$0x55,$xb3,$xb1
1493         vmovdqa         $xb0,0x80-0x100(%rcx)
1494         vpshufd         \$0xaa,$xb3,$xb2
1495         vmovdqa         $xb1,0x90-0x100(%rcx)
1496         vpshufd         \$0xff,$xb3,$xb3
1497         vmovdqa         $xb2,0xa0-0x100(%rcx)
1498         vmovdqa         $xb3,0xb0-0x100(%rcx)
1499
1500         vpshufd         \$0x00,$xt3,$xt0        # "$xc0"
1501         vpshufd         \$0x55,$xt3,$xt1        # "$xc1"
1502         vmovdqa         $xt0,0xc0-0x100(%rcx)
1503         vpshufd         \$0xaa,$xt3,$xt2        # "$xc2"
1504         vmovdqa         $xt1,0xd0-0x100(%rcx)
1505         vpshufd         \$0xff,$xt3,$xt3        # "$xc3"
1506         vmovdqa         $xt2,0xe0-0x100(%rcx)
1507         vmovdqa         $xt3,0xf0-0x100(%rcx)
1508
1509         vpshufd         \$0x00,$xd3,$xd0
1510         vpshufd         \$0x55,$xd3,$xd1
1511         vpaddd          .Linc(%rip),$xd0,$xd0   # don't save counters yet
1512         vpshufd         \$0xaa,$xd3,$xd2
1513         vmovdqa         $xd1,0x110-0x100(%rcx)
1514         vpshufd         \$0xff,$xd3,$xd3
1515         vmovdqa         $xd2,0x120-0x100(%rcx)
1516         vmovdqa         $xd3,0x130-0x100(%rcx)
1517
1518         jmp             .Loop_enter4xop
1519
1520 .align  32
1521 .Loop_outer4xop:
1522         vmovdqa         0x40(%rsp),$xa0         # re-load smashed key
1523         vmovdqa         0x50(%rsp),$xa1
1524         vmovdqa         0x60(%rsp),$xa2
1525         vmovdqa         0x70(%rsp),$xa3
1526         vmovdqa         0x80-0x100(%rcx),$xb0
1527         vmovdqa         0x90-0x100(%rcx),$xb1
1528         vmovdqa         0xa0-0x100(%rcx),$xb2
1529         vmovdqa         0xb0-0x100(%rcx),$xb3
1530         vmovdqa         0xc0-0x100(%rcx),$xt0   # "$xc0"
1531         vmovdqa         0xd0-0x100(%rcx),$xt1   # "$xc1"
1532         vmovdqa         0xe0-0x100(%rcx),$xt2   # "$xc2"
1533         vmovdqa         0xf0-0x100(%rcx),$xt3   # "$xc3"
1534         vmovdqa         0x100-0x100(%rcx),$xd0
1535         vmovdqa         0x110-0x100(%rcx),$xd1
1536         vmovdqa         0x120-0x100(%rcx),$xd2
1537         vmovdqa         0x130-0x100(%rcx),$xd3
1538         vpaddd          .Lfour(%rip),$xd0,$xd0  # next SIMD counters
1539
1540 .Loop_enter4xop:
1541         mov             \$10,%eax
1542         vmovdqa         $xd0,0x100-0x100(%rcx)  # save SIMD counters
1543         jmp             .Loop4xop
1544
1545 .align  32
1546 .Loop4xop:
1547 ___
1548         foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
1549         foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
1550 $code.=<<___;
1551         dec             %eax
1552         jnz             .Loop4xop
1553
1554         vpaddd          0x40(%rsp),$xa0,$xa0    # accumulate key material
1555         vpaddd          0x50(%rsp),$xa1,$xa1
1556         vpaddd          0x60(%rsp),$xa2,$xa2
1557         vpaddd          0x70(%rsp),$xa3,$xa3
1558
1559         vmovdqa         $xt2,0x20(%rsp)         # offload $xc2,3
1560         vmovdqa         $xt3,0x30(%rsp)
1561
1562         vpunpckldq      $xa1,$xa0,$xt2          # "de-interlace" data
1563         vpunpckldq      $xa3,$xa2,$xt3
1564         vpunpckhdq      $xa1,$xa0,$xa0
1565         vpunpckhdq      $xa3,$xa2,$xa2
1566         vpunpcklqdq     $xt3,$xt2,$xa1          # "a0"
1567         vpunpckhqdq     $xt3,$xt2,$xt2          # "a1"
1568         vpunpcklqdq     $xa2,$xa0,$xa3          # "a2"
1569         vpunpckhqdq     $xa2,$xa0,$xa0          # "a3"
1570 ___
1571         ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1572 $code.=<<___;
1573         vpaddd          0x80-0x100(%rcx),$xb0,$xb0
1574         vpaddd          0x90-0x100(%rcx),$xb1,$xb1
1575         vpaddd          0xa0-0x100(%rcx),$xb2,$xb2
1576         vpaddd          0xb0-0x100(%rcx),$xb3,$xb3
1577
1578         vmovdqa         $xa0,0x00(%rsp)         # offload $xa0,1
1579         vmovdqa         $xa1,0x10(%rsp)
1580         vmovdqa         0x20(%rsp),$xa0         # "xc2"
1581         vmovdqa         0x30(%rsp),$xa1         # "xc3"
1582
1583         vpunpckldq      $xb1,$xb0,$xt2
1584         vpunpckldq      $xb3,$xb2,$xt3
1585         vpunpckhdq      $xb1,$xb0,$xb0
1586         vpunpckhdq      $xb3,$xb2,$xb2
1587         vpunpcklqdq     $xt3,$xt2,$xb1          # "b0"
1588         vpunpckhqdq     $xt3,$xt2,$xt2          # "b1"
1589         vpunpcklqdq     $xb2,$xb0,$xb3          # "b2"
1590         vpunpckhqdq     $xb2,$xb0,$xb0          # "b3"
1591 ___
1592         ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1593         my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1594 $code.=<<___;
1595         vpaddd          0xc0-0x100(%rcx),$xc0,$xc0
1596         vpaddd          0xd0-0x100(%rcx),$xc1,$xc1
1597         vpaddd          0xe0-0x100(%rcx),$xc2,$xc2
1598         vpaddd          0xf0-0x100(%rcx),$xc3,$xc3
1599
1600         vpunpckldq      $xc1,$xc0,$xt2
1601         vpunpckldq      $xc3,$xc2,$xt3
1602         vpunpckhdq      $xc1,$xc0,$xc0
1603         vpunpckhdq      $xc3,$xc2,$xc2
1604         vpunpcklqdq     $xt3,$xt2,$xc1          # "c0"
1605         vpunpckhqdq     $xt3,$xt2,$xt2          # "c1"
1606         vpunpcklqdq     $xc2,$xc0,$xc3          # "c2"
1607         vpunpckhqdq     $xc2,$xc0,$xc0          # "c3"
1608 ___
1609         ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1610 $code.=<<___;
1611         vpaddd          0x100-0x100(%rcx),$xd0,$xd0
1612         vpaddd          0x110-0x100(%rcx),$xd1,$xd1
1613         vpaddd          0x120-0x100(%rcx),$xd2,$xd2
1614         vpaddd          0x130-0x100(%rcx),$xd3,$xd3
1615
1616         vpunpckldq      $xd1,$xd0,$xt2
1617         vpunpckldq      $xd3,$xd2,$xt3
1618         vpunpckhdq      $xd1,$xd0,$xd0
1619         vpunpckhdq      $xd3,$xd2,$xd2
1620         vpunpcklqdq     $xt3,$xt2,$xd1          # "d0"
1621         vpunpckhqdq     $xt3,$xt2,$xt2          # "d1"
1622         vpunpcklqdq     $xd2,$xd0,$xd3          # "d2"
1623         vpunpckhqdq     $xd2,$xd0,$xd0          # "d3"
1624 ___
1625         ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1626         ($xa0,$xa1)=($xt2,$xt3);
1627 $code.=<<___;
1628         vmovdqa         0x00(%rsp),$xa0         # restore $xa0,1
1629         vmovdqa         0x10(%rsp),$xa1
1630
1631         cmp             \$64*4,$len
1632         jb              .Ltail4xop
1633
1634         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1635         vpxor           0x10($inp),$xb0,$xb0
1636         vpxor           0x20($inp),$xc0,$xc0
1637         vpxor           0x30($inp),$xd0,$xd0
1638         vpxor           0x40($inp),$xa1,$xa1
1639         vpxor           0x50($inp),$xb1,$xb1
1640         vpxor           0x60($inp),$xc1,$xc1
1641         vpxor           0x70($inp),$xd1,$xd1
1642         lea             0x80($inp),$inp         # size optimization
1643         vpxor           0x00($inp),$xa2,$xa2
1644         vpxor           0x10($inp),$xb2,$xb2
1645         vpxor           0x20($inp),$xc2,$xc2
1646         vpxor           0x30($inp),$xd2,$xd2
1647         vpxor           0x40($inp),$xa3,$xa3
1648         vpxor           0x50($inp),$xb3,$xb3
1649         vpxor           0x60($inp),$xc3,$xc3
1650         vpxor           0x70($inp),$xd3,$xd3
1651         lea             0x80($inp),$inp         # inp+=64*4
1652
1653         vmovdqu         $xa0,0x00($out)
1654         vmovdqu         $xb0,0x10($out)
1655         vmovdqu         $xc0,0x20($out)
1656         vmovdqu         $xd0,0x30($out)
1657         vmovdqu         $xa1,0x40($out)
1658         vmovdqu         $xb1,0x50($out)
1659         vmovdqu         $xc1,0x60($out)
1660         vmovdqu         $xd1,0x70($out)
1661         lea             0x80($out),$out         # size optimization
1662         vmovdqu         $xa2,0x00($out)
1663         vmovdqu         $xb2,0x10($out)
1664         vmovdqu         $xc2,0x20($out)
1665         vmovdqu         $xd2,0x30($out)
1666         vmovdqu         $xa3,0x40($out)
1667         vmovdqu         $xb3,0x50($out)
1668         vmovdqu         $xc3,0x60($out)
1669         vmovdqu         $xd3,0x70($out)
1670         lea             0x80($out),$out         # out+=64*4
1671
1672         sub             \$64*4,$len
1673         jnz             .Loop_outer4xop
1674
1675         jmp             .Ldone4xop
1676
1677 .align  32
1678 .Ltail4xop:
1679         cmp             \$192,$len
1680         jae             .L192_or_more4xop
1681         cmp             \$128,$len
1682         jae             .L128_or_more4xop
1683         cmp             \$64,$len
1684         jae             .L64_or_more4xop
1685
1686         xor             %r10,%r10
1687         vmovdqa         $xa0,0x00(%rsp)
1688         vmovdqa         $xb0,0x10(%rsp)
1689         vmovdqa         $xc0,0x20(%rsp)
1690         vmovdqa         $xd0,0x30(%rsp)
1691         jmp             .Loop_tail4xop
1692
1693 .align  32
1694 .L64_or_more4xop:
1695         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1696         vpxor           0x10($inp),$xb0,$xb0
1697         vpxor           0x20($inp),$xc0,$xc0
1698         vpxor           0x30($inp),$xd0,$xd0
1699         vmovdqu         $xa0,0x00($out)
1700         vmovdqu         $xb0,0x10($out)
1701         vmovdqu         $xc0,0x20($out)
1702         vmovdqu         $xd0,0x30($out)
1703         je              .Ldone4xop
1704
1705         lea             0x40($inp),$inp         # inp+=64*1
1706         vmovdqa         $xa1,0x00(%rsp)
1707         xor             %r10,%r10
1708         vmovdqa         $xb1,0x10(%rsp)
1709         lea             0x40($out),$out         # out+=64*1
1710         vmovdqa         $xc1,0x20(%rsp)
1711         sub             \$64,$len               # len-=64*1
1712         vmovdqa         $xd1,0x30(%rsp)
1713         jmp             .Loop_tail4xop
1714
1715 .align  32
1716 .L128_or_more4xop:
1717         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1718         vpxor           0x10($inp),$xb0,$xb0
1719         vpxor           0x20($inp),$xc0,$xc0
1720         vpxor           0x30($inp),$xd0,$xd0
1721         vpxor           0x40($inp),$xa1,$xa1
1722         vpxor           0x50($inp),$xb1,$xb1
1723         vpxor           0x60($inp),$xc1,$xc1
1724         vpxor           0x70($inp),$xd1,$xd1
1725
1726         vmovdqu         $xa0,0x00($out)
1727         vmovdqu         $xb0,0x10($out)
1728         vmovdqu         $xc0,0x20($out)
1729         vmovdqu         $xd0,0x30($out)
1730         vmovdqu         $xa1,0x40($out)
1731         vmovdqu         $xb1,0x50($out)
1732         vmovdqu         $xc1,0x60($out)
1733         vmovdqu         $xd1,0x70($out)
1734         je              .Ldone4xop
1735
1736         lea             0x80($inp),$inp         # inp+=64*2
1737         vmovdqa         $xa2,0x00(%rsp)
1738         xor             %r10,%r10
1739         vmovdqa         $xb2,0x10(%rsp)
1740         lea             0x80($out),$out         # out+=64*2
1741         vmovdqa         $xc2,0x20(%rsp)
1742         sub             \$128,$len              # len-=64*2
1743         vmovdqa         $xd2,0x30(%rsp)
1744         jmp             .Loop_tail4xop
1745
1746 .align  32
1747 .L192_or_more4xop:
1748         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1749         vpxor           0x10($inp),$xb0,$xb0
1750         vpxor           0x20($inp),$xc0,$xc0
1751         vpxor           0x30($inp),$xd0,$xd0
1752         vpxor           0x40($inp),$xa1,$xa1
1753         vpxor           0x50($inp),$xb1,$xb1
1754         vpxor           0x60($inp),$xc1,$xc1
1755         vpxor           0x70($inp),$xd1,$xd1
1756         lea             0x80($inp),$inp         # size optimization
1757         vpxor           0x00($inp),$xa2,$xa2
1758         vpxor           0x10($inp),$xb2,$xb2
1759         vpxor           0x20($inp),$xc2,$xc2
1760         vpxor           0x30($inp),$xd2,$xd2
1761
1762         vmovdqu         $xa0,0x00($out)
1763         vmovdqu         $xb0,0x10($out)
1764         vmovdqu         $xc0,0x20($out)
1765         vmovdqu         $xd0,0x30($out)
1766         vmovdqu         $xa1,0x40($out)
1767         vmovdqu         $xb1,0x50($out)
1768         vmovdqu         $xc1,0x60($out)
1769         vmovdqu         $xd1,0x70($out)
1770         lea             0x80($out),$out         # size optimization
1771         vmovdqu         $xa2,0x00($out)
1772         vmovdqu         $xb2,0x10($out)
1773         vmovdqu         $xc2,0x20($out)
1774         vmovdqu         $xd2,0x30($out)
1775         je              .Ldone4xop
1776
1777         lea             0x40($inp),$inp         # inp+=64*3
1778         vmovdqa         $xa3,0x00(%rsp)
1779         xor             %r10,%r10
1780         vmovdqa         $xb3,0x10(%rsp)
1781         lea             0x40($out),$out         # out+=64*3
1782         vmovdqa         $xc3,0x20(%rsp)
1783         sub             \$192,$len              # len-=64*3
1784         vmovdqa         $xd3,0x30(%rsp)
1785
1786 .Loop_tail4xop:
1787         movzb           ($inp,%r10),%eax
1788         movzb           (%rsp,%r10),%ecx
1789         lea             1(%r10),%r10
1790         xor             %ecx,%eax
1791         mov             %al,-1($out,%r10)
1792         dec             $len
1793         jnz             .Loop_tail4xop
1794
1795 .Ldone4xop:
1796         vzeroupper
1797 ___
1798 $code.=<<___    if ($win64);
1799         movaps          -0xa8(%r9),%xmm6
1800         movaps          -0x98(%r9),%xmm7
1801         movaps          -0x88(%r9),%xmm8
1802         movaps          -0x78(%r9),%xmm9
1803         movaps          -0x68(%r9),%xmm10
1804         movaps          -0x58(%r9),%xmm11
1805         movaps          -0x48(%r9),%xmm12
1806         movaps          -0x38(%r9),%xmm13
1807         movaps          -0x28(%r9),%xmm14
1808         movaps          -0x18(%r9),%xmm15
1809 ___
1810 $code.=<<___;
1811         lea             (%r9),%rsp
1812 .cfi_def_cfa_register   %rsp
1813 .L4xop_epilogue:
1814         ret
1815 .cfi_endproc
1816 .size   ChaCha20_4xop,.-ChaCha20_4xop
1817 ___
1818 }
1819
1820 ########################################################################
1821 # AVX2 code path
1822 if ($avx>1) {
1823 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1824     $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1825 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1826         "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1827
1828 sub AVX2_lane_ROUND {
1829 my ($a0,$b0,$c0,$d0)=@_;
1830 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1831 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1832 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1833 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1834 my @x=map("\"$_\"",@xx);
1835
1836         # Consider order in which variables are addressed by their
1837         # index:
1838         #
1839         #       a   b   c   d
1840         #
1841         #       0   4   8  12 < even round
1842         #       1   5   9  13
1843         #       2   6  10  14
1844         #       3   7  11  15
1845         #       0   5  10  15 < odd round
1846         #       1   6  11  12
1847         #       2   7   8  13
1848         #       3   4   9  14
1849         #
1850         # 'a', 'b' and 'd's are permanently allocated in registers,
1851         # @x[0..7,12..15], while 'c's are maintained in memory. If
1852         # you observe 'c' column, you'll notice that pair of 'c's is
1853         # invariant between rounds. This means that we have to reload
1854         # them once per round, in the middle. This is why you'll see
1855         # bunch of 'c' stores and loads in the middle, but none in
1856         # the beginning or end.
1857
1858         (
1859         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",     # Q1
1860         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1861         "&vpshufb       (@x[$d0],@x[$d0],$t1)",
1862          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",     # Q2
1863          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1864          "&vpshufb      (@x[$d1],@x[$d1],$t1)",
1865
1866         "&vpaddd        ($xc,$xc,@x[$d0])",
1867         "&vpxor         (@x[$b0],$xc,@x[$b0])",
1868         "&vpslld        ($t0,@x[$b0],12)",
1869         "&vpsrld        (@x[$b0],@x[$b0],20)",
1870         "&vpor          (@x[$b0],$t0,@x[$b0])",
1871         "&vbroadcasti128($t0,'(%r11)')",                # .Lrot24(%rip)
1872          "&vpaddd       ($xc_,$xc_,@x[$d1])",
1873          "&vpxor        (@x[$b1],$xc_,@x[$b1])",
1874          "&vpslld       ($t1,@x[$b1],12)",
1875          "&vpsrld       (@x[$b1],@x[$b1],20)",
1876          "&vpor         (@x[$b1],$t1,@x[$b1])",
1877
1878         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",
1879         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1880         "&vpshufb       (@x[$d0],@x[$d0],$t0)",
1881          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",
1882          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1883          "&vpshufb      (@x[$d1],@x[$d1],$t0)",
1884
1885         "&vpaddd        ($xc,$xc,@x[$d0])",
1886         "&vpxor         (@x[$b0],$xc,@x[$b0])",
1887         "&vpslld        ($t1,@x[$b0],7)",
1888         "&vpsrld        (@x[$b0],@x[$b0],25)",
1889         "&vpor          (@x[$b0],$t1,@x[$b0])",
1890         "&vbroadcasti128($t1,'(%r10)')",                # .Lrot16(%rip)
1891          "&vpaddd       ($xc_,$xc_,@x[$d1])",
1892          "&vpxor        (@x[$b1],$xc_,@x[$b1])",
1893          "&vpslld       ($t0,@x[$b1],7)",
1894          "&vpsrld       (@x[$b1],@x[$b1],25)",
1895          "&vpor         (@x[$b1],$t0,@x[$b1])",
1896
1897         "&vmovdqa       (\"`32*($c0-8)`(%rsp)\",$xc)",  # reload pair of 'c's
1898          "&vmovdqa      (\"`32*($c1-8)`(%rsp)\",$xc_)",
1899         "&vmovdqa       ($xc,\"`32*($c2-8)`(%rsp)\")",
1900          "&vmovdqa      ($xc_,\"`32*($c3-8)`(%rsp)\")",
1901
1902         "&vpaddd        (@x[$a2],@x[$a2],@x[$b2])",     # Q3
1903         "&vpxor         (@x[$d2],@x[$a2],@x[$d2])",
1904         "&vpshufb       (@x[$d2],@x[$d2],$t1)",
1905          "&vpaddd       (@x[$a3],@x[$a3],@x[$b3])",     # Q4
1906          "&vpxor        (@x[$d3],@x[$a3],@x[$d3])",
1907          "&vpshufb      (@x[$d3],@x[$d3],$t1)",
1908
1909         "&vpaddd        ($xc,$xc,@x[$d2])",
1910         "&vpxor         (@x[$b2],$xc,@x[$b2])",
1911         "&vpslld        ($t0,@x[$b2],12)",
1912         "&vpsrld        (@x[$b2],@x[$b2],20)",
1913         "&vpor          (@x[$b2],$t0,@x[$b2])",
1914         "&vbroadcasti128($t0,'(%r11)')",                # .Lrot24(%rip)
1915          "&vpaddd       ($xc_,$xc_,@x[$d3])",
1916          "&vpxor        (@x[$b3],$xc_,@x[$b3])",
1917          "&vpslld       ($t1,@x[$b3],12)",
1918          "&vpsrld       (@x[$b3],@x[$b3],20)",
1919          "&vpor         (@x[$b3],$t1,@x[$b3])",
1920
1921         "&vpaddd        (@x[$a2],@x[$a2],@x[$b2])",
1922         "&vpxor         (@x[$d2],@x[$a2],@x[$d2])",
1923         "&vpshufb       (@x[$d2],@x[$d2],$t0)",
1924          "&vpaddd       (@x[$a3],@x[$a3],@x[$b3])",
1925          "&vpxor        (@x[$d3],@x[$a3],@x[$d3])",
1926          "&vpshufb      (@x[$d3],@x[$d3],$t0)",
1927
1928         "&vpaddd        ($xc,$xc,@x[$d2])",
1929         "&vpxor         (@x[$b2],$xc,@x[$b2])",
1930         "&vpslld        ($t1,@x[$b2],7)",
1931         "&vpsrld        (@x[$b2],@x[$b2],25)",
1932         "&vpor          (@x[$b2],$t1,@x[$b2])",
1933         "&vbroadcasti128($t1,'(%r10)')",                # .Lrot16(%rip)
1934          "&vpaddd       ($xc_,$xc_,@x[$d3])",
1935          "&vpxor        (@x[$b3],$xc_,@x[$b3])",
1936          "&vpslld       ($t0,@x[$b3],7)",
1937          "&vpsrld       (@x[$b3],@x[$b3],25)",
1938          "&vpor         (@x[$b3],$t0,@x[$b3])"
1939         );
1940 }
1941
1942 my $xframe = $win64 ? 0xa8 : 8;
1943
1944 $code.=<<___;
1945 .type   ChaCha20_8x,\@function,5
1946 .align  32
1947 ChaCha20_8x:
1948 .cfi_startproc
1949 .LChaCha20_8x:
1950         mov             %rsp,%r9                # frame register
1951 .cfi_def_cfa_register   %r9
1952         sub             \$0x280+$xframe,%rsp
1953         and             \$-32,%rsp
1954 ___
1955 $code.=<<___    if ($win64);
1956         movaps          %xmm6,-0xa8(%r9)
1957         movaps          %xmm7,-0x98(%r9)
1958         movaps          %xmm8,-0x88(%r9)
1959         movaps          %xmm9,-0x78(%r9)
1960         movaps          %xmm10,-0x68(%r9)
1961         movaps          %xmm11,-0x58(%r9)
1962         movaps          %xmm12,-0x48(%r9)
1963         movaps          %xmm13,-0x38(%r9)
1964         movaps          %xmm14,-0x28(%r9)
1965         movaps          %xmm15,-0x18(%r9)
1966 .L8x_body:
1967 ___
1968 $code.=<<___;
1969         vzeroupper
1970
1971         ################ stack layout
1972         # +0x00         SIMD equivalent of @x[8-12]
1973         # ...
1974         # +0x80         constant copy of key[0-2] smashed by lanes
1975         # ...
1976         # +0x200        SIMD counters (with nonce smashed by lanes)
1977         # ...
1978         # +0x280
1979
1980         vbroadcasti128  .Lsigma(%rip),$xa3      # key[0]
1981         vbroadcasti128  ($key),$xb3             # key[1]
1982         vbroadcasti128  16($key),$xt3           # key[2]
1983         vbroadcasti128  ($counter),$xd3         # key[3]
1984         lea             0x100(%rsp),%rcx        # size optimization
1985         lea             0x200(%rsp),%rax        # size optimization
1986         lea             .Lrot16(%rip),%r10
1987         lea             .Lrot24(%rip),%r11
1988
1989         vpshufd         \$0x00,$xa3,$xa0        # smash key by lanes...
1990         vpshufd         \$0x55,$xa3,$xa1
1991         vmovdqa         $xa0,0x80-0x100(%rcx)   # ... and offload
1992         vpshufd         \$0xaa,$xa3,$xa2
1993         vmovdqa         $xa1,0xa0-0x100(%rcx)
1994         vpshufd         \$0xff,$xa3,$xa3
1995         vmovdqa         $xa2,0xc0-0x100(%rcx)
1996         vmovdqa         $xa3,0xe0-0x100(%rcx)
1997
1998         vpshufd         \$0x00,$xb3,$xb0
1999         vpshufd         \$0x55,$xb3,$xb1
2000         vmovdqa         $xb0,0x100-0x100(%rcx)
2001         vpshufd         \$0xaa,$xb3,$xb2
2002         vmovdqa         $xb1,0x120-0x100(%rcx)
2003         vpshufd         \$0xff,$xb3,$xb3
2004         vmovdqa         $xb2,0x140-0x100(%rcx)
2005         vmovdqa         $xb3,0x160-0x100(%rcx)
2006
2007         vpshufd         \$0x00,$xt3,$xt0        # "xc0"
2008         vpshufd         \$0x55,$xt3,$xt1        # "xc1"
2009         vmovdqa         $xt0,0x180-0x200(%rax)
2010         vpshufd         \$0xaa,$xt3,$xt2        # "xc2"
2011         vmovdqa         $xt1,0x1a0-0x200(%rax)
2012         vpshufd         \$0xff,$xt3,$xt3        # "xc3"
2013         vmovdqa         $xt2,0x1c0-0x200(%rax)
2014         vmovdqa         $xt3,0x1e0-0x200(%rax)
2015
2016         vpshufd         \$0x00,$xd3,$xd0
2017         vpshufd         \$0x55,$xd3,$xd1
2018         vpaddd          .Lincy(%rip),$xd0,$xd0  # don't save counters yet
2019         vpshufd         \$0xaa,$xd3,$xd2
2020         vmovdqa         $xd1,0x220-0x200(%rax)
2021         vpshufd         \$0xff,$xd3,$xd3
2022         vmovdqa         $xd2,0x240-0x200(%rax)
2023         vmovdqa         $xd3,0x260-0x200(%rax)
2024
2025         jmp             .Loop_enter8x
2026
2027 .align  32
2028 .Loop_outer8x:
2029         vmovdqa         0x80-0x100(%rcx),$xa0   # re-load smashed key
2030         vmovdqa         0xa0-0x100(%rcx),$xa1
2031         vmovdqa         0xc0-0x100(%rcx),$xa2
2032         vmovdqa         0xe0-0x100(%rcx),$xa3
2033         vmovdqa         0x100-0x100(%rcx),$xb0
2034         vmovdqa         0x120-0x100(%rcx),$xb1
2035         vmovdqa         0x140-0x100(%rcx),$xb2
2036         vmovdqa         0x160-0x100(%rcx),$xb3
2037         vmovdqa         0x180-0x200(%rax),$xt0  # "xc0"
2038         vmovdqa         0x1a0-0x200(%rax),$xt1  # "xc1"
2039         vmovdqa         0x1c0-0x200(%rax),$xt2  # "xc2"
2040         vmovdqa         0x1e0-0x200(%rax),$xt3  # "xc3"
2041         vmovdqa         0x200-0x200(%rax),$xd0
2042         vmovdqa         0x220-0x200(%rax),$xd1
2043         vmovdqa         0x240-0x200(%rax),$xd2
2044         vmovdqa         0x260-0x200(%rax),$xd3
2045         vpaddd          .Leight(%rip),$xd0,$xd0 # next SIMD counters
2046
2047 .Loop_enter8x:
2048         vmovdqa         $xt2,0x40(%rsp)         # SIMD equivalent of "@x[10]"
2049         vmovdqa         $xt3,0x60(%rsp)         # SIMD equivalent of "@x[11]"
2050         vbroadcasti128  (%r10),$xt3
2051         vmovdqa         $xd0,0x200-0x200(%rax)  # save SIMD counters
2052         mov             \$10,%eax
2053         jmp             .Loop8x
2054
2055 .align  32
2056 .Loop8x:
2057 ___
2058         foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
2059         foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
2060 $code.=<<___;
2061         dec             %eax
2062         jnz             .Loop8x
2063
2064         lea             0x200(%rsp),%rax        # size optimization
2065         vpaddd          0x80-0x100(%rcx),$xa0,$xa0      # accumulate key
2066         vpaddd          0xa0-0x100(%rcx),$xa1,$xa1
2067         vpaddd          0xc0-0x100(%rcx),$xa2,$xa2
2068         vpaddd          0xe0-0x100(%rcx),$xa3,$xa3
2069
2070         vpunpckldq      $xa1,$xa0,$xt2          # "de-interlace" data
2071         vpunpckldq      $xa3,$xa2,$xt3
2072         vpunpckhdq      $xa1,$xa0,$xa0
2073         vpunpckhdq      $xa3,$xa2,$xa2
2074         vpunpcklqdq     $xt3,$xt2,$xa1          # "a0"
2075         vpunpckhqdq     $xt3,$xt2,$xt2          # "a1"
2076         vpunpcklqdq     $xa2,$xa0,$xa3          # "a2"
2077         vpunpckhqdq     $xa2,$xa0,$xa0          # "a3"
2078 ___
2079         ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
2080 $code.=<<___;
2081         vpaddd          0x100-0x100(%rcx),$xb0,$xb0
2082         vpaddd          0x120-0x100(%rcx),$xb1,$xb1
2083         vpaddd          0x140-0x100(%rcx),$xb2,$xb2
2084         vpaddd          0x160-0x100(%rcx),$xb3,$xb3
2085
2086         vpunpckldq      $xb1,$xb0,$xt2
2087         vpunpckldq      $xb3,$xb2,$xt3
2088         vpunpckhdq      $xb1,$xb0,$xb0
2089         vpunpckhdq      $xb3,$xb2,$xb2
2090         vpunpcklqdq     $xt3,$xt2,$xb1          # "b0"
2091         vpunpckhqdq     $xt3,$xt2,$xt2          # "b1"
2092         vpunpcklqdq     $xb2,$xb0,$xb3          # "b2"
2093         vpunpckhqdq     $xb2,$xb0,$xb0          # "b3"
2094 ___
2095         ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
2096 $code.=<<___;
2097         vperm2i128      \$0x20,$xb0,$xa0,$xt3   # "de-interlace" further
2098         vperm2i128      \$0x31,$xb0,$xa0,$xb0
2099         vperm2i128      \$0x20,$xb1,$xa1,$xa0
2100         vperm2i128      \$0x31,$xb1,$xa1,$xb1
2101         vperm2i128      \$0x20,$xb2,$xa2,$xa1
2102         vperm2i128      \$0x31,$xb2,$xa2,$xb2
2103         vperm2i128      \$0x20,$xb3,$xa3,$xa2
2104         vperm2i128      \$0x31,$xb3,$xa3,$xb3
2105 ___
2106         ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
2107         my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
2108 $code.=<<___;
2109         vmovdqa         $xa0,0x00(%rsp)         # offload $xaN
2110         vmovdqa         $xa1,0x20(%rsp)
2111         vmovdqa         0x40(%rsp),$xc2         # $xa0
2112         vmovdqa         0x60(%rsp),$xc3         # $xa1
2113
2114         vpaddd          0x180-0x200(%rax),$xc0,$xc0
2115         vpaddd          0x1a0-0x200(%rax),$xc1,$xc1
2116         vpaddd          0x1c0-0x200(%rax),$xc2,$xc2
2117         vpaddd          0x1e0-0x200(%rax),$xc3,$xc3
2118
2119         vpunpckldq      $xc1,$xc0,$xt2
2120         vpunpckldq      $xc3,$xc2,$xt3
2121         vpunpckhdq      $xc1,$xc0,$xc0
2122         vpunpckhdq      $xc3,$xc2,$xc2
2123         vpunpcklqdq     $xt3,$xt2,$xc1          # "c0"
2124         vpunpckhqdq     $xt3,$xt2,$xt2          # "c1"
2125         vpunpcklqdq     $xc2,$xc0,$xc3          # "c2"
2126         vpunpckhqdq     $xc2,$xc0,$xc0          # "c3"
2127 ___
2128         ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
2129 $code.=<<___;
2130         vpaddd          0x200-0x200(%rax),$xd0,$xd0
2131         vpaddd          0x220-0x200(%rax),$xd1,$xd1
2132         vpaddd          0x240-0x200(%rax),$xd2,$xd2
2133         vpaddd          0x260-0x200(%rax),$xd3,$xd3
2134
2135         vpunpckldq      $xd1,$xd0,$xt2
2136         vpunpckldq      $xd3,$xd2,$xt3
2137         vpunpckhdq      $xd1,$xd0,$xd0
2138         vpunpckhdq      $xd3,$xd2,$xd2
2139         vpunpcklqdq     $xt3,$xt2,$xd1          # "d0"
2140         vpunpckhqdq     $xt3,$xt2,$xt2          # "d1"
2141         vpunpcklqdq     $xd2,$xd0,$xd3          # "d2"
2142         vpunpckhqdq     $xd2,$xd0,$xd0          # "d3"
2143 ___
2144         ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
2145 $code.=<<___;
2146         vperm2i128      \$0x20,$xd0,$xc0,$xt3   # "de-interlace" further
2147         vperm2i128      \$0x31,$xd0,$xc0,$xd0
2148         vperm2i128      \$0x20,$xd1,$xc1,$xc0
2149         vperm2i128      \$0x31,$xd1,$xc1,$xd1
2150         vperm2i128      \$0x20,$xd2,$xc2,$xc1
2151         vperm2i128      \$0x31,$xd2,$xc2,$xd2
2152         vperm2i128      \$0x20,$xd3,$xc3,$xc2
2153         vperm2i128      \$0x31,$xd3,$xc3,$xd3
2154 ___
2155         ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
2156         ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
2157         ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
2158         ($xa0,$xa1)=($xt2,$xt3);
2159 $code.=<<___;
2160         vmovdqa         0x00(%rsp),$xa0         # $xaN was offloaded, remember?
2161         vmovdqa         0x20(%rsp),$xa1
2162
2163         cmp             \$64*8,$len
2164         jb              .Ltail8x
2165
2166         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2167         vpxor           0x20($inp),$xb0,$xb0
2168         vpxor           0x40($inp),$xc0,$xc0
2169         vpxor           0x60($inp),$xd0,$xd0
2170         lea             0x80($inp),$inp         # size optimization
2171         vmovdqu         $xa0,0x00($out)
2172         vmovdqu         $xb0,0x20($out)
2173         vmovdqu         $xc0,0x40($out)
2174         vmovdqu         $xd0,0x60($out)
2175         lea             0x80($out),$out         # size optimization
2176
2177         vpxor           0x00($inp),$xa1,$xa1
2178         vpxor           0x20($inp),$xb1,$xb1
2179         vpxor           0x40($inp),$xc1,$xc1
2180         vpxor           0x60($inp),$xd1,$xd1
2181         lea             0x80($inp),$inp         # size optimization
2182         vmovdqu         $xa1,0x00($out)
2183         vmovdqu         $xb1,0x20($out)
2184         vmovdqu         $xc1,0x40($out)
2185         vmovdqu         $xd1,0x60($out)
2186         lea             0x80($out),$out         # size optimization
2187
2188         vpxor           0x00($inp),$xa2,$xa2
2189         vpxor           0x20($inp),$xb2,$xb2
2190         vpxor           0x40($inp),$xc2,$xc2
2191         vpxor           0x60($inp),$xd2,$xd2
2192         lea             0x80($inp),$inp         # size optimization
2193         vmovdqu         $xa2,0x00($out)
2194         vmovdqu         $xb2,0x20($out)
2195         vmovdqu         $xc2,0x40($out)
2196         vmovdqu         $xd2,0x60($out)
2197         lea             0x80($out),$out         # size optimization
2198
2199         vpxor           0x00($inp),$xa3,$xa3
2200         vpxor           0x20($inp),$xb3,$xb3
2201         vpxor           0x40($inp),$xc3,$xc3
2202         vpxor           0x60($inp),$xd3,$xd3
2203         lea             0x80($inp),$inp         # size optimization
2204         vmovdqu         $xa3,0x00($out)
2205         vmovdqu         $xb3,0x20($out)
2206         vmovdqu         $xc3,0x40($out)
2207         vmovdqu         $xd3,0x60($out)
2208         lea             0x80($out),$out         # size optimization
2209
2210         sub             \$64*8,$len
2211         jnz             .Loop_outer8x
2212
2213         jmp             .Ldone8x
2214
2215 .Ltail8x:
2216         cmp             \$448,$len
2217         jae             .L448_or_more8x
2218         cmp             \$384,$len
2219         jae             .L384_or_more8x
2220         cmp             \$320,$len
2221         jae             .L320_or_more8x
2222         cmp             \$256,$len
2223         jae             .L256_or_more8x
2224         cmp             \$192,$len
2225         jae             .L192_or_more8x
2226         cmp             \$128,$len
2227         jae             .L128_or_more8x
2228         cmp             \$64,$len
2229         jae             .L64_or_more8x
2230
2231         xor             %r10,%r10
2232         vmovdqa         $xa0,0x00(%rsp)
2233         vmovdqa         $xb0,0x20(%rsp)
2234         jmp             .Loop_tail8x
2235
2236 .align  32
2237 .L64_or_more8x:
2238         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2239         vpxor           0x20($inp),$xb0,$xb0
2240         vmovdqu         $xa0,0x00($out)
2241         vmovdqu         $xb0,0x20($out)
2242         je              .Ldone8x
2243
2244         lea             0x40($inp),$inp         # inp+=64*1
2245         xor             %r10,%r10
2246         vmovdqa         $xc0,0x00(%rsp)
2247         lea             0x40($out),$out         # out+=64*1
2248         sub             \$64,$len               # len-=64*1
2249         vmovdqa         $xd0,0x20(%rsp)
2250         jmp             .Loop_tail8x
2251
2252 .align  32
2253 .L128_or_more8x:
2254         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2255         vpxor           0x20($inp),$xb0,$xb0
2256         vpxor           0x40($inp),$xc0,$xc0
2257         vpxor           0x60($inp),$xd0,$xd0
2258         vmovdqu         $xa0,0x00($out)
2259         vmovdqu         $xb0,0x20($out)
2260         vmovdqu         $xc0,0x40($out)
2261         vmovdqu         $xd0,0x60($out)
2262         je              .Ldone8x
2263
2264         lea             0x80($inp),$inp         # inp+=64*2
2265         xor             %r10,%r10
2266         vmovdqa         $xa1,0x00(%rsp)
2267         lea             0x80($out),$out         # out+=64*2
2268         sub             \$128,$len              # len-=64*2
2269         vmovdqa         $xb1,0x20(%rsp)
2270         jmp             .Loop_tail8x
2271
2272 .align  32
2273 .L192_or_more8x:
2274         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2275         vpxor           0x20($inp),$xb0,$xb0
2276         vpxor           0x40($inp),$xc0,$xc0
2277         vpxor           0x60($inp),$xd0,$xd0
2278         vpxor           0x80($inp),$xa1,$xa1
2279         vpxor           0xa0($inp),$xb1,$xb1
2280         vmovdqu         $xa0,0x00($out)
2281         vmovdqu         $xb0,0x20($out)
2282         vmovdqu         $xc0,0x40($out)
2283         vmovdqu         $xd0,0x60($out)
2284         vmovdqu         $xa1,0x80($out)
2285         vmovdqu         $xb1,0xa0($out)
2286         je              .Ldone8x
2287
2288         lea             0xc0($inp),$inp         # inp+=64*3
2289         xor             %r10,%r10
2290         vmovdqa         $xc1,0x00(%rsp)
2291         lea             0xc0($out),$out         # out+=64*3
2292         sub             \$192,$len              # len-=64*3
2293         vmovdqa         $xd1,0x20(%rsp)
2294         jmp             .Loop_tail8x
2295
2296 .align  32
2297 .L256_or_more8x:
2298         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2299         vpxor           0x20($inp),$xb0,$xb0
2300         vpxor           0x40($inp),$xc0,$xc0
2301         vpxor           0x60($inp),$xd0,$xd0
2302         vpxor           0x80($inp),$xa1,$xa1
2303         vpxor           0xa0($inp),$xb1,$xb1
2304         vpxor           0xc0($inp),$xc1,$xc1
2305         vpxor           0xe0($inp),$xd1,$xd1
2306         vmovdqu         $xa0,0x00($out)
2307         vmovdqu         $xb0,0x20($out)
2308         vmovdqu         $xc0,0x40($out)
2309         vmovdqu         $xd0,0x60($out)
2310         vmovdqu         $xa1,0x80($out)
2311         vmovdqu         $xb1,0xa0($out)
2312         vmovdqu         $xc1,0xc0($out)
2313         vmovdqu         $xd1,0xe0($out)
2314         je              .Ldone8x
2315
2316         lea             0x100($inp),$inp        # inp+=64*4
2317         xor             %r10,%r10
2318         vmovdqa         $xa2,0x00(%rsp)
2319         lea             0x100($out),$out        # out+=64*4
2320         sub             \$256,$len              # len-=64*4
2321         vmovdqa         $xb2,0x20(%rsp)
2322         jmp             .Loop_tail8x
2323
2324 .align  32
2325 .L320_or_more8x:
2326         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2327         vpxor           0x20($inp),$xb0,$xb0
2328         vpxor           0x40($inp),$xc0,$xc0
2329         vpxor           0x60($inp),$xd0,$xd0
2330         vpxor           0x80($inp),$xa1,$xa1
2331         vpxor           0xa0($inp),$xb1,$xb1
2332         vpxor           0xc0($inp),$xc1,$xc1
2333         vpxor           0xe0($inp),$xd1,$xd1
2334         vpxor           0x100($inp),$xa2,$xa2
2335         vpxor           0x120($inp),$xb2,$xb2
2336         vmovdqu         $xa0,0x00($out)
2337         vmovdqu         $xb0,0x20($out)
2338         vmovdqu         $xc0,0x40($out)
2339         vmovdqu         $xd0,0x60($out)
2340         vmovdqu         $xa1,0x80($out)
2341         vmovdqu         $xb1,0xa0($out)
2342         vmovdqu         $xc1,0xc0($out)
2343         vmovdqu         $xd1,0xe0($out)
2344         vmovdqu         $xa2,0x100($out)
2345         vmovdqu         $xb2,0x120($out)
2346         je              .Ldone8x
2347
2348         lea             0x140($inp),$inp        # inp+=64*5
2349         xor             %r10,%r10
2350         vmovdqa         $xc2,0x00(%rsp)
2351         lea             0x140($out),$out        # out+=64*5
2352         sub             \$320,$len              # len-=64*5
2353         vmovdqa         $xd2,0x20(%rsp)
2354         jmp             .Loop_tail8x
2355
2356 .align  32
2357 .L384_or_more8x:
2358         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2359         vpxor           0x20($inp),$xb0,$xb0
2360         vpxor           0x40($inp),$xc0,$xc0
2361         vpxor           0x60($inp),$xd0,$xd0
2362         vpxor           0x80($inp),$xa1,$xa1
2363         vpxor           0xa0($inp),$xb1,$xb1
2364         vpxor           0xc0($inp),$xc1,$xc1
2365         vpxor           0xe0($inp),$xd1,$xd1
2366         vpxor           0x100($inp),$xa2,$xa2
2367         vpxor           0x120($inp),$xb2,$xb2
2368         vpxor           0x140($inp),$xc2,$xc2
2369         vpxor           0x160($inp),$xd2,$xd2
2370         vmovdqu         $xa0,0x00($out)
2371         vmovdqu         $xb0,0x20($out)
2372         vmovdqu         $xc0,0x40($out)
2373         vmovdqu         $xd0,0x60($out)
2374         vmovdqu         $xa1,0x80($out)
2375         vmovdqu         $xb1,0xa0($out)
2376         vmovdqu         $xc1,0xc0($out)
2377         vmovdqu         $xd1,0xe0($out)
2378         vmovdqu         $xa2,0x100($out)
2379         vmovdqu         $xb2,0x120($out)
2380         vmovdqu         $xc2,0x140($out)
2381         vmovdqu         $xd2,0x160($out)
2382         je              .Ldone8x
2383
2384         lea             0x180($inp),$inp        # inp+=64*6
2385         xor             %r10,%r10
2386         vmovdqa         $xa3,0x00(%rsp)
2387         lea             0x180($out),$out        # out+=64*6
2388         sub             \$384,$len              # len-=64*6
2389         vmovdqa         $xb3,0x20(%rsp)
2390         jmp             .Loop_tail8x
2391
2392 .align  32
2393 .L448_or_more8x:
2394         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2395         vpxor           0x20($inp),$xb0,$xb0
2396         vpxor           0x40($inp),$xc0,$xc0
2397         vpxor           0x60($inp),$xd0,$xd0
2398         vpxor           0x80($inp),$xa1,$xa1
2399         vpxor           0xa0($inp),$xb1,$xb1
2400         vpxor           0xc0($inp),$xc1,$xc1
2401         vpxor           0xe0($inp),$xd1,$xd1
2402         vpxor           0x100($inp),$xa2,$xa2
2403         vpxor           0x120($inp),$xb2,$xb2
2404         vpxor           0x140($inp),$xc2,$xc2
2405         vpxor           0x160($inp),$xd2,$xd2
2406         vpxor           0x180($inp),$xa3,$xa3
2407         vpxor           0x1a0($inp),$xb3,$xb3
2408         vmovdqu         $xa0,0x00($out)
2409         vmovdqu         $xb0,0x20($out)
2410         vmovdqu         $xc0,0x40($out)
2411         vmovdqu         $xd0,0x60($out)
2412         vmovdqu         $xa1,0x80($out)
2413         vmovdqu         $xb1,0xa0($out)
2414         vmovdqu         $xc1,0xc0($out)
2415         vmovdqu         $xd1,0xe0($out)
2416         vmovdqu         $xa2,0x100($out)
2417         vmovdqu         $xb2,0x120($out)
2418         vmovdqu         $xc2,0x140($out)
2419         vmovdqu         $xd2,0x160($out)
2420         vmovdqu         $xa3,0x180($out)
2421         vmovdqu         $xb3,0x1a0($out)
2422         je              .Ldone8x
2423
2424         lea             0x1c0($inp),$inp        # inp+=64*7
2425         xor             %r10,%r10
2426         vmovdqa         $xc3,0x00(%rsp)
2427         lea             0x1c0($out),$out        # out+=64*7
2428         sub             \$448,$len              # len-=64*7
2429         vmovdqa         $xd3,0x20(%rsp)
2430
2431 .Loop_tail8x:
2432         movzb           ($inp,%r10),%eax
2433         movzb           (%rsp,%r10),%ecx
2434         lea             1(%r10),%r10
2435         xor             %ecx,%eax
2436         mov             %al,-1($out,%r10)
2437         dec             $len
2438         jnz             .Loop_tail8x
2439
2440 .Ldone8x:
2441         vzeroall
2442 ___
2443 $code.=<<___    if ($win64);
2444         movaps          -0xa8(%r9),%xmm6
2445         movaps          -0x98(%r9),%xmm7
2446         movaps          -0x88(%r9),%xmm8
2447         movaps          -0x78(%r9),%xmm9
2448         movaps          -0x68(%r9),%xmm10
2449         movaps          -0x58(%r9),%xmm11
2450         movaps          -0x48(%r9),%xmm12
2451         movaps          -0x38(%r9),%xmm13
2452         movaps          -0x28(%r9),%xmm14
2453         movaps          -0x18(%r9),%xmm15
2454 ___
2455 $code.=<<___;
2456         lea             (%r9),%rsp
2457 .cfi_def_cfa_register   %rsp
2458 .L8x_epilogue:
2459         ret
2460 .cfi_endproc
2461 .size   ChaCha20_8x,.-ChaCha20_8x
2462 ___
2463 }
2464
2465 ########################################################################
2466 # AVX512 code paths
2467 if ($avx>2) {
2468 # This one handles shorter inputs...
2469
2470 my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
2471 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
2472
2473 sub vpxord()            # size optimization
2474 { my $opcode = "vpxor"; # adhere to vpxor when possible
2475
2476     foreach (@_) {
2477         if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) {
2478             $opcode = "vpxord";
2479             last;
2480         }
2481     }
2482
2483     $code .= "\t$opcode\t".join(',',reverse @_)."\n";
2484 }
2485
2486 sub AVX512ROUND {       # critical path is 14 "SIMD ticks" per round
2487         &vpaddd ($a,$a,$b);
2488         &vpxord ($d,$d,$a);
2489         &vprold ($d,$d,16);
2490
2491         &vpaddd ($c,$c,$d);
2492         &vpxord ($b,$b,$c);
2493         &vprold ($b,$b,12);
2494
2495         &vpaddd ($a,$a,$b);
2496         &vpxord ($d,$d,$a);
2497         &vprold ($d,$d,8);
2498
2499         &vpaddd ($c,$c,$d);
2500         &vpxord ($b,$b,$c);
2501         &vprold ($b,$b,7);
2502 }
2503
2504 my $xframe = $win64 ? 32+8 : 8;
2505
2506 $code.=<<___;
2507 .type   ChaCha20_avx512,\@function,5
2508 .align  32
2509 ChaCha20_avx512:
2510 .cfi_startproc
2511 .LChaCha20_avx512:
2512         mov     %rsp,%r9                # frame pointer
2513 .cfi_def_cfa_register   %r9
2514         cmp     \$512,$len
2515         ja      .LChaCha20_16x
2516
2517         sub     \$64+$xframe,%rsp
2518 ___
2519 $code.=<<___    if ($win64);
2520         movaps  %xmm6,-0x28(%r9)
2521         movaps  %xmm7,-0x18(%r9)
2522 .Lavx512_body:
2523 ___
2524 $code.=<<___;
2525         vbroadcasti32x4 .Lsigma(%rip),$a
2526         vbroadcasti32x4 ($key),$b
2527         vbroadcasti32x4 16($key),$c
2528         vbroadcasti32x4 ($counter),$d
2529
2530         vmovdqa32       $a,$a_
2531         vmovdqa32       $b,$b_
2532         vmovdqa32       $c,$c_
2533         vpaddd          .Lzeroz(%rip),$d,$d
2534         vmovdqa32       .Lfourz(%rip),$fourz
2535         mov             \$10,$counter   # reuse $counter
2536         vmovdqa32       $d,$d_
2537         jmp             .Loop_avx512
2538
2539 .align  16
2540 .Loop_outer_avx512:
2541         vmovdqa32       $a_,$a
2542         vmovdqa32       $b_,$b
2543         vmovdqa32       $c_,$c
2544         vpaddd          $fourz,$d_,$d
2545         mov             \$10,$counter
2546         vmovdqa32       $d,$d_
2547         jmp             .Loop_avx512
2548
2549 .align  32
2550 .Loop_avx512:
2551 ___
2552         &AVX512ROUND();
2553         &vpshufd        ($c,$c,0b01001110);
2554         &vpshufd        ($b,$b,0b00111001);
2555         &vpshufd        ($d,$d,0b10010011);
2556
2557         &AVX512ROUND();
2558         &vpshufd        ($c,$c,0b01001110);
2559         &vpshufd        ($b,$b,0b10010011);
2560         &vpshufd        ($d,$d,0b00111001);
2561
2562         &dec            ($counter);
2563         &jnz            (".Loop_avx512");
2564
2565 $code.=<<___;
2566         vpaddd          $a_,$a,$a
2567         vpaddd          $b_,$b,$b
2568         vpaddd          $c_,$c,$c
2569         vpaddd          $d_,$d,$d
2570
2571         sub             \$64,$len
2572         jb              .Ltail64_avx512
2573
2574         vpxor           0x00($inp),%x#$a,$t0    # xor with input
2575         vpxor           0x10($inp),%x#$b,$t1
2576         vpxor           0x20($inp),%x#$c,$t2
2577         vpxor           0x30($inp),%x#$d,$t3
2578         lea             0x40($inp),$inp         # inp+=64
2579
2580         vmovdqu         $t0,0x00($out)          # write output
2581         vmovdqu         $t1,0x10($out)
2582         vmovdqu         $t2,0x20($out)
2583         vmovdqu         $t3,0x30($out)
2584         lea             0x40($out),$out         # out+=64
2585
2586         jz              .Ldone_avx512
2587
2588         vextracti32x4   \$1,$a,$t0
2589         vextracti32x4   \$1,$b,$t1
2590         vextracti32x4   \$1,$c,$t2
2591         vextracti32x4   \$1,$d,$t3
2592
2593         sub             \$64,$len
2594         jb              .Ltail_avx512
2595
2596         vpxor           0x00($inp),$t0,$t0      # xor with input
2597         vpxor           0x10($inp),$t1,$t1
2598         vpxor           0x20($inp),$t2,$t2
2599         vpxor           0x30($inp),$t3,$t3
2600         lea             0x40($inp),$inp         # inp+=64
2601
2602         vmovdqu         $t0,0x00($out)          # write output
2603         vmovdqu         $t1,0x10($out)
2604         vmovdqu         $t2,0x20($out)
2605         vmovdqu         $t3,0x30($out)
2606         lea             0x40($out),$out         # out+=64
2607
2608         jz              .Ldone_avx512
2609
2610         vextracti32x4   \$2,$a,$t0
2611         vextracti32x4   \$2,$b,$t1
2612         vextracti32x4   \$2,$c,$t2
2613         vextracti32x4   \$2,$d,$t3
2614
2615         sub             \$64,$len
2616         jb              .Ltail_avx512
2617
2618         vpxor           0x00($inp),$t0,$t0      # xor with input
2619         vpxor           0x10($inp),$t1,$t1
2620         vpxor           0x20($inp),$t2,$t2
2621         vpxor           0x30($inp),$t3,$t3
2622         lea             0x40($inp),$inp         # inp+=64
2623
2624         vmovdqu         $t0,0x00($out)          # write output
2625         vmovdqu         $t1,0x10($out)
2626         vmovdqu         $t2,0x20($out)
2627         vmovdqu         $t3,0x30($out)
2628         lea             0x40($out),$out         # out+=64
2629
2630         jz              .Ldone_avx512
2631
2632         vextracti32x4   \$3,$a,$t0
2633         vextracti32x4   \$3,$b,$t1
2634         vextracti32x4   \$3,$c,$t2
2635         vextracti32x4   \$3,$d,$t3
2636
2637         sub             \$64,$len
2638         jb              .Ltail_avx512
2639
2640         vpxor           0x00($inp),$t0,$t0      # xor with input
2641         vpxor           0x10($inp),$t1,$t1
2642         vpxor           0x20($inp),$t2,$t2
2643         vpxor           0x30($inp),$t3,$t3
2644         lea             0x40($inp),$inp         # inp+=64
2645
2646         vmovdqu         $t0,0x00($out)          # write output
2647         vmovdqu         $t1,0x10($out)
2648         vmovdqu         $t2,0x20($out)
2649         vmovdqu         $t3,0x30($out)
2650         lea             0x40($out),$out         # out+=64
2651
2652         jnz             .Loop_outer_avx512
2653
2654         jmp             .Ldone_avx512
2655
2656 .align  16
2657 .Ltail64_avx512:
2658         vmovdqa         %x#$a,0x00(%rsp)
2659         vmovdqa         %x#$b,0x10(%rsp)
2660         vmovdqa         %x#$c,0x20(%rsp)
2661         vmovdqa         %x#$d,0x30(%rsp)
2662         add             \$64,$len
2663         jmp             .Loop_tail_avx512
2664
2665 .align  16
2666 .Ltail_avx512:
2667         vmovdqa         $t0,0x00(%rsp)
2668         vmovdqa         $t1,0x10(%rsp)
2669         vmovdqa         $t2,0x20(%rsp)
2670         vmovdqa         $t3,0x30(%rsp)
2671         add             \$64,$len
2672
2673 .Loop_tail_avx512:
2674         movzb           ($inp,$counter),%eax
2675         movzb           (%rsp,$counter),%ecx
2676         lea             1($counter),$counter
2677         xor             %ecx,%eax
2678         mov             %al,-1($out,$counter)
2679         dec             $len
2680         jnz             .Loop_tail_avx512
2681
2682         vmovdqu32       $a_,0x00(%rsp)
2683
2684 .Ldone_avx512:
2685         vzeroall
2686 ___
2687 $code.=<<___    if ($win64);
2688         movaps  -0x28(%r9),%xmm6
2689         movaps  -0x18(%r9),%xmm7
2690 ___
2691 $code.=<<___;
2692         lea     (%r9),%rsp
2693 .cfi_def_cfa_register   %rsp
2694 .Lavx512_epilogue:
2695         ret
2696 .cfi_endproc
2697 .size   ChaCha20_avx512,.-ChaCha20_avx512
2698 ___
2699
2700 map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz);
2701
2702 $code.=<<___;
2703 .type   ChaCha20_avx512vl,\@function,5
2704 .align  32
2705 ChaCha20_avx512vl:
2706 .cfi_startproc
2707 .LChaCha20_avx512vl:
2708         mov     %rsp,%r9                # frame pointer
2709 .cfi_def_cfa_register   %r9
2710         cmp     \$128,$len
2711         ja      .LChaCha20_8xvl
2712
2713         sub     \$64+$xframe,%rsp
2714 ___
2715 $code.=<<___    if ($win64);
2716         movaps  %xmm6,-0x28(%r9)
2717         movaps  %xmm7,-0x18(%r9)
2718 .Lavx512vl_body:
2719 ___
2720 $code.=<<___;
2721         vbroadcasti128  .Lsigma(%rip),$a
2722         vbroadcasti128  ($key),$b
2723         vbroadcasti128  16($key),$c
2724         vbroadcasti128  ($counter),$d
2725
2726         vmovdqa32       $a,$a_
2727         vmovdqa32       $b,$b_
2728         vmovdqa32       $c,$c_
2729         vpaddd          .Lzeroz(%rip),$d,$d
2730         vmovdqa32       .Ltwoy(%rip),$fourz
2731         mov             \$10,$counter   # reuse $counter
2732         vmovdqa32       $d,$d_
2733         jmp             .Loop_avx512vl
2734
2735 .align  16
2736 .Loop_outer_avx512vl:
2737         vmovdqa32       $c_,$c
2738         vpaddd          $fourz,$d_,$d
2739         mov             \$10,$counter
2740         vmovdqa32       $d,$d_
2741         jmp             .Loop_avx512vl
2742
2743 .align  32
2744 .Loop_avx512vl:
2745 ___
2746         &AVX512ROUND();
2747         &vpshufd        ($c,$c,0b01001110);
2748         &vpshufd        ($b,$b,0b00111001);
2749         &vpshufd        ($d,$d,0b10010011);
2750
2751         &AVX512ROUND();
2752         &vpshufd        ($c,$c,0b01001110);
2753         &vpshufd        ($b,$b,0b10010011);
2754         &vpshufd        ($d,$d,0b00111001);
2755
2756         &dec            ($counter);
2757         &jnz            (".Loop_avx512vl");
2758
2759 $code.=<<___;
2760         vpaddd          $a_,$a,$a
2761         vpaddd          $b_,$b,$b
2762         vpaddd          $c_,$c,$c
2763         vpaddd          $d_,$d,$d
2764
2765         sub             \$64,$len
2766         jb              .Ltail64_avx512vl
2767
2768         vpxor           0x00($inp),%x#$a,$t0    # xor with input
2769         vpxor           0x10($inp),%x#$b,$t1
2770         vpxor           0x20($inp),%x#$c,$t2
2771         vpxor           0x30($inp),%x#$d,$t3
2772         lea             0x40($inp),$inp         # inp+=64
2773
2774         vmovdqu         $t0,0x00($out)          # write output
2775         vmovdqu         $t1,0x10($out)
2776         vmovdqu         $t2,0x20($out)
2777         vmovdqu         $t3,0x30($out)
2778         lea             0x40($out),$out         # out+=64
2779
2780         jz              .Ldone_avx512vl
2781
2782         vextracti128    \$1,$a,$t0
2783         vextracti128    \$1,$b,$t1
2784         vextracti128    \$1,$c,$t2
2785         vextracti128    \$1,$d,$t3
2786
2787         sub             \$64,$len
2788         jb              .Ltail_avx512vl
2789
2790         vpxor           0x00($inp),$t0,$t0      # xor with input
2791         vpxor           0x10($inp),$t1,$t1
2792         vpxor           0x20($inp),$t2,$t2
2793         vpxor           0x30($inp),$t3,$t3
2794         lea             0x40($inp),$inp         # inp+=64
2795
2796         vmovdqu         $t0,0x00($out)          # write output
2797         vmovdqu         $t1,0x10($out)
2798         vmovdqu         $t2,0x20($out)
2799         vmovdqu         $t3,0x30($out)
2800         lea             0x40($out),$out         # out+=64
2801
2802         vmovdqa32       $a_,$a
2803         vmovdqa32       $b_,$b
2804         jnz             .Loop_outer_avx512vl
2805
2806         jmp             .Ldone_avx512vl
2807
2808 .align  16
2809 .Ltail64_avx512vl:
2810         vmovdqa         %x#$a,0x00(%rsp)
2811         vmovdqa         %x#$b,0x10(%rsp)
2812         vmovdqa         %x#$c,0x20(%rsp)
2813         vmovdqa         %x#$d,0x30(%rsp)
2814         add             \$64,$len
2815         jmp             .Loop_tail_avx512vl
2816
2817 .align  16
2818 .Ltail_avx512vl:
2819         vmovdqa         $t0,0x00(%rsp)
2820         vmovdqa         $t1,0x10(%rsp)
2821         vmovdqa         $t2,0x20(%rsp)
2822         vmovdqa         $t3,0x30(%rsp)
2823         add             \$64,$len
2824
2825 .Loop_tail_avx512vl:
2826         movzb           ($inp,$counter),%eax
2827         movzb           (%rsp,$counter),%ecx
2828         lea             1($counter),$counter
2829         xor             %ecx,%eax
2830         mov             %al,-1($out,$counter)
2831         dec             $len
2832         jnz             .Loop_tail_avx512vl
2833
2834         vmovdqu32       $a_,0x00(%rsp)
2835         vmovdqu32       $a_,0x20(%rsp)
2836
2837 .Ldone_avx512vl:
2838         vzeroall
2839 ___
2840 $code.=<<___    if ($win64);
2841         movaps  -0x28(%r9),%xmm6
2842         movaps  -0x18(%r9),%xmm7
2843 ___
2844 $code.=<<___;
2845         lea     (%r9),%rsp
2846 .cfi_def_cfa_register   %rsp
2847 .Lavx512vl_epilogue:
2848         ret
2849 .cfi_endproc
2850 .size   ChaCha20_avx512vl,.-ChaCha20_avx512vl
2851 ___
2852 }
2853 if ($avx>2) {
2854 # This one handles longer inputs...
2855
2856 my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2857     $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
2858 my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2859          $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2860 my @key=map("%zmm$_",(16..31));
2861 my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
2862
2863 sub AVX512_lane_ROUND {
2864 my ($a0,$b0,$c0,$d0)=@_;
2865 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
2866 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
2867 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
2868 my @x=map("\"$_\"",@xx);
2869
2870         (
2871         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",     # Q1
2872          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",     # Q2
2873           "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",     # Q3
2874            "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",     # Q4
2875         "&vpxord        (@x[$d0],@x[$d0],@x[$a0])",
2876          "&vpxord       (@x[$d1],@x[$d1],@x[$a1])",
2877           "&vpxord      (@x[$d2],@x[$d2],@x[$a2])",
2878            "&vpxord     (@x[$d3],@x[$d3],@x[$a3])",
2879         "&vprold        (@x[$d0],@x[$d0],16)",
2880          "&vprold       (@x[$d1],@x[$d1],16)",
2881           "&vprold      (@x[$d2],@x[$d2],16)",
2882            "&vprold     (@x[$d3],@x[$d3],16)",
2883
2884         "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
2885          "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
2886           "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
2887            "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
2888         "&vpxord        (@x[$b0],@x[$b0],@x[$c0])",
2889          "&vpxord       (@x[$b1],@x[$b1],@x[$c1])",
2890           "&vpxord      (@x[$b2],@x[$b2],@x[$c2])",
2891            "&vpxord     (@x[$b3],@x[$b3],@x[$c3])",
2892         "&vprold        (@x[$b0],@x[$b0],12)",
2893          "&vprold       (@x[$b1],@x[$b1],12)",
2894           "&vprold      (@x[$b2],@x[$b2],12)",
2895            "&vprold     (@x[$b3],@x[$b3],12)",
2896
2897         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",
2898          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",
2899           "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",
2900            "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",
2901         "&vpxord        (@x[$d0],@x[$d0],@x[$a0])",
2902          "&vpxord       (@x[$d1],@x[$d1],@x[$a1])",
2903           "&vpxord      (@x[$d2],@x[$d2],@x[$a2])",
2904            "&vpxord     (@x[$d3],@x[$d3],@x[$a3])",
2905         "&vprold        (@x[$d0],@x[$d0],8)",
2906          "&vprold       (@x[$d1],@x[$d1],8)",
2907           "&vprold      (@x[$d2],@x[$d2],8)",
2908            "&vprold     (@x[$d3],@x[$d3],8)",
2909
2910         "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
2911          "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
2912           "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
2913            "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
2914         "&vpxord        (@x[$b0],@x[$b0],@x[$c0])",
2915          "&vpxord       (@x[$b1],@x[$b1],@x[$c1])",
2916           "&vpxord      (@x[$b2],@x[$b2],@x[$c2])",
2917            "&vpxord     (@x[$b3],@x[$b3],@x[$c3])",
2918         "&vprold        (@x[$b0],@x[$b0],7)",
2919          "&vprold       (@x[$b1],@x[$b1],7)",
2920           "&vprold      (@x[$b2],@x[$b2],7)",
2921            "&vprold     (@x[$b3],@x[$b3],7)"
2922         );
2923 }
2924
2925 my $xframe = $win64 ? 0xa8 : 8;
2926
2927 $code.=<<___;
2928 .type   ChaCha20_16x,\@function,5
2929 .align  32
2930 ChaCha20_16x:
2931 .cfi_startproc
2932 .LChaCha20_16x:
2933         mov             %rsp,%r9                # frame register
2934 .cfi_def_cfa_register   %r9
2935         sub             \$64+$xframe,%rsp
2936         and             \$-64,%rsp
2937 ___
2938 $code.=<<___    if ($win64);
2939         movaps          %xmm6,-0xa8(%r9)
2940         movaps          %xmm7,-0x98(%r9)
2941         movaps          %xmm8,-0x88(%r9)
2942         movaps          %xmm9,-0x78(%r9)
2943         movaps          %xmm10,-0x68(%r9)
2944         movaps          %xmm11,-0x58(%r9)
2945         movaps          %xmm12,-0x48(%r9)
2946         movaps          %xmm13,-0x38(%r9)
2947         movaps          %xmm14,-0x28(%r9)
2948         movaps          %xmm15,-0x18(%r9)
2949 .L16x_body:
2950 ___
2951 $code.=<<___;
2952         vzeroupper
2953
2954         lea             .Lsigma(%rip),%r10
2955         vbroadcasti32x4 (%r10),$xa3             # key[0]
2956         vbroadcasti32x4 ($key),$xb3             # key[1]
2957         vbroadcasti32x4 16($key),$xc3           # key[2]
2958         vbroadcasti32x4 ($counter),$xd3         # key[3]
2959
2960         vpshufd         \$0x00,$xa3,$xa0        # smash key by lanes...
2961         vpshufd         \$0x55,$xa3,$xa1
2962         vpshufd         \$0xaa,$xa3,$xa2
2963         vpshufd         \$0xff,$xa3,$xa3
2964         vmovdqa64       $xa0,@key[0]
2965         vmovdqa64       $xa1,@key[1]
2966         vmovdqa64       $xa2,@key[2]
2967         vmovdqa64       $xa3,@key[3]
2968
2969         vpshufd         \$0x00,$xb3,$xb0
2970         vpshufd         \$0x55,$xb3,$xb1
2971         vpshufd         \$0xaa,$xb3,$xb2
2972         vpshufd         \$0xff,$xb3,$xb3
2973         vmovdqa64       $xb0,@key[4]
2974         vmovdqa64       $xb1,@key[5]
2975         vmovdqa64       $xb2,@key[6]
2976         vmovdqa64       $xb3,@key[7]
2977
2978         vpshufd         \$0x00,$xc3,$xc0
2979         vpshufd         \$0x55,$xc3,$xc1
2980         vpshufd         \$0xaa,$xc3,$xc2
2981         vpshufd         \$0xff,$xc3,$xc3
2982         vmovdqa64       $xc0,@key[8]
2983         vmovdqa64       $xc1,@key[9]
2984         vmovdqa64       $xc2,@key[10]
2985         vmovdqa64       $xc3,@key[11]
2986
2987         vpshufd         \$0x00,$xd3,$xd0
2988         vpshufd         \$0x55,$xd3,$xd1
2989         vpshufd         \$0xaa,$xd3,$xd2
2990         vpshufd         \$0xff,$xd3,$xd3
2991         vpaddd          .Lincz(%rip),$xd0,$xd0  # don't save counters yet
2992         vmovdqa64       $xd0,@key[12]
2993         vmovdqa64       $xd1,@key[13]
2994         vmovdqa64       $xd2,@key[14]
2995         vmovdqa64       $xd3,@key[15]
2996
2997         mov             \$10,%eax
2998         jmp             .Loop16x
2999
3000 .align  32
3001 .Loop_outer16x:
3002         vpbroadcastd    0(%r10),$xa0            # reload key
3003         vpbroadcastd    4(%r10),$xa1
3004         vpbroadcastd    8(%r10),$xa2
3005         vpbroadcastd    12(%r10),$xa3
3006         vpaddd          .Lsixteen(%rip),@key[12],@key[12]       # next SIMD counters
3007         vmovdqa64       @key[4],$xb0
3008         vmovdqa64       @key[5],$xb1
3009         vmovdqa64       @key[6],$xb2
3010         vmovdqa64       @key[7],$xb3
3011         vmovdqa64       @key[8],$xc0
3012         vmovdqa64       @key[9],$xc1
3013         vmovdqa64       @key[10],$xc2
3014         vmovdqa64       @key[11],$xc3
3015         vmovdqa64       @key[12],$xd0
3016         vmovdqa64       @key[13],$xd1
3017         vmovdqa64       @key[14],$xd2
3018         vmovdqa64       @key[15],$xd3
3019
3020         vmovdqa64       $xa0,@key[0]
3021         vmovdqa64       $xa1,@key[1]
3022         vmovdqa64       $xa2,@key[2]
3023         vmovdqa64       $xa3,@key[3]
3024
3025         mov             \$10,%eax
3026         jmp             .Loop16x
3027
3028 .align  32
3029 .Loop16x:
3030 ___
3031         foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
3032         foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
3033 $code.=<<___;
3034         dec             %eax
3035         jnz             .Loop16x
3036
3037         vpaddd          @key[0],$xa0,$xa0       # accumulate key
3038         vpaddd          @key[1],$xa1,$xa1
3039         vpaddd          @key[2],$xa2,$xa2
3040         vpaddd          @key[3],$xa3,$xa3
3041
3042         vpunpckldq      $xa1,$xa0,$xt2          # "de-interlace" data
3043         vpunpckldq      $xa3,$xa2,$xt3
3044         vpunpckhdq      $xa1,$xa0,$xa0
3045         vpunpckhdq      $xa3,$xa2,$xa2
3046         vpunpcklqdq     $xt3,$xt2,$xa1          # "a0"
3047         vpunpckhqdq     $xt3,$xt2,$xt2          # "a1"
3048         vpunpcklqdq     $xa2,$xa0,$xa3          # "a2"
3049         vpunpckhqdq     $xa2,$xa0,$xa0          # "a3"
3050 ___
3051         ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
3052 $code.=<<___;
3053         vpaddd          @key[4],$xb0,$xb0
3054         vpaddd          @key[5],$xb1,$xb1
3055         vpaddd          @key[6],$xb2,$xb2
3056         vpaddd          @key[7],$xb3,$xb3
3057
3058         vpunpckldq      $xb1,$xb0,$xt2
3059         vpunpckldq      $xb3,$xb2,$xt3
3060         vpunpckhdq      $xb1,$xb0,$xb0
3061         vpunpckhdq      $xb3,$xb2,$xb2
3062         vpunpcklqdq     $xt3,$xt2,$xb1          # "b0"
3063         vpunpckhqdq     $xt3,$xt2,$xt2          # "b1"
3064         vpunpcklqdq     $xb2,$xb0,$xb3          # "b2"
3065         vpunpckhqdq     $xb2,$xb0,$xb0          # "b3"
3066 ___
3067         ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
3068 $code.=<<___;
3069         vshufi32x4      \$0x44,$xb0,$xa0,$xt3   # "de-interlace" further
3070         vshufi32x4      \$0xee,$xb0,$xa0,$xb0
3071         vshufi32x4      \$0x44,$xb1,$xa1,$xa0
3072         vshufi32x4      \$0xee,$xb1,$xa1,$xb1
3073         vshufi32x4      \$0x44,$xb2,$xa2,$xa1
3074         vshufi32x4      \$0xee,$xb2,$xa2,$xb2
3075         vshufi32x4      \$0x44,$xb3,$xa3,$xa2
3076         vshufi32x4      \$0xee,$xb3,$xa3,$xb3
3077 ___
3078         ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
3079 $code.=<<___;
3080         vpaddd          @key[8],$xc0,$xc0
3081         vpaddd          @key[9],$xc1,$xc1
3082         vpaddd          @key[10],$xc2,$xc2
3083         vpaddd          @key[11],$xc3,$xc3
3084
3085         vpunpckldq      $xc1,$xc0,$xt2
3086         vpunpckldq      $xc3,$xc2,$xt3
3087         vpunpckhdq      $xc1,$xc0,$xc0
3088         vpunpckhdq      $xc3,$xc2,$xc2
3089         vpunpcklqdq     $xt3,$xt2,$xc1          # "c0"
3090         vpunpckhqdq     $xt3,$xt2,$xt2          # "c1"
3091         vpunpcklqdq     $xc2,$xc0,$xc3          # "c2"
3092         vpunpckhqdq     $xc2,$xc0,$xc0          # "c3"
3093 ___
3094         ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
3095 $code.=<<___;
3096         vpaddd          @key[12],$xd0,$xd0
3097         vpaddd          @key[13],$xd1,$xd1
3098         vpaddd          @key[14],$xd2,$xd2
3099         vpaddd          @key[15],$xd3,$xd3
3100
3101         vpunpckldq      $xd1,$xd0,$xt2
3102         vpunpckldq      $xd3,$xd2,$xt3
3103         vpunpckhdq      $xd1,$xd0,$xd0
3104         vpunpckhdq      $xd3,$xd2,$xd2
3105         vpunpcklqdq     $xt3,$xt2,$xd1          # "d0"
3106         vpunpckhqdq     $xt3,$xt2,$xt2          # "d1"
3107         vpunpcklqdq     $xd2,$xd0,$xd3          # "d2"
3108         vpunpckhqdq     $xd2,$xd0,$xd0          # "d3"
3109 ___
3110         ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
3111 $code.=<<___;
3112         vshufi32x4      \$0x44,$xd0,$xc0,$xt3   # "de-interlace" further
3113         vshufi32x4      \$0xee,$xd0,$xc0,$xd0
3114         vshufi32x4      \$0x44,$xd1,$xc1,$xc0
3115         vshufi32x4      \$0xee,$xd1,$xc1,$xd1
3116         vshufi32x4      \$0x44,$xd2,$xc2,$xc1
3117         vshufi32x4      \$0xee,$xd2,$xc2,$xd2
3118         vshufi32x4      \$0x44,$xd3,$xc3,$xc2
3119         vshufi32x4      \$0xee,$xd3,$xc3,$xd3
3120 ___
3121         ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
3122 $code.=<<___;
3123         vshufi32x4      \$0x88,$xc0,$xa0,$xt0   # "de-interlace" further
3124         vshufi32x4      \$0xdd,$xc0,$xa0,$xa0
3125          vshufi32x4     \$0x88,$xd0,$xb0,$xc0
3126          vshufi32x4     \$0xdd,$xd0,$xb0,$xd0
3127         vshufi32x4      \$0x88,$xc1,$xa1,$xt1
3128         vshufi32x4      \$0xdd,$xc1,$xa1,$xa1
3129          vshufi32x4     \$0x88,$xd1,$xb1,$xc1
3130          vshufi32x4     \$0xdd,$xd1,$xb1,$xd1
3131         vshufi32x4      \$0x88,$xc2,$xa2,$xt2
3132         vshufi32x4      \$0xdd,$xc2,$xa2,$xa2
3133          vshufi32x4     \$0x88,$xd2,$xb2,$xc2
3134          vshufi32x4     \$0xdd,$xd2,$xb2,$xd2
3135         vshufi32x4      \$0x88,$xc3,$xa3,$xt3
3136         vshufi32x4      \$0xdd,$xc3,$xa3,$xa3
3137          vshufi32x4     \$0x88,$xd3,$xb3,$xc3
3138          vshufi32x4     \$0xdd,$xd3,$xb3,$xd3
3139 ___
3140         ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
3141         ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
3142
3143         ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
3144          $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
3145         ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3146          $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
3147 $code.=<<___;
3148         cmp             \$64*16,$len
3149         jb              .Ltail16x
3150
3151         vpxord          0x00($inp),$xa0,$xa0    # xor with input
3152         vpxord          0x40($inp),$xb0,$xb0
3153         vpxord          0x80($inp),$xc0,$xc0
3154         vpxord          0xc0($inp),$xd0,$xd0
3155         vmovdqu32       $xa0,0x00($out)
3156         vmovdqu32       $xb0,0x40($out)
3157         vmovdqu32       $xc0,0x80($out)
3158         vmovdqu32       $xd0,0xc0($out)
3159
3160         vpxord          0x100($inp),$xa1,$xa1
3161         vpxord          0x140($inp),$xb1,$xb1
3162         vpxord          0x180($inp),$xc1,$xc1
3163         vpxord          0x1c0($inp),$xd1,$xd1
3164         vmovdqu32       $xa1,0x100($out)
3165         vmovdqu32       $xb1,0x140($out)
3166         vmovdqu32       $xc1,0x180($out)
3167         vmovdqu32       $xd1,0x1c0($out)
3168
3169         vpxord          0x200($inp),$xa2,$xa2
3170         vpxord          0x240($inp),$xb2,$xb2
3171         vpxord          0x280($inp),$xc2,$xc2
3172         vpxord          0x2c0($inp),$xd2,$xd2
3173         vmovdqu32       $xa2,0x200($out)
3174         vmovdqu32       $xb2,0x240($out)
3175         vmovdqu32       $xc2,0x280($out)
3176         vmovdqu32       $xd2,0x2c0($out)
3177
3178         vpxord          0x300($inp),$xa3,$xa3
3179         vpxord          0x340($inp),$xb3,$xb3
3180         vpxord          0x380($inp),$xc3,$xc3
3181         vpxord          0x3c0($inp),$xd3,$xd3
3182         lea             0x400($inp),$inp
3183         vmovdqu32       $xa3,0x300($out)
3184         vmovdqu32       $xb3,0x340($out)
3185         vmovdqu32       $xc3,0x380($out)
3186         vmovdqu32       $xd3,0x3c0($out)
3187         lea             0x400($out),$out
3188
3189         sub             \$64*16,$len
3190         jnz             .Loop_outer16x
3191
3192         jmp             .Ldone16x
3193
3194 .align  32
3195 .Ltail16x:
3196         xor             %r10,%r10
3197         sub             $inp,$out
3198         cmp             \$64*1,$len
3199         jb              .Less_than_64_16x
3200         vpxord          ($inp),$xa0,$xa0        # xor with input
3201         vmovdqu32       $xa0,($out,$inp)
3202         je              .Ldone16x
3203         vmovdqa32       $xb0,$xa0
3204         lea             64($inp),$inp
3205
3206         cmp             \$64*2,$len
3207         jb              .Less_than_64_16x
3208         vpxord          ($inp),$xb0,$xb0
3209         vmovdqu32       $xb0,($out,$inp)
3210         je              .Ldone16x
3211         vmovdqa32       $xc0,$xa0
3212         lea             64($inp),$inp
3213
3214         cmp             \$64*3,$len
3215         jb              .Less_than_64_16x
3216         vpxord          ($inp),$xc0,$xc0
3217         vmovdqu32       $xc0,($out,$inp)
3218         je              .Ldone16x
3219         vmovdqa32       $xd0,$xa0
3220         lea             64($inp),$inp
3221
3222         cmp             \$64*4,$len
3223         jb              .Less_than_64_16x
3224         vpxord          ($inp),$xd0,$xd0
3225         vmovdqu32       $xd0,($out,$inp)
3226         je              .Ldone16x
3227         vmovdqa32       $xa1,$xa0
3228         lea             64($inp),$inp
3229
3230         cmp             \$64*5,$len
3231         jb              .Less_than_64_16x
3232         vpxord          ($inp),$xa1,$xa1
3233         vmovdqu32       $xa1,($out,$inp)
3234         je              .Ldone16x
3235         vmovdqa32       $xb1,$xa0
3236         lea             64($inp),$inp
3237
3238         cmp             \$64*6,$len
3239         jb              .Less_than_64_16x
3240         vpxord          ($inp),$xb1,$xb1
3241         vmovdqu32       $xb1,($out,$inp)
3242         je              .Ldone16x
3243         vmovdqa32       $xc1,$xa0
3244         lea             64($inp),$inp
3245
3246         cmp             \$64*7,$len
3247         jb              .Less_than_64_16x
3248         vpxord          ($inp),$xc1,$xc1
3249         vmovdqu32       $xc1,($out,$inp)
3250         je              .Ldone16x
3251         vmovdqa32       $xd1,$xa0
3252         lea             64($inp),$inp
3253
3254         cmp             \$64*8,$len
3255         jb              .Less_than_64_16x
3256         vpxord          ($inp),$xd1,$xd1
3257         vmovdqu32       $xd1,($out,$inp)
3258         je              .Ldone16x
3259         vmovdqa32       $xa2,$xa0
3260         lea             64($inp),$inp
3261
3262         cmp             \$64*9,$len
3263         jb              .Less_than_64_16x
3264         vpxord          ($inp),$xa2,$xa2
3265         vmovdqu32       $xa2,($out,$inp)
3266         je              .Ldone16x
3267         vmovdqa32       $xb2,$xa0
3268         lea             64($inp),$inp
3269
3270         cmp             \$64*10,$len
3271         jb              .Less_than_64_16x
3272         vpxord          ($inp),$xb2,$xb2
3273         vmovdqu32       $xb2,($out,$inp)
3274         je              .Ldone16x
3275         vmovdqa32       $xc2,$xa0
3276         lea             64($inp),$inp
3277
3278         cmp             \$64*11,$len
3279         jb              .Less_than_64_16x
3280         vpxord          ($inp),$xc2,$xc2
3281         vmovdqu32       $xc2,($out,$inp)
3282         je              .Ldone16x
3283         vmovdqa32       $xd2,$xa0
3284         lea             64($inp),$inp
3285
3286         cmp             \$64*12,$len
3287         jb              .Less_than_64_16x
3288         vpxord          ($inp),$xd2,$xd2
3289         vmovdqu32       $xd2,($out,$inp)
3290         je              .Ldone16x
3291         vmovdqa32       $xa3,$xa0
3292         lea             64($inp),$inp
3293
3294         cmp             \$64*13,$len
3295         jb              .Less_than_64_16x
3296         vpxord          ($inp),$xa3,$xa3
3297         vmovdqu32       $xa3,($out,$inp)
3298         je              .Ldone16x
3299         vmovdqa32       $xb3,$xa0
3300         lea             64($inp),$inp
3301
3302         cmp             \$64*14,$len
3303         jb              .Less_than_64_16x
3304         vpxord          ($inp),$xb3,$xb3
3305         vmovdqu32       $xb3,($out,$inp)
3306         je              .Ldone16x
3307         vmovdqa32       $xc3,$xa0
3308         lea             64($inp),$inp
3309
3310         cmp             \$64*15,$len
3311         jb              .Less_than_64_16x
3312         vpxord          ($inp),$xc3,$xc3
3313         vmovdqu32       $xc3,($out,$inp)
3314         je              .Ldone16x
3315         vmovdqa32       $xd3,$xa0
3316         lea             64($inp),$inp
3317
3318 .Less_than_64_16x:
3319         vmovdqa32       $xa0,0x00(%rsp)
3320         lea             ($out,$inp),$out
3321         and             \$63,$len
3322
3323 .Loop_tail16x:
3324         movzb           ($inp,%r10),%eax
3325         movzb           (%rsp,%r10),%ecx
3326         lea             1(%r10),%r10
3327         xor             %ecx,%eax
3328         mov             %al,-1($out,%r10)
3329         dec             $len
3330         jnz             .Loop_tail16x
3331
3332         vpxord          $xa0,$xa0,$xa0
3333         vmovdqa32       $xa0,0(%rsp)
3334
3335 .Ldone16x:
3336         vzeroall
3337 ___
3338 $code.=<<___    if ($win64);
3339         movaps          -0xa8(%r9),%xmm6
3340         movaps          -0x98(%r9),%xmm7
3341         movaps          -0x88(%r9),%xmm8
3342         movaps          -0x78(%r9),%xmm9
3343         movaps          -0x68(%r9),%xmm10
3344         movaps          -0x58(%r9),%xmm11
3345         movaps          -0x48(%r9),%xmm12
3346         movaps          -0x38(%r9),%xmm13
3347         movaps          -0x28(%r9),%xmm14
3348         movaps          -0x18(%r9),%xmm15
3349 ___
3350 $code.=<<___;
3351         lea             (%r9),%rsp
3352 .cfi_def_cfa_register   %rsp
3353 .L16x_epilogue:
3354         ret
3355 .cfi_endproc
3356 .size   ChaCha20_16x,.-ChaCha20_16x
3357 ___
3358
3359 # switch to %ymm domain
3360 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3361  $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15));
3362 @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3363      $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
3364 @key=map("%ymm$_",(16..31));
3365 ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
3366
3367 $code.=<<___;
3368 .type   ChaCha20_8xvl,\@function,5
3369 .align  32
3370 ChaCha20_8xvl:
3371 .cfi_startproc
3372 .LChaCha20_8xvl:
3373         mov             %rsp,%r9                # frame register
3374 .cfi_def_cfa_register   %r9
3375         sub             \$64+$xframe,%rsp
3376         and             \$-64,%rsp
3377 ___
3378 $code.=<<___    if ($win64);
3379         movaps          %xmm6,-0xa8(%r9)
3380         movaps          %xmm7,-0x98(%r9)
3381         movaps          %xmm8,-0x88(%r9)
3382         movaps          %xmm9,-0x78(%r9)
3383         movaps          %xmm10,-0x68(%r9)
3384         movaps          %xmm11,-0x58(%r9)
3385         movaps          %xmm12,-0x48(%r9)
3386         movaps          %xmm13,-0x38(%r9)
3387         movaps          %xmm14,-0x28(%r9)
3388         movaps          %xmm15,-0x18(%r9)
3389 .L8xvl_body:
3390 ___
3391 $code.=<<___;
3392         vzeroupper
3393
3394         lea             .Lsigma(%rip),%r10
3395         vbroadcasti128  (%r10),$xa3             # key[0]
3396         vbroadcasti128  ($key),$xb3             # key[1]
3397         vbroadcasti128  16($key),$xc3           # key[2]
3398         vbroadcasti128  ($counter),$xd3         # key[3]
3399
3400         vpshufd         \$0x00,$xa3,$xa0        # smash key by lanes...
3401         vpshufd         \$0x55,$xa3,$xa1
3402         vpshufd         \$0xaa,$xa3,$xa2
3403         vpshufd         \$0xff,$xa3,$xa3
3404         vmovdqa64       $xa0,@key[0]
3405         vmovdqa64       $xa1,@key[1]
3406         vmovdqa64       $xa2,@key[2]
3407         vmovdqa64       $xa3,@key[3]
3408
3409         vpshufd         \$0x00,$xb3,$xb0
3410         vpshufd         \$0x55,$xb3,$xb1
3411         vpshufd         \$0xaa,$xb3,$xb2
3412         vpshufd         \$0xff,$xb3,$xb3
3413         vmovdqa64       $xb0,@key[4]
3414         vmovdqa64       $xb1,@key[5]
3415         vmovdqa64       $xb2,@key[6]
3416         vmovdqa64       $xb3,@key[7]
3417
3418         vpshufd         \$0x00,$xc3,$xc0
3419         vpshufd         \$0x55,$xc3,$xc1
3420         vpshufd         \$0xaa,$xc3,$xc2
3421         vpshufd         \$0xff,$xc3,$xc3
3422         vmovdqa64       $xc0,@key[8]
3423         vmovdqa64       $xc1,@key[9]
3424         vmovdqa64       $xc2,@key[10]
3425         vmovdqa64       $xc3,@key[11]
3426
3427         vpshufd         \$0x00,$xd3,$xd0
3428         vpshufd         \$0x55,$xd3,$xd1
3429         vpshufd         \$0xaa,$xd3,$xd2
3430         vpshufd         \$0xff,$xd3,$xd3
3431         vpaddd          .Lincy(%rip),$xd0,$xd0  # don't save counters yet
3432         vmovdqa64       $xd0,@key[12]
3433         vmovdqa64       $xd1,@key[13]
3434         vmovdqa64       $xd2,@key[14]
3435         vmovdqa64       $xd3,@key[15]
3436
3437         mov             \$10,%eax
3438         jmp             .Loop8xvl
3439
3440 .align  32
3441 .Loop_outer8xvl:
3442         #vpbroadcastd   0(%r10),$xa0            # reload key
3443         #vpbroadcastd   4(%r10),$xa1
3444         vpbroadcastd    8(%r10),$xa2
3445         vpbroadcastd    12(%r10),$xa3
3446         vpaddd          .Leight(%rip),@key[12],@key[12] # next SIMD counters
3447         vmovdqa64       @key[4],$xb0
3448         vmovdqa64       @key[5],$xb1
3449         vmovdqa64       @key[6],$xb2
3450         vmovdqa64       @key[7],$xb3
3451         vmovdqa64       @key[8],$xc0
3452         vmovdqa64       @key[9],$xc1
3453         vmovdqa64       @key[10],$xc2
3454         vmovdqa64       @key[11],$xc3
3455         vmovdqa64       @key[12],$xd0
3456         vmovdqa64       @key[13],$xd1
3457         vmovdqa64       @key[14],$xd2
3458         vmovdqa64       @key[15],$xd3
3459
3460         vmovdqa64       $xa0,@key[0]
3461         vmovdqa64       $xa1,@key[1]
3462         vmovdqa64       $xa2,@key[2]
3463         vmovdqa64       $xa3,@key[3]
3464
3465         mov             \$10,%eax
3466         jmp             .Loop8xvl
3467
3468 .align  32
3469 .Loop8xvl:
3470 ___
3471         foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
3472         foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
3473 $code.=<<___;
3474         dec             %eax
3475         jnz             .Loop8xvl
3476
3477         vpaddd          @key[0],$xa0,$xa0       # accumulate key
3478         vpaddd          @key[1],$xa1,$xa1
3479         vpaddd          @key[2],$xa2,$xa2
3480         vpaddd          @key[3],$xa3,$xa3
3481
3482         vpunpckldq      $xa1,$xa0,$xt2          # "de-interlace" data
3483         vpunpckldq      $xa3,$xa2,$xt3
3484         vpunpckhdq      $xa1,$xa0,$xa0
3485         vpunpckhdq      $xa3,$xa2,$xa2
3486         vpunpcklqdq     $xt3,$xt2,$xa1          # "a0"
3487         vpunpckhqdq     $xt3,$xt2,$xt2          # "a1"
3488         vpunpcklqdq     $xa2,$xa0,$xa3          # "a2"
3489         vpunpckhqdq     $xa2,$xa0,$xa0          # "a3"
3490 ___
3491         ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
3492 $code.=<<___;
3493         vpaddd          @key[4],$xb0,$xb0
3494         vpaddd          @key[5],$xb1,$xb1
3495         vpaddd          @key[6],$xb2,$xb2
3496         vpaddd          @key[7],$xb3,$xb3
3497
3498         vpunpckldq      $xb1,$xb0,$xt2
3499         vpunpckldq      $xb3,$xb2,$xt3
3500         vpunpckhdq      $xb1,$xb0,$xb0
3501         vpunpckhdq      $xb3,$xb2,$xb2
3502         vpunpcklqdq     $xt3,$xt2,$xb1          # "b0"
3503         vpunpckhqdq     $xt3,$xt2,$xt2          # "b1"
3504         vpunpcklqdq     $xb2,$xb0,$xb3          # "b2"
3505         vpunpckhqdq     $xb2,$xb0,$xb0          # "b3"
3506 ___
3507         ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
3508 $code.=<<___;
3509         vshufi32x4      \$0,$xb0,$xa0,$xt3      # "de-interlace" further
3510         vshufi32x4      \$3,$xb0,$xa0,$xb0
3511         vshufi32x4      \$0,$xb1,$xa1,$xa0
3512         vshufi32x4      \$3,$xb1,$xa1,$xb1
3513         vshufi32x4      \$0,$xb2,$xa2,$xa1
3514         vshufi32x4      \$3,$xb2,$xa2,$xb2
3515         vshufi32x4      \$0,$xb3,$xa3,$xa2
3516         vshufi32x4      \$3,$xb3,$xa3,$xb3
3517 ___
3518         ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
3519 $code.=<<___;
3520         vpaddd          @key[8],$xc0,$xc0
3521         vpaddd          @key[9],$xc1,$xc1
3522         vpaddd          @key[10],$xc2,$xc2
3523         vpaddd          @key[11],$xc3,$xc3
3524
3525         vpunpckldq      $xc1,$xc0,$xt2
3526         vpunpckldq      $xc3,$xc2,$xt3
3527         vpunpckhdq      $xc1,$xc0,$xc0
3528         vpunpckhdq      $xc3,$xc2,$xc2
3529         vpunpcklqdq     $xt3,$xt2,$xc1          # "c0"
3530         vpunpckhqdq     $xt3,$xt2,$xt2          # "c1"
3531         vpunpcklqdq     $xc2,$xc0,$xc3          # "c2"
3532         vpunpckhqdq     $xc2,$xc0,$xc0          # "c3"
3533 ___
3534         ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
3535 $code.=<<___;
3536         vpaddd          @key[12],$xd0,$xd0
3537         vpaddd          @key[13],$xd1,$xd1
3538         vpaddd          @key[14],$xd2,$xd2
3539         vpaddd          @key[15],$xd3,$xd3
3540
3541         vpunpckldq      $xd1,$xd0,$xt2
3542         vpunpckldq      $xd3,$xd2,$xt3
3543         vpunpckhdq      $xd1,$xd0,$xd0
3544         vpunpckhdq      $xd3,$xd2,$xd2
3545         vpunpcklqdq     $xt3,$xt2,$xd1          # "d0"
3546         vpunpckhqdq     $xt3,$xt2,$xt2          # "d1"
3547         vpunpcklqdq     $xd2,$xd0,$xd3          # "d2"
3548         vpunpckhqdq     $xd2,$xd0,$xd0          # "d3"
3549 ___
3550         ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
3551 $code.=<<___;
3552         vperm2i128      \$0x20,$xd0,$xc0,$xt3   # "de-interlace" further
3553         vperm2i128      \$0x31,$xd0,$xc0,$xd0
3554         vperm2i128      \$0x20,$xd1,$xc1,$xc0
3555         vperm2i128      \$0x31,$xd1,$xc1,$xd1
3556         vperm2i128      \$0x20,$xd2,$xc2,$xc1
3557         vperm2i128      \$0x31,$xd2,$xc2,$xd2
3558         vperm2i128      \$0x20,$xd3,$xc3,$xc2
3559         vperm2i128      \$0x31,$xd3,$xc3,$xd3
3560 ___
3561         ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
3562         ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
3563         ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
3564 $code.=<<___;
3565         cmp             \$64*8,$len
3566         jb              .Ltail8xvl
3567
3568         mov             \$0x80,%eax             # size optimization
3569         vpxord          0x00($inp),$xa0,$xa0    # xor with input
3570         vpxor           0x20($inp),$xb0,$xb0
3571         vpxor           0x40($inp),$xc0,$xc0
3572         vpxor           0x60($inp),$xd0,$xd0
3573         lea             ($inp,%rax),$inp        # size optimization
3574         vmovdqu32       $xa0,0x00($out)
3575         vmovdqu         $xb0,0x20($out)
3576         vmovdqu         $xc0,0x40($out)
3577         vmovdqu         $xd0,0x60($out)
3578         lea             ($out,%rax),$out        # size optimization
3579
3580         vpxor           0x00($inp),$xa1,$xa1
3581         vpxor           0x20($inp),$xb1,$xb1
3582         vpxor           0x40($inp),$xc1,$xc1
3583         vpxor           0x60($inp),$xd1,$xd1
3584         lea             ($inp,%rax),$inp        # size optimization
3585         vmovdqu         $xa1,0x00($out)
3586         vmovdqu         $xb1,0x20($out)
3587         vmovdqu         $xc1,0x40($out)
3588         vmovdqu         $xd1,0x60($out)
3589         lea             ($out,%rax),$out        # size optimization
3590
3591         vpxord          0x00($inp),$xa2,$xa2
3592         vpxor           0x20($inp),$xb2,$xb2
3593         vpxor           0x40($inp),$xc2,$xc2
3594         vpxor           0x60($inp),$xd2,$xd2
3595         lea             ($inp,%rax),$inp        # size optimization
3596         vmovdqu32       $xa2,0x00($out)
3597         vmovdqu         $xb2,0x20($out)
3598         vmovdqu         $xc2,0x40($out)
3599         vmovdqu         $xd2,0x60($out)
3600         lea             ($out,%rax),$out        # size optimization
3601
3602         vpxor           0x00($inp),$xa3,$xa3
3603         vpxor           0x20($inp),$xb3,$xb3
3604         vpxor           0x40($inp),$xc3,$xc3
3605         vpxor           0x60($inp),$xd3,$xd3
3606         lea             ($inp,%rax),$inp        # size optimization
3607         vmovdqu         $xa3,0x00($out)
3608         vmovdqu         $xb3,0x20($out)
3609         vmovdqu         $xc3,0x40($out)
3610         vmovdqu         $xd3,0x60($out)
3611         lea             ($out,%rax),$out        # size optimization
3612
3613         vpbroadcastd    0(%r10),%ymm0           # reload key
3614         vpbroadcastd    4(%r10),%ymm1
3615
3616         sub             \$64*8,$len
3617         jnz             .Loop_outer8xvl
3618
3619         jmp             .Ldone8xvl
3620
3621 .align  32
3622 .Ltail8xvl:
3623         vmovdqa64       $xa0,%ymm8              # size optimization
3624 ___
3625 $xa0 = "%ymm8";
3626 $code.=<<___;
3627         xor             %r10,%r10
3628         sub             $inp,$out
3629         cmp             \$64*1,$len
3630         jb              .Less_than_64_8xvl
3631         vpxor           0x00($inp),$xa0,$xa0    # xor with input
3632         vpxor           0x20($inp),$xb0,$xb0
3633         vmovdqu         $xa0,0x00($out,$inp)
3634         vmovdqu         $xb0,0x20($out,$inp)
3635         je              .Ldone8xvl
3636         vmovdqa         $xc0,$xa0
3637         vmovdqa         $xd0,$xb0
3638         lea             64($inp),$inp
3639
3640         cmp             \$64*2,$len
3641         jb              .