ac169ee33cc943287a473585432abffce2c093a2
[openssl.git] / crypto / chacha / asm / chacha-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # November 2014
18 #
19 # ChaCha20 for x86_64.
20 #
21 # December 2016
22 #
23 # Add AVX512F code path.
24 #
25 # Performance in cycles per byte out of large buffer.
26 #
27 #               IALU/gcc 4.8(i) 1xSSSE3/SSE2    4xSSSE3     8xAVX2
28 #
29 # P4            9.48/+99%       -/22.7(ii)      -
30 # Core2         7.83/+55%       7.90/8.08       4.35
31 # Westmere      7.19/+50%       5.60/6.70       3.00
32 # Sandy Bridge  8.31/+42%       5.45/6.76       2.72
33 # Ivy Bridge    6.71/+46%       5.40/6.49       2.41
34 # Haswell       5.92/+43%       5.20/6.45       2.42        1.23
35 # Skylake       5.87/+39%       4.70/-          2.31        1.19
36 # Silvermont    12.0/+33%       7.75/7.40       7.03(iii)
37 # Goldmont      10.6/+17%       5.10/-          3.28
38 # Sledgehammer  7.28/+52%       -/14.2(ii)      -
39 # Bulldozer     9.66/+28%       9.85/11.1       3.06(iv)
40 # VIA Nano      10.5/+46%       6.72/8.60       6.05
41 #
42 # (i)   compared to older gcc 3.x one can observe >2x improvement on
43 #       most platforms;
44 # (ii)  as it can be seen, SSE2 performance is too low on legacy
45 #       processors; NxSSE2 results are naturally better, but not
46 #       impressively better than IALU ones, which is why you won't
47 #       find SSE2 code below;
48 # (iii) this is not optimal result for Atom because of MSROM
49 #       limitations, SSE2 can do better, but gain is considered too
50 #       low to justify the [maintenance] effort;
51 # (iv)  Bulldozer actually executes 4xXOP code path that delivers 2.20;
52
53 $flavour = shift;
54 $output  = shift;
55 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
56
57 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
58
59 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
61 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
62 die "can't locate x86_64-xlate.pl";
63
64 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
65                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
66         $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
67 }
68
69 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
70            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
71         $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
72         $avx += 1 if ($1==2.11 && $2>=8);
73 }
74
75 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
76            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
77         $avx = ($1>=10) + ($1>=11);
78 }
79
80 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
81         $avx = ($2>=3.0) + ($2>3.0);
82 }
83
84 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
85 *STDOUT=*OUT;
86
87 # input parameter block
88 ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
89
90 $code.=<<___;
91 .text
92
93 .extern OPENSSL_ia32cap_P
94
95 .align  64
96 .Lzero:
97 .long   0,0,0,0
98 .Lone:
99 .long   1,0,0,0
100 .Linc:
101 .long   0,1,2,3
102 .Lfour:
103 .long   4,4,4,4
104 .Lincy:
105 .long   0,2,4,6,1,3,5,7
106 .Leight:
107 .long   8,8,8,8,8,8,8,8
108 .Lrot16:
109 .byte   0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
110 .Lrot24:
111 .byte   0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
112 .Lsigma:
113 .asciz  "expand 32-byte k"
114 .align  64
115 .Lzeroz:
116 .long   0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
117 .Lfourz:
118 .long   4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
119 .Lincz:
120 .long   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
121 .Lsixteen:
122 .long   16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
123 .asciz  "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
124 ___
125
126 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
127 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
128   my $arg = pop;
129     $arg = "\$$arg" if ($arg*1 eq $arg);
130     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
131 }
132
133 @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
134     "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
135 @t=("%esi","%edi");
136
137 sub ROUND {                     # critical path is 24 cycles per round
138 my ($a0,$b0,$c0,$d0)=@_;
139 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
140 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
141 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
142 my ($xc,$xc_)=map("\"$_\"",@t);
143 my @x=map("\"$_\"",@x);
144
145         # Consider order in which variables are addressed by their
146         # index:
147         #
148         #       a   b   c   d
149         #
150         #       0   4   8  12 < even round
151         #       1   5   9  13
152         #       2   6  10  14
153         #       3   7  11  15
154         #       0   5  10  15 < odd round
155         #       1   6  11  12
156         #       2   7   8  13
157         #       3   4   9  14
158         #
159         # 'a', 'b' and 'd's are permanently allocated in registers,
160         # @x[0..7,12..15], while 'c's are maintained in memory. If
161         # you observe 'c' column, you'll notice that pair of 'c's is
162         # invariant between rounds. This means that we have to reload
163         # them once per round, in the middle. This is why you'll see
164         # bunch of 'c' stores and loads in the middle, but none in
165         # the beginning or end.
166
167         # Normally instructions would be interleaved to favour in-order
168         # execution. Generally out-of-order cores manage it gracefully,
169         # but not this time for some reason. As in-order execution
170         # cores are dying breed, old Atom is the only one around,
171         # instructions are left uninterleaved. Besides, Atom is better
172         # off executing 1xSSSE3 code anyway...
173
174         (
175         "&add   (@x[$a0],@x[$b0])",     # Q1
176         "&xor   (@x[$d0],@x[$a0])",
177         "&rol   (@x[$d0],16)",
178          "&add  (@x[$a1],@x[$b1])",     # Q2
179          "&xor  (@x[$d1],@x[$a1])",
180          "&rol  (@x[$d1],16)",
181
182         "&add   ($xc,@x[$d0])",
183         "&xor   (@x[$b0],$xc)",
184         "&rol   (@x[$b0],12)",
185          "&add  ($xc_,@x[$d1])",
186          "&xor  (@x[$b1],$xc_)",
187          "&rol  (@x[$b1],12)",
188
189         "&add   (@x[$a0],@x[$b0])",
190         "&xor   (@x[$d0],@x[$a0])",
191         "&rol   (@x[$d0],8)",
192          "&add  (@x[$a1],@x[$b1])",
193          "&xor  (@x[$d1],@x[$a1])",
194          "&rol  (@x[$d1],8)",
195
196         "&add   ($xc,@x[$d0])",
197         "&xor   (@x[$b0],$xc)",
198         "&rol   (@x[$b0],7)",
199          "&add  ($xc_,@x[$d1])",
200          "&xor  (@x[$b1],$xc_)",
201          "&rol  (@x[$b1],7)",
202
203         "&mov   (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
204          "&mov  (\"4*$c1(%rsp)\",$xc_)",
205         "&mov   ($xc,\"4*$c2(%rsp)\")",
206          "&mov  ($xc_,\"4*$c3(%rsp)\")",
207
208         "&add   (@x[$a2],@x[$b2])",     # Q3
209         "&xor   (@x[$d2],@x[$a2])",
210         "&rol   (@x[$d2],16)",
211          "&add  (@x[$a3],@x[$b3])",     # Q4
212          "&xor  (@x[$d3],@x[$a3])",
213          "&rol  (@x[$d3],16)",
214
215         "&add   ($xc,@x[$d2])",
216         "&xor   (@x[$b2],$xc)",
217         "&rol   (@x[$b2],12)",
218          "&add  ($xc_,@x[$d3])",
219          "&xor  (@x[$b3],$xc_)",
220          "&rol  (@x[$b3],12)",
221
222         "&add   (@x[$a2],@x[$b2])",
223         "&xor   (@x[$d2],@x[$a2])",
224         "&rol   (@x[$d2],8)",
225          "&add  (@x[$a3],@x[$b3])",
226          "&xor  (@x[$d3],@x[$a3])",
227          "&rol  (@x[$d3],8)",
228
229         "&add   ($xc,@x[$d2])",
230         "&xor   (@x[$b2],$xc)",
231         "&rol   (@x[$b2],7)",
232          "&add  ($xc_,@x[$d3])",
233          "&xor  (@x[$b3],$xc_)",
234          "&rol  (@x[$b3],7)"
235         );
236 }
237
238 ########################################################################
239 # Generic code path that handles all lengths on pre-SSSE3 processors.
240 $code.=<<___;
241 .globl  ChaCha20_ctr32
242 .type   ChaCha20_ctr32,\@function,5
243 .align  64
244 ChaCha20_ctr32:
245         cmp     \$0,$len
246         je      .Lno_data
247         mov     OPENSSL_ia32cap_P+4(%rip),%r10
248 ___
249 $code.=<<___    if ($avx>2);
250         bt      \$48,%r10               # check for AVX512F
251         jc      .LChaCha20_avx512
252 ___
253 $code.=<<___;
254         test    \$`1<<(41-32)`,%r10d
255         jnz     .LChaCha20_ssse3
256
257         push    %rbx
258         push    %rbp
259         push    %r12
260         push    %r13
261         push    %r14
262         push    %r15
263         sub     \$64+24,%rsp
264
265         #movdqa .Lsigma(%rip),%xmm0
266         movdqu  ($key),%xmm1
267         movdqu  16($key),%xmm2
268         movdqu  ($counter),%xmm3
269         movdqa  .Lone(%rip),%xmm4
270
271         #movdqa %xmm0,4*0(%rsp)         # key[0]
272         movdqa  %xmm1,4*4(%rsp)         # key[1]
273         movdqa  %xmm2,4*8(%rsp)         # key[2]
274         movdqa  %xmm3,4*12(%rsp)        # key[3]
275         mov     $len,%rbp               # reassign $len
276         jmp     .Loop_outer
277
278 .align  32
279 .Loop_outer:
280         mov     \$0x61707865,@x[0]      # 'expa'
281         mov     \$0x3320646e,@x[1]      # 'nd 3'
282         mov     \$0x79622d32,@x[2]      # '2-by'
283         mov     \$0x6b206574,@x[3]      # 'te k'
284         mov     4*4(%rsp),@x[4]
285         mov     4*5(%rsp),@x[5]
286         mov     4*6(%rsp),@x[6]
287         mov     4*7(%rsp),@x[7]
288         movd    %xmm3,@x[12]
289         mov     4*13(%rsp),@x[13]
290         mov     4*14(%rsp),@x[14]
291         mov     4*15(%rsp),@x[15]
292
293         mov     %rbp,64+0(%rsp)         # save len
294         mov     \$10,%ebp
295         mov     $inp,64+8(%rsp)         # save inp
296         movq    %xmm2,%rsi              # "@x[8]"
297         mov     $out,64+16(%rsp)        # save out
298         mov     %rsi,%rdi
299         shr     \$32,%rdi               # "@x[9]"
300         jmp     .Loop
301
302 .align  32
303 .Loop:
304 ___
305         foreach (&ROUND (0, 4, 8,12)) { eval; }
306         foreach (&ROUND (0, 5,10,15)) { eval; }
307         &dec    ("%ebp");
308         &jnz    (".Loop");
309
310 $code.=<<___;
311         mov     @t[1],4*9(%rsp)         # modulo-scheduled
312         mov     @t[0],4*8(%rsp)
313         mov     64(%rsp),%rbp           # load len
314         movdqa  %xmm2,%xmm1
315         mov     64+8(%rsp),$inp         # load inp
316         paddd   %xmm4,%xmm3             # increment counter
317         mov     64+16(%rsp),$out        # load out
318
319         add     \$0x61707865,@x[0]      # 'expa'
320         add     \$0x3320646e,@x[1]      # 'nd 3'
321         add     \$0x79622d32,@x[2]      # '2-by'
322         add     \$0x6b206574,@x[3]      # 'te k'
323         add     4*4(%rsp),@x[4]
324         add     4*5(%rsp),@x[5]
325         add     4*6(%rsp),@x[6]
326         add     4*7(%rsp),@x[7]
327         add     4*12(%rsp),@x[12]
328         add     4*13(%rsp),@x[13]
329         add     4*14(%rsp),@x[14]
330         add     4*15(%rsp),@x[15]
331         paddd   4*8(%rsp),%xmm1
332
333         cmp     \$64,%rbp
334         jb      .Ltail
335
336         xor     4*0($inp),@x[0]         # xor with input
337         xor     4*1($inp),@x[1]
338         xor     4*2($inp),@x[2]
339         xor     4*3($inp),@x[3]
340         xor     4*4($inp),@x[4]
341         xor     4*5($inp),@x[5]
342         xor     4*6($inp),@x[6]
343         xor     4*7($inp),@x[7]
344         movdqu  4*8($inp),%xmm0
345         xor     4*12($inp),@x[12]
346         xor     4*13($inp),@x[13]
347         xor     4*14($inp),@x[14]
348         xor     4*15($inp),@x[15]
349         lea     4*16($inp),$inp         # inp+=64
350         pxor    %xmm1,%xmm0
351
352         movdqa  %xmm2,4*8(%rsp)
353         movd    %xmm3,4*12(%rsp)
354
355         mov     @x[0],4*0($out)         # write output
356         mov     @x[1],4*1($out)
357         mov     @x[2],4*2($out)
358         mov     @x[3],4*3($out)
359         mov     @x[4],4*4($out)
360         mov     @x[5],4*5($out)
361         mov     @x[6],4*6($out)
362         mov     @x[7],4*7($out)
363         movdqu  %xmm0,4*8($out)
364         mov     @x[12],4*12($out)
365         mov     @x[13],4*13($out)
366         mov     @x[14],4*14($out)
367         mov     @x[15],4*15($out)
368         lea     4*16($out),$out         # out+=64
369
370         sub     \$64,%rbp
371         jnz     .Loop_outer
372
373         jmp     .Ldone
374
375 .align  16
376 .Ltail:
377         mov     @x[0],4*0(%rsp)
378         mov     @x[1],4*1(%rsp)
379         xor     %rbx,%rbx
380         mov     @x[2],4*2(%rsp)
381         mov     @x[3],4*3(%rsp)
382         mov     @x[4],4*4(%rsp)
383         mov     @x[5],4*5(%rsp)
384         mov     @x[6],4*6(%rsp)
385         mov     @x[7],4*7(%rsp)
386         movdqa  %xmm1,4*8(%rsp)
387         mov     @x[12],4*12(%rsp)
388         mov     @x[13],4*13(%rsp)
389         mov     @x[14],4*14(%rsp)
390         mov     @x[15],4*15(%rsp)
391
392 .Loop_tail:
393         movzb   ($inp,%rbx),%eax
394         movzb   (%rsp,%rbx),%edx
395         lea     1(%rbx),%rbx
396         xor     %edx,%eax
397         mov     %al,-1($out,%rbx)
398         dec     %rbp
399         jnz     .Loop_tail
400
401 .Ldone:
402         add     \$64+24,%rsp
403         pop     %r15
404         pop     %r14
405         pop     %r13
406         pop     %r12
407         pop     %rbp
408         pop     %rbx
409 .Lno_data:
410         ret
411 .size   ChaCha20_ctr32,.-ChaCha20_ctr32
412 ___
413
414 ########################################################################
415 # SSSE3 code path that handles shorter lengths
416 {
417 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
418
419 sub SSSE3ROUND {        # critical path is 20 "SIMD ticks" per round
420         &paddd  ($a,$b);
421         &pxor   ($d,$a);
422         &pshufb ($d,$rot16);
423
424         &paddd  ($c,$d);
425         &pxor   ($b,$c);
426         &movdqa ($t,$b);
427         &psrld  ($b,20);
428         &pslld  ($t,12);
429         &por    ($b,$t);
430
431         &paddd  ($a,$b);
432         &pxor   ($d,$a);
433         &pshufb ($d,$rot24);
434
435         &paddd  ($c,$d);
436         &pxor   ($b,$c);
437         &movdqa ($t,$b);
438         &psrld  ($b,25);
439         &pslld  ($t,7);
440         &por    ($b,$t);
441 }
442
443 my $xframe = $win64 ? 32+32+8 : 24;
444
445 $code.=<<___;
446 .type   ChaCha20_ssse3,\@function,5
447 .align  32
448 ChaCha20_ssse3:
449 .LChaCha20_ssse3:
450 ___
451 $code.=<<___    if ($avx);
452         test    \$`1<<(43-32)`,%r10d
453         jnz     .LChaCha20_4xop         # XOP is fastest even if we use 1/4
454 ___
455 $code.=<<___;
456         cmp     \$128,$len              # we might throw away some data,
457         ja      .LChaCha20_4x           # but overall it won't be slower
458
459 .Ldo_sse3_after_all:
460         push    %rbx                    # just to share SEH handler, no pops
461         push    %rbp
462         push    %r12
463         push    %r13
464         push    %r14
465         push    %r15
466
467         sub     \$64+$xframe,%rsp
468 ___
469 $code.=<<___    if ($win64);
470         movaps  %xmm6,64+32(%rsp)
471         movaps  %xmm7,64+48(%rsp)
472 ___
473 $code.=<<___;
474         movdqa  .Lsigma(%rip),$a
475         movdqu  ($key),$b
476         movdqu  16($key),$c
477         movdqu  ($counter),$d
478         movdqa  .Lrot16(%rip),$rot16
479         movdqa  .Lrot24(%rip),$rot24
480
481         movdqa  $a,0x00(%rsp)
482         movdqa  $b,0x10(%rsp)
483         movdqa  $c,0x20(%rsp)
484         movdqa  $d,0x30(%rsp)
485         mov     \$10,$counter           # reuse $counter
486         jmp     .Loop_ssse3
487
488 .align  32
489 .Loop_outer_ssse3:
490         movdqa  .Lone(%rip),$d
491         movdqa  0x00(%rsp),$a
492         movdqa  0x10(%rsp),$b
493         movdqa  0x20(%rsp),$c
494         paddd   0x30(%rsp),$d
495         mov     \$10,$counter
496         movdqa  $d,0x30(%rsp)
497         jmp     .Loop_ssse3
498
499 .align  32
500 .Loop_ssse3:
501 ___
502         &SSSE3ROUND();
503         &pshufd ($c,$c,0b01001110);
504         &pshufd ($b,$b,0b00111001);
505         &pshufd ($d,$d,0b10010011);
506         &nop    ();
507
508         &SSSE3ROUND();
509         &pshufd ($c,$c,0b01001110);
510         &pshufd ($b,$b,0b10010011);
511         &pshufd ($d,$d,0b00111001);
512
513         &dec    ($counter);
514         &jnz    (".Loop_ssse3");
515
516 $code.=<<___;
517         paddd   0x00(%rsp),$a
518         paddd   0x10(%rsp),$b
519         paddd   0x20(%rsp),$c
520         paddd   0x30(%rsp),$d
521
522         cmp     \$64,$len
523         jb      .Ltail_ssse3
524
525         movdqu  0x00($inp),$t
526         movdqu  0x10($inp),$t1
527         pxor    $t,$a                   # xor with input
528         movdqu  0x20($inp),$t
529         pxor    $t1,$b
530         movdqu  0x30($inp),$t1
531         lea     0x40($inp),$inp         # inp+=64
532         pxor    $t,$c
533         pxor    $t1,$d
534
535         movdqu  $a,0x00($out)           # write output
536         movdqu  $b,0x10($out)
537         movdqu  $c,0x20($out)
538         movdqu  $d,0x30($out)
539         lea     0x40($out),$out         # out+=64
540
541         sub     \$64,$len
542         jnz     .Loop_outer_ssse3
543
544         jmp     .Ldone_ssse3
545
546 .align  16
547 .Ltail_ssse3:
548         movdqa  $a,0x00(%rsp)
549         movdqa  $b,0x10(%rsp)
550         movdqa  $c,0x20(%rsp)
551         movdqa  $d,0x30(%rsp)
552         xor     $counter,$counter
553
554 .Loop_tail_ssse3:
555         movzb   ($inp,$counter),%eax
556         movzb   (%rsp,$counter),%ecx
557         lea     1($counter),$counter
558         xor     %ecx,%eax
559         mov     %al,-1($out,$counter)
560         dec     $len
561         jnz     .Loop_tail_ssse3
562
563 .Ldone_ssse3:
564 ___
565 $code.=<<___    if ($win64);
566         movaps  64+32(%rsp),%xmm6
567         movaps  64+48(%rsp),%xmm7
568 ___
569 $code.=<<___;
570         add     \$64+$xframe+48,%rsp
571         ret
572 .size   ChaCha20_ssse3,.-ChaCha20_ssse3
573 ___
574 }
575
576 ########################################################################
577 # SSSE3 code path that handles longer messages.
578 {
579 # assign variables to favor Atom front-end
580 my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
581     $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
582 my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
583         "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
584
585 sub SSSE3_lane_ROUND {
586 my ($a0,$b0,$c0,$d0)=@_;
587 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
588 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
589 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
590 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
591 my @x=map("\"$_\"",@xx);
592
593         # Consider order in which variables are addressed by their
594         # index:
595         #
596         #       a   b   c   d
597         #
598         #       0   4   8  12 < even round
599         #       1   5   9  13
600         #       2   6  10  14
601         #       3   7  11  15
602         #       0   5  10  15 < odd round
603         #       1   6  11  12
604         #       2   7   8  13
605         #       3   4   9  14
606         #
607         # 'a', 'b' and 'd's are permanently allocated in registers,
608         # @x[0..7,12..15], while 'c's are maintained in memory. If
609         # you observe 'c' column, you'll notice that pair of 'c's is
610         # invariant between rounds. This means that we have to reload
611         # them once per round, in the middle. This is why you'll see
612         # bunch of 'c' stores and loads in the middle, but none in
613         # the beginning or end.
614
615         (
616         "&paddd         (@x[$a0],@x[$b0])",     # Q1
617          "&paddd        (@x[$a1],@x[$b1])",     # Q2
618         "&pxor          (@x[$d0],@x[$a0])",
619          "&pxor         (@x[$d1],@x[$a1])",
620         "&pshufb        (@x[$d0],$t1)",
621          "&pshufb       (@x[$d1],$t1)",
622
623         "&paddd         ($xc,@x[$d0])",
624          "&paddd        ($xc_,@x[$d1])",
625         "&pxor          (@x[$b0],$xc)",
626          "&pxor         (@x[$b1],$xc_)",
627         "&movdqa        ($t0,@x[$b0])",
628         "&pslld         (@x[$b0],12)",
629         "&psrld         ($t0,20)",
630          "&movdqa       ($t1,@x[$b1])",
631          "&pslld        (@x[$b1],12)",
632         "&por           (@x[$b0],$t0)",
633          "&psrld        ($t1,20)",
634         "&movdqa        ($t0,'(%r11)')",        # .Lrot24(%rip)
635          "&por          (@x[$b1],$t1)",
636
637         "&paddd         (@x[$a0],@x[$b0])",
638          "&paddd        (@x[$a1],@x[$b1])",
639         "&pxor          (@x[$d0],@x[$a0])",
640          "&pxor         (@x[$d1],@x[$a1])",
641         "&pshufb        (@x[$d0],$t0)",
642          "&pshufb       (@x[$d1],$t0)",
643
644         "&paddd         ($xc,@x[$d0])",
645          "&paddd        ($xc_,@x[$d1])",
646         "&pxor          (@x[$b0],$xc)",
647          "&pxor         (@x[$b1],$xc_)",
648         "&movdqa        ($t1,@x[$b0])",
649         "&pslld         (@x[$b0],7)",
650         "&psrld         ($t1,25)",
651          "&movdqa       ($t0,@x[$b1])",
652          "&pslld        (@x[$b1],7)",
653         "&por           (@x[$b0],$t1)",
654          "&psrld        ($t0,25)",
655         "&movdqa        ($t1,'(%r10)')",        # .Lrot16(%rip)
656          "&por          (@x[$b1],$t0)",
657
658         "&movdqa        (\"`16*($c0-8)`(%rsp)\",$xc)",  # reload pair of 'c's
659          "&movdqa       (\"`16*($c1-8)`(%rsp)\",$xc_)",
660         "&movdqa        ($xc,\"`16*($c2-8)`(%rsp)\")",
661          "&movdqa       ($xc_,\"`16*($c3-8)`(%rsp)\")",
662
663         "&paddd         (@x[$a2],@x[$b2])",     # Q3
664          "&paddd        (@x[$a3],@x[$b3])",     # Q4
665         "&pxor          (@x[$d2],@x[$a2])",
666          "&pxor         (@x[$d3],@x[$a3])",
667         "&pshufb        (@x[$d2],$t1)",
668          "&pshufb       (@x[$d3],$t1)",
669
670         "&paddd         ($xc,@x[$d2])",
671          "&paddd        ($xc_,@x[$d3])",
672         "&pxor          (@x[$b2],$xc)",
673          "&pxor         (@x[$b3],$xc_)",
674         "&movdqa        ($t0,@x[$b2])",
675         "&pslld         (@x[$b2],12)",
676         "&psrld         ($t0,20)",
677          "&movdqa       ($t1,@x[$b3])",
678          "&pslld        (@x[$b3],12)",
679         "&por           (@x[$b2],$t0)",
680          "&psrld        ($t1,20)",
681         "&movdqa        ($t0,'(%r11)')",        # .Lrot24(%rip)
682          "&por          (@x[$b3],$t1)",
683
684         "&paddd         (@x[$a2],@x[$b2])",
685          "&paddd        (@x[$a3],@x[$b3])",
686         "&pxor          (@x[$d2],@x[$a2])",
687          "&pxor         (@x[$d3],@x[$a3])",
688         "&pshufb        (@x[$d2],$t0)",
689          "&pshufb       (@x[$d3],$t0)",
690
691         "&paddd         ($xc,@x[$d2])",
692          "&paddd        ($xc_,@x[$d3])",
693         "&pxor          (@x[$b2],$xc)",
694          "&pxor         (@x[$b3],$xc_)",
695         "&movdqa        ($t1,@x[$b2])",
696         "&pslld         (@x[$b2],7)",
697         "&psrld         ($t1,25)",
698          "&movdqa       ($t0,@x[$b3])",
699          "&pslld        (@x[$b3],7)",
700         "&por           (@x[$b2],$t1)",
701          "&psrld        ($t0,25)",
702         "&movdqa        ($t1,'(%r10)')",        # .Lrot16(%rip)
703          "&por          (@x[$b3],$t0)"
704         );
705 }
706
707 my $xframe = $win64 ? 0xa0 : 0;
708
709 $code.=<<___;
710 .type   ChaCha20_4x,\@function,5
711 .align  32
712 ChaCha20_4x:
713 .LChaCha20_4x:
714         mov             %r10,%r11
715 ___
716 $code.=<<___    if ($avx>1);
717         shr             \$32,%r10               # OPENSSL_ia32cap_P+8
718         test            \$`1<<5`,%r10           # test AVX2
719         jnz             .LChaCha20_8x
720 ___
721 $code.=<<___;
722         cmp             \$192,$len
723         ja              .Lproceed4x
724
725         and             \$`1<<26|1<<22`,%r11    # isolate XSAVE+MOVBE
726         cmp             \$`1<<22`,%r11          # check for MOVBE without XSAVE
727         je              .Ldo_sse3_after_all     # to detect Atom
728
729 .Lproceed4x:
730         lea             -0x78(%rsp),%r11
731         sub             \$0x148+$xframe,%rsp
732 ___
733         ################ stack layout
734         # +0x00         SIMD equivalent of @x[8-12]
735         # ...
736         # +0x40         constant copy of key[0-2] smashed by lanes
737         # ...
738         # +0x100        SIMD counters (with nonce smashed by lanes)
739         # ...
740         # +0x140
741 $code.=<<___    if ($win64);
742         movaps          %xmm6,-0x30(%r11)
743         movaps          %xmm7,-0x20(%r11)
744         movaps          %xmm8,-0x10(%r11)
745         movaps          %xmm9,0x00(%r11)
746         movaps          %xmm10,0x10(%r11)
747         movaps          %xmm11,0x20(%r11)
748         movaps          %xmm12,0x30(%r11)
749         movaps          %xmm13,0x40(%r11)
750         movaps          %xmm14,0x50(%r11)
751         movaps          %xmm15,0x60(%r11)
752 ___
753 $code.=<<___;
754         movdqa          .Lsigma(%rip),$xa3      # key[0]
755         movdqu          ($key),$xb3             # key[1]
756         movdqu          16($key),$xt3           # key[2]
757         movdqu          ($counter),$xd3         # key[3]
758         lea             0x100(%rsp),%rcx        # size optimization
759         lea             .Lrot16(%rip),%r10
760         lea             .Lrot24(%rip),%r11
761
762         pshufd          \$0x00,$xa3,$xa0        # smash key by lanes...
763         pshufd          \$0x55,$xa3,$xa1
764         movdqa          $xa0,0x40(%rsp)         # ... and offload
765         pshufd          \$0xaa,$xa3,$xa2
766         movdqa          $xa1,0x50(%rsp)
767         pshufd          \$0xff,$xa3,$xa3
768         movdqa          $xa2,0x60(%rsp)
769         movdqa          $xa3,0x70(%rsp)
770
771         pshufd          \$0x00,$xb3,$xb0
772         pshufd          \$0x55,$xb3,$xb1
773         movdqa          $xb0,0x80-0x100(%rcx)
774         pshufd          \$0xaa,$xb3,$xb2
775         movdqa          $xb1,0x90-0x100(%rcx)
776         pshufd          \$0xff,$xb3,$xb3
777         movdqa          $xb2,0xa0-0x100(%rcx)
778         movdqa          $xb3,0xb0-0x100(%rcx)
779
780         pshufd          \$0x00,$xt3,$xt0        # "$xc0"
781         pshufd          \$0x55,$xt3,$xt1        # "$xc1"
782         movdqa          $xt0,0xc0-0x100(%rcx)
783         pshufd          \$0xaa,$xt3,$xt2        # "$xc2"
784         movdqa          $xt1,0xd0-0x100(%rcx)
785         pshufd          \$0xff,$xt3,$xt3        # "$xc3"
786         movdqa          $xt2,0xe0-0x100(%rcx)
787         movdqa          $xt3,0xf0-0x100(%rcx)
788
789         pshufd          \$0x00,$xd3,$xd0
790         pshufd          \$0x55,$xd3,$xd1
791         paddd           .Linc(%rip),$xd0        # don't save counters yet
792         pshufd          \$0xaa,$xd3,$xd2
793         movdqa          $xd1,0x110-0x100(%rcx)
794         pshufd          \$0xff,$xd3,$xd3
795         movdqa          $xd2,0x120-0x100(%rcx)
796         movdqa          $xd3,0x130-0x100(%rcx)
797
798         jmp             .Loop_enter4x
799
800 .align  32
801 .Loop_outer4x:
802         movdqa          0x40(%rsp),$xa0         # re-load smashed key
803         movdqa          0x50(%rsp),$xa1
804         movdqa          0x60(%rsp),$xa2
805         movdqa          0x70(%rsp),$xa3
806         movdqa          0x80-0x100(%rcx),$xb0
807         movdqa          0x90-0x100(%rcx),$xb1
808         movdqa          0xa0-0x100(%rcx),$xb2
809         movdqa          0xb0-0x100(%rcx),$xb3
810         movdqa          0xc0-0x100(%rcx),$xt0   # "$xc0"
811         movdqa          0xd0-0x100(%rcx),$xt1   # "$xc1"
812         movdqa          0xe0-0x100(%rcx),$xt2   # "$xc2"
813         movdqa          0xf0-0x100(%rcx),$xt3   # "$xc3"
814         movdqa          0x100-0x100(%rcx),$xd0
815         movdqa          0x110-0x100(%rcx),$xd1
816         movdqa          0x120-0x100(%rcx),$xd2
817         movdqa          0x130-0x100(%rcx),$xd3
818         paddd           .Lfour(%rip),$xd0       # next SIMD counters
819
820 .Loop_enter4x:
821         movdqa          $xt2,0x20(%rsp)         # SIMD equivalent of "@x[10]"
822         movdqa          $xt3,0x30(%rsp)         # SIMD equivalent of "@x[11]"
823         movdqa          (%r10),$xt3             # .Lrot16(%rip)
824         mov             \$10,%eax
825         movdqa          $xd0,0x100-0x100(%rcx)  # save SIMD counters
826         jmp             .Loop4x
827
828 .align  32
829 .Loop4x:
830 ___
831         foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
832         foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
833 $code.=<<___;
834         dec             %eax
835         jnz             .Loop4x
836
837         paddd           0x40(%rsp),$xa0         # accumulate key material
838         paddd           0x50(%rsp),$xa1
839         paddd           0x60(%rsp),$xa2
840         paddd           0x70(%rsp),$xa3
841
842         movdqa          $xa0,$xt2               # "de-interlace" data
843         punpckldq       $xa1,$xa0
844         movdqa          $xa2,$xt3
845         punpckldq       $xa3,$xa2
846         punpckhdq       $xa1,$xt2
847         punpckhdq       $xa3,$xt3
848         movdqa          $xa0,$xa1
849         punpcklqdq      $xa2,$xa0               # "a0"
850         movdqa          $xt2,$xa3
851         punpcklqdq      $xt3,$xt2               # "a2"
852         punpckhqdq      $xa2,$xa1               # "a1"
853         punpckhqdq      $xt3,$xa3               # "a3"
854 ___
855         ($xa2,$xt2)=($xt2,$xa2);
856 $code.=<<___;
857         paddd           0x80-0x100(%rcx),$xb0
858         paddd           0x90-0x100(%rcx),$xb1
859         paddd           0xa0-0x100(%rcx),$xb2
860         paddd           0xb0-0x100(%rcx),$xb3
861
862         movdqa          $xa0,0x00(%rsp)         # offload $xaN
863         movdqa          $xa1,0x10(%rsp)
864         movdqa          0x20(%rsp),$xa0         # "xc2"
865         movdqa          0x30(%rsp),$xa1         # "xc3"
866
867         movdqa          $xb0,$xt2
868         punpckldq       $xb1,$xb0
869         movdqa          $xb2,$xt3
870         punpckldq       $xb3,$xb2
871         punpckhdq       $xb1,$xt2
872         punpckhdq       $xb3,$xt3
873         movdqa          $xb0,$xb1
874         punpcklqdq      $xb2,$xb0               # "b0"
875         movdqa          $xt2,$xb3
876         punpcklqdq      $xt3,$xt2               # "b2"
877         punpckhqdq      $xb2,$xb1               # "b1"
878         punpckhqdq      $xt3,$xb3               # "b3"
879 ___
880         ($xb2,$xt2)=($xt2,$xb2);
881         my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
882 $code.=<<___;
883         paddd           0xc0-0x100(%rcx),$xc0
884         paddd           0xd0-0x100(%rcx),$xc1
885         paddd           0xe0-0x100(%rcx),$xc2
886         paddd           0xf0-0x100(%rcx),$xc3
887
888         movdqa          $xa2,0x20(%rsp)         # keep offloading $xaN
889         movdqa          $xa3,0x30(%rsp)
890
891         movdqa          $xc0,$xt2
892         punpckldq       $xc1,$xc0
893         movdqa          $xc2,$xt3
894         punpckldq       $xc3,$xc2
895         punpckhdq       $xc1,$xt2
896         punpckhdq       $xc3,$xt3
897         movdqa          $xc0,$xc1
898         punpcklqdq      $xc2,$xc0               # "c0"
899         movdqa          $xt2,$xc3
900         punpcklqdq      $xt3,$xt2               # "c2"
901         punpckhqdq      $xc2,$xc1               # "c1"
902         punpckhqdq      $xt3,$xc3               # "c3"
903 ___
904         ($xc2,$xt2)=($xt2,$xc2);
905         ($xt0,$xt1)=($xa2,$xa3);                # use $xaN as temporary
906 $code.=<<___;
907         paddd           0x100-0x100(%rcx),$xd0
908         paddd           0x110-0x100(%rcx),$xd1
909         paddd           0x120-0x100(%rcx),$xd2
910         paddd           0x130-0x100(%rcx),$xd3
911
912         movdqa          $xd0,$xt2
913         punpckldq       $xd1,$xd0
914         movdqa          $xd2,$xt3
915         punpckldq       $xd3,$xd2
916         punpckhdq       $xd1,$xt2
917         punpckhdq       $xd3,$xt3
918         movdqa          $xd0,$xd1
919         punpcklqdq      $xd2,$xd0               # "d0"
920         movdqa          $xt2,$xd3
921         punpcklqdq      $xt3,$xt2               # "d2"
922         punpckhqdq      $xd2,$xd1               # "d1"
923         punpckhqdq      $xt3,$xd3               # "d3"
924 ___
925         ($xd2,$xt2)=($xt2,$xd2);
926 $code.=<<___;
927         cmp             \$64*4,$len
928         jb              .Ltail4x
929
930         movdqu          0x00($inp),$xt0         # xor with input
931         movdqu          0x10($inp),$xt1
932         movdqu          0x20($inp),$xt2
933         movdqu          0x30($inp),$xt3
934         pxor            0x00(%rsp),$xt0         # $xaN is offloaded, remember?
935         pxor            $xb0,$xt1
936         pxor            $xc0,$xt2
937         pxor            $xd0,$xt3
938
939          movdqu         $xt0,0x00($out)
940         movdqu          0x40($inp),$xt0
941          movdqu         $xt1,0x10($out)
942         movdqu          0x50($inp),$xt1
943          movdqu         $xt2,0x20($out)
944         movdqu          0x60($inp),$xt2
945          movdqu         $xt3,0x30($out)
946         movdqu          0x70($inp),$xt3
947         lea             0x80($inp),$inp         # size optimization
948         pxor            0x10(%rsp),$xt0
949         pxor            $xb1,$xt1
950         pxor            $xc1,$xt2
951         pxor            $xd1,$xt3
952
953          movdqu         $xt0,0x40($out)
954         movdqu          0x00($inp),$xt0
955          movdqu         $xt1,0x50($out)
956         movdqu          0x10($inp),$xt1
957          movdqu         $xt2,0x60($out)
958         movdqu          0x20($inp),$xt2
959          movdqu         $xt3,0x70($out)
960          lea            0x80($out),$out         # size optimization
961         movdqu          0x30($inp),$xt3
962         pxor            0x20(%rsp),$xt0
963         pxor            $xb2,$xt1
964         pxor            $xc2,$xt2
965         pxor            $xd2,$xt3
966
967          movdqu         $xt0,0x00($out)
968         movdqu          0x40($inp),$xt0
969          movdqu         $xt1,0x10($out)
970         movdqu          0x50($inp),$xt1
971          movdqu         $xt2,0x20($out)
972         movdqu          0x60($inp),$xt2
973          movdqu         $xt3,0x30($out)
974         movdqu          0x70($inp),$xt3
975         lea             0x80($inp),$inp         # inp+=64*4
976         pxor            0x30(%rsp),$xt0
977         pxor            $xb3,$xt1
978         pxor            $xc3,$xt2
979         pxor            $xd3,$xt3
980         movdqu          $xt0,0x40($out)
981         movdqu          $xt1,0x50($out)
982         movdqu          $xt2,0x60($out)
983         movdqu          $xt3,0x70($out)
984         lea             0x80($out),$out         # out+=64*4
985
986         sub             \$64*4,$len
987         jnz             .Loop_outer4x
988
989         jmp             .Ldone4x
990
991 .Ltail4x:
992         cmp             \$192,$len
993         jae             .L192_or_more4x
994         cmp             \$128,$len
995         jae             .L128_or_more4x
996         cmp             \$64,$len
997         jae             .L64_or_more4x
998
999         #movdqa         0x00(%rsp),$xt0         # $xaN is offloaded, remember?
1000         xor             %r10,%r10
1001         #movdqa         $xt0,0x00(%rsp)
1002         movdqa          $xb0,0x10(%rsp)
1003         movdqa          $xc0,0x20(%rsp)
1004         movdqa          $xd0,0x30(%rsp)
1005         jmp             .Loop_tail4x
1006
1007 .align  32
1008 .L64_or_more4x:
1009         movdqu          0x00($inp),$xt0         # xor with input
1010         movdqu          0x10($inp),$xt1
1011         movdqu          0x20($inp),$xt2
1012         movdqu          0x30($inp),$xt3
1013         pxor            0x00(%rsp),$xt0         # $xaxN is offloaded, remember?
1014         pxor            $xb0,$xt1
1015         pxor            $xc0,$xt2
1016         pxor            $xd0,$xt3
1017         movdqu          $xt0,0x00($out)
1018         movdqu          $xt1,0x10($out)
1019         movdqu          $xt2,0x20($out)
1020         movdqu          $xt3,0x30($out)
1021         je              .Ldone4x
1022
1023         movdqa          0x10(%rsp),$xt0         # $xaN is offloaded, remember?
1024         lea             0x40($inp),$inp         # inp+=64*1
1025         xor             %r10,%r10
1026         movdqa          $xt0,0x00(%rsp)
1027         movdqa          $xb1,0x10(%rsp)
1028         lea             0x40($out),$out         # out+=64*1
1029         movdqa          $xc1,0x20(%rsp)
1030         sub             \$64,$len               # len-=64*1
1031         movdqa          $xd1,0x30(%rsp)
1032         jmp             .Loop_tail4x
1033
1034 .align  32
1035 .L128_or_more4x:
1036         movdqu          0x00($inp),$xt0         # xor with input
1037         movdqu          0x10($inp),$xt1
1038         movdqu          0x20($inp),$xt2
1039         movdqu          0x30($inp),$xt3
1040         pxor            0x00(%rsp),$xt0         # $xaN is offloaded, remember?
1041         pxor            $xb0,$xt1
1042         pxor            $xc0,$xt2
1043         pxor            $xd0,$xt3
1044
1045          movdqu         $xt0,0x00($out)
1046         movdqu          0x40($inp),$xt0
1047          movdqu         $xt1,0x10($out)
1048         movdqu          0x50($inp),$xt1
1049          movdqu         $xt2,0x20($out)
1050         movdqu          0x60($inp),$xt2
1051          movdqu         $xt3,0x30($out)
1052         movdqu          0x70($inp),$xt3
1053         pxor            0x10(%rsp),$xt0
1054         pxor            $xb1,$xt1
1055         pxor            $xc1,$xt2
1056         pxor            $xd1,$xt3
1057         movdqu          $xt0,0x40($out)
1058         movdqu          $xt1,0x50($out)
1059         movdqu          $xt2,0x60($out)
1060         movdqu          $xt3,0x70($out)
1061         je              .Ldone4x
1062
1063         movdqa          0x20(%rsp),$xt0         # $xaN is offloaded, remember?
1064         lea             0x80($inp),$inp         # inp+=64*2
1065         xor             %r10,%r10
1066         movdqa          $xt0,0x00(%rsp)
1067         movdqa          $xb2,0x10(%rsp)
1068         lea             0x80($out),$out         # out+=64*2
1069         movdqa          $xc2,0x20(%rsp)
1070         sub             \$128,$len              # len-=64*2
1071         movdqa          $xd2,0x30(%rsp)
1072         jmp             .Loop_tail4x
1073
1074 .align  32
1075 .L192_or_more4x:
1076         movdqu          0x00($inp),$xt0         # xor with input
1077         movdqu          0x10($inp),$xt1
1078         movdqu          0x20($inp),$xt2
1079         movdqu          0x30($inp),$xt3
1080         pxor            0x00(%rsp),$xt0         # $xaN is offloaded, remember?
1081         pxor            $xb0,$xt1
1082         pxor            $xc0,$xt2
1083         pxor            $xd0,$xt3
1084
1085          movdqu         $xt0,0x00($out)
1086         movdqu          0x40($inp),$xt0
1087          movdqu         $xt1,0x10($out)
1088         movdqu          0x50($inp),$xt1
1089          movdqu         $xt2,0x20($out)
1090         movdqu          0x60($inp),$xt2
1091          movdqu         $xt3,0x30($out)
1092         movdqu          0x70($inp),$xt3
1093         lea             0x80($inp),$inp         # size optimization
1094         pxor            0x10(%rsp),$xt0
1095         pxor            $xb1,$xt1
1096         pxor            $xc1,$xt2
1097         pxor            $xd1,$xt3
1098
1099          movdqu         $xt0,0x40($out)
1100         movdqu          0x00($inp),$xt0
1101          movdqu         $xt1,0x50($out)
1102         movdqu          0x10($inp),$xt1
1103          movdqu         $xt2,0x60($out)
1104         movdqu          0x20($inp),$xt2
1105          movdqu         $xt3,0x70($out)
1106          lea            0x80($out),$out         # size optimization
1107         movdqu          0x30($inp),$xt3
1108         pxor            0x20(%rsp),$xt0
1109         pxor            $xb2,$xt1
1110         pxor            $xc2,$xt2
1111         pxor            $xd2,$xt3
1112         movdqu          $xt0,0x00($out)
1113         movdqu          $xt1,0x10($out)
1114         movdqu          $xt2,0x20($out)
1115         movdqu          $xt3,0x30($out)
1116         je              .Ldone4x
1117
1118         movdqa          0x30(%rsp),$xt0         # $xaN is offloaded, remember?
1119         lea             0x40($inp),$inp         # inp+=64*3
1120         xor             %r10,%r10
1121         movdqa          $xt0,0x00(%rsp)
1122         movdqa          $xb3,0x10(%rsp)
1123         lea             0x40($out),$out         # out+=64*3
1124         movdqa          $xc3,0x20(%rsp)
1125         sub             \$192,$len              # len-=64*3
1126         movdqa          $xd3,0x30(%rsp)
1127
1128 .Loop_tail4x:
1129         movzb           ($inp,%r10),%eax
1130         movzb           (%rsp,%r10),%ecx
1131         lea             1(%r10),%r10
1132         xor             %ecx,%eax
1133         mov             %al,-1($out,%r10)
1134         dec             $len
1135         jnz             .Loop_tail4x
1136
1137 .Ldone4x:
1138 ___
1139 $code.=<<___    if ($win64);
1140         lea             0x140+0x30(%rsp),%r11
1141         movaps          -0x30(%r11),%xmm6
1142         movaps          -0x20(%r11),%xmm7
1143         movaps          -0x10(%r11),%xmm8
1144         movaps          0x00(%r11),%xmm9
1145         movaps          0x10(%r11),%xmm10
1146         movaps          0x20(%r11),%xmm11
1147         movaps          0x30(%r11),%xmm12
1148         movaps          0x40(%r11),%xmm13
1149         movaps          0x50(%r11),%xmm14
1150         movaps          0x60(%r11),%xmm15
1151 ___
1152 $code.=<<___;
1153         add             \$0x148+$xframe,%rsp
1154         ret
1155 .size   ChaCha20_4x,.-ChaCha20_4x
1156 ___
1157 }
1158
1159 ########################################################################
1160 # XOP code path that handles all lengths.
1161 if ($avx) {
1162 # There is some "anomaly" observed depending on instructions' size or
1163 # alignment. If you look closely at below code you'll notice that
1164 # sometimes argument order varies. The order affects instruction
1165 # encoding by making it larger, and such fiddling gives 5% performance
1166 # improvement. This is on FX-4100...
1167
1168 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1169     $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1170 my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1171          $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1172
1173 sub XOP_lane_ROUND {
1174 my ($a0,$b0,$c0,$d0)=@_;
1175 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1176 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1177 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1178 my @x=map("\"$_\"",@xx);
1179
1180         (
1181         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",     # Q1
1182          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",     # Q2
1183           "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",     # Q3
1184            "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",     # Q4
1185         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1186          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1187           "&vpxor       (@x[$d2],@x[$a2],@x[$d2])",
1188            "&vpxor      (@x[$d3],@x[$a3],@x[$d3])",
1189         "&vprotd        (@x[$d0],@x[$d0],16)",
1190          "&vprotd       (@x[$d1],@x[$d1],16)",
1191           "&vprotd      (@x[$d2],@x[$d2],16)",
1192            "&vprotd     (@x[$d3],@x[$d3],16)",
1193
1194         "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
1195          "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
1196           "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
1197            "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
1198         "&vpxor         (@x[$b0],@x[$c0],@x[$b0])",
1199          "&vpxor        (@x[$b1],@x[$c1],@x[$b1])",
1200           "&vpxor       (@x[$b2],@x[$b2],@x[$c2])",     # flip
1201            "&vpxor      (@x[$b3],@x[$b3],@x[$c3])",     # flip
1202         "&vprotd        (@x[$b0],@x[$b0],12)",
1203          "&vprotd       (@x[$b1],@x[$b1],12)",
1204           "&vprotd      (@x[$b2],@x[$b2],12)",
1205            "&vprotd     (@x[$b3],@x[$b3],12)",
1206
1207         "&vpaddd        (@x[$a0],@x[$b0],@x[$a0])",     # flip
1208          "&vpaddd       (@x[$a1],@x[$b1],@x[$a1])",     # flip
1209           "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",
1210            "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",
1211         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1212          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1213           "&vpxor       (@x[$d2],@x[$a2],@x[$d2])",
1214            "&vpxor      (@x[$d3],@x[$a3],@x[$d3])",
1215         "&vprotd        (@x[$d0],@x[$d0],8)",
1216          "&vprotd       (@x[$d1],@x[$d1],8)",
1217           "&vprotd      (@x[$d2],@x[$d2],8)",
1218            "&vprotd     (@x[$d3],@x[$d3],8)",
1219
1220         "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
1221          "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
1222           "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
1223            "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
1224         "&vpxor         (@x[$b0],@x[$c0],@x[$b0])",
1225          "&vpxor        (@x[$b1],@x[$c1],@x[$b1])",
1226           "&vpxor       (@x[$b2],@x[$b2],@x[$c2])",     # flip
1227            "&vpxor      (@x[$b3],@x[$b3],@x[$c3])",     # flip
1228         "&vprotd        (@x[$b0],@x[$b0],7)",
1229          "&vprotd       (@x[$b1],@x[$b1],7)",
1230           "&vprotd      (@x[$b2],@x[$b2],7)",
1231            "&vprotd     (@x[$b3],@x[$b3],7)"
1232         );
1233 }
1234
1235 my $xframe = $win64 ? 0xa0 : 0;
1236
1237 $code.=<<___;
1238 .type   ChaCha20_4xop,\@function,5
1239 .align  32
1240 ChaCha20_4xop:
1241 .LChaCha20_4xop:
1242         lea             -0x78(%rsp),%r11
1243         sub             \$0x148+$xframe,%rsp
1244 ___
1245         ################ stack layout
1246         # +0x00         SIMD equivalent of @x[8-12]
1247         # ...
1248         # +0x40         constant copy of key[0-2] smashed by lanes
1249         # ...
1250         # +0x100        SIMD counters (with nonce smashed by lanes)
1251         # ...
1252         # +0x140
1253 $code.=<<___    if ($win64);
1254         movaps          %xmm6,-0x30(%r11)
1255         movaps          %xmm7,-0x20(%r11)
1256         movaps          %xmm8,-0x10(%r11)
1257         movaps          %xmm9,0x00(%r11)
1258         movaps          %xmm10,0x10(%r11)
1259         movaps          %xmm11,0x20(%r11)
1260         movaps          %xmm12,0x30(%r11)
1261         movaps          %xmm13,0x40(%r11)
1262         movaps          %xmm14,0x50(%r11)
1263         movaps          %xmm15,0x60(%r11)
1264 ___
1265 $code.=<<___;
1266         vzeroupper
1267
1268         vmovdqa         .Lsigma(%rip),$xa3      # key[0]
1269         vmovdqu         ($key),$xb3             # key[1]
1270         vmovdqu         16($key),$xt3           # key[2]
1271         vmovdqu         ($counter),$xd3         # key[3]
1272         lea             0x100(%rsp),%rcx        # size optimization
1273
1274         vpshufd         \$0x00,$xa3,$xa0        # smash key by lanes...
1275         vpshufd         \$0x55,$xa3,$xa1
1276         vmovdqa         $xa0,0x40(%rsp)         # ... and offload
1277         vpshufd         \$0xaa,$xa3,$xa2
1278         vmovdqa         $xa1,0x50(%rsp)
1279         vpshufd         \$0xff,$xa3,$xa3
1280         vmovdqa         $xa2,0x60(%rsp)
1281         vmovdqa         $xa3,0x70(%rsp)
1282
1283         vpshufd         \$0x00,$xb3,$xb0
1284         vpshufd         \$0x55,$xb3,$xb1
1285         vmovdqa         $xb0,0x80-0x100(%rcx)
1286         vpshufd         \$0xaa,$xb3,$xb2
1287         vmovdqa         $xb1,0x90-0x100(%rcx)
1288         vpshufd         \$0xff,$xb3,$xb3
1289         vmovdqa         $xb2,0xa0-0x100(%rcx)
1290         vmovdqa         $xb3,0xb0-0x100(%rcx)
1291
1292         vpshufd         \$0x00,$xt3,$xt0        # "$xc0"
1293         vpshufd         \$0x55,$xt3,$xt1        # "$xc1"
1294         vmovdqa         $xt0,0xc0-0x100(%rcx)
1295         vpshufd         \$0xaa,$xt3,$xt2        # "$xc2"
1296         vmovdqa         $xt1,0xd0-0x100(%rcx)
1297         vpshufd         \$0xff,$xt3,$xt3        # "$xc3"
1298         vmovdqa         $xt2,0xe0-0x100(%rcx)
1299         vmovdqa         $xt3,0xf0-0x100(%rcx)
1300
1301         vpshufd         \$0x00,$xd3,$xd0
1302         vpshufd         \$0x55,$xd3,$xd1
1303         vpaddd          .Linc(%rip),$xd0,$xd0   # don't save counters yet
1304         vpshufd         \$0xaa,$xd3,$xd2
1305         vmovdqa         $xd1,0x110-0x100(%rcx)
1306         vpshufd         \$0xff,$xd3,$xd3
1307         vmovdqa         $xd2,0x120-0x100(%rcx)
1308         vmovdqa         $xd3,0x130-0x100(%rcx)
1309
1310         jmp             .Loop_enter4xop
1311
1312 .align  32
1313 .Loop_outer4xop:
1314         vmovdqa         0x40(%rsp),$xa0         # re-load smashed key
1315         vmovdqa         0x50(%rsp),$xa1
1316         vmovdqa         0x60(%rsp),$xa2
1317         vmovdqa         0x70(%rsp),$xa3
1318         vmovdqa         0x80-0x100(%rcx),$xb0
1319         vmovdqa         0x90-0x100(%rcx),$xb1
1320         vmovdqa         0xa0-0x100(%rcx),$xb2
1321         vmovdqa         0xb0-0x100(%rcx),$xb3
1322         vmovdqa         0xc0-0x100(%rcx),$xt0   # "$xc0"
1323         vmovdqa         0xd0-0x100(%rcx),$xt1   # "$xc1"
1324         vmovdqa         0xe0-0x100(%rcx),$xt2   # "$xc2"
1325         vmovdqa         0xf0-0x100(%rcx),$xt3   # "$xc3"
1326         vmovdqa         0x100-0x100(%rcx),$xd0
1327         vmovdqa         0x110-0x100(%rcx),$xd1
1328         vmovdqa         0x120-0x100(%rcx),$xd2
1329         vmovdqa         0x130-0x100(%rcx),$xd3
1330         vpaddd          .Lfour(%rip),$xd0,$xd0  # next SIMD counters
1331
1332 .Loop_enter4xop:
1333         mov             \$10,%eax
1334         vmovdqa         $xd0,0x100-0x100(%rcx)  # save SIMD counters
1335         jmp             .Loop4xop
1336
1337 .align  32
1338 .Loop4xop:
1339 ___
1340         foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
1341         foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
1342 $code.=<<___;
1343         dec             %eax
1344         jnz             .Loop4xop
1345
1346         vpaddd          0x40(%rsp),$xa0,$xa0    # accumulate key material
1347         vpaddd          0x50(%rsp),$xa1,$xa1
1348         vpaddd          0x60(%rsp),$xa2,$xa2
1349         vpaddd          0x70(%rsp),$xa3,$xa3
1350
1351         vmovdqa         $xt2,0x20(%rsp)         # offload $xc2,3
1352         vmovdqa         $xt3,0x30(%rsp)
1353
1354         vpunpckldq      $xa1,$xa0,$xt2          # "de-interlace" data
1355         vpunpckldq      $xa3,$xa2,$xt3
1356         vpunpckhdq      $xa1,$xa0,$xa0
1357         vpunpckhdq      $xa3,$xa2,$xa2
1358         vpunpcklqdq     $xt3,$xt2,$xa1          # "a0"
1359         vpunpckhqdq     $xt3,$xt2,$xt2          # "a1"
1360         vpunpcklqdq     $xa2,$xa0,$xa3          # "a2"
1361         vpunpckhqdq     $xa2,$xa0,$xa0          # "a3"
1362 ___
1363         ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1364 $code.=<<___;
1365         vpaddd          0x80-0x100(%rcx),$xb0,$xb0
1366         vpaddd          0x90-0x100(%rcx),$xb1,$xb1
1367         vpaddd          0xa0-0x100(%rcx),$xb2,$xb2
1368         vpaddd          0xb0-0x100(%rcx),$xb3,$xb3
1369
1370         vmovdqa         $xa0,0x00(%rsp)         # offload $xa0,1
1371         vmovdqa         $xa1,0x10(%rsp)
1372         vmovdqa         0x20(%rsp),$xa0         # "xc2"
1373         vmovdqa         0x30(%rsp),$xa1         # "xc3"
1374
1375         vpunpckldq      $xb1,$xb0,$xt2
1376         vpunpckldq      $xb3,$xb2,$xt3
1377         vpunpckhdq      $xb1,$xb0,$xb0
1378         vpunpckhdq      $xb3,$xb2,$xb2
1379         vpunpcklqdq     $xt3,$xt2,$xb1          # "b0"
1380         vpunpckhqdq     $xt3,$xt2,$xt2          # "b1"
1381         vpunpcklqdq     $xb2,$xb0,$xb3          # "b2"
1382         vpunpckhqdq     $xb2,$xb0,$xb0          # "b3"
1383 ___
1384         ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1385         my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1386 $code.=<<___;
1387         vpaddd          0xc0-0x100(%rcx),$xc0,$xc0
1388         vpaddd          0xd0-0x100(%rcx),$xc1,$xc1
1389         vpaddd          0xe0-0x100(%rcx),$xc2,$xc2
1390         vpaddd          0xf0-0x100(%rcx),$xc3,$xc3
1391
1392         vpunpckldq      $xc1,$xc0,$xt2
1393         vpunpckldq      $xc3,$xc2,$xt3
1394         vpunpckhdq      $xc1,$xc0,$xc0
1395         vpunpckhdq      $xc3,$xc2,$xc2
1396         vpunpcklqdq     $xt3,$xt2,$xc1          # "c0"
1397         vpunpckhqdq     $xt3,$xt2,$xt2          # "c1"
1398         vpunpcklqdq     $xc2,$xc0,$xc3          # "c2"
1399         vpunpckhqdq     $xc2,$xc0,$xc0          # "c3"
1400 ___
1401         ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1402 $code.=<<___;
1403         vpaddd          0x100-0x100(%rcx),$xd0,$xd0
1404         vpaddd          0x110-0x100(%rcx),$xd1,$xd1
1405         vpaddd          0x120-0x100(%rcx),$xd2,$xd2
1406         vpaddd          0x130-0x100(%rcx),$xd3,$xd3
1407
1408         vpunpckldq      $xd1,$xd0,$xt2
1409         vpunpckldq      $xd3,$xd2,$xt3
1410         vpunpckhdq      $xd1,$xd0,$xd0
1411         vpunpckhdq      $xd3,$xd2,$xd2
1412         vpunpcklqdq     $xt3,$xt2,$xd1          # "d0"
1413         vpunpckhqdq     $xt3,$xt2,$xt2          # "d1"
1414         vpunpcklqdq     $xd2,$xd0,$xd3          # "d2"
1415         vpunpckhqdq     $xd2,$xd0,$xd0          # "d3"
1416 ___
1417         ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1418         ($xa0,$xa1)=($xt2,$xt3);
1419 $code.=<<___;
1420         vmovdqa         0x00(%rsp),$xa0         # restore $xa0,1
1421         vmovdqa         0x10(%rsp),$xa1
1422
1423         cmp             \$64*4,$len
1424         jb              .Ltail4xop
1425
1426         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1427         vpxor           0x10($inp),$xb0,$xb0
1428         vpxor           0x20($inp),$xc0,$xc0
1429         vpxor           0x30($inp),$xd0,$xd0
1430         vpxor           0x40($inp),$xa1,$xa1
1431         vpxor           0x50($inp),$xb1,$xb1
1432         vpxor           0x60($inp),$xc1,$xc1
1433         vpxor           0x70($inp),$xd1,$xd1
1434         lea             0x80($inp),$inp         # size optimization
1435         vpxor           0x00($inp),$xa2,$xa2
1436         vpxor           0x10($inp),$xb2,$xb2
1437         vpxor           0x20($inp),$xc2,$xc2
1438         vpxor           0x30($inp),$xd2,$xd2
1439         vpxor           0x40($inp),$xa3,$xa3
1440         vpxor           0x50($inp),$xb3,$xb3
1441         vpxor           0x60($inp),$xc3,$xc3
1442         vpxor           0x70($inp),$xd3,$xd3
1443         lea             0x80($inp),$inp         # inp+=64*4
1444
1445         vmovdqu         $xa0,0x00($out)
1446         vmovdqu         $xb0,0x10($out)
1447         vmovdqu         $xc0,0x20($out)
1448         vmovdqu         $xd0,0x30($out)
1449         vmovdqu         $xa1,0x40($out)
1450         vmovdqu         $xb1,0x50($out)
1451         vmovdqu         $xc1,0x60($out)
1452         vmovdqu         $xd1,0x70($out)
1453         lea             0x80($out),$out         # size optimization
1454         vmovdqu         $xa2,0x00($out)
1455         vmovdqu         $xb2,0x10($out)
1456         vmovdqu         $xc2,0x20($out)
1457         vmovdqu         $xd2,0x30($out)
1458         vmovdqu         $xa3,0x40($out)
1459         vmovdqu         $xb3,0x50($out)
1460         vmovdqu         $xc3,0x60($out)
1461         vmovdqu         $xd3,0x70($out)
1462         lea             0x80($out),$out         # out+=64*4
1463
1464         sub             \$64*4,$len
1465         jnz             .Loop_outer4xop
1466
1467         jmp             .Ldone4xop
1468
1469 .align  32
1470 .Ltail4xop:
1471         cmp             \$192,$len
1472         jae             .L192_or_more4xop
1473         cmp             \$128,$len
1474         jae             .L128_or_more4xop
1475         cmp             \$64,$len
1476         jae             .L64_or_more4xop
1477
1478         xor             %r10,%r10
1479         vmovdqa         $xa0,0x00(%rsp)
1480         vmovdqa         $xb0,0x10(%rsp)
1481         vmovdqa         $xc0,0x20(%rsp)
1482         vmovdqa         $xd0,0x30(%rsp)
1483         jmp             .Loop_tail4xop
1484
1485 .align  32
1486 .L64_or_more4xop:
1487         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1488         vpxor           0x10($inp),$xb0,$xb0
1489         vpxor           0x20($inp),$xc0,$xc0
1490         vpxor           0x30($inp),$xd0,$xd0
1491         vmovdqu         $xa0,0x00($out)
1492         vmovdqu         $xb0,0x10($out)
1493         vmovdqu         $xc0,0x20($out)
1494         vmovdqu         $xd0,0x30($out)
1495         je              .Ldone4xop
1496
1497         lea             0x40($inp),$inp         # inp+=64*1
1498         vmovdqa         $xa1,0x00(%rsp)
1499         xor             %r10,%r10
1500         vmovdqa         $xb1,0x10(%rsp)
1501         lea             0x40($out),$out         # out+=64*1
1502         vmovdqa         $xc1,0x20(%rsp)
1503         sub             \$64,$len               # len-=64*1
1504         vmovdqa         $xd1,0x30(%rsp)
1505         jmp             .Loop_tail4xop
1506
1507 .align  32
1508 .L128_or_more4xop:
1509         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1510         vpxor           0x10($inp),$xb0,$xb0
1511         vpxor           0x20($inp),$xc0,$xc0
1512         vpxor           0x30($inp),$xd0,$xd0
1513         vpxor           0x40($inp),$xa1,$xa1
1514         vpxor           0x50($inp),$xb1,$xb1
1515         vpxor           0x60($inp),$xc1,$xc1
1516         vpxor           0x70($inp),$xd1,$xd1
1517
1518         vmovdqu         $xa0,0x00($out)
1519         vmovdqu         $xb0,0x10($out)
1520         vmovdqu         $xc0,0x20($out)
1521         vmovdqu         $xd0,0x30($out)
1522         vmovdqu         $xa1,0x40($out)
1523         vmovdqu         $xb1,0x50($out)
1524         vmovdqu         $xc1,0x60($out)
1525         vmovdqu         $xd1,0x70($out)
1526         je              .Ldone4xop
1527
1528         lea             0x80($inp),$inp         # inp+=64*2
1529         vmovdqa         $xa2,0x00(%rsp)
1530         xor             %r10,%r10
1531         vmovdqa         $xb2,0x10(%rsp)
1532         lea             0x80($out),$out         # out+=64*2
1533         vmovdqa         $xc2,0x20(%rsp)
1534         sub             \$128,$len              # len-=64*2
1535         vmovdqa         $xd2,0x30(%rsp)
1536         jmp             .Loop_tail4xop
1537
1538 .align  32
1539 .L192_or_more4xop:
1540         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1541         vpxor           0x10($inp),$xb0,$xb0
1542         vpxor           0x20($inp),$xc0,$xc0
1543         vpxor           0x30($inp),$xd0,$xd0
1544         vpxor           0x40($inp),$xa1,$xa1
1545         vpxor           0x50($inp),$xb1,$xb1
1546         vpxor           0x60($inp),$xc1,$xc1
1547         vpxor           0x70($inp),$xd1,$xd1
1548         lea             0x80($inp),$inp         # size optimization
1549         vpxor           0x00($inp),$xa2,$xa2
1550         vpxor           0x10($inp),$xb2,$xb2
1551         vpxor           0x20($inp),$xc2,$xc2
1552         vpxor           0x30($inp),$xd2,$xd2
1553
1554         vmovdqu         $xa0,0x00($out)
1555         vmovdqu         $xb0,0x10($out)
1556         vmovdqu         $xc0,0x20($out)
1557         vmovdqu         $xd0,0x30($out)
1558         vmovdqu         $xa1,0x40($out)
1559         vmovdqu         $xb1,0x50($out)
1560         vmovdqu         $xc1,0x60($out)
1561         vmovdqu         $xd1,0x70($out)
1562         lea             0x80($out),$out         # size optimization
1563         vmovdqu         $xa2,0x00($out)
1564         vmovdqu         $xb2,0x10($out)
1565         vmovdqu         $xc2,0x20($out)
1566         vmovdqu         $xd2,0x30($out)
1567         je              .Ldone4xop
1568
1569         lea             0x40($inp),$inp         # inp+=64*3
1570         vmovdqa         $xa3,0x00(%rsp)
1571         xor             %r10,%r10
1572         vmovdqa         $xb3,0x10(%rsp)
1573         lea             0x40($out),$out         # out+=64*3
1574         vmovdqa         $xc3,0x20(%rsp)
1575         sub             \$192,$len              # len-=64*3
1576         vmovdqa         $xd3,0x30(%rsp)
1577
1578 .Loop_tail4xop:
1579         movzb           ($inp,%r10),%eax
1580         movzb           (%rsp,%r10),%ecx
1581         lea             1(%r10),%r10
1582         xor             %ecx,%eax
1583         mov             %al,-1($out,%r10)
1584         dec             $len
1585         jnz             .Loop_tail4xop
1586
1587 .Ldone4xop:
1588         vzeroupper
1589 ___
1590 $code.=<<___    if ($win64);
1591         lea             0x140+0x30(%rsp),%r11
1592         movaps          -0x30(%r11),%xmm6
1593         movaps          -0x20(%r11),%xmm7
1594         movaps          -0x10(%r11),%xmm8
1595         movaps          0x00(%r11),%xmm9
1596         movaps          0x10(%r11),%xmm10
1597         movaps          0x20(%r11),%xmm11
1598         movaps          0x30(%r11),%xmm12
1599         movaps          0x40(%r11),%xmm13
1600         movaps          0x50(%r11),%xmm14
1601         movaps          0x60(%r11),%xmm15
1602 ___
1603 $code.=<<___;
1604         add             \$0x148+$xframe,%rsp
1605         ret
1606 .size   ChaCha20_4xop,.-ChaCha20_4xop
1607 ___
1608 }
1609
1610 ########################################################################
1611 # AVX2 code path
1612 if ($avx>1) {
1613 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1614     $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1615 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1616         "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1617
1618 sub AVX2_lane_ROUND {
1619 my ($a0,$b0,$c0,$d0)=@_;
1620 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1621 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1622 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1623 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1624 my @x=map("\"$_\"",@xx);
1625
1626         # Consider order in which variables are addressed by their
1627         # index:
1628         #
1629         #       a   b   c   d
1630         #
1631         #       0   4   8  12 < even round
1632         #       1   5   9  13
1633         #       2   6  10  14
1634         #       3   7  11  15
1635         #       0   5  10  15 < odd round
1636         #       1   6  11  12
1637         #       2   7   8  13
1638         #       3   4   9  14
1639         #
1640         # 'a', 'b' and 'd's are permanently allocated in registers,
1641         # @x[0..7,12..15], while 'c's are maintained in memory. If
1642         # you observe 'c' column, you'll notice that pair of 'c's is
1643         # invariant between rounds. This means that we have to reload
1644         # them once per round, in the middle. This is why you'll see
1645         # bunch of 'c' stores and loads in the middle, but none in
1646         # the beginning or end.
1647
1648         (
1649         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",     # Q1
1650         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1651         "&vpshufb       (@x[$d0],@x[$d0],$t1)",
1652          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",     # Q2
1653          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1654          "&vpshufb      (@x[$d1],@x[$d1],$t1)",
1655
1656         "&vpaddd        ($xc,$xc,@x[$d0])",
1657         "&vpxor         (@x[$b0],$xc,@x[$b0])",
1658         "&vpslld        ($t0,@x[$b0],12)",
1659         "&vpsrld        (@x[$b0],@x[$b0],20)",
1660         "&vpor          (@x[$b0],$t0,@x[$b0])",
1661         "&vbroadcasti128($t0,'(%r11)')",                # .Lrot24(%rip)
1662          "&vpaddd       ($xc_,$xc_,@x[$d1])",
1663          "&vpxor        (@x[$b1],$xc_,@x[$b1])",
1664          "&vpslld       ($t1,@x[$b1],12)",
1665          "&vpsrld       (@x[$b1],@x[$b1],20)",
1666          "&vpor         (@x[$b1],$t1,@x[$b1])",
1667
1668         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",
1669         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1670         "&vpshufb       (@x[$d0],@x[$d0],$t0)",
1671          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",
1672          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1673          "&vpshufb      (@x[$d1],@x[$d1],$t0)",
1674
1675         "&vpaddd        ($xc,$xc,@x[$d0])",
1676         "&vpxor         (@x[$b0],$xc,@x[$b0])",
1677         "&vpslld        ($t1,@x[$b0],7)",
1678         "&vpsrld        (@x[$b0],@x[$b0],25)",
1679         "&vpor          (@x[$b0],$t1,@x[$b0])",
1680         "&vbroadcasti128($t1,'(%r10)')",                # .Lrot16(%rip)
1681          "&vpaddd       ($xc_,$xc_,@x[$d1])",
1682          "&vpxor        (@x[$b1],$xc_,@x[$b1])",
1683          "&vpslld       ($t0,@x[$b1],7)",
1684          "&vpsrld       (@x[$b1],@x[$b1],25)",
1685          "&vpor         (@x[$b1],$t0,@x[$b1])",
1686
1687         "&vmovdqa       (\"`32*($c0-8)`(%rsp)\",$xc)",  # reload pair of 'c's
1688          "&vmovdqa      (\"`32*($c1-8)`(%rsp)\",$xc_)",
1689         "&vmovdqa       ($xc,\"`32*($c2-8)`(%rsp)\")",
1690          "&vmovdqa      ($xc_,\"`32*($c3-8)`(%rsp)\")",
1691
1692         "&vpaddd        (@x[$a2],@x[$a2],@x[$b2])",     # Q3
1693         "&vpxor         (@x[$d2],@x[$a2],@x[$d2])",
1694         "&vpshufb       (@x[$d2],@x[$d2],$t1)",
1695          "&vpaddd       (@x[$a3],@x[$a3],@x[$b3])",     # Q4
1696          "&vpxor        (@x[$d3],@x[$a3],@x[$d3])",
1697          "&vpshufb      (@x[$d3],@x[$d3],$t1)",
1698
1699         "&vpaddd        ($xc,$xc,@x[$d2])",
1700         "&vpxor         (@x[$b2],$xc,@x[$b2])",
1701         "&vpslld        ($t0,@x[$b2],12)",
1702         "&vpsrld        (@x[$b2],@x[$b2],20)",
1703         "&vpor          (@x[$b2],$t0,@x[$b2])",
1704         "&vbroadcasti128($t0,'(%r11)')",                # .Lrot24(%rip)
1705          "&vpaddd       ($xc_,$xc_,@x[$d3])",
1706          "&vpxor        (@x[$b3],$xc_,@x[$b3])",
1707          "&vpslld       ($t1,@x[$b3],12)",
1708          "&vpsrld       (@x[$b3],@x[$b3],20)",
1709          "&vpor         (@x[$b3],$t1,@x[$b3])",
1710
1711         "&vpaddd        (@x[$a2],@x[$a2],@x[$b2])",
1712         "&vpxor         (@x[$d2],@x[$a2],@x[$d2])",
1713         "&vpshufb       (@x[$d2],@x[$d2],$t0)",
1714          "&vpaddd       (@x[$a3],@x[$a3],@x[$b3])",
1715          "&vpxor        (@x[$d3],@x[$a3],@x[$d3])",
1716          "&vpshufb      (@x[$d3],@x[$d3],$t0)",
1717
1718         "&vpaddd        ($xc,$xc,@x[$d2])",
1719         "&vpxor         (@x[$b2],$xc,@x[$b2])",
1720         "&vpslld        ($t1,@x[$b2],7)",
1721         "&vpsrld        (@x[$b2],@x[$b2],25)",
1722         "&vpor          (@x[$b2],$t1,@x[$b2])",
1723         "&vbroadcasti128($t1,'(%r10)')",                # .Lrot16(%rip)
1724          "&vpaddd       ($xc_,$xc_,@x[$d3])",
1725          "&vpxor        (@x[$b3],$xc_,@x[$b3])",
1726          "&vpslld       ($t0,@x[$b3],7)",
1727          "&vpsrld       (@x[$b3],@x[$b3],25)",
1728          "&vpor         (@x[$b3],$t0,@x[$b3])"
1729         );
1730 }
1731
1732 my $xframe = $win64 ? 0xb0 : 8;
1733
1734 $code.=<<___;
1735 .type   ChaCha20_8x,\@function,5
1736 .align  32
1737 ChaCha20_8x:
1738 .LChaCha20_8x:
1739         mov             %rsp,%r10
1740         sub             \$0x280+$xframe,%rsp
1741         and             \$-32,%rsp
1742 ___
1743 $code.=<<___    if ($win64);
1744         lea             0x290+0x30(%rsp),%r11
1745         movaps          %xmm6,-0x30(%r11)
1746         movaps          %xmm7,-0x20(%r11)
1747         movaps          %xmm8,-0x10(%r11)
1748         movaps          %xmm9,0x00(%r11)
1749         movaps          %xmm10,0x10(%r11)
1750         movaps          %xmm11,0x20(%r11)
1751         movaps          %xmm12,0x30(%r11)
1752         movaps          %xmm13,0x40(%r11)
1753         movaps          %xmm14,0x50(%r11)
1754         movaps          %xmm15,0x60(%r11)
1755 ___
1756 $code.=<<___;
1757         vzeroupper
1758         mov             %r10,0x280(%rsp)
1759
1760         ################ stack layout
1761         # +0x00         SIMD equivalent of @x[8-12]
1762         # ...
1763         # +0x80         constant copy of key[0-2] smashed by lanes
1764         # ...
1765         # +0x200        SIMD counters (with nonce smashed by lanes)
1766         # ...
1767         # +0x280        saved %rsp
1768
1769         vbroadcasti128  .Lsigma(%rip),$xa3      # key[0]
1770         vbroadcasti128  ($key),$xb3             # key[1]
1771         vbroadcasti128  16($key),$xt3           # key[2]
1772         vbroadcasti128  ($counter),$xd3         # key[3]
1773         lea             0x100(%rsp),%rcx        # size optimization
1774         lea             0x200(%rsp),%rax        # size optimization
1775         lea             .Lrot16(%rip),%r10
1776         lea             .Lrot24(%rip),%r11
1777
1778         vpshufd         \$0x00,$xa3,$xa0        # smash key by lanes...
1779         vpshufd         \$0x55,$xa3,$xa1
1780         vmovdqa         $xa0,0x80-0x100(%rcx)   # ... and offload
1781         vpshufd         \$0xaa,$xa3,$xa2
1782         vmovdqa         $xa1,0xa0-0x100(%rcx)
1783         vpshufd         \$0xff,$xa3,$xa3
1784         vmovdqa         $xa2,0xc0-0x100(%rcx)
1785         vmovdqa         $xa3,0xe0-0x100(%rcx)
1786
1787         vpshufd         \$0x00,$xb3,$xb0
1788         vpshufd         \$0x55,$xb3,$xb1
1789         vmovdqa         $xb0,0x100-0x100(%rcx)
1790         vpshufd         \$0xaa,$xb3,$xb2
1791         vmovdqa         $xb1,0x120-0x100(%rcx)
1792         vpshufd         \$0xff,$xb3,$xb3
1793         vmovdqa         $xb2,0x140-0x100(%rcx)
1794         vmovdqa         $xb3,0x160-0x100(%rcx)
1795
1796         vpshufd         \$0x00,$xt3,$xt0        # "xc0"
1797         vpshufd         \$0x55,$xt3,$xt1        # "xc1"
1798         vmovdqa         $xt0,0x180-0x200(%rax)
1799         vpshufd         \$0xaa,$xt3,$xt2        # "xc2"
1800         vmovdqa         $xt1,0x1a0-0x200(%rax)
1801         vpshufd         \$0xff,$xt3,$xt3        # "xc3"
1802         vmovdqa         $xt2,0x1c0-0x200(%rax)
1803         vmovdqa         $xt3,0x1e0-0x200(%rax)
1804
1805         vpshufd         \$0x00,$xd3,$xd0
1806         vpshufd         \$0x55,$xd3,$xd1
1807         vpaddd          .Lincy(%rip),$xd0,$xd0  # don't save counters yet
1808         vpshufd         \$0xaa,$xd3,$xd2
1809         vmovdqa         $xd1,0x220-0x200(%rax)
1810         vpshufd         \$0xff,$xd3,$xd3
1811         vmovdqa         $xd2,0x240-0x200(%rax)
1812         vmovdqa         $xd3,0x260-0x200(%rax)
1813
1814         jmp             .Loop_enter8x
1815
1816 .align  32
1817 .Loop_outer8x:
1818         vmovdqa         0x80-0x100(%rcx),$xa0   # re-load smashed key
1819         vmovdqa         0xa0-0x100(%rcx),$xa1
1820         vmovdqa         0xc0-0x100(%rcx),$xa2
1821         vmovdqa         0xe0-0x100(%rcx),$xa3
1822         vmovdqa         0x100-0x100(%rcx),$xb0
1823         vmovdqa         0x120-0x100(%rcx),$xb1
1824         vmovdqa         0x140-0x100(%rcx),$xb2
1825         vmovdqa         0x160-0x100(%rcx),$xb3
1826         vmovdqa         0x180-0x200(%rax),$xt0  # "xc0"
1827         vmovdqa         0x1a0-0x200(%rax),$xt1  # "xc1"
1828         vmovdqa         0x1c0-0x200(%rax),$xt2  # "xc2"
1829         vmovdqa         0x1e0-0x200(%rax),$xt3  # "xc3"
1830         vmovdqa         0x200-0x200(%rax),$xd0
1831         vmovdqa         0x220-0x200(%rax),$xd1
1832         vmovdqa         0x240-0x200(%rax),$xd2
1833         vmovdqa         0x260-0x200(%rax),$xd3
1834         vpaddd          .Leight(%rip),$xd0,$xd0 # next SIMD counters
1835
1836 .Loop_enter8x:
1837         vmovdqa         $xt2,0x40(%rsp)         # SIMD equivalent of "@x[10]"
1838         vmovdqa         $xt3,0x60(%rsp)         # SIMD equivalent of "@x[11]"
1839         vbroadcasti128  (%r10),$xt3
1840         vmovdqa         $xd0,0x200-0x200(%rax)  # save SIMD counters
1841         mov             \$10,%eax
1842         jmp             .Loop8x
1843
1844 .align  32
1845 .Loop8x:
1846 ___
1847         foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
1848         foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
1849 $code.=<<___;
1850         dec             %eax
1851         jnz             .Loop8x
1852
1853         lea             0x200(%rsp),%rax        # size optimization
1854         vpaddd          0x80-0x100(%rcx),$xa0,$xa0      # accumulate key
1855         vpaddd          0xa0-0x100(%rcx),$xa1,$xa1
1856         vpaddd          0xc0-0x100(%rcx),$xa2,$xa2
1857         vpaddd          0xe0-0x100(%rcx),$xa3,$xa3
1858
1859         vpunpckldq      $xa1,$xa0,$xt2          # "de-interlace" data
1860         vpunpckldq      $xa3,$xa2,$xt3
1861         vpunpckhdq      $xa1,$xa0,$xa0
1862         vpunpckhdq      $xa3,$xa2,$xa2
1863         vpunpcklqdq     $xt3,$xt2,$xa1          # "a0"
1864         vpunpckhqdq     $xt3,$xt2,$xt2          # "a1"
1865         vpunpcklqdq     $xa2,$xa0,$xa3          # "a2"
1866         vpunpckhqdq     $xa2,$xa0,$xa0          # "a3"
1867 ___
1868         ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1869 $code.=<<___;
1870         vpaddd          0x100-0x100(%rcx),$xb0,$xb0
1871         vpaddd          0x120-0x100(%rcx),$xb1,$xb1
1872         vpaddd          0x140-0x100(%rcx),$xb2,$xb2
1873         vpaddd          0x160-0x100(%rcx),$xb3,$xb3
1874
1875         vpunpckldq      $xb1,$xb0,$xt2
1876         vpunpckldq      $xb3,$xb2,$xt3
1877         vpunpckhdq      $xb1,$xb0,$xb0
1878         vpunpckhdq      $xb3,$xb2,$xb2
1879         vpunpcklqdq     $xt3,$xt2,$xb1          # "b0"
1880         vpunpckhqdq     $xt3,$xt2,$xt2          # "b1"
1881         vpunpcklqdq     $xb2,$xb0,$xb3          # "b2"
1882         vpunpckhqdq     $xb2,$xb0,$xb0          # "b3"
1883 ___
1884         ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1885 $code.=<<___;
1886         vperm2i128      \$0x20,$xb0,$xa0,$xt3   # "de-interlace" further
1887         vperm2i128      \$0x31,$xb0,$xa0,$xb0
1888         vperm2i128      \$0x20,$xb1,$xa1,$xa0
1889         vperm2i128      \$0x31,$xb1,$xa1,$xb1
1890         vperm2i128      \$0x20,$xb2,$xa2,$xa1
1891         vperm2i128      \$0x31,$xb2,$xa2,$xb2
1892         vperm2i128      \$0x20,$xb3,$xa3,$xa2
1893         vperm2i128      \$0x31,$xb3,$xa3,$xb3
1894 ___
1895         ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
1896         my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1897 $code.=<<___;
1898         vmovdqa         $xa0,0x00(%rsp)         # offload $xaN
1899         vmovdqa         $xa1,0x20(%rsp)
1900         vmovdqa         0x40(%rsp),$xc2         # $xa0
1901         vmovdqa         0x60(%rsp),$xc3         # $xa1
1902
1903         vpaddd          0x180-0x200(%rax),$xc0,$xc0
1904         vpaddd          0x1a0-0x200(%rax),$xc1,$xc1
1905         vpaddd          0x1c0-0x200(%rax),$xc2,$xc2
1906         vpaddd          0x1e0-0x200(%rax),$xc3,$xc3
1907
1908         vpunpckldq      $xc1,$xc0,$xt2
1909         vpunpckldq      $xc3,$xc2,$xt3
1910         vpunpckhdq      $xc1,$xc0,$xc0
1911         vpunpckhdq      $xc3,$xc2,$xc2
1912         vpunpcklqdq     $xt3,$xt2,$xc1          # "c0"
1913         vpunpckhqdq     $xt3,$xt2,$xt2          # "c1"
1914         vpunpcklqdq     $xc2,$xc0,$xc3          # "c2"
1915         vpunpckhqdq     $xc2,$xc0,$xc0          # "c3"
1916 ___
1917         ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1918 $code.=<<___;
1919         vpaddd          0x200-0x200(%rax),$xd0,$xd0
1920         vpaddd          0x220-0x200(%rax),$xd1,$xd1
1921         vpaddd          0x240-0x200(%rax),$xd2,$xd2
1922         vpaddd          0x260-0x200(%rax),$xd3,$xd3
1923
1924         vpunpckldq      $xd1,$xd0,$xt2
1925         vpunpckldq      $xd3,$xd2,$xt3
1926         vpunpckhdq      $xd1,$xd0,$xd0
1927         vpunpckhdq      $xd3,$xd2,$xd2
1928         vpunpcklqdq     $xt3,$xt2,$xd1          # "d0"
1929         vpunpckhqdq     $xt3,$xt2,$xt2          # "d1"
1930         vpunpcklqdq     $xd2,$xd0,$xd3          # "d2"
1931         vpunpckhqdq     $xd2,$xd0,$xd0          # "d3"
1932 ___
1933         ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1934 $code.=<<___;
1935         vperm2i128      \$0x20,$xd0,$xc0,$xt3   # "de-interlace" further
1936         vperm2i128      \$0x31,$xd0,$xc0,$xd0
1937         vperm2i128      \$0x20,$xd1,$xc1,$xc0
1938         vperm2i128      \$0x31,$xd1,$xc1,$xd1
1939         vperm2i128      \$0x20,$xd2,$xc2,$xc1
1940         vperm2i128      \$0x31,$xd2,$xc2,$xd2
1941         vperm2i128      \$0x20,$xd3,$xc3,$xc2
1942         vperm2i128      \$0x31,$xd3,$xc3,$xd3
1943 ___
1944         ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
1945         ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
1946         ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
1947         ($xa0,$xa1)=($xt2,$xt3);
1948 $code.=<<___;
1949         vmovdqa         0x00(%rsp),$xa0         # $xaN was offloaded, remember?
1950         vmovdqa         0x20(%rsp),$xa1
1951
1952         cmp             \$64*8,$len
1953         jb              .Ltail8x
1954
1955         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1956         vpxor           0x20($inp),$xb0,$xb0
1957         vpxor           0x40($inp),$xc0,$xc0
1958         vpxor           0x60($inp),$xd0,$xd0
1959         lea             0x80($inp),$inp         # size optimization
1960         vmovdqu         $xa0,0x00($out)
1961         vmovdqu         $xb0,0x20($out)
1962         vmovdqu         $xc0,0x40($out)
1963         vmovdqu         $xd0,0x60($out)
1964         lea             0x80($out),$out         # size optimization
1965
1966         vpxor           0x00($inp),$xa1,$xa1
1967         vpxor           0x20($inp),$xb1,$xb1
1968         vpxor           0x40($inp),$xc1,$xc1
1969         vpxor           0x60($inp),$xd1,$xd1
1970         lea             0x80($inp),$inp         # size optimization
1971         vmovdqu         $xa1,0x00($out)
1972         vmovdqu         $xb1,0x20($out)
1973         vmovdqu         $xc1,0x40($out)
1974         vmovdqu         $xd1,0x60($out)
1975         lea             0x80($out),$out         # size optimization
1976
1977         vpxor           0x00($inp),$xa2,$xa2
1978         vpxor           0x20($inp),$xb2,$xb2
1979         vpxor           0x40($inp),$xc2,$xc2
1980         vpxor           0x60($inp),$xd2,$xd2
1981         lea             0x80($inp),$inp         # size optimization
1982         vmovdqu         $xa2,0x00($out)
1983         vmovdqu         $xb2,0x20($out)
1984         vmovdqu         $xc2,0x40($out)
1985         vmovdqu         $xd2,0x60($out)
1986         lea             0x80($out),$out         # size optimization
1987
1988         vpxor           0x00($inp),$xa3,$xa3
1989         vpxor           0x20($inp),$xb3,$xb3
1990         vpxor           0x40($inp),$xc3,$xc3
1991         vpxor           0x60($inp),$xd3,$xd3
1992         lea             0x80($inp),$inp         # size optimization
1993         vmovdqu         $xa3,0x00($out)
1994         vmovdqu         $xb3,0x20($out)
1995         vmovdqu         $xc3,0x40($out)
1996         vmovdqu         $xd3,0x60($out)
1997         lea             0x80($out),$out         # size optimization
1998
1999         sub             \$64*8,$len
2000         jnz             .Loop_outer8x
2001
2002         jmp             .Ldone8x
2003
2004 .Ltail8x:
2005         cmp             \$448,$len
2006         jae             .L448_or_more8x
2007         cmp             \$384,$len
2008         jae             .L384_or_more8x
2009         cmp             \$320,$len
2010         jae             .L320_or_more8x
2011         cmp             \$256,$len
2012         jae             .L256_or_more8x
2013         cmp             \$192,$len
2014         jae             .L192_or_more8x
2015         cmp             \$128,$len
2016         jae             .L128_or_more8x
2017         cmp             \$64,$len
2018         jae             .L64_or_more8x
2019
2020         xor             %r10,%r10
2021         vmovdqa         $xa0,0x00(%rsp)
2022         vmovdqa         $xb0,0x20(%rsp)
2023         jmp             .Loop_tail8x
2024
2025 .align  32
2026 .L64_or_more8x:
2027         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2028         vpxor           0x20($inp),$xb0,$xb0
2029         vmovdqu         $xa0,0x00($out)
2030         vmovdqu         $xb0,0x20($out)
2031         je              .Ldone8x
2032
2033         lea             0x40($inp),$inp         # inp+=64*1
2034         xor             %r10,%r10
2035         vmovdqa         $xc0,0x00(%rsp)
2036         lea             0x40($out),$out         # out+=64*1
2037         sub             \$64,$len               # len-=64*1
2038         vmovdqa         $xd0,0x20(%rsp)
2039         jmp             .Loop_tail8x
2040
2041 .align  32
2042 .L128_or_more8x:
2043         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2044         vpxor           0x20($inp),$xb0,$xb0
2045         vpxor           0x40($inp),$xc0,$xc0
2046         vpxor           0x60($inp),$xd0,$xd0
2047         vmovdqu         $xa0,0x00($out)
2048         vmovdqu         $xb0,0x20($out)
2049         vmovdqu         $xc0,0x40($out)
2050         vmovdqu         $xd0,0x60($out)
2051         je              .Ldone8x
2052
2053         lea             0x80($inp),$inp         # inp+=64*2
2054         xor             %r10,%r10
2055         vmovdqa         $xa1,0x00(%rsp)
2056         lea             0x80($out),$out         # out+=64*2
2057         sub             \$128,$len              # len-=64*2
2058         vmovdqa         $xb1,0x20(%rsp)
2059         jmp             .Loop_tail8x
2060
2061 .align  32
2062 .L192_or_more8x:
2063         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2064         vpxor           0x20($inp),$xb0,$xb0
2065         vpxor           0x40($inp),$xc0,$xc0
2066         vpxor           0x60($inp),$xd0,$xd0
2067         vpxor           0x80($inp),$xa1,$xa1
2068         vpxor           0xa0($inp),$xb1,$xb1
2069         vmovdqu         $xa0,0x00($out)
2070         vmovdqu         $xb0,0x20($out)
2071         vmovdqu         $xc0,0x40($out)
2072         vmovdqu         $xd0,0x60($out)
2073         vmovdqu         $xa1,0x80($out)
2074         vmovdqu         $xb1,0xa0($out)
2075         je              .Ldone8x
2076
2077         lea             0xc0($inp),$inp         # inp+=64*3
2078         xor             %r10,%r10
2079         vmovdqa         $xc1,0x00(%rsp)
2080         lea             0xc0($out),$out         # out+=64*3
2081         sub             \$192,$len              # len-=64*3
2082         vmovdqa         $xd1,0x20(%rsp)
2083         jmp             .Loop_tail8x
2084
2085 .align  32
2086 .L256_or_more8x:
2087         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2088         vpxor           0x20($inp),$xb0,$xb0
2089         vpxor           0x40($inp),$xc0,$xc0
2090         vpxor           0x60($inp),$xd0,$xd0
2091         vpxor           0x80($inp),$xa1,$xa1
2092         vpxor           0xa0($inp),$xb1,$xb1
2093         vpxor           0xc0($inp),$xc1,$xc1
2094         vpxor           0xe0($inp),$xd1,$xd1
2095         vmovdqu         $xa0,0x00($out)
2096         vmovdqu         $xb0,0x20($out)
2097         vmovdqu         $xc0,0x40($out)
2098         vmovdqu         $xd0,0x60($out)
2099         vmovdqu         $xa1,0x80($out)
2100         vmovdqu         $xb1,0xa0($out)
2101         vmovdqu         $xc1,0xc0($out)
2102         vmovdqu         $xd1,0xe0($out)
2103         je              .Ldone8x
2104
2105         lea             0x100($inp),$inp        # inp+=64*4
2106         xor             %r10,%r10
2107         vmovdqa         $xa2,0x00(%rsp)
2108         lea             0x100($out),$out        # out+=64*4
2109         sub             \$256,$len              # len-=64*4
2110         vmovdqa         $xb2,0x20(%rsp)
2111         jmp             .Loop_tail8x
2112
2113 .align  32
2114 .L320_or_more8x:
2115         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2116         vpxor           0x20($inp),$xb0,$xb0
2117         vpxor           0x40($inp),$xc0,$xc0
2118         vpxor           0x60($inp),$xd0,$xd0
2119         vpxor           0x80($inp),$xa1,$xa1
2120         vpxor           0xa0($inp),$xb1,$xb1
2121         vpxor           0xc0($inp),$xc1,$xc1
2122         vpxor           0xe0($inp),$xd1,$xd1
2123         vpxor           0x100($inp),$xa2,$xa2
2124         vpxor           0x120($inp),$xb2,$xb2
2125         vmovdqu         $xa0,0x00($out)
2126         vmovdqu         $xb0,0x20($out)
2127         vmovdqu         $xc0,0x40($out)
2128         vmovdqu         $xd0,0x60($out)
2129         vmovdqu         $xa1,0x80($out)
2130         vmovdqu         $xb1,0xa0($out)
2131         vmovdqu         $xc1,0xc0($out)
2132         vmovdqu         $xd1,0xe0($out)
2133         vmovdqu         $xa2,0x100($out)
2134         vmovdqu         $xb2,0x120($out)
2135         je              .Ldone8x
2136
2137         lea             0x140($inp),$inp        # inp+=64*5
2138         xor             %r10,%r10
2139         vmovdqa         $xc2,0x00(%rsp)
2140         lea             0x140($out),$out        # out+=64*5
2141         sub             \$320,$len              # len-=64*5
2142         vmovdqa         $xd2,0x20(%rsp)
2143         jmp             .Loop_tail8x
2144
2145 .align  32
2146 .L384_or_more8x:
2147         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2148         vpxor           0x20($inp),$xb0,$xb0
2149         vpxor           0x40($inp),$xc0,$xc0
2150         vpxor           0x60($inp),$xd0,$xd0
2151         vpxor           0x80($inp),$xa1,$xa1
2152         vpxor           0xa0($inp),$xb1,$xb1
2153         vpxor           0xc0($inp),$xc1,$xc1
2154         vpxor           0xe0($inp),$xd1,$xd1
2155         vpxor           0x100($inp),$xa2,$xa2
2156         vpxor           0x120($inp),$xb2,$xb2
2157         vpxor           0x140($inp),$xc2,$xc2
2158         vpxor           0x160($inp),$xd2,$xd2
2159         vmovdqu         $xa0,0x00($out)
2160         vmovdqu         $xb0,0x20($out)
2161         vmovdqu         $xc0,0x40($out)
2162         vmovdqu         $xd0,0x60($out)
2163         vmovdqu         $xa1,0x80($out)
2164         vmovdqu         $xb1,0xa0($out)
2165         vmovdqu         $xc1,0xc0($out)
2166         vmovdqu         $xd1,0xe0($out)
2167         vmovdqu         $xa2,0x100($out)
2168         vmovdqu         $xb2,0x120($out)
2169         vmovdqu         $xc2,0x140($out)
2170         vmovdqu         $xd2,0x160($out)
2171         je              .Ldone8x
2172
2173         lea             0x180($inp),$inp        # inp+=64*6
2174         xor             %r10,%r10
2175         vmovdqa         $xa3,0x00(%rsp)
2176         lea             0x180($out),$out        # out+=64*6
2177         sub             \$384,$len              # len-=64*6
2178         vmovdqa         $xb3,0x20(%rsp)
2179         jmp             .Loop_tail8x
2180
2181 .align  32
2182 .L448_or_more8x:
2183         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2184         vpxor           0x20($inp),$xb0,$xb0
2185         vpxor           0x40($inp),$xc0,$xc0
2186         vpxor           0x60($inp),$xd0,$xd0
2187         vpxor           0x80($inp),$xa1,$xa1
2188         vpxor           0xa0($inp),$xb1,$xb1
2189         vpxor           0xc0($inp),$xc1,$xc1
2190         vpxor           0xe0($inp),$xd1,$xd1
2191         vpxor           0x100($inp),$xa2,$xa2
2192         vpxor           0x120($inp),$xb2,$xb2
2193         vpxor           0x140($inp),$xc2,$xc2
2194         vpxor           0x160($inp),$xd2,$xd2
2195         vpxor           0x180($inp),$xa3,$xa3
2196         vpxor           0x1a0($inp),$xb3,$xb3
2197         vmovdqu         $xa0,0x00($out)
2198         vmovdqu         $xb0,0x20($out)
2199         vmovdqu         $xc0,0x40($out)
2200         vmovdqu         $xd0,0x60($out)
2201         vmovdqu         $xa1,0x80($out)
2202         vmovdqu         $xb1,0xa0($out)
2203         vmovdqu         $xc1,0xc0($out)
2204         vmovdqu         $xd1,0xe0($out)
2205         vmovdqu         $xa2,0x100($out)
2206         vmovdqu         $xb2,0x120($out)
2207         vmovdqu         $xc2,0x140($out)
2208         vmovdqu         $xd2,0x160($out)
2209         vmovdqu         $xa3,0x180($out)
2210         vmovdqu         $xb3,0x1a0($out)
2211         je              .Ldone8x
2212
2213         lea             0x1c0($inp),$inp        # inp+=64*7
2214         xor             %r10,%r10
2215         vmovdqa         $xc3,0x00(%rsp)
2216         lea             0x1c0($out),$out        # out+=64*7
2217         sub             \$448,$len              # len-=64*7
2218         vmovdqa         $xd3,0x20(%rsp)
2219
2220 .Loop_tail8x:
2221         movzb           ($inp,%r10),%eax
2222         movzb           (%rsp,%r10),%ecx
2223         lea             1(%r10),%r10
2224         xor             %ecx,%eax
2225         mov             %al,-1($out,%r10)
2226         dec             $len
2227         jnz             .Loop_tail8x
2228
2229 .Ldone8x:
2230         vzeroall
2231 ___
2232 $code.=<<___    if ($win64);
2233         lea             0x290+0x30(%rsp),%r11
2234         movaps          -0x30(%r11),%xmm6
2235         movaps          -0x20(%r11),%xmm7
2236         movaps          -0x10(%r11),%xmm8
2237         movaps          0x00(%r11),%xmm9
2238         movaps          0x10(%r11),%xmm10
2239         movaps          0x20(%r11),%xmm11
2240         movaps          0x30(%r11),%xmm12
2241         movaps          0x40(%r11),%xmm13
2242         movaps          0x50(%r11),%xmm14
2243         movaps          0x60(%r11),%xmm15
2244 ___
2245 $code.=<<___;
2246         mov             0x280(%rsp),%rsp
2247         ret
2248 .size   ChaCha20_8x,.-ChaCha20_8x
2249 ___
2250 }
2251
2252 ########################################################################
2253 # AVX512 code paths
2254 if ($avx>2) {
2255 # This one handles shorter inputs...
2256
2257 my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
2258 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
2259
2260 sub AVX512ROUND {       # critical path is 14 "SIMD ticks" per round
2261         &vpaddd ($a,$a,$b);
2262         &vpxord ($d,$d,$a);
2263         &vprold ($d,$d,16);
2264
2265         &vpaddd ($c,$c,$d);
2266         &vpxord ($b,$b,$c);
2267         &vprold ($b,$b,12);
2268
2269         &vpaddd ($a,$a,$b);
2270         &vpxord ($d,$d,$a);
2271         &vprold ($d,$d,8);
2272
2273         &vpaddd ($c,$c,$d);
2274         &vpxord ($b,$b,$c);
2275         &vprold ($b,$b,7);
2276 }
2277
2278 my $xframe = $win64 ? 32+32+8 : 24;
2279
2280 $code.=<<___;
2281 .type   ChaCha20_avx512,\@function,5
2282 .align  32
2283 ChaCha20_avx512:
2284 .LChaCha20_avx512:
2285         cmp     \$512,$len
2286         ja      .LChaCha20_16x
2287
2288         push    %rbx                    # just to share SEH handler, no pops
2289         push    %rbp
2290         push    %r12
2291         push    %r13
2292         push    %r14
2293         push    %r15
2294
2295         sub     \$64+$xframe,%rsp
2296 ___
2297 $code.=<<___    if ($win64);
2298         movaps  %xmm6,64+32(%rsp)
2299         movaps  %xmm7,64+48(%rsp)
2300 ___
2301 $code.=<<___;
2302         vbroadcasti32x4 .Lsigma(%rip),$a
2303         vbroadcasti32x4 ($key),$b
2304         vbroadcasti32x4 16($key),$c
2305         vbroadcasti32x4 ($counter),$d
2306
2307         vmovdqa32       $a,$a_
2308         vmovdqa32       $b,$b_
2309         vmovdqa32       $c,$c_
2310         vpaddd          .Lzeroz(%rip),$d,$d
2311         vmovdqa32       .Lfourz(%rip),$fourz
2312         mov             \$10,$counter   # reuse $counter
2313         vmovdqa32       $d,$d_
2314         jmp             .Loop_avx512
2315
2316 .align  16
2317 .Loop_outer_avx512:
2318         vmovdqa32       $a_,$a
2319         vmovdqa32       $b_,$b
2320         vmovdqa32       $c_,$c
2321         vpaddd          $fourz,$d_,$d
2322         mov             \$10,$counter
2323         vmovdqa32       $d,$d_
2324         jmp             .Loop_avx512
2325
2326 .align  32
2327 .Loop_avx512:
2328 ___
2329         &AVX512ROUND();
2330         &vpshufd        ($c,$c,0b01001110);
2331         &vpshufd        ($b,$b,0b00111001);
2332         &vpshufd        ($d,$d,0b10010011);
2333
2334         &AVX512ROUND();
2335         &vpshufd        ($c,$c,0b01001110);
2336         &vpshufd        ($b,$b,0b10010011);
2337         &vpshufd        ($d,$d,0b00111001);
2338
2339         &dec            ($counter);
2340         &jnz            (".Loop_avx512");
2341
2342 $code.=<<___;
2343         vpaddd          $a_,$a,$a
2344         vpaddd          $b_,$b,$b
2345         vpaddd          $c_,$c,$c
2346         vpaddd          $d_,$d,$d
2347
2348         sub             \$64,$len
2349         jb              .Ltail64_avx512
2350
2351         vpxor           0x00($inp),%x#$a,$t0    # xor with input
2352         vpxor           0x10($inp),%x#$b,$t1
2353         vpxor           0x20($inp),%x#$c,$t2
2354         vpxor           0x30($inp),%x#$d,$t3
2355         lea             0x40($inp),$inp         # inp+=64
2356
2357         vmovdqu         $t0,0x00($out)          # write output
2358         vmovdqu         $t1,0x10($out)
2359         vmovdqu         $t2,0x20($out)
2360         vmovdqu         $t3,0x30($out)
2361         lea             0x40($out),$out         # out+=64
2362
2363         jz              .Ldone_avx512
2364
2365         vextracti32x4   \$1,$a,$t0
2366         vextracti32x4   \$1,$b,$t1
2367         vextracti32x4   \$1,$c,$t2
2368         vextracti32x4   \$1,$d,$t3
2369
2370         sub             \$64,$len
2371         jb              .Ltail_avx512
2372
2373         vpxor           0x00($inp),$t0,$t0      # xor with input
2374         vpxor           0x10($inp),$t1,$t1
2375         vpxor           0x20($inp),$t2,$t2
2376         vpxor           0x30($inp),$t3,$t3
2377         lea             0x40($inp),$inp         # inp+=64
2378
2379         vmovdqu         $t0,0x00($out)          # write output
2380         vmovdqu         $t1,0x10($out)
2381         vmovdqu         $t2,0x20($out)
2382         vmovdqu         $t3,0x30($out)
2383         lea             0x40($out),$out         # out+=64
2384
2385         jz              .Ldone_avx512
2386
2387         vextracti32x4   \$2,$a,$t0
2388         vextracti32x4   \$2,$b,$t1
2389         vextracti32x4   \$2,$c,$t2
2390         vextracti32x4   \$2,$d,$t3
2391
2392         sub             \$64,$len
2393         jb              .Ltail_avx512
2394
2395         vpxor           0x00($inp),$t0,$t0      # xor with input
2396         vpxor           0x10($inp),$t1,$t1
2397         vpxor           0x20($inp),$t2,$t2
2398         vpxor           0x30($inp),$t3,$t3
2399         lea             0x40($inp),$inp         # inp+=64
2400
2401         vmovdqu         $t0,0x00($out)          # write output
2402         vmovdqu         $t1,0x10($out)
2403         vmovdqu         $t2,0x20($out)
2404         vmovdqu         $t3,0x30($out)
2405         lea             0x40($out),$out         # out+=64
2406
2407         jz              .Ldone_avx512
2408
2409         vextracti32x4   \$3,$a,$t0
2410         vextracti32x4   \$3,$b,$t1
2411         vextracti32x4   \$3,$c,$t2
2412         vextracti32x4   \$3,$d,$t3
2413
2414         sub             \$64,$len
2415         jb              .Ltail_avx512
2416
2417         vpxor           0x00($inp),$t0,$t0      # xor with input
2418         vpxor           0x10($inp),$t1,$t1
2419         vpxor           0x20($inp),$t2,$t2
2420         vpxor           0x30($inp),$t3,$t3
2421         lea             0x40($inp),$inp         # inp+=64
2422
2423         vmovdqu         $t0,0x00($out)          # write output
2424         vmovdqu         $t1,0x10($out)
2425         vmovdqu         $t2,0x20($out)
2426         vmovdqu         $t3,0x30($out)
2427         lea             0x40($out),$out         # out+=64
2428
2429         jnz             .Loop_outer_avx512
2430
2431         jmp             .Ldone_avx512
2432
2433 .align  16
2434 .Ltail64_avx512:
2435         vmovdqa         %x#$a,0x00(%rsp)
2436         vmovdqa         %x#$b,0x10(%rsp)
2437         vmovdqa         %x#$c,0x20(%rsp)
2438         vmovdqa         %x#$d,0x30(%rsp)
2439         add             \$64,$len
2440         jmp             .Loop_tail_avx512
2441
2442 .align  16
2443 .Ltail_avx512:
2444         vmovdqa         $t0,0x00(%rsp)
2445         vmovdqa         $t1,0x10(%rsp)
2446         vmovdqa         $t2,0x20(%rsp)
2447         vmovdqa         $t3,0x30(%rsp)
2448         add             \$64,$len
2449
2450 .Loop_tail_avx512:
2451         movzb           ($inp,$counter),%eax
2452         movzb           (%rsp,$counter),%ecx
2453         lea             1($counter),$counter
2454         xor             %ecx,%eax
2455         mov             %al,-1($out,$counter)
2456         dec             $len
2457         jnz             .Loop_tail_avx512
2458
2459         vmovdqa32       $a_,0x00(%rsp)
2460
2461 .Ldone_avx512:
2462         vzeroall
2463 ___
2464 $code.=<<___    if ($win64);
2465         movaps  64+32(%rsp),%xmm6
2466         movaps  64+48(%rsp),%xmm7
2467 ___
2468 $code.=<<___;
2469         add     \$64+$xframe+48,%rsp
2470         ret
2471 .size   ChaCha20_avx512,.-ChaCha20_avx512
2472 ___
2473 }
2474 if ($avx>2) {
2475 # This one handles longer inputs...
2476
2477 my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2478     $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
2479 my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2480          $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2481 my @key=map("%zmm$_",(16..31));
2482 my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
2483
2484 sub AVX512_lane_ROUND {
2485 my ($a0,$b0,$c0,$d0)=@_;
2486 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
2487 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
2488 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
2489 my @x=map("\"$_\"",@xx);
2490
2491         (
2492         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",     # Q1
2493          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",     # Q2
2494           "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",     # Q3
2495            "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",     # Q4
2496         "&vpxord        (@x[$d0],@x[$d0],@x[$a0])",
2497          "&vpxord       (@x[$d1],@x[$d1],@x[$a1])",
2498           "&vpxord      (@x[$d2],@x[$d2],@x[$a2])",
2499            "&vpxord     (@x[$d3],@x[$d3],@x[$a3])",
2500         "&vprold        (@x[$d0],@x[$d0],16)",
2501          "&vprold       (@x[$d1],@x[$d1],16)",
2502           "&vprold      (@x[$d2],@x[$d2],16)",
2503            "&vprold     (@x[$d3],@x[$d3],16)",
2504
2505         "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
2506          "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
2507           "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
2508            "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
2509         "&vpxord        (@x[$b0],@x[$b0],@x[$c0])",
2510          "&vpxord       (@x[$b1],@x[$b1],@x[$c1])",
2511           "&vpxord      (@x[$b2],@x[$b2],@x[$c2])",
2512            "&vpxord     (@x[$b3],@x[$b3],@x[$c3])",
2513         "&vprold        (@x[$b0],@x[$b0],12)",
2514          "&vprold       (@x[$b1],@x[$b1],12)",
2515           "&vprold      (@x[$b2],@x[$b2],12)",
2516            "&vprold     (@x[$b3],@x[$b3],12)",
2517
2518         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",
2519          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",
2520           "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",
2521            "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",
2522         "&vpxord        (@x[$d0],@x[$d0],@x[$a0])",
2523          "&vpxord       (@x[$d1],@x[$d1],@x[$a1])",
2524           "&vpxord      (@x[$d2],@x[$d2],@x[$a2])",
2525            "&vpxord     (@x[$d3],@x[$d3],@x[$a3])",
2526         "&vprold        (@x[$d0],@x[$d0],8)",
2527          "&vprold       (@x[$d1],@x[$d1],8)",
2528           "&vprold      (@x[$d2],@x[$d2],8)",
2529            "&vprold     (@x[$d3],@x[$d3],8)",
2530
2531         "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
2532          "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
2533           "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
2534            "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
2535         "&vpxord        (@x[$b0],@x[$b0],@x[$c0])",
2536          "&vpxord       (@x[$b1],@x[$b1],@x[$c1])",
2537           "&vpxord      (@x[$b2],@x[$b2],@x[$c2])",
2538            "&vpxord     (@x[$b3],@x[$b3],@x[$c3])",
2539         "&vprold        (@x[$b0],@x[$b0],7)",
2540          "&vprold       (@x[$b1],@x[$b1],7)",
2541           "&vprold      (@x[$b2],@x[$b2],7)",
2542            "&vprold     (@x[$b3],@x[$b3],7)"
2543         );
2544 }
2545
2546 my $xframe = $win64 ? 0xb0 : 8;
2547
2548 $code.=<<___;
2549 .type   ChaCha20_16x,\@function,5
2550 .align  32
2551 ChaCha20_16x:
2552 .LChaCha20_16x:
2553         mov             %rsp,%r11
2554         sub             \$64+$xframe,%rsp
2555         and             \$-64,%rsp
2556 ___
2557 $code.=<<___    if ($win64);
2558         lea             0x290+0x30(%rsp),%r11
2559         movaps          %xmm6,-0x30(%r11)
2560         movaps          %xmm7,-0x20(%r11)
2561         movaps          %xmm8,-0x10(%r11)
2562         movaps          %xmm9,0x00(%r11)
2563         movaps          %xmm10,0x10(%r11)
2564         movaps          %xmm11,0x20(%r11)
2565         movaps          %xmm12,0x30(%r11)
2566         movaps          %xmm13,0x40(%r11)
2567         movaps          %xmm14,0x50(%r11)
2568         movaps          %xmm15,0x60(%r11)
2569 ___
2570 $code.=<<___;
2571         vzeroupper
2572
2573         lea             .Lsigma(%rip),%r10
2574         vbroadcasti32x4 (%r10),$xa3             # key[0]
2575         vbroadcasti32x4 ($key),$xb3             # key[1]
2576         vbroadcasti32x4 16($key),$xc3           # key[2]
2577         vbroadcasti32x4 ($counter),$xd3         # key[3]
2578
2579         vpshufd         \$0x00,$xa3,$xa0        # smash key by lanes...
2580         vpshufd         \$0x55,$xa3,$xa1
2581         vpshufd         \$0xaa,$xa3,$xa2
2582         vpshufd         \$0xff,$xa3,$xa3
2583         vmovdqa64       $xa0,@key[0]
2584         vmovdqa64       $xa1,@key[1]
2585         vmovdqa64       $xa2,@key[2]
2586         vmovdqa64       $xa3,@key[3]
2587
2588         vpshufd         \$0x00,$xb3,$xb0
2589         vpshufd         \$0x55,$xb3,$xb1
2590         vpshufd         \$0xaa,$xb3,$xb2
2591         vpshufd         \$0xff,$xb3,$xb3
2592         vmovdqa64       $xb0,@key[4]
2593         vmovdqa64       $xb1,@key[5]
2594         vmovdqa64       $xb2,@key[6]
2595         vmovdqa64       $xb3,@key[7]
2596
2597         vpshufd         \$0x00,$xc3,$xc0
2598         vpshufd         \$0x55,$xc3,$xc1
2599         vpshufd         \$0xaa,$xc3,$xc2
2600         vpshufd         \$0xff,$xc3,$xc3
2601         vmovdqa64       $xc0,@key[8]
2602         vmovdqa64       $xc1,@key[9]
2603         vmovdqa64       $xc2,@key[10]
2604         vmovdqa64       $xc3,@key[11]
2605
2606         vpshufd         \$0x00,$xd3,$xd0
2607         vpshufd         \$0x55,$xd3,$xd1
2608         vpshufd         \$0xaa,$xd3,$xd2
2609         vpshufd         \$0xff,$xd3,$xd3
2610         vpaddd          .Lincz(%rip),$xd0,$xd0  # don't save counters yet
2611         vmovdqa64       $xd0,@key[12]
2612         vmovdqa64       $xd1,@key[13]
2613         vmovdqa64       $xd2,@key[14]
2614         vmovdqa64       $xd3,@key[15]
2615
2616         mov             \$10,%eax
2617         jmp             .Loop16x
2618
2619 .align  32
2620 .Loop_outer16x:
2621         vpbroadcastd    0(%r10),$xa0            # reload key
2622         vpbroadcastd    4(%r10),$xa1
2623         vpbroadcastd    8(%r10),$xa2
2624         vpbroadcastd    12(%r10),$xa3
2625         vpaddd          .Lsixteen(%rip),@key[12],@key[12]       # next SIMD counters
2626         vmovdqa64       @key[4],$xb0
2627         vmovdqa64       @key[5],$xb1
2628         vmovdqa64       @key[6],$xb2
2629         vmovdqa64       @key[7],$xb3
2630         vmovdqa64       @key[8],$xc0
2631         vmovdqa64       @key[9],$xc1
2632         vmovdqa64       @key[10],$xc2
2633         vmovdqa64       @key[11],$xc3
2634         vmovdqa64       @key[12],$xd0
2635         vmovdqa64       @key[13],$xd1
2636         vmovdqa64       @key[14],$xd2
2637         vmovdqa64       @key[15],$xd3
2638
2639         vmovdqa64       $xa0,@key[0]
2640         vmovdqa64       $xa1,@key[1]
2641         vmovdqa64       $xa2,@key[2]
2642         vmovdqa64       $xa3,@key[3]
2643
2644         mov             \$10,%eax
2645         jmp             .Loop16x
2646
2647 .align  32
2648 .Loop16x:
2649 ___
2650         foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
2651         foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
2652 $code.=<<___;
2653         dec             %eax
2654         jnz             .Loop16x
2655
2656         vpaddd          @key[0],$xa0,$xa0       # accumulate key
2657         vpaddd          @key[1],$xa1,$xa1
2658         vpaddd          @key[2],$xa2,$xa2
2659         vpaddd          @key[3],$xa3,$xa3
2660
2661         vpunpckldq      $xa1,$xa0,$xt2          # "de-interlace" data
2662         vpunpckldq      $xa3,$xa2,$xt3
2663         vpunpckhdq      $xa1,$xa0,$xa0
2664         vpunpckhdq      $xa3,$xa2,$xa2
2665         vpunpcklqdq     $xt3,$xt2,$xa1          # "a0"
2666         vpunpckhqdq     $xt3,$xt2,$xt2          # "a1"
2667         vpunpcklqdq     $xa2,$xa0,$xa3          # "a2"
2668         vpunpckhqdq     $xa2,$xa0,$xa0          # "a3"
2669 ___
2670         ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
2671 $code.=<<___;
2672         vpaddd          @key[4],$xb0,$xb0
2673         vpaddd          @key[5],$xb1,$xb1
2674         vpaddd          @key[6],$xb2,$xb2
2675         vpaddd          @key[7],$xb3,$xb3
2676
2677         vpunpckldq      $xb1,$xb0,$xt2
2678         vpunpckldq      $xb3,$xb2,$xt3
2679         vpunpckhdq      $xb1,$xb0,$xb0
2680         vpunpckhdq      $xb3,$xb2,$xb2
2681         vpunpcklqdq     $xt3,$xt2,$xb1          # "b0"
2682         vpunpckhqdq     $xt3,$xt2,$xt2          # "b1"
2683         vpunpcklqdq     $xb2,$xb0,$xb3          # "b2"
2684         vpunpckhqdq     $xb2,$xb0,$xb0          # "b3"
2685 ___
2686         ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
2687 $code.=<<___;
2688         vshufi32x4      \$0x44,$xb0,$xa0,$xt3   # "de-interlace" further
2689         vshufi32x4      \$0xee,$xb0,$xa0,$xb0
2690         vshufi32x4      \$0x44,$xb1,$xa1,$xa0
2691         vshufi32x4      \$0xee,$xb1,$xa1,$xb1
2692         vshufi32x4      \$0x44,$xb2,$xa2,$xa1
2693         vshufi32x4      \$0xee,$xb2,$xa2,$xb2
2694         vshufi32x4      \$0x44,$xb3,$xa3,$xa2
2695         vshufi32x4      \$0xee,$xb3,$xa3,$xb3
2696 ___
2697         ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
2698 $code.=<<___;
2699         vpaddd          @key[8],$xc0,$xc0
2700         vpaddd          @key[9],$xc1,$xc1
2701         vpaddd          @key[10],$xc2,$xc2
2702         vpaddd          @key[11],$xc3,$xc3
2703
2704         vpunpckldq      $xc1,$xc0,$xt2
2705         vpunpckldq      $xc3,$xc2,$xt3
2706         vpunpckhdq      $xc1,$xc0,$xc0
2707         vpunpckhdq      $xc3,$xc2,$xc2
2708         vpunpcklqdq     $xt3,$xt2,$xc1          # "c0"
2709         vpunpckhqdq     $xt3,$xt2,$xt2          # "c1"
2710         vpunpcklqdq     $xc2,$xc0,$xc3          # "c2"
2711         vpunpckhqdq     $xc2,$xc0,$xc0          # "c3"
2712 ___
2713         ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
2714 $code.=<<___;
2715         vpaddd          @key[12],$xd0,$xd0
2716         vpaddd          @key[13],$xd1,$xd1
2717         vpaddd          @key[14],$xd2,$xd2
2718         vpaddd          @key[15],$xd3,$xd3
2719
2720         vpunpckldq      $xd1,$xd0,$xt2
2721         vpunpckldq      $xd3,$xd2,$xt3
2722         vpunpckhdq      $xd1,$xd0,$xd0
2723         vpunpckhdq      $xd3,$xd2,$xd2
2724         vpunpcklqdq     $xt3,$xt2,$xd1          # "d0"
2725         vpunpckhqdq     $xt3,$xt2,$xt2          # "d1"
2726         vpunpcklqdq     $xd2,$xd0,$xd3          # "d2"
2727         vpunpckhqdq     $xd2,$xd0,$xd0          # "d3"
2728 ___
2729         ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
2730 $code.=<<___;
2731         vshufi32x4      \$0x44,$xd0,$xc0,$xt3   # "de-interlace" further
2732         vshufi32x4      \$0xee,$xd0,$xc0,$xd0
2733         vshufi32x4      \$0x44,$xd1,$xc1,$xc0
2734         vshufi32x4      \$0xee,$xd1,$xc1,$xd1
2735         vshufi32x4      \$0x44,$xd2,$xc2,$xc1
2736         vshufi32x4      \$0xee,$xd2,$xc2,$xd2
2737         vshufi32x4      \$0x44,$xd3,$xc3,$xc2
2738         vshufi32x4      \$0xee,$xd3,$xc3,$xd3
2739 ___
2740         ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
2741 $code.=<<___;
2742         vshufi32x4      \$0x88,$xc0,$xa0,$xt0   # "de-interlace" further
2743         vshufi32x4      \$0xdd,$xc0,$xa0,$xa0
2744          vshufi32x4     \$0x88,$xd0,$xb0,$xc0
2745          vshufi32x4     \$0xdd,$xd0,$xb0,$xd0
2746         vshufi32x4      \$0x88,$xc1,$xa1,$xt1
2747         vshufi32x4      \$0xdd,$xc1,$xa1,$xa1
2748          vshufi32x4     \$0x88,$xd1,$xb1,$xc1
2749          vshufi32x4     \$0xdd,$xd1,$xb1,$xd1
2750         vshufi32x4      \$0x88,$xc2,$xa2,$xt2
2751         vshufi32x4      \$0xdd,$xc2,$xa2,$xa2
2752          vshufi32x4     \$0x88,$xd2,$xb2,$xc2
2753          vshufi32x4     \$0xdd,$xd2,$xb2,$xd2
2754         vshufi32x4      \$0x88,$xc3,$xa3,$xt3
2755         vshufi32x4      \$0xdd,$xc3,$xa3,$xa3
2756          vshufi32x4     \$0x88,$xd3,$xb3,$xc3
2757          vshufi32x4     \$0xdd,$xd3,$xb3,$xd3
2758 ___
2759         ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
2760         ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
2761
2762         ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
2763          $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
2764         ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2765          $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2766 $code.=<<___;
2767         cmp             \$64*16,$len
2768         jb              .Ltail16x
2769
2770         vpxord          0x00($inp),$xa0,$xa0    # xor with input
2771         vpxord          0x40($inp),$xb0,$xb0
2772         vpxord          0x80($inp),$xc0,$xc0
2773         vpxord          0xc0($inp),$xd0,$xd0
2774         vmovdqu32       $xa0,0x00($out)
2775         vmovdqu32       $xb0,0x40($out)
2776         vmovdqu32       $xc0,0x80($out)
2777         vmovdqu32       $xd0,0xc0($out)
2778
2779         vpxord          0x100($inp),$xa1,$xa1
2780         vpxord          0x140($inp),$xb1,$xb1
2781         vpxord          0x180($inp),$xc1,$xc1
2782         vpxord          0x1c0($inp),$xd1,$xd1
2783         vmovdqu32       $xa1,0x100($out)
2784         vmovdqu32       $xb1,0x140($out)
2785         vmovdqu32       $xc1,0x180($out)
2786         vmovdqu32       $xd1,0x1c0($out)
2787
2788         vpxord          0x200($inp),$xa2,$xa2
2789         vpxord          0x240($inp),$xb2,$xb2
2790         vpxord          0x280($inp),$xc2,$xc2
2791         vpxord          0x2c0($inp),$xd2,$xd2
2792         vmovdqu32       $xa2,0x200($out)
2793         vmovdqu32       $xb2,0x240($out)
2794         vmovdqu32       $xc2,0x280($out)
2795         vmovdqu32       $xd2,0x2c0($out)
2796
2797         vpxord          0x300($inp),$xa3,$xa3
2798         vpxord          0x340($inp),$xb3,$xb3
2799         vpxord          0x380($inp),$xc3,$xc3
2800         vpxord          0x3c0($inp),$xd3,$xd3
2801         lea             0x400($inp),$inp
2802         vmovdqu32       $xa3,0x300($out)
2803         vmovdqu32       $xb3,0x340($out)
2804         vmovdqu32       $xc3,0x380($out)
2805         vmovdqu32       $xd3,0x3c0($out)
2806         lea             0x400($out),$out
2807
2808         sub             \$64*16,$len
2809         jnz             .Loop_outer16x
2810
2811         jmp             .Ldone16x
2812
2813 .align  32
2814 .Ltail16x:
2815         xor             %r10,%r10
2816         sub             $inp,$out
2817         cmp             \$64*1,$len
2818         jb              .Less_than_64_16x
2819         vpxord          ($inp),$xa0,$xa0        # xor with input
2820         vmovdqu32       $xa0,($out,$inp)
2821         je              .Ldone16x
2822         vmovdqa32       $xb0,$xa0
2823         lea             64($inp),$inp
2824
2825         cmp             \$64*2,$len
2826         jb              .Less_than_64_16x
2827         vpxord          ($inp),$xb0,$xb0
2828         vmovdqu32       $xb0,($out,$inp)
2829         je              .Ldone16x
2830         vmovdqa32       $xc0,$xa0
2831         lea             64($inp),$inp
2832
2833         cmp             \$64*3,$len
2834         jb              .Less_than_64_16x
2835         vpxord          ($inp),$xc0,$xc0
2836         vmovdqu32       $xc0,($out,$inp)
2837         je              .Ldone16x
2838         vmovdqa32       $xd0,$xa0
2839         lea             64($inp),$inp
2840
2841         cmp             \$64*4,$len
2842         jb              .Less_than_64_16x
2843         vpxord          ($inp),$xd0,$xd0
2844         vmovdqu32       $xd0,($out,$inp)
2845         je              .Ldone16x
2846         vmovdqa32       $xa1,$xa0
2847         lea             64($inp),$inp
2848
2849         cmp             \$64*5,$len
2850         jb              .Less_than_64_16x
2851         vpxord          ($inp),$xa1,$xa1
2852         vmovdqu32       $xa1,($out,$inp)
2853         je              .Ldone16x
2854         vmovdqa32       $xb1,$xa0
2855         lea             64($inp),$inp
2856
2857         cmp             \$64*6,$len
2858         jb              .Less_than_64_16x
2859         vpxord          ($inp),$xb1,$xb1
2860         vmovdqu32       $xb1,($out,$inp)
2861         je              .Ldone16x
2862         vmovdqa32       $xc1,$xa0
2863         lea             64($inp),$inp
2864
2865         cmp             \$64*7,$len
2866         jb              .Less_than_64_16x
2867         vpxord          ($inp),$xc1,$xc1
2868         vmovdqu32       $xc1,($out,$inp)
2869         je              .Ldone16x
2870         vmovdqa32       $xd1,$xa0
2871         lea             64($inp),$inp
2872
2873         cmp             \$64*8,$len
2874         jb              .Less_than_64_16x
2875         vpxord          ($inp),$xd1,$xd1
2876         vmovdqu32       $xd1,($out,$inp)
2877         je              .Ldone16x
2878         vmovdqa32       $xa2,$xa0
2879         lea             64($inp),$inp
2880
2881         cmp             \$64*9,$len
2882         jb              .Less_than_64_16x
2883         vpxord          ($inp),$xa2,$xa2
2884         vmovdqu32       $xa2,($out,$inp)
2885         je              .Ldone16x
2886         vmovdqa32       $xb2,$xa0
2887         lea             64($inp),$inp
2888
2889         cmp             \$64*10,$len
2890         jb              .Less_than_64_16x
2891         vpxord          ($inp),$xb2,$xb2
2892         vmovdqu32       $xb2,($out,$inp)
2893         je              .Ldone16x
2894         vmovdqa32       $xc2,$xa0
2895         lea             64($inp),$inp
2896
2897         cmp             \$64*11,$len
2898         jb              .Less_than_64_16x
2899         vpxord          ($inp),$xc2,$xc2
2900         vmovdqu32       $xc2,($out,$inp)
2901         je              .Ldone16x
2902         vmovdqa32       $xd2,$xa0
2903         lea             64($inp),$inp
2904
2905         cmp             \$64*12,$len
2906         jb              .Less_than_64_16x
2907         vpxord          ($inp),$xd2,$xd2
2908         vmovdqu32       $xd2,($out,$inp)
2909         je              .Ldone16x
2910         vmovdqa32       $xa3,$xa0
2911         lea             64($inp),$inp
2912
2913         cmp             \$64*13,$len
2914         jb              .Less_than_64_16x
2915         vpxord          ($inp),$xa3,$xa3
2916         vmovdqu32       $xa3,($out,$inp)
2917         je              .Ldone16x
2918         vmovdqa32       $xb3,$xa0
2919         lea             64($inp),$inp
2920
2921         cmp             \$64*14,$len
2922         jb              .Less_than_64_16x
2923         vpxord          ($inp),$xb3,$xb3
2924         vmovdqu32       $xb3,($out,$inp)
2925         je              .Ldone16x
2926         vmovdqa32       $xc3,$xa0
2927         lea             64($inp),$inp
2928
2929         cmp             \$64*15,$len
2930         jb              .Less_than_64_16x
2931         vpxord          ($inp),$xc3,$xc3
2932         vmovdqu32       $xc3,($out,$inp)
2933         je              .Ldone16x
2934         vmovdqa32       $xd3,$xa0
2935         lea             64($inp),$inp
2936
2937 .Less_than_64_16x:
2938         vmovdqa32       $xa0,0x00(%rsp)
2939         lea             ($out,$inp),$out
2940         and             \$63,$len
2941
2942 .Loop_tail16x:
2943         movzb           ($inp,%r10),%eax
2944         movzb           (%rsp,%r10),%ecx
2945         lea             1(%r10),%r10
2946         xor             %ecx,%eax
2947         mov             %al,-1($out,%r10)
2948         dec             $len
2949         jnz             .Loop_tail16x
2950
2951         vpxord          $xa0,$xa0,$xa0
2952         vmovdqa32       $xa0,0(%rsp)
2953
2954 .Ldone16x:
2955         vzeroall
2956 ___
2957 $code.=<<___    if ($win64);
2958         lea             0x290+0x30(%rsp),%r11
2959         movaps          -0x30(%r11),%xmm6
2960         movaps          -0x20(%r11),%xmm7
2961         movaps          -0x10(%r11),%xmm8
2962         movaps          0x00(%r11),%xmm9
2963         movaps          0x10(%r11),%xmm10
2964         movaps          0x20(%r11),%xmm11
2965         movaps          0x30(%r11),%xmm12
2966         movaps          0x40(%r11),%xmm13
2967         movaps          0x50(%r11),%xmm14
2968         movaps          0x60(%r11),%xmm15
2969 ___
2970 $code.=<<___;
2971         mov             %r11,%rsp
2972         ret
2973 .size   ChaCha20_16x,.-ChaCha20_16x
2974 ___
2975 }
2976
2977 foreach (split("\n",$code)) {
2978         s/\`([^\`]*)\`/eval $1/ge;
2979
2980         s/%x#%[yz]/%x/g;        # "down-shift"
2981
2982         print $_,"\n";
2983 }
2984
2985 close STDOUT;