4b1750cd5dea6dd41bd5896ad7e07856e9a94e9d
[openssl.git] / crypto / chacha / asm / chacha-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # November 2014
18 #
19 # ChaCha20 for x86_64.
20 #
21 # Performance in cycles per byte out of large buffer.
22 #
23 #               IALU/gcc 4.8(i) 1xSSSE3/SSE2    4xSSSE3     8xAVX2
24 #
25 # P4            9.48/+99%       -/22.7(ii)      -
26 # Core2         7.83/+55%       7.90/8.08       4.35
27 # Westmere      7.19/+50%       5.60/6.70       3.00
28 # Sandy Bridge  8.31/+42%       5.45/6.76       2.72
29 # Ivy Bridge    6.71/+46%       5.40/6.49       2.41
30 # Haswell       5.92/+43%       5.20/6.45       2.42        1.23
31 # Silvermont    12.0/+33%       7.75/7.40       7.03(iii)
32 # Sledgehammer  7.28/+52%       -/14.2(ii)      -
33 # Bulldozer     9.66/+28%       9.85/11.1       3.06(iv)
34 # VIA Nano      10.5/+46%       6.72/8.60       6.05
35 #
36 # (i)   compared to older gcc 3.x one can observe >2x improvement on
37 #       most platforms;
38 # (ii)  as it can be seen, SSE2 performance is too low on legacy
39 #       processors; NxSSE2 results are naturally better, but not
40 #       impressively better than IALU ones, which is why you won't
41 #       find SSE2 code below;
42 # (iii) this is not optimal result for Atom because of MSROM
43 #       limitations, SSE2 can do better, but gain is considered too
44 #       low to justify the [maintenance] effort;
45 # (iv)  Bulldozer actually executes 4xXOP code path that delivers 2.20;
46
47 $flavour = shift;
48 $output  = shift;
49 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
50
51 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
52
53 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
55 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
56 die "can't locate x86_64-xlate.pl";
57
58 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
59                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
60         $avx = ($1>=2.19) + ($1>=2.22);
61 }
62
63 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
64            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
65         $avx = ($1>=2.09) + ($1>=2.10);
66 }
67
68 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
69            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
70         $avx = ($1>=10) + ($1>=11);
71 }
72
73 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
74         $avx = ($2>=3.0) + ($2>3.0);
75 }
76
77 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
78 *STDOUT=*OUT;
79
80 # input parameter block
81 ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
82
83 $code.=<<___;
84 .text
85
86 .extern OPENSSL_ia32cap_P
87
88 .align  64
89 .Lzero:
90 .long   0,0,0,0
91 .Lone:
92 .long   1,0,0,0
93 .Linc:
94 .long   0,1,2,3
95 .Lfour:
96 .long   4,4,4,4
97 .Lincy:
98 .long   0,2,4,6,1,3,5,7
99 .Leight:
100 .long   8,8,8,8,8,8,8,8
101 .Lrot16:
102 .byte   0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
103 .Lrot24:
104 .byte   0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
105 .Lsigma:
106 .asciz  "expand 32-byte k"
107 .asciz  "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
108 ___
109
110 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
111 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
112   my $arg = pop;
113     $arg = "\$$arg" if ($arg*1 eq $arg);
114     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
115 }
116
117 @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
118     "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
119 @t=("%esi","%edi");
120
121 sub ROUND {                     # critical path is 24 cycles per round
122 my ($a0,$b0,$c0,$d0)=@_;
123 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
124 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
125 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
126 my ($xc,$xc_)=map("\"$_\"",@t);
127 my @x=map("\"$_\"",@x);
128
129         # Consider order in which variables are addressed by their
130         # index:
131         #
132         #       a   b   c   d
133         #
134         #       0   4   8  12 < even round
135         #       1   5   9  13
136         #       2   6  10  14
137         #       3   7  11  15
138         #       0   5  10  15 < odd round
139         #       1   6  11  12
140         #       2   7   8  13
141         #       3   4   9  14
142         #
143         # 'a', 'b' and 'd's are permanently allocated in registers,
144         # @x[0..7,12..15], while 'c's are maintained in memory. If
145         # you observe 'c' column, you'll notice that pair of 'c's is
146         # invariant between rounds. This means that we have to reload
147         # them once per round, in the middle. This is why you'll see
148         # bunch of 'c' stores and loads in the middle, but none in
149         # the beginning or end.
150
151         # Normally instructions would be interleaved to favour in-order
152         # execution. Generally out-of-order cores manage it gracefully,
153         # but not this time for some reason. As in-order execution
154         # cores are dying breed, old Atom is the only one around,
155         # instructions are left uninterleaved. Besides, Atom is better
156         # off executing 1xSSSE3 code anyway...
157
158         (
159         "&add   (@x[$a0],@x[$b0])",     # Q1
160         "&xor   (@x[$d0],@x[$a0])",
161         "&rol   (@x[$d0],16)",
162          "&add  (@x[$a1],@x[$b1])",     # Q2
163          "&xor  (@x[$d1],@x[$a1])",
164          "&rol  (@x[$d1],16)",
165
166         "&add   ($xc,@x[$d0])",
167         "&xor   (@x[$b0],$xc)",
168         "&rol   (@x[$b0],12)",
169          "&add  ($xc_,@x[$d1])",
170          "&xor  (@x[$b1],$xc_)",
171          "&rol  (@x[$b1],12)",
172
173         "&add   (@x[$a0],@x[$b0])",
174         "&xor   (@x[$d0],@x[$a0])",
175         "&rol   (@x[$d0],8)",
176          "&add  (@x[$a1],@x[$b1])",
177          "&xor  (@x[$d1],@x[$a1])",
178          "&rol  (@x[$d1],8)",
179
180         "&add   ($xc,@x[$d0])",
181         "&xor   (@x[$b0],$xc)",
182         "&rol   (@x[$b0],7)",
183          "&add  ($xc_,@x[$d1])",
184          "&xor  (@x[$b1],$xc_)",
185          "&rol  (@x[$b1],7)",
186
187         "&mov   (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
188          "&mov  (\"4*$c1(%rsp)\",$xc_)",
189         "&mov   ($xc,\"4*$c2(%rsp)\")",
190          "&mov  ($xc_,\"4*$c3(%rsp)\")",
191
192         "&add   (@x[$a2],@x[$b2])",     # Q3
193         "&xor   (@x[$d2],@x[$a2])",
194         "&rol   (@x[$d2],16)",
195          "&add  (@x[$a3],@x[$b3])",     # Q4
196          "&xor  (@x[$d3],@x[$a3])",
197          "&rol  (@x[$d3],16)",
198
199         "&add   ($xc,@x[$d2])",
200         "&xor   (@x[$b2],$xc)",
201         "&rol   (@x[$b2],12)",
202          "&add  ($xc_,@x[$d3])",
203          "&xor  (@x[$b3],$xc_)",
204          "&rol  (@x[$b3],12)",
205
206         "&add   (@x[$a2],@x[$b2])",
207         "&xor   (@x[$d2],@x[$a2])",
208         "&rol   (@x[$d2],8)",
209          "&add  (@x[$a3],@x[$b3])",
210          "&xor  (@x[$d3],@x[$a3])",
211          "&rol  (@x[$d3],8)",
212
213         "&add   ($xc,@x[$d2])",
214         "&xor   (@x[$b2],$xc)",
215         "&rol   (@x[$b2],7)",
216          "&add  ($xc_,@x[$d3])",
217          "&xor  (@x[$b3],$xc_)",
218          "&rol  (@x[$b3],7)"
219         );
220 }
221
222 ########################################################################
223 # Generic code path that handles all lengths on pre-SSSE3 processors.
224 $code.=<<___;
225 .globl  ChaCha20_ctr32
226 .type   ChaCha20_ctr32,\@function,5
227 .align  64
228 ChaCha20_ctr32:
229         cmp     \$0,$len
230         je      .Lno_data
231         mov     OPENSSL_ia32cap_P+4(%rip),%r10
232         test    \$`1<<(41-32)`,%r10d
233         jnz     .LChaCha20_ssse3
234
235         push    %rbx
236         push    %rbp
237         push    %r12
238         push    %r13
239         push    %r14
240         push    %r15
241         sub     \$64+24,%rsp
242
243         #movdqa .Lsigma(%rip),%xmm0
244         movdqu  ($key),%xmm1
245         movdqu  16($key),%xmm2
246         movdqu  ($counter),%xmm3
247         movdqa  .Lone(%rip),%xmm4
248
249         #movdqa %xmm0,4*0(%rsp)         # key[0]
250         movdqa  %xmm1,4*4(%rsp)         # key[1]
251         movdqa  %xmm2,4*8(%rsp)         # key[2]
252         movdqa  %xmm3,4*12(%rsp)        # key[3]
253         mov     $len,%rbp               # reassign $len
254         jmp     .Loop_outer
255
256 .align  32
257 .Loop_outer:
258         mov     \$0x61707865,@x[0]      # 'expa'
259         mov     \$0x3320646e,@x[1]      # 'nd 3'
260         mov     \$0x79622d32,@x[2]      # '2-by'
261         mov     \$0x6b206574,@x[3]      # 'te k'
262         mov     4*4(%rsp),@x[4]
263         mov     4*5(%rsp),@x[5]
264         mov     4*6(%rsp),@x[6]
265         mov     4*7(%rsp),@x[7]
266         movd    %xmm3,@x[12]
267         mov     4*13(%rsp),@x[13]
268         mov     4*14(%rsp),@x[14]
269         mov     4*15(%rsp),@x[15]
270
271         mov     %rbp,64+0(%rsp)         # save len
272         mov     \$10,%ebp
273         mov     $inp,64+8(%rsp)         # save inp
274         movq    %xmm2,%rsi              # "@x[8]"
275         mov     $out,64+16(%rsp)        # save out
276         mov     %rsi,%rdi
277         shr     \$32,%rdi               # "@x[9]"
278         jmp     .Loop
279
280 .align  32
281 .Loop:
282 ___
283         foreach (&ROUND (0, 4, 8,12)) { eval; }
284         foreach (&ROUND (0, 5,10,15)) { eval; }
285         &dec    ("%ebp");
286         &jnz    (".Loop");
287
288 $code.=<<___;
289         mov     @t[1],4*9(%rsp)         # modulo-scheduled
290         mov     @t[0],4*8(%rsp)
291         mov     64(%rsp),%rbp           # load len
292         movdqa  %xmm2,%xmm1
293         mov     64+8(%rsp),$inp         # load inp
294         paddd   %xmm4,%xmm3             # increment counter
295         mov     64+16(%rsp),$out        # load out
296
297         add     \$0x61707865,@x[0]      # 'expa'
298         add     \$0x3320646e,@x[1]      # 'nd 3'
299         add     \$0x79622d32,@x[2]      # '2-by'
300         add     \$0x6b206574,@x[3]      # 'te k'
301         add     4*4(%rsp),@x[4]
302         add     4*5(%rsp),@x[5]
303         add     4*6(%rsp),@x[6]
304         add     4*7(%rsp),@x[7]
305         add     4*12(%rsp),@x[12]
306         add     4*13(%rsp),@x[13]
307         add     4*14(%rsp),@x[14]
308         add     4*15(%rsp),@x[15]
309         paddd   4*8(%rsp),%xmm1
310
311         cmp     \$64,%rbp
312         jb      .Ltail
313
314         xor     4*0($inp),@x[0]         # xor with input
315         xor     4*1($inp),@x[1]
316         xor     4*2($inp),@x[2]
317         xor     4*3($inp),@x[3]
318         xor     4*4($inp),@x[4]
319         xor     4*5($inp),@x[5]
320         xor     4*6($inp),@x[6]
321         xor     4*7($inp),@x[7]
322         movdqu  4*8($inp),%xmm0
323         xor     4*12($inp),@x[12]
324         xor     4*13($inp),@x[13]
325         xor     4*14($inp),@x[14]
326         xor     4*15($inp),@x[15]
327         lea     4*16($inp),$inp         # inp+=64
328         pxor    %xmm1,%xmm0
329
330         movdqa  %xmm2,4*8(%rsp)
331         movd    %xmm3,4*12(%rsp)
332
333         mov     @x[0],4*0($out)         # write output
334         mov     @x[1],4*1($out)
335         mov     @x[2],4*2($out)
336         mov     @x[3],4*3($out)
337         mov     @x[4],4*4($out)
338         mov     @x[5],4*5($out)
339         mov     @x[6],4*6($out)
340         mov     @x[7],4*7($out)
341         movdqu  %xmm0,4*8($out)
342         mov     @x[12],4*12($out)
343         mov     @x[13],4*13($out)
344         mov     @x[14],4*14($out)
345         mov     @x[15],4*15($out)
346         lea     4*16($out),$out         # out+=64
347
348         sub     \$64,%rbp
349         jnz     .Loop_outer
350
351         jmp     .Ldone
352
353 .align  16
354 .Ltail:
355         mov     @x[0],4*0(%rsp)
356         mov     @x[1],4*1(%rsp)
357         xor     %rbx,%rbx
358         mov     @x[2],4*2(%rsp)
359         mov     @x[3],4*3(%rsp)
360         mov     @x[4],4*4(%rsp)
361         mov     @x[5],4*5(%rsp)
362         mov     @x[6],4*6(%rsp)
363         mov     @x[7],4*7(%rsp)
364         movdqa  %xmm1,4*8(%rsp)
365         mov     @x[12],4*12(%rsp)
366         mov     @x[13],4*13(%rsp)
367         mov     @x[14],4*14(%rsp)
368         mov     @x[15],4*15(%rsp)
369
370 .Loop_tail:
371         movzb   ($inp,%rbx),%eax
372         movzb   (%rsp,%rbx),%edx
373         lea     1(%rbx),%rbx
374         xor     %edx,%eax
375         mov     %al,-1($out,%rbx)
376         dec     %rbp
377         jnz     .Loop_tail
378
379 .Ldone:
380         add     \$64+24,%rsp
381         pop     %r15
382         pop     %r14
383         pop     %r13
384         pop     %r12
385         pop     %rbp
386         pop     %rbx
387 .Lno_data:
388         ret
389 .size   ChaCha20_ctr32,.-ChaCha20_ctr32
390 ___
391
392 ########################################################################
393 # SSSE3 code path that handles shorter lengths
394 {
395 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
396
397 sub SSSE3ROUND {        # critical path is 20 "SIMD ticks" per round
398         &paddd  ($a,$b);
399         &pxor   ($d,$a);
400         &pshufb ($d,$rot16);
401
402         &paddd  ($c,$d);
403         &pxor   ($b,$c);
404         &movdqa ($t,$b);
405         &psrld  ($b,20);
406         &pslld  ($t,12);
407         &por    ($b,$t);
408
409         &paddd  ($a,$b);
410         &pxor   ($d,$a);
411         &pshufb ($d,$rot24);
412
413         &paddd  ($c,$d);
414         &pxor   ($b,$c);
415         &movdqa ($t,$b);
416         &psrld  ($b,25);
417         &pslld  ($t,7);
418         &por    ($b,$t);
419 }
420
421 my $xframe = $win64 ? 32+32+8 : 24;
422
423 $code.=<<___;
424 .type   ChaCha20_ssse3,\@function,5
425 .align  32
426 ChaCha20_ssse3:
427 .LChaCha20_ssse3:
428 ___
429 $code.=<<___    if ($avx);
430         test    \$`1<<(43-32)`,%r10d
431         jnz     .LChaCha20_4xop         # XOP is fastest even if we use 1/4
432 ___
433 $code.=<<___;
434         cmp     \$128,$len              # we might throw away some data,
435         ja      .LChaCha20_4x           # but overall it won't be slower
436
437 .Ldo_sse3_after_all:
438         push    %rbx
439         push    %rbp
440         push    %r12
441         push    %r13
442         push    %r14
443         push    %r15
444
445         sub     \$64+$xframe,%rsp
446 ___
447 $code.=<<___    if ($win64);
448         movaps  %xmm6,64+32(%rsp)
449         movaps  %xmm7,64+48(%rsp)
450 ___
451 $code.=<<___;
452         movdqa  .Lsigma(%rip),$a
453         movdqu  ($key),$b
454         movdqu  16($key),$c
455         movdqu  ($counter),$d
456         movdqa  .Lrot16(%rip),$rot16
457         movdqa  .Lrot24(%rip),$rot24
458
459         movdqa  $a,0x00(%rsp)
460         movdqa  $b,0x10(%rsp)
461         movdqa  $c,0x20(%rsp)
462         movdqa  $d,0x30(%rsp)
463         mov     \$10,%ebp
464         jmp     .Loop_ssse3
465
466 .align  32
467 .Loop_outer_ssse3:
468         movdqa  .Lone(%rip),$d
469         movdqa  0x00(%rsp),$a
470         movdqa  0x10(%rsp),$b
471         movdqa  0x20(%rsp),$c
472         paddd   0x30(%rsp),$d
473         mov     \$10,%ebp
474         movdqa  $d,0x30(%rsp)
475         jmp     .Loop_ssse3
476
477 .align  32
478 .Loop_ssse3:
479 ___
480         &SSSE3ROUND();
481         &pshufd ($c,$c,0b01001110);
482         &pshufd ($b,$b,0b00111001);
483         &pshufd ($d,$d,0b10010011);
484         &nop    ();
485
486         &SSSE3ROUND();
487         &pshufd ($c,$c,0b01001110);
488         &pshufd ($b,$b,0b10010011);
489         &pshufd ($d,$d,0b00111001);
490
491         &dec    ("%ebp");
492         &jnz    (".Loop_ssse3");
493
494 $code.=<<___;
495         paddd   0x00(%rsp),$a
496         paddd   0x10(%rsp),$b
497         paddd   0x20(%rsp),$c
498         paddd   0x30(%rsp),$d
499
500         cmp     \$64,$len
501         jb      .Ltail_ssse3
502
503         movdqu  0x00($inp),$t
504         movdqu  0x10($inp),$t1
505         pxor    $t,$a                   # xor with input
506         movdqu  0x20($inp),$t
507         pxor    $t1,$b
508         movdqu  0x30($inp),$t1
509         lea     0x40($inp),$inp         # inp+=64
510         pxor    $t,$c
511         pxor    $t1,$d
512
513         movdqu  $a,0x00($out)           # write output
514         movdqu  $b,0x10($out)
515         movdqu  $c,0x20($out)
516         movdqu  $d,0x30($out)
517         lea     0x40($out),$out         # out+=64
518
519         sub     \$64,$len
520         jnz     .Loop_outer_ssse3
521
522         jmp     .Ldone_ssse3
523
524 .align  16
525 .Ltail_ssse3:
526         movdqa  $a,0x00(%rsp)
527         movdqa  $b,0x10(%rsp)
528         movdqa  $c,0x20(%rsp)
529         movdqa  $d,0x30(%rsp)
530         xor     %rbx,%rbx
531
532 .Loop_tail_ssse3:
533         movzb   ($inp,%rbx),%eax
534         movzb   (%rsp,%rbx),%ecx
535         lea     1(%rbx),%rbx
536         xor     %ecx,%eax
537         mov     %al,-1($out,%rbx)
538         dec     $len
539         jnz     .Loop_tail_ssse3
540
541 .Ldone_ssse3:
542 ___
543 $code.=<<___    if ($win64);
544         movaps  64+32(%rsp),%xmm6
545         movaps  64+48(%rsp),%xmm7
546 ___
547 $code.=<<___;
548         add     \$64+$xframe,%rsp
549         pop     %r15
550         pop     %r14
551         pop     %r13
552         pop     %r12
553         pop     %rbp
554         pop     %rbx
555         ret
556 .size   ChaCha20_ssse3,.-ChaCha20_ssse3
557 ___
558 }
559
560 ########################################################################
561 # SSSE3 code path that handles longer messages.
562 {
563 # assign variables to favor Atom front-end
564 my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
565     $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
566 my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
567         "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
568
569 sub SSSE3_lane_ROUND {
570 my ($a0,$b0,$c0,$d0)=@_;
571 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
572 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
573 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
574 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
575 my @x=map("\"$_\"",@xx);
576
577         # Consider order in which variables are addressed by their
578         # index:
579         #
580         #       a   b   c   d
581         #
582         #       0   4   8  12 < even round
583         #       1   5   9  13
584         #       2   6  10  14
585         #       3   7  11  15
586         #       0   5  10  15 < odd round
587         #       1   6  11  12
588         #       2   7   8  13
589         #       3   4   9  14
590         #
591         # 'a', 'b' and 'd's are permanently allocated in registers,
592         # @x[0..7,12..15], while 'c's are maintained in memory. If
593         # you observe 'c' column, you'll notice that pair of 'c's is
594         # invariant between rounds. This means that we have to reload
595         # them once per round, in the middle. This is why you'll see
596         # bunch of 'c' stores and loads in the middle, but none in
597         # the beginning or end.
598
599         (
600         "&paddd         (@x[$a0],@x[$b0])",     # Q1
601          "&paddd        (@x[$a1],@x[$b1])",     # Q2
602         "&pxor          (@x[$d0],@x[$a0])",
603          "&pxor         (@x[$d1],@x[$a1])",
604         "&pshufb        (@x[$d0],$t1)",
605          "&pshufb       (@x[$d1],$t1)",
606
607         "&paddd         ($xc,@x[$d0])",
608          "&paddd        ($xc_,@x[$d1])",
609         "&pxor          (@x[$b0],$xc)",
610          "&pxor         (@x[$b1],$xc_)",
611         "&movdqa        ($t0,@x[$b0])",
612         "&pslld         (@x[$b0],12)",
613         "&psrld         ($t0,20)",
614          "&movdqa       ($t1,@x[$b1])",
615          "&pslld        (@x[$b1],12)",
616         "&por           (@x[$b0],$t0)",
617          "&psrld        ($t1,20)",
618         "&movdqa        ($t0,'(%r11)')",        # .Lrot24(%rip)
619          "&por          (@x[$b1],$t1)",
620
621         "&paddd         (@x[$a0],@x[$b0])",
622          "&paddd        (@x[$a1],@x[$b1])",
623         "&pxor          (@x[$d0],@x[$a0])",
624          "&pxor         (@x[$d1],@x[$a1])",
625         "&pshufb        (@x[$d0],$t0)",
626          "&pshufb       (@x[$d1],$t0)",
627
628         "&paddd         ($xc,@x[$d0])",
629          "&paddd        ($xc_,@x[$d1])",
630         "&pxor          (@x[$b0],$xc)",
631          "&pxor         (@x[$b1],$xc_)",
632         "&movdqa        ($t1,@x[$b0])",
633         "&pslld         (@x[$b0],7)",
634         "&psrld         ($t1,25)",
635          "&movdqa       ($t0,@x[$b1])",
636          "&pslld        (@x[$b1],7)",
637         "&por           (@x[$b0],$t1)",
638          "&psrld        ($t0,25)",
639         "&movdqa        ($t1,'(%r10)')",        # .Lrot16(%rip)
640          "&por          (@x[$b1],$t0)",
641
642         "&movdqa        (\"`16*($c0-8)`(%rsp)\",$xc)",  # reload pair of 'c's
643          "&movdqa       (\"`16*($c1-8)`(%rsp)\",$xc_)",
644         "&movdqa        ($xc,\"`16*($c2-8)`(%rsp)\")",
645          "&movdqa       ($xc_,\"`16*($c3-8)`(%rsp)\")",
646
647         "&paddd         (@x[$a2],@x[$b2])",     # Q3
648          "&paddd        (@x[$a3],@x[$b3])",     # Q4
649         "&pxor          (@x[$d2],@x[$a2])",
650          "&pxor         (@x[$d3],@x[$a3])",
651         "&pshufb        (@x[$d2],$t1)",
652          "&pshufb       (@x[$d3],$t1)",
653
654         "&paddd         ($xc,@x[$d2])",
655          "&paddd        ($xc_,@x[$d3])",
656         "&pxor          (@x[$b2],$xc)",
657          "&pxor         (@x[$b3],$xc_)",
658         "&movdqa        ($t0,@x[$b2])",
659         "&pslld         (@x[$b2],12)",
660         "&psrld         ($t0,20)",
661          "&movdqa       ($t1,@x[$b3])",
662          "&pslld        (@x[$b3],12)",
663         "&por           (@x[$b2],$t0)",
664          "&psrld        ($t1,20)",
665         "&movdqa        ($t0,'(%r11)')",        # .Lrot24(%rip)
666          "&por          (@x[$b3],$t1)",
667
668         "&paddd         (@x[$a2],@x[$b2])",
669          "&paddd        (@x[$a3],@x[$b3])",
670         "&pxor          (@x[$d2],@x[$a2])",
671          "&pxor         (@x[$d3],@x[$a3])",
672         "&pshufb        (@x[$d2],$t0)",
673          "&pshufb       (@x[$d3],$t0)",
674
675         "&paddd         ($xc,@x[$d2])",
676          "&paddd        ($xc_,@x[$d3])",
677         "&pxor          (@x[$b2],$xc)",
678          "&pxor         (@x[$b3],$xc_)",
679         "&movdqa        ($t1,@x[$b2])",
680         "&pslld         (@x[$b2],7)",
681         "&psrld         ($t1,25)",
682          "&movdqa       ($t0,@x[$b3])",
683          "&pslld        (@x[$b3],7)",
684         "&por           (@x[$b2],$t1)",
685          "&psrld        ($t0,25)",
686         "&movdqa        ($t1,'(%r10)')",        # .Lrot16(%rip)
687          "&por          (@x[$b3],$t0)"
688         );
689 }
690
691 my $xframe = $win64 ? 0xa0 : 0;
692
693 $code.=<<___;
694 .type   ChaCha20_4x,\@function,5
695 .align  32
696 ChaCha20_4x:
697 .LChaCha20_4x:
698         mov             %r10,%r11
699 ___
700 $code.=<<___    if ($avx>1);
701         shr             \$32,%r10               # OPENSSL_ia32cap_P+8
702         test            \$`1<<5`,%r10           # test AVX2
703         jnz             .LChaCha20_8x
704 ___
705 $code.=<<___;
706         cmp             \$192,$len
707         ja              .Lproceed4x
708
709         and             \$`1<<26|1<<22`,%r11    # isolate XSAVE+MOVBE
710         cmp             \$`1<<22`,%r11          # check for MOVBE without XSAVE
711         je              .Ldo_sse3_after_all     # to detect Atom
712
713 .Lproceed4x:
714         lea             -0x78(%rsp),%r11
715         sub             \$0x148+$xframe,%rsp
716 ___
717         ################ stack layout
718         # +0x00         SIMD equivalent of @x[8-12]
719         # ...
720         # +0x40         constant copy of key[0-2] smashed by lanes
721         # ...
722         # +0x100        SIMD counters (with nonce smashed by lanes)
723         # ...
724         # +0x140
725 $code.=<<___    if ($win64);
726         movaps          %xmm6,-0x30(%r11)
727         movaps          %xmm7,-0x20(%r11)
728         movaps          %xmm8,-0x10(%r11)
729         movaps          %xmm9,0x00(%r11)
730         movaps          %xmm10,0x10(%r11)
731         movaps          %xmm11,0x20(%r11)
732         movaps          %xmm12,0x30(%r11)
733         movaps          %xmm13,0x40(%r11)
734         movaps          %xmm14,0x50(%r11)
735         movaps          %xmm15,0x60(%r11)
736 ___
737 $code.=<<___;
738         movdqa          .Lsigma(%rip),$xa3      # key[0]
739         movdqu          ($key),$xb3             # key[1]
740         movdqu          16($key),$xt3           # key[2]
741         movdqu          ($counter),$xd3         # key[3]
742         lea             0x100(%rsp),%rcx        # size optimization
743         lea             .Lrot16(%rip),%r10
744         lea             .Lrot24(%rip),%r11
745
746         pshufd          \$0x00,$xa3,$xa0        # smash key by lanes...
747         pshufd          \$0x55,$xa3,$xa1
748         movdqa          $xa0,0x40(%rsp)         # ... and offload
749         pshufd          \$0xaa,$xa3,$xa2
750         movdqa          $xa1,0x50(%rsp)
751         pshufd          \$0xff,$xa3,$xa3
752         movdqa          $xa2,0x60(%rsp)
753         movdqa          $xa3,0x70(%rsp)
754
755         pshufd          \$0x00,$xb3,$xb0
756         pshufd          \$0x55,$xb3,$xb1
757         movdqa          $xb0,0x80-0x100(%rcx)
758         pshufd          \$0xaa,$xb3,$xb2
759         movdqa          $xb1,0x90-0x100(%rcx)
760         pshufd          \$0xff,$xb3,$xb3
761         movdqa          $xb2,0xa0-0x100(%rcx)
762         movdqa          $xb3,0xb0-0x100(%rcx)
763
764         pshufd          \$0x00,$xt3,$xt0        # "$xc0"
765         pshufd          \$0x55,$xt3,$xt1        # "$xc1"
766         movdqa          $xt0,0xc0-0x100(%rcx)
767         pshufd          \$0xaa,$xt3,$xt2        # "$xc2"
768         movdqa          $xt1,0xd0-0x100(%rcx)
769         pshufd          \$0xff,$xt3,$xt3        # "$xc3"
770         movdqa          $xt2,0xe0-0x100(%rcx)
771         movdqa          $xt3,0xf0-0x100(%rcx)
772
773         pshufd          \$0x00,$xd3,$xd0
774         pshufd          \$0x55,$xd3,$xd1
775         paddd           .Linc(%rip),$xd0        # don't save counters yet
776         pshufd          \$0xaa,$xd3,$xd2
777         movdqa          $xd1,0x110-0x100(%rcx)
778         pshufd          \$0xff,$xd3,$xd3
779         movdqa          $xd2,0x120-0x100(%rcx)
780         movdqa          $xd3,0x130-0x100(%rcx)
781
782         jmp             .Loop_enter4x
783
784 .align  32
785 .Loop_outer4x:
786         movdqa          0x40(%rsp),$xa0         # re-load smashed key
787         movdqa          0x50(%rsp),$xa1
788         movdqa          0x60(%rsp),$xa2
789         movdqa          0x70(%rsp),$xa3
790         movdqa          0x80-0x100(%rcx),$xb0
791         movdqa          0x90-0x100(%rcx),$xb1
792         movdqa          0xa0-0x100(%rcx),$xb2
793         movdqa          0xb0-0x100(%rcx),$xb3
794         movdqa          0xc0-0x100(%rcx),$xt0   # "$xc0"
795         movdqa          0xd0-0x100(%rcx),$xt1   # "$xc1"
796         movdqa          0xe0-0x100(%rcx),$xt2   # "$xc2"
797         movdqa          0xf0-0x100(%rcx),$xt3   # "$xc3"
798         movdqa          0x100-0x100(%rcx),$xd0
799         movdqa          0x110-0x100(%rcx),$xd1
800         movdqa          0x120-0x100(%rcx),$xd2
801         movdqa          0x130-0x100(%rcx),$xd3
802         paddd           .Lfour(%rip),$xd0       # next SIMD counters
803
804 .Loop_enter4x:
805         movdqa          $xt2,0x20(%rsp)         # SIMD equivalent of "@x[10]"
806         movdqa          $xt3,0x30(%rsp)         # SIMD equivalent of "@x[11]"
807         movdqa          (%r10),$xt3             # .Lrot16(%rip)
808         mov             \$10,%eax
809         movdqa          $xd0,0x100-0x100(%rcx)  # save SIMD counters
810         jmp             .Loop4x
811
812 .align  32
813 .Loop4x:
814 ___
815         foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
816         foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
817 $code.=<<___;
818         dec             %eax
819         jnz             .Loop4x
820
821         paddd           0x40(%rsp),$xa0         # accumulate key material
822         paddd           0x50(%rsp),$xa1
823         paddd           0x60(%rsp),$xa2
824         paddd           0x70(%rsp),$xa3
825
826         movdqa          $xa0,$xt2               # "de-interlace" data
827         punpckldq       $xa1,$xa0
828         movdqa          $xa2,$xt3
829         punpckldq       $xa3,$xa2
830         punpckhdq       $xa1,$xt2
831         punpckhdq       $xa3,$xt3
832         movdqa          $xa0,$xa1
833         punpcklqdq      $xa2,$xa0               # "a0"
834         movdqa          $xt2,$xa3
835         punpcklqdq      $xt3,$xt2               # "a2"
836         punpckhqdq      $xa2,$xa1               # "a1"
837         punpckhqdq      $xt3,$xa3               # "a3"
838 ___
839         ($xa2,$xt2)=($xt2,$xa2);
840 $code.=<<___;
841         paddd           0x80-0x100(%rcx),$xb0
842         paddd           0x90-0x100(%rcx),$xb1
843         paddd           0xa0-0x100(%rcx),$xb2
844         paddd           0xb0-0x100(%rcx),$xb3
845
846         movdqa          $xa0,0x00(%rsp)         # offload $xaN
847         movdqa          $xa1,0x10(%rsp)
848         movdqa          0x20(%rsp),$xa0         # "xc2"
849         movdqa          0x30(%rsp),$xa1         # "xc3"
850
851         movdqa          $xb0,$xt2
852         punpckldq       $xb1,$xb0
853         movdqa          $xb2,$xt3
854         punpckldq       $xb3,$xb2
855         punpckhdq       $xb1,$xt2
856         punpckhdq       $xb3,$xt3
857         movdqa          $xb0,$xb1
858         punpcklqdq      $xb2,$xb0               # "b0"
859         movdqa          $xt2,$xb3
860         punpcklqdq      $xt3,$xt2               # "b2"
861         punpckhqdq      $xb2,$xb1               # "b1"
862         punpckhqdq      $xt3,$xb3               # "b3"
863 ___
864         ($xb2,$xt2)=($xt2,$xb2);
865         my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
866 $code.=<<___;
867         paddd           0xc0-0x100(%rcx),$xc0
868         paddd           0xd0-0x100(%rcx),$xc1
869         paddd           0xe0-0x100(%rcx),$xc2
870         paddd           0xf0-0x100(%rcx),$xc3
871
872         movdqa          $xa2,0x20(%rsp)         # keep offloading $xaN
873         movdqa          $xa3,0x30(%rsp)
874
875         movdqa          $xc0,$xt2
876         punpckldq       $xc1,$xc0
877         movdqa          $xc2,$xt3
878         punpckldq       $xc3,$xc2
879         punpckhdq       $xc1,$xt2
880         punpckhdq       $xc3,$xt3
881         movdqa          $xc0,$xc1
882         punpcklqdq      $xc2,$xc0               # "c0"
883         movdqa          $xt2,$xc3
884         punpcklqdq      $xt3,$xt2               # "c2"
885         punpckhqdq      $xc2,$xc1               # "c1"
886         punpckhqdq      $xt3,$xc3               # "c3"
887 ___
888         ($xc2,$xt2)=($xt2,$xc2);
889         ($xt0,$xt1)=($xa2,$xa3);                # use $xaN as temporary
890 $code.=<<___;
891         paddd           0x100-0x100(%rcx),$xd0
892         paddd           0x110-0x100(%rcx),$xd1
893         paddd           0x120-0x100(%rcx),$xd2
894         paddd           0x130-0x100(%rcx),$xd3
895
896         movdqa          $xd0,$xt2
897         punpckldq       $xd1,$xd0
898         movdqa          $xd2,$xt3
899         punpckldq       $xd3,$xd2
900         punpckhdq       $xd1,$xt2
901         punpckhdq       $xd3,$xt3
902         movdqa          $xd0,$xd1
903         punpcklqdq      $xd2,$xd0               # "d0"
904         movdqa          $xt2,$xd3
905         punpcklqdq      $xt3,$xt2               # "d2"
906         punpckhqdq      $xd2,$xd1               # "d1"
907         punpckhqdq      $xt3,$xd3               # "d3"
908 ___
909         ($xd2,$xt2)=($xt2,$xd2);
910 $code.=<<___;
911         cmp             \$64*4,$len
912         jb              .Ltail4x
913
914         movdqu          0x00($inp),$xt0         # xor with input
915         movdqu          0x10($inp),$xt1
916         movdqu          0x20($inp),$xt2
917         movdqu          0x30($inp),$xt3
918         pxor            0x00(%rsp),$xt0         # $xaN is offloaded, remember?
919         pxor            $xb0,$xt1
920         pxor            $xc0,$xt2
921         pxor            $xd0,$xt3
922
923          movdqu         $xt0,0x00($out)
924         movdqu          0x40($inp),$xt0
925          movdqu         $xt1,0x10($out)
926         movdqu          0x50($inp),$xt1
927          movdqu         $xt2,0x20($out)
928         movdqu          0x60($inp),$xt2
929          movdqu         $xt3,0x30($out)
930         movdqu          0x70($inp),$xt3
931         lea             0x80($inp),$inp         # size optimization
932         pxor            0x10(%rsp),$xt0
933         pxor            $xb1,$xt1
934         pxor            $xc1,$xt2
935         pxor            $xd1,$xt3
936
937          movdqu         $xt0,0x40($out)
938         movdqu          0x00($inp),$xt0
939          movdqu         $xt1,0x50($out)
940         movdqu          0x10($inp),$xt1
941          movdqu         $xt2,0x60($out)
942         movdqu          0x20($inp),$xt2
943          movdqu         $xt3,0x70($out)
944          lea            0x80($out),$out         # size optimization
945         movdqu          0x30($inp),$xt3
946         pxor            0x20(%rsp),$xt0
947         pxor            $xb2,$xt1
948         pxor            $xc2,$xt2
949         pxor            $xd2,$xt3
950
951          movdqu         $xt0,0x00($out)
952         movdqu          0x40($inp),$xt0
953          movdqu         $xt1,0x10($out)
954         movdqu          0x50($inp),$xt1
955          movdqu         $xt2,0x20($out)
956         movdqu          0x60($inp),$xt2
957          movdqu         $xt3,0x30($out)
958         movdqu          0x70($inp),$xt3
959         lea             0x80($inp),$inp         # inp+=64*4
960         pxor            0x30(%rsp),$xt0
961         pxor            $xb3,$xt1
962         pxor            $xc3,$xt2
963         pxor            $xd3,$xt3
964         movdqu          $xt0,0x40($out)
965         movdqu          $xt1,0x50($out)
966         movdqu          $xt2,0x60($out)
967         movdqu          $xt3,0x70($out)
968         lea             0x80($out),$out         # out+=64*4
969
970         sub             \$64*4,$len
971         jnz             .Loop_outer4x
972
973         jmp             .Ldone4x
974
975 .Ltail4x:
976         cmp             \$192,$len
977         jae             .L192_or_more4x
978         cmp             \$128,$len
979         jae             .L128_or_more4x
980         cmp             \$64,$len
981         jae             .L64_or_more4x
982
983         #movdqa         0x00(%rsp),$xt0         # $xaN is offloaded, remember?
984         xor             %r10,%r10
985         #movdqa         $xt0,0x00(%rsp)
986         movdqa          $xb0,0x10(%rsp)
987         movdqa          $xc0,0x20(%rsp)
988         movdqa          $xd0,0x30(%rsp)
989         jmp             .Loop_tail4x
990
991 .align  32
992 .L64_or_more4x:
993         movdqu          0x00($inp),$xt0         # xor with input
994         movdqu          0x10($inp),$xt1
995         movdqu          0x20($inp),$xt2
996         movdqu          0x30($inp),$xt3
997         pxor            0x00(%rsp),$xt0         # $xaxN is offloaded, remember?
998         pxor            $xb0,$xt1
999         pxor            $xc0,$xt2
1000         pxor            $xd0,$xt3
1001         movdqu          $xt0,0x00($out)
1002         movdqu          $xt1,0x10($out)
1003         movdqu          $xt2,0x20($out)
1004         movdqu          $xt3,0x30($out)
1005         je              .Ldone4x
1006
1007         movdqa          0x10(%rsp),$xt0         # $xaN is offloaded, remember?
1008         lea             0x40($inp),$inp         # inp+=64*1
1009         xor             %r10,%r10
1010         movdqa          $xt0,0x00(%rsp)
1011         movdqa          $xb1,0x10(%rsp)
1012         lea             0x40($out),$out         # out+=64*1
1013         movdqa          $xc1,0x20(%rsp)
1014         sub             \$64,$len               # len-=64*1
1015         movdqa          $xd1,0x30(%rsp)
1016         jmp             .Loop_tail4x
1017
1018 .align  32
1019 .L128_or_more4x:
1020         movdqu          0x00($inp),$xt0         # xor with input
1021         movdqu          0x10($inp),$xt1
1022         movdqu          0x20($inp),$xt2
1023         movdqu          0x30($inp),$xt3
1024         pxor            0x00(%rsp),$xt0         # $xaN is offloaded, remember?
1025         pxor            $xb0,$xt1
1026         pxor            $xc0,$xt2
1027         pxor            $xd0,$xt3
1028
1029          movdqu         $xt0,0x00($out)
1030         movdqu          0x40($inp),$xt0
1031          movdqu         $xt1,0x10($out)
1032         movdqu          0x50($inp),$xt1
1033          movdqu         $xt2,0x20($out)
1034         movdqu          0x60($inp),$xt2
1035          movdqu         $xt3,0x30($out)
1036         movdqu          0x70($inp),$xt3
1037         pxor            0x10(%rsp),$xt0
1038         pxor            $xb1,$xt1
1039         pxor            $xc1,$xt2
1040         pxor            $xd1,$xt3
1041         movdqu          $xt0,0x40($out)
1042         movdqu          $xt1,0x50($out)
1043         movdqu          $xt2,0x60($out)
1044         movdqu          $xt3,0x70($out)
1045         je              .Ldone4x
1046
1047         movdqa          0x20(%rsp),$xt0         # $xaN is offloaded, remember?
1048         lea             0x80($inp),$inp         # inp+=64*2
1049         xor             %r10,%r10
1050         movdqa          $xt0,0x00(%rsp)
1051         movdqa          $xb2,0x10(%rsp)
1052         lea             0x80($out),$out         # out+=64*2
1053         movdqa          $xc2,0x20(%rsp)
1054         sub             \$128,$len              # len-=64*2
1055         movdqa          $xd2,0x30(%rsp)
1056         jmp             .Loop_tail4x
1057
1058 .align  32
1059 .L192_or_more4x:
1060         movdqu          0x00($inp),$xt0         # xor with input
1061         movdqu          0x10($inp),$xt1
1062         movdqu          0x20($inp),$xt2
1063         movdqu          0x30($inp),$xt3
1064         pxor            0x00(%rsp),$xt0         # $xaN is offloaded, remember?
1065         pxor            $xb0,$xt1
1066         pxor            $xc0,$xt2
1067         pxor            $xd0,$xt3
1068
1069          movdqu         $xt0,0x00($out)
1070         movdqu          0x40($inp),$xt0
1071          movdqu         $xt1,0x10($out)
1072         movdqu          0x50($inp),$xt1
1073          movdqu         $xt2,0x20($out)
1074         movdqu          0x60($inp),$xt2
1075          movdqu         $xt3,0x30($out)
1076         movdqu          0x70($inp),$xt3
1077         lea             0x80($inp),$inp         # size optimization
1078         pxor            0x10(%rsp),$xt0
1079         pxor            $xb1,$xt1
1080         pxor            $xc1,$xt2
1081         pxor            $xd1,$xt3
1082
1083          movdqu         $xt0,0x40($out)
1084         movdqu          0x00($inp),$xt0
1085          movdqu         $xt1,0x50($out)
1086         movdqu          0x10($inp),$xt1
1087          movdqu         $xt2,0x60($out)
1088         movdqu          0x20($inp),$xt2
1089          movdqu         $xt3,0x70($out)
1090          lea            0x80($out),$out         # size optimization
1091         movdqu          0x30($inp),$xt3
1092         pxor            0x20(%rsp),$xt0
1093         pxor            $xb2,$xt1
1094         pxor            $xc2,$xt2
1095         pxor            $xd2,$xt3
1096         movdqu          $xt0,0x00($out)
1097         movdqu          $xt1,0x10($out)
1098         movdqu          $xt2,0x20($out)
1099         movdqu          $xt3,0x30($out)
1100         je              .Ldone4x
1101
1102         movdqa          0x30(%rsp),$xt0         # $xaN is offloaded, remember?
1103         lea             0x40($inp),$inp         # inp+=64*3
1104         xor             %r10,%r10
1105         movdqa          $xt0,0x00(%rsp)
1106         movdqa          $xb3,0x10(%rsp)
1107         lea             0x40($out),$out         # out+=64*3
1108         movdqa          $xc3,0x20(%rsp)
1109         sub             \$192,$len              # len-=64*3
1110         movdqa          $xd3,0x30(%rsp)
1111
1112 .Loop_tail4x:
1113         movzb           ($inp,%r10),%eax
1114         movzb           (%rsp,%r10),%ecx
1115         lea             1(%r10),%r10
1116         xor             %ecx,%eax
1117         mov             %al,-1($out,%r10)
1118         dec             $len
1119         jnz             .Loop_tail4x
1120
1121 .Ldone4x:
1122 ___
1123 $code.=<<___    if ($win64);
1124         lea             0x140+0x30(%rsp),%r11
1125         movaps          -0x30(%r11),%xmm6
1126         movaps          -0x20(%r11),%xmm7
1127         movaps          -0x10(%r11),%xmm8
1128         movaps          0x00(%r11),%xmm9
1129         movaps          0x10(%r11),%xmm10
1130         movaps          0x20(%r11),%xmm11
1131         movaps          0x30(%r11),%xmm12
1132         movaps          0x40(%r11),%xmm13
1133         movaps          0x50(%r11),%xmm14
1134         movaps          0x60(%r11),%xmm15
1135 ___
1136 $code.=<<___;
1137         add             \$0x148+$xframe,%rsp
1138         ret
1139 .size   ChaCha20_4x,.-ChaCha20_4x
1140 ___
1141 }
1142
1143 ########################################################################
1144 # XOP code path that handles all lengths.
1145 if ($avx) {
1146 # There is some "anomaly" observed depending on instructions' size or
1147 # alignment. If you look closely at below code you'll notice that
1148 # sometimes argument order varies. The order affects instruction
1149 # encoding by making it larger, and such fiddling gives 5% performance
1150 # improvement. This is on FX-4100...
1151
1152 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1153     $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1154 my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1155          $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1156
1157 sub XOP_lane_ROUND {
1158 my ($a0,$b0,$c0,$d0)=@_;
1159 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1160 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1161 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1162 my @x=map("\"$_\"",@xx);
1163
1164         (
1165         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",     # Q1
1166          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",     # Q2
1167           "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",     # Q3
1168            "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",     # Q4
1169         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1170          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1171           "&vpxor       (@x[$d2],@x[$a2],@x[$d2])",
1172            "&vpxor      (@x[$d3],@x[$a3],@x[$d3])",
1173         "&vprotd        (@x[$d0],@x[$d0],16)",
1174          "&vprotd       (@x[$d1],@x[$d1],16)",
1175           "&vprotd      (@x[$d2],@x[$d2],16)",
1176            "&vprotd     (@x[$d3],@x[$d3],16)",
1177
1178         "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
1179          "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
1180           "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
1181            "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
1182         "&vpxor         (@x[$b0],@x[$c0],@x[$b0])",
1183          "&vpxor        (@x[$b1],@x[$c1],@x[$b1])",
1184           "&vpxor       (@x[$b2],@x[$b2],@x[$c2])",     # flip
1185            "&vpxor      (@x[$b3],@x[$b3],@x[$c3])",     # flip
1186         "&vprotd        (@x[$b0],@x[$b0],12)",
1187          "&vprotd       (@x[$b1],@x[$b1],12)",
1188           "&vprotd      (@x[$b2],@x[$b2],12)",
1189            "&vprotd     (@x[$b3],@x[$b3],12)",
1190
1191         "&vpaddd        (@x[$a0],@x[$b0],@x[$a0])",     # flip
1192          "&vpaddd       (@x[$a1],@x[$b1],@x[$a1])",     # flip
1193           "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",
1194            "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",
1195         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1196          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1197           "&vpxor       (@x[$d2],@x[$a2],@x[$d2])",
1198            "&vpxor      (@x[$d3],@x[$a3],@x[$d3])",
1199         "&vprotd        (@x[$d0],@x[$d0],8)",
1200          "&vprotd       (@x[$d1],@x[$d1],8)",
1201           "&vprotd      (@x[$d2],@x[$d2],8)",
1202            "&vprotd     (@x[$d3],@x[$d3],8)",
1203
1204         "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
1205          "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
1206           "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
1207            "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
1208         "&vpxor         (@x[$b0],@x[$c0],@x[$b0])",
1209          "&vpxor        (@x[$b1],@x[$c1],@x[$b1])",
1210           "&vpxor       (@x[$b2],@x[$b2],@x[$c2])",     # flip
1211            "&vpxor      (@x[$b3],@x[$b3],@x[$c3])",     # flip
1212         "&vprotd        (@x[$b0],@x[$b0],7)",
1213          "&vprotd       (@x[$b1],@x[$b1],7)",
1214           "&vprotd      (@x[$b2],@x[$b2],7)",
1215            "&vprotd     (@x[$b3],@x[$b3],7)"
1216         );
1217 }
1218
1219 my $xframe = $win64 ? 0xa0 : 0;
1220
1221 $code.=<<___;
1222 .type   ChaCha20_4xop,\@function,5
1223 .align  32
1224 ChaCha20_4xop:
1225 .LChaCha20_4xop:
1226         lea             -0x78(%rsp),%r11
1227         sub             \$0x148+$xframe,%rsp
1228 ___
1229         ################ stack layout
1230         # +0x00         SIMD equivalent of @x[8-12]
1231         # ...
1232         # +0x40         constant copy of key[0-2] smashed by lanes
1233         # ...
1234         # +0x100        SIMD counters (with nonce smashed by lanes)
1235         # ...
1236         # +0x140
1237 $code.=<<___    if ($win64);
1238         movaps          %xmm6,-0x30(%r11)
1239         movaps          %xmm7,-0x20(%r11)
1240         movaps          %xmm8,-0x10(%r11)
1241         movaps          %xmm9,0x00(%r11)
1242         movaps          %xmm10,0x10(%r11)
1243         movaps          %xmm11,0x20(%r11)
1244         movaps          %xmm12,0x30(%r11)
1245         movaps          %xmm13,0x40(%r11)
1246         movaps          %xmm14,0x50(%r11)
1247         movaps          %xmm15,0x60(%r11)
1248 ___
1249 $code.=<<___;
1250         vzeroupper
1251
1252         vmovdqa         .Lsigma(%rip),$xa3      # key[0]
1253         vmovdqu         ($key),$xb3             # key[1]
1254         vmovdqu         16($key),$xt3           # key[2]
1255         vmovdqu         ($counter),$xd3         # key[3]
1256         lea             0x100(%rsp),%rcx        # size optimization
1257
1258         vpshufd         \$0x00,$xa3,$xa0        # smash key by lanes...
1259         vpshufd         \$0x55,$xa3,$xa1
1260         vmovdqa         $xa0,0x40(%rsp)         # ... and offload
1261         vpshufd         \$0xaa,$xa3,$xa2
1262         vmovdqa         $xa1,0x50(%rsp)
1263         vpshufd         \$0xff,$xa3,$xa3
1264         vmovdqa         $xa2,0x60(%rsp)
1265         vmovdqa         $xa3,0x70(%rsp)
1266
1267         vpshufd         \$0x00,$xb3,$xb0
1268         vpshufd         \$0x55,$xb3,$xb1
1269         vmovdqa         $xb0,0x80-0x100(%rcx)
1270         vpshufd         \$0xaa,$xb3,$xb2
1271         vmovdqa         $xb1,0x90-0x100(%rcx)
1272         vpshufd         \$0xff,$xb3,$xb3
1273         vmovdqa         $xb2,0xa0-0x100(%rcx)
1274         vmovdqa         $xb3,0xb0-0x100(%rcx)
1275
1276         vpshufd         \$0x00,$xt3,$xt0        # "$xc0"
1277         vpshufd         \$0x55,$xt3,$xt1        # "$xc1"
1278         vmovdqa         $xt0,0xc0-0x100(%rcx)
1279         vpshufd         \$0xaa,$xt3,$xt2        # "$xc2"
1280         vmovdqa         $xt1,0xd0-0x100(%rcx)
1281         vpshufd         \$0xff,$xt3,$xt3        # "$xc3"
1282         vmovdqa         $xt2,0xe0-0x100(%rcx)
1283         vmovdqa         $xt3,0xf0-0x100(%rcx)
1284
1285         vpshufd         \$0x00,$xd3,$xd0
1286         vpshufd         \$0x55,$xd3,$xd1
1287         vpaddd          .Linc(%rip),$xd0,$xd0   # don't save counters yet
1288         vpshufd         \$0xaa,$xd3,$xd2
1289         vmovdqa         $xd1,0x110-0x100(%rcx)
1290         vpshufd         \$0xff,$xd3,$xd3
1291         vmovdqa         $xd2,0x120-0x100(%rcx)
1292         vmovdqa         $xd3,0x130-0x100(%rcx)
1293
1294         jmp             .Loop_enter4xop
1295
1296 .align  32
1297 .Loop_outer4xop:
1298         vmovdqa         0x40(%rsp),$xa0         # re-load smashed key
1299         vmovdqa         0x50(%rsp),$xa1
1300         vmovdqa         0x60(%rsp),$xa2
1301         vmovdqa         0x70(%rsp),$xa3
1302         vmovdqa         0x80-0x100(%rcx),$xb0
1303         vmovdqa         0x90-0x100(%rcx),$xb1
1304         vmovdqa         0xa0-0x100(%rcx),$xb2
1305         vmovdqa         0xb0-0x100(%rcx),$xb3
1306         vmovdqa         0xc0-0x100(%rcx),$xt0   # "$xc0"
1307         vmovdqa         0xd0-0x100(%rcx),$xt1   # "$xc1"
1308         vmovdqa         0xe0-0x100(%rcx),$xt2   # "$xc2"
1309         vmovdqa         0xf0-0x100(%rcx),$xt3   # "$xc3"
1310         vmovdqa         0x100-0x100(%rcx),$xd0
1311         vmovdqa         0x110-0x100(%rcx),$xd1
1312         vmovdqa         0x120-0x100(%rcx),$xd2
1313         vmovdqa         0x130-0x100(%rcx),$xd3
1314         vpaddd          .Lfour(%rip),$xd0,$xd0  # next SIMD counters
1315
1316 .Loop_enter4xop:
1317         mov             \$10,%eax
1318         vmovdqa         $xd0,0x100-0x100(%rcx)  # save SIMD counters
1319         jmp             .Loop4xop
1320
1321 .align  32
1322 .Loop4xop:
1323 ___
1324         foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
1325         foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
1326 $code.=<<___;
1327         dec             %eax
1328         jnz             .Loop4xop
1329
1330         vpaddd          0x40(%rsp),$xa0,$xa0    # accumulate key material
1331         vpaddd          0x50(%rsp),$xa1,$xa1
1332         vpaddd          0x60(%rsp),$xa2,$xa2
1333         vpaddd          0x70(%rsp),$xa3,$xa3
1334
1335         vmovdqa         $xt2,0x20(%rsp)         # offload $xc2,3
1336         vmovdqa         $xt3,0x30(%rsp)
1337
1338         vpunpckldq      $xa1,$xa0,$xt2          # "de-interlace" data
1339         vpunpckldq      $xa3,$xa2,$xt3
1340         vpunpckhdq      $xa1,$xa0,$xa0
1341         vpunpckhdq      $xa3,$xa2,$xa2
1342         vpunpcklqdq     $xt3,$xt2,$xa1          # "a0"
1343         vpunpckhqdq     $xt3,$xt2,$xt2          # "a1"
1344         vpunpcklqdq     $xa2,$xa0,$xa3          # "a2"
1345         vpunpckhqdq     $xa2,$xa0,$xa0          # "a3"
1346 ___
1347         ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1348 $code.=<<___;
1349         vpaddd          0x80-0x100(%rcx),$xb0,$xb0
1350         vpaddd          0x90-0x100(%rcx),$xb1,$xb1
1351         vpaddd          0xa0-0x100(%rcx),$xb2,$xb2
1352         vpaddd          0xb0-0x100(%rcx),$xb3,$xb3
1353
1354         vmovdqa         $xa0,0x00(%rsp)         # offload $xa0,1
1355         vmovdqa         $xa1,0x10(%rsp)
1356         vmovdqa         0x20(%rsp),$xa0         # "xc2"
1357         vmovdqa         0x30(%rsp),$xa1         # "xc3"
1358
1359         vpunpckldq      $xb1,$xb0,$xt2
1360         vpunpckldq      $xb3,$xb2,$xt3
1361         vpunpckhdq      $xb1,$xb0,$xb0
1362         vpunpckhdq      $xb3,$xb2,$xb2
1363         vpunpcklqdq     $xt3,$xt2,$xb1          # "b0"
1364         vpunpckhqdq     $xt3,$xt2,$xt2          # "b1"
1365         vpunpcklqdq     $xb2,$xb0,$xb3          # "b2"
1366         vpunpckhqdq     $xb2,$xb0,$xb0          # "b3"
1367 ___
1368         ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1369         my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1370 $code.=<<___;
1371         vpaddd          0xc0-0x100(%rcx),$xc0,$xc0
1372         vpaddd          0xd0-0x100(%rcx),$xc1,$xc1
1373         vpaddd          0xe0-0x100(%rcx),$xc2,$xc2
1374         vpaddd          0xf0-0x100(%rcx),$xc3,$xc3
1375
1376         vpunpckldq      $xc1,$xc0,$xt2
1377         vpunpckldq      $xc3,$xc2,$xt3
1378         vpunpckhdq      $xc1,$xc0,$xc0
1379         vpunpckhdq      $xc3,$xc2,$xc2
1380         vpunpcklqdq     $xt3,$xt2,$xc1          # "c0"
1381         vpunpckhqdq     $xt3,$xt2,$xt2          # "c1"
1382         vpunpcklqdq     $xc2,$xc0,$xc3          # "c2"
1383         vpunpckhqdq     $xc2,$xc0,$xc0          # "c3"
1384 ___
1385         ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1386 $code.=<<___;
1387         vpaddd          0x100-0x100(%rcx),$xd0,$xd0
1388         vpaddd          0x110-0x100(%rcx),$xd1,$xd1
1389         vpaddd          0x120-0x100(%rcx),$xd2,$xd2
1390         vpaddd          0x130-0x100(%rcx),$xd3,$xd3
1391
1392         vpunpckldq      $xd1,$xd0,$xt2
1393         vpunpckldq      $xd3,$xd2,$xt3
1394         vpunpckhdq      $xd1,$xd0,$xd0
1395         vpunpckhdq      $xd3,$xd2,$xd2
1396         vpunpcklqdq     $xt3,$xt2,$xd1          # "d0"
1397         vpunpckhqdq     $xt3,$xt2,$xt2          # "d1"
1398         vpunpcklqdq     $xd2,$xd0,$xd3          # "d2"
1399         vpunpckhqdq     $xd2,$xd0,$xd0          # "d3"
1400 ___
1401         ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1402         ($xa0,$xa1)=($xt2,$xt3);
1403 $code.=<<___;
1404         vmovdqa         0x00(%rsp),$xa0         # restore $xa0,1
1405         vmovdqa         0x10(%rsp),$xa1
1406
1407         cmp             \$64*4,$len
1408         jb              .Ltail4xop
1409
1410         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1411         vpxor           0x10($inp),$xb0,$xb0
1412         vpxor           0x20($inp),$xc0,$xc0
1413         vpxor           0x30($inp),$xd0,$xd0
1414         vpxor           0x40($inp),$xa1,$xa1
1415         vpxor           0x50($inp),$xb1,$xb1
1416         vpxor           0x60($inp),$xc1,$xc1
1417         vpxor           0x70($inp),$xd1,$xd1
1418         lea             0x80($inp),$inp         # size optimization
1419         vpxor           0x00($inp),$xa2,$xa2
1420         vpxor           0x10($inp),$xb2,$xb2
1421         vpxor           0x20($inp),$xc2,$xc2
1422         vpxor           0x30($inp),$xd2,$xd2
1423         vpxor           0x40($inp),$xa3,$xa3
1424         vpxor           0x50($inp),$xb3,$xb3
1425         vpxor           0x60($inp),$xc3,$xc3
1426         vpxor           0x70($inp),$xd3,$xd3
1427         lea             0x80($inp),$inp         # inp+=64*4
1428
1429         vmovdqu         $xa0,0x00($out)
1430         vmovdqu         $xb0,0x10($out)
1431         vmovdqu         $xc0,0x20($out)
1432         vmovdqu         $xd0,0x30($out)
1433         vmovdqu         $xa1,0x40($out)
1434         vmovdqu         $xb1,0x50($out)
1435         vmovdqu         $xc1,0x60($out)
1436         vmovdqu         $xd1,0x70($out)
1437         lea             0x80($out),$out         # size optimization
1438         vmovdqu         $xa2,0x00($out)
1439         vmovdqu         $xb2,0x10($out)
1440         vmovdqu         $xc2,0x20($out)
1441         vmovdqu         $xd2,0x30($out)
1442         vmovdqu         $xa3,0x40($out)
1443         vmovdqu         $xb3,0x50($out)
1444         vmovdqu         $xc3,0x60($out)
1445         vmovdqu         $xd3,0x70($out)
1446         lea             0x80($out),$out         # out+=64*4
1447
1448         sub             \$64*4,$len
1449         jnz             .Loop_outer4xop
1450
1451         jmp             .Ldone4xop
1452
1453 .align  32
1454 .Ltail4xop:
1455         cmp             \$192,$len
1456         jae             .L192_or_more4xop
1457         cmp             \$128,$len
1458         jae             .L128_or_more4xop
1459         cmp             \$64,$len
1460         jae             .L64_or_more4xop
1461
1462         xor             %r10,%r10
1463         vmovdqa         $xa0,0x00(%rsp)
1464         vmovdqa         $xb0,0x10(%rsp)
1465         vmovdqa         $xc0,0x20(%rsp)
1466         vmovdqa         $xd0,0x30(%rsp)
1467         jmp             .Loop_tail4xop
1468
1469 .align  32
1470 .L64_or_more4xop:
1471         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1472         vpxor           0x10($inp),$xb0,$xb0
1473         vpxor           0x20($inp),$xc0,$xc0
1474         vpxor           0x30($inp),$xd0,$xd0
1475         vmovdqu         $xa0,0x00($out)
1476         vmovdqu         $xb0,0x10($out)
1477         vmovdqu         $xc0,0x20($out)
1478         vmovdqu         $xd0,0x30($out)
1479         je              .Ldone4xop
1480
1481         lea             0x40($inp),$inp         # inp+=64*1
1482         vmovdqa         $xa1,0x00(%rsp)
1483         xor             %r10,%r10
1484         vmovdqa         $xb1,0x10(%rsp)
1485         lea             0x40($out),$out         # out+=64*1
1486         vmovdqa         $xc1,0x20(%rsp)
1487         sub             \$64,$len               # len-=64*1
1488         vmovdqa         $xd1,0x30(%rsp)
1489         jmp             .Loop_tail4xop
1490
1491 .align  32
1492 .L128_or_more4xop:
1493         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1494         vpxor           0x10($inp),$xb0,$xb0
1495         vpxor           0x20($inp),$xc0,$xc0
1496         vpxor           0x30($inp),$xd0,$xd0
1497         vpxor           0x40($inp),$xa1,$xa1
1498         vpxor           0x50($inp),$xb1,$xb1
1499         vpxor           0x60($inp),$xc1,$xc1
1500         vpxor           0x70($inp),$xd1,$xd1
1501
1502         vmovdqu         $xa0,0x00($out)
1503         vmovdqu         $xb0,0x10($out)
1504         vmovdqu         $xc0,0x20($out)
1505         vmovdqu         $xd0,0x30($out)
1506         vmovdqu         $xa1,0x40($out)
1507         vmovdqu         $xb1,0x50($out)
1508         vmovdqu         $xc1,0x60($out)
1509         vmovdqu         $xd1,0x70($out)
1510         je              .Ldone4xop
1511
1512         lea             0x80($inp),$inp         # inp+=64*2
1513         vmovdqa         $xa2,0x00(%rsp)
1514         xor             %r10,%r10
1515         vmovdqa         $xb2,0x10(%rsp)
1516         lea             0x80($out),$out         # out+=64*2
1517         vmovdqa         $xc2,0x20(%rsp)
1518         sub             \$128,$len              # len-=64*2
1519         vmovdqa         $xd2,0x30(%rsp)
1520         jmp             .Loop_tail4xop
1521
1522 .align  32
1523 .L192_or_more4xop:
1524         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1525         vpxor           0x10($inp),$xb0,$xb0
1526         vpxor           0x20($inp),$xc0,$xc0
1527         vpxor           0x30($inp),$xd0,$xd0
1528         vpxor           0x40($inp),$xa1,$xa1
1529         vpxor           0x50($inp),$xb1,$xb1
1530         vpxor           0x60($inp),$xc1,$xc1
1531         vpxor           0x70($inp),$xd1,$xd1
1532         lea             0x80($inp),$inp         # size optimization
1533         vpxor           0x00($inp),$xa2,$xa2
1534         vpxor           0x10($inp),$xb2,$xb2
1535         vpxor           0x20($inp),$xc2,$xc2
1536         vpxor           0x30($inp),$xd2,$xd2
1537
1538         vmovdqu         $xa0,0x00($out)
1539         vmovdqu         $xb0,0x10($out)
1540         vmovdqu         $xc0,0x20($out)
1541         vmovdqu         $xd0,0x30($out)
1542         vmovdqu         $xa1,0x40($out)
1543         vmovdqu         $xb1,0x50($out)
1544         vmovdqu         $xc1,0x60($out)
1545         vmovdqu         $xd1,0x70($out)
1546         lea             0x80($out),$out         # size optimization
1547         vmovdqu         $xa2,0x00($out)
1548         vmovdqu         $xb2,0x10($out)
1549         vmovdqu         $xc2,0x20($out)
1550         vmovdqu         $xd2,0x30($out)
1551         je              .Ldone4xop
1552
1553         lea             0x40($inp),$inp         # inp+=64*3
1554         vmovdqa         $xa3,0x00(%rsp)
1555         xor             %r10,%r10
1556         vmovdqa         $xb3,0x10(%rsp)
1557         lea             0x40($out),$out         # out+=64*3
1558         vmovdqa         $xc3,0x20(%rsp)
1559         sub             \$192,$len              # len-=64*3
1560         vmovdqa         $xd3,0x30(%rsp)
1561
1562 .Loop_tail4xop:
1563         movzb           ($inp,%r10),%eax
1564         movzb           (%rsp,%r10),%ecx
1565         lea             1(%r10),%r10
1566         xor             %ecx,%eax
1567         mov             %al,-1($out,%r10)
1568         dec             $len
1569         jnz             .Loop_tail4xop
1570
1571 .Ldone4xop:
1572         vzeroupper
1573 ___
1574 $code.=<<___    if ($win64);
1575         lea             0x140+0x30(%rsp),%r11
1576         movaps          -0x30(%r11),%xmm6
1577         movaps          -0x20(%r11),%xmm7
1578         movaps          -0x10(%r11),%xmm8
1579         movaps          0x00(%r11),%xmm9
1580         movaps          0x10(%r11),%xmm10
1581         movaps          0x20(%r11),%xmm11
1582         movaps          0x30(%r11),%xmm12
1583         movaps          0x40(%r11),%xmm13
1584         movaps          0x50(%r11),%xmm14
1585         movaps          0x60(%r11),%xmm15
1586 ___
1587 $code.=<<___;
1588         add             \$0x148+$xframe,%rsp
1589         ret
1590 .size   ChaCha20_4xop,.-ChaCha20_4xop
1591 ___
1592 }
1593
1594 ########################################################################
1595 # AVX2 code path
1596 if ($avx>1) {
1597 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1598     $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1599 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1600         "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1601
1602 sub AVX2_lane_ROUND {
1603 my ($a0,$b0,$c0,$d0)=@_;
1604 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1605 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1606 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1607 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1608 my @x=map("\"$_\"",@xx);
1609
1610         # Consider order in which variables are addressed by their
1611         # index:
1612         #
1613         #       a   b   c   d
1614         #
1615         #       0   4   8  12 < even round
1616         #       1   5   9  13
1617         #       2   6  10  14
1618         #       3   7  11  15
1619         #       0   5  10  15 < odd round
1620         #       1   6  11  12
1621         #       2   7   8  13
1622         #       3   4   9  14
1623         #
1624         # 'a', 'b' and 'd's are permanently allocated in registers,
1625         # @x[0..7,12..15], while 'c's are maintained in memory. If
1626         # you observe 'c' column, you'll notice that pair of 'c's is
1627         # invariant between rounds. This means that we have to reload
1628         # them once per round, in the middle. This is why you'll see
1629         # bunch of 'c' stores and loads in the middle, but none in
1630         # the beginning or end.
1631
1632         (
1633         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",     # Q1
1634         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1635         "&vpshufb       (@x[$d0],@x[$d0],$t1)",
1636          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",     # Q2
1637          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1638          "&vpshufb      (@x[$d1],@x[$d1],$t1)",
1639
1640         "&vpaddd        ($xc,$xc,@x[$d0])",
1641         "&vpxor         (@x[$b0],$xc,@x[$b0])",
1642         "&vpslld        ($t0,@x[$b0],12)",
1643         "&vpsrld        (@x[$b0],@x[$b0],20)",
1644         "&vpor          (@x[$b0],$t0,@x[$b0])",
1645         "&vbroadcasti128($t0,'(%r11)')",                # .Lrot24(%rip)
1646          "&vpaddd       ($xc_,$xc_,@x[$d1])",
1647          "&vpxor        (@x[$b1],$xc_,@x[$b1])",
1648          "&vpslld       ($t1,@x[$b1],12)",
1649          "&vpsrld       (@x[$b1],@x[$b1],20)",
1650          "&vpor         (@x[$b1],$t1,@x[$b1])",
1651
1652         "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",
1653         "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
1654         "&vpshufb       (@x[$d0],@x[$d0],$t0)",
1655          "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",
1656          "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
1657          "&vpshufb      (@x[$d1],@x[$d1],$t0)",
1658
1659         "&vpaddd        ($xc,$xc,@x[$d0])",
1660         "&vpxor         (@x[$b0],$xc,@x[$b0])",
1661         "&vpslld        ($t1,@x[$b0],7)",
1662         "&vpsrld        (@x[$b0],@x[$b0],25)",
1663         "&vpor          (@x[$b0],$t1,@x[$b0])",
1664         "&vbroadcasti128($t1,'(%r10)')",                # .Lrot16(%rip)
1665          "&vpaddd       ($xc_,$xc_,@x[$d1])",
1666          "&vpxor        (@x[$b1],$xc_,@x[$b1])",
1667          "&vpslld       ($t0,@x[$b1],7)",
1668          "&vpsrld       (@x[$b1],@x[$b1],25)",
1669          "&vpor         (@x[$b1],$t0,@x[$b1])",
1670
1671         "&vmovdqa       (\"`32*($c0-8)`(%rsp)\",$xc)",  # reload pair of 'c's
1672          "&vmovdqa      (\"`32*($c1-8)`(%rsp)\",$xc_)",
1673         "&vmovdqa       ($xc,\"`32*($c2-8)`(%rsp)\")",
1674          "&vmovdqa      ($xc_,\"`32*($c3-8)`(%rsp)\")",
1675
1676         "&vpaddd        (@x[$a2],@x[$a2],@x[$b2])",     # Q3
1677         "&vpxor         (@x[$d2],@x[$a2],@x[$d2])",
1678         "&vpshufb       (@x[$d2],@x[$d2],$t1)",
1679          "&vpaddd       (@x[$a3],@x[$a3],@x[$b3])",     # Q4
1680          "&vpxor        (@x[$d3],@x[$a3],@x[$d3])",
1681          "&vpshufb      (@x[$d3],@x[$d3],$t1)",
1682
1683         "&vpaddd        ($xc,$xc,@x[$d2])",
1684         "&vpxor         (@x[$b2],$xc,@x[$b2])",
1685         "&vpslld        ($t0,@x[$b2],12)",
1686         "&vpsrld        (@x[$b2],@x[$b2],20)",
1687         "&vpor          (@x[$b2],$t0,@x[$b2])",
1688         "&vbroadcasti128($t0,'(%r11)')",                # .Lrot24(%rip)
1689          "&vpaddd       ($xc_,$xc_,@x[$d3])",
1690          "&vpxor        (@x[$b3],$xc_,@x[$b3])",
1691          "&vpslld       ($t1,@x[$b3],12)",
1692          "&vpsrld       (@x[$b3],@x[$b3],20)",
1693          "&vpor         (@x[$b3],$t1,@x[$b3])",
1694
1695         "&vpaddd        (@x[$a2],@x[$a2],@x[$b2])",
1696         "&vpxor         (@x[$d2],@x[$a2],@x[$d2])",
1697         "&vpshufb       (@x[$d2],@x[$d2],$t0)",
1698          "&vpaddd       (@x[$a3],@x[$a3],@x[$b3])",
1699          "&vpxor        (@x[$d3],@x[$a3],@x[$d3])",
1700          "&vpshufb      (@x[$d3],@x[$d3],$t0)",
1701
1702         "&vpaddd        ($xc,$xc,@x[$d2])",
1703         "&vpxor         (@x[$b2],$xc,@x[$b2])",
1704         "&vpslld        ($t1,@x[$b2],7)",
1705         "&vpsrld        (@x[$b2],@x[$b2],25)",
1706         "&vpor          (@x[$b2],$t1,@x[$b2])",
1707         "&vbroadcasti128($t1,'(%r10)')",                # .Lrot16(%rip)
1708          "&vpaddd       ($xc_,$xc_,@x[$d3])",
1709          "&vpxor        (@x[$b3],$xc_,@x[$b3])",
1710          "&vpslld       ($t0,@x[$b3],7)",
1711          "&vpsrld       (@x[$b3],@x[$b3],25)",
1712          "&vpor         (@x[$b3],$t0,@x[$b3])"
1713         );
1714 }
1715
1716 my $xframe = $win64 ? 0xb0 : 8;
1717
1718 $code.=<<___;
1719 .type   ChaCha20_8x,\@function,5
1720 .align  32
1721 ChaCha20_8x:
1722 .LChaCha20_8x:
1723         mov             %rsp,%r10
1724         sub             \$0x280+$xframe,%rsp
1725         and             \$-32,%rsp
1726 ___
1727 $code.=<<___    if ($win64);
1728         lea             0x290+0x30(%rsp),%r11
1729         movaps          %xmm6,-0x30(%r11)
1730         movaps          %xmm7,-0x20(%r11)
1731         movaps          %xmm8,-0x10(%r11)
1732         movaps          %xmm9,0x00(%r11)
1733         movaps          %xmm10,0x10(%r11)
1734         movaps          %xmm11,0x20(%r11)
1735         movaps          %xmm12,0x30(%r11)
1736         movaps          %xmm13,0x40(%r11)
1737         movaps          %xmm14,0x50(%r11)
1738         movaps          %xmm15,0x60(%r11)
1739 ___
1740 $code.=<<___;
1741         vzeroupper
1742         mov             %r10,0x280(%rsp)
1743
1744         ################ stack layout
1745         # +0x00         SIMD equivalent of @x[8-12]
1746         # ...
1747         # +0x80         constant copy of key[0-2] smashed by lanes
1748         # ...
1749         # +0x200        SIMD counters (with nonce smashed by lanes)
1750         # ...
1751         # +0x280        saved %rsp
1752
1753         vbroadcasti128  .Lsigma(%rip),$xa3      # key[0]
1754         vbroadcasti128  ($key),$xb3             # key[1]
1755         vbroadcasti128  16($key),$xt3           # key[2]
1756         vbroadcasti128  ($counter),$xd3         # key[3]
1757         lea             0x100(%rsp),%rcx        # size optimization
1758         lea             0x200(%rsp),%rax        # size optimization
1759         lea             .Lrot16(%rip),%r10
1760         lea             .Lrot24(%rip),%r11
1761
1762         vpshufd         \$0x00,$xa3,$xa0        # smash key by lanes...
1763         vpshufd         \$0x55,$xa3,$xa1
1764         vmovdqa         $xa0,0x80-0x100(%rcx)   # ... and offload
1765         vpshufd         \$0xaa,$xa3,$xa2
1766         vmovdqa         $xa1,0xa0-0x100(%rcx)
1767         vpshufd         \$0xff,$xa3,$xa3
1768         vmovdqa         $xa2,0xc0-0x100(%rcx)
1769         vmovdqa         $xa3,0xe0-0x100(%rcx)
1770
1771         vpshufd         \$0x00,$xb3,$xb0
1772         vpshufd         \$0x55,$xb3,$xb1
1773         vmovdqa         $xb0,0x100-0x100(%rcx)
1774         vpshufd         \$0xaa,$xb3,$xb2
1775         vmovdqa         $xb1,0x120-0x100(%rcx)
1776         vpshufd         \$0xff,$xb3,$xb3
1777         vmovdqa         $xb2,0x140-0x100(%rcx)
1778         vmovdqa         $xb3,0x160-0x100(%rcx)
1779
1780         vpshufd         \$0x00,$xt3,$xt0        # "xc0"
1781         vpshufd         \$0x55,$xt3,$xt1        # "xc1"
1782         vmovdqa         $xt0,0x180-0x200(%rax)
1783         vpshufd         \$0xaa,$xt3,$xt2        # "xc2"
1784         vmovdqa         $xt1,0x1a0-0x200(%rax)
1785         vpshufd         \$0xff,$xt3,$xt3        # "xc3"
1786         vmovdqa         $xt2,0x1c0-0x200(%rax)
1787         vmovdqa         $xt3,0x1e0-0x200(%rax)
1788
1789         vpshufd         \$0x00,$xd3,$xd0
1790         vpshufd         \$0x55,$xd3,$xd1
1791         vpaddd          .Lincy(%rip),$xd0,$xd0  # don't save counters yet
1792         vpshufd         \$0xaa,$xd3,$xd2
1793         vmovdqa         $xd1,0x220-0x200(%rax)
1794         vpshufd         \$0xff,$xd3,$xd3
1795         vmovdqa         $xd2,0x240-0x200(%rax)
1796         vmovdqa         $xd3,0x260-0x200(%rax)
1797
1798         jmp             .Loop_enter8x
1799
1800 .align  32
1801 .Loop_outer8x:
1802         vmovdqa         0x80-0x100(%rcx),$xa0   # re-load smashed key
1803         vmovdqa         0xa0-0x100(%rcx),$xa1
1804         vmovdqa         0xc0-0x100(%rcx),$xa2
1805         vmovdqa         0xe0-0x100(%rcx),$xa3
1806         vmovdqa         0x100-0x100(%rcx),$xb0
1807         vmovdqa         0x120-0x100(%rcx),$xb1
1808         vmovdqa         0x140-0x100(%rcx),$xb2
1809         vmovdqa         0x160-0x100(%rcx),$xb3
1810         vmovdqa         0x180-0x200(%rax),$xt0  # "xc0"
1811         vmovdqa         0x1a0-0x200(%rax),$xt1  # "xc1"
1812         vmovdqa         0x1c0-0x200(%rax),$xt2  # "xc2"
1813         vmovdqa         0x1e0-0x200(%rax),$xt3  # "xc3"
1814         vmovdqa         0x200-0x200(%rax),$xd0
1815         vmovdqa         0x220-0x200(%rax),$xd1
1816         vmovdqa         0x240-0x200(%rax),$xd2
1817         vmovdqa         0x260-0x200(%rax),$xd3
1818         vpaddd          .Leight(%rip),$xd0,$xd0 # next SIMD counters
1819
1820 .Loop_enter8x:
1821         vmovdqa         $xt2,0x40(%rsp)         # SIMD equivalent of "@x[10]"
1822         vmovdqa         $xt3,0x60(%rsp)         # SIMD equivalent of "@x[11]"
1823         vbroadcasti128  (%r10),$xt3
1824         vmovdqa         $xd0,0x200-0x200(%rax)  # save SIMD counters
1825         mov             \$10,%eax
1826         jmp             .Loop8x
1827
1828 .align  32
1829 .Loop8x:
1830 ___
1831         foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
1832         foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
1833 $code.=<<___;
1834         dec             %eax
1835         jnz             .Loop8x
1836
1837         lea             0x200(%rsp),%rax        # size optimization
1838         vpaddd          0x80-0x100(%rcx),$xa0,$xa0      # accumulate key
1839         vpaddd          0xa0-0x100(%rcx),$xa1,$xa1
1840         vpaddd          0xc0-0x100(%rcx),$xa2,$xa2
1841         vpaddd          0xe0-0x100(%rcx),$xa3,$xa3
1842
1843         vpunpckldq      $xa1,$xa0,$xt2          # "de-interlace" data
1844         vpunpckldq      $xa3,$xa2,$xt3
1845         vpunpckhdq      $xa1,$xa0,$xa0
1846         vpunpckhdq      $xa3,$xa2,$xa2
1847         vpunpcklqdq     $xt3,$xt2,$xa1          # "a0"
1848         vpunpckhqdq     $xt3,$xt2,$xt2          # "a1"
1849         vpunpcklqdq     $xa2,$xa0,$xa3          # "a2"
1850         vpunpckhqdq     $xa2,$xa0,$xa0          # "a3"
1851 ___
1852         ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1853 $code.=<<___;
1854         vpaddd          0x100-0x100(%rcx),$xb0,$xb0
1855         vpaddd          0x120-0x100(%rcx),$xb1,$xb1
1856         vpaddd          0x140-0x100(%rcx),$xb2,$xb2
1857         vpaddd          0x160-0x100(%rcx),$xb3,$xb3
1858
1859         vpunpckldq      $xb1,$xb0,$xt2
1860         vpunpckldq      $xb3,$xb2,$xt3
1861         vpunpckhdq      $xb1,$xb0,$xb0
1862         vpunpckhdq      $xb3,$xb2,$xb2
1863         vpunpcklqdq     $xt3,$xt2,$xb1          # "b0"
1864         vpunpckhqdq     $xt3,$xt2,$xt2          # "b1"
1865         vpunpcklqdq     $xb2,$xb0,$xb3          # "b2"
1866         vpunpckhqdq     $xb2,$xb0,$xb0          # "b3"
1867 ___
1868         ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1869 $code.=<<___;
1870         vperm2i128      \$0x20,$xb0,$xa0,$xt3   # "de-interlace" further
1871         vperm2i128      \$0x31,$xb0,$xa0,$xb0
1872         vperm2i128      \$0x20,$xb1,$xa1,$xa0
1873         vperm2i128      \$0x31,$xb1,$xa1,$xb1
1874         vperm2i128      \$0x20,$xb2,$xa2,$xa1
1875         vperm2i128      \$0x31,$xb2,$xa2,$xb2
1876         vperm2i128      \$0x20,$xb3,$xa3,$xa2
1877         vperm2i128      \$0x31,$xb3,$xa3,$xb3
1878 ___
1879         ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
1880         my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1881 $code.=<<___;
1882         vmovdqa         $xa0,0x00(%rsp)         # offload $xaN
1883         vmovdqa         $xa1,0x20(%rsp)
1884         vmovdqa         0x40(%rsp),$xc2         # $xa0
1885         vmovdqa         0x60(%rsp),$xc3         # $xa1
1886
1887         vpaddd          0x180-0x200(%rax),$xc0,$xc0
1888         vpaddd          0x1a0-0x200(%rax),$xc1,$xc1
1889         vpaddd          0x1c0-0x200(%rax),$xc2,$xc2
1890         vpaddd          0x1e0-0x200(%rax),$xc3,$xc3
1891
1892         vpunpckldq      $xc1,$xc0,$xt2
1893         vpunpckldq      $xc3,$xc2,$xt3
1894         vpunpckhdq      $xc1,$xc0,$xc0
1895         vpunpckhdq      $xc3,$xc2,$xc2
1896         vpunpcklqdq     $xt3,$xt2,$xc1          # "c0"
1897         vpunpckhqdq     $xt3,$xt2,$xt2          # "c1"
1898         vpunpcklqdq     $xc2,$xc0,$xc3          # "c2"
1899         vpunpckhqdq     $xc2,$xc0,$xc0          # "c3"
1900 ___
1901         ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1902 $code.=<<___;
1903         vpaddd          0x200-0x200(%rax),$xd0,$xd0
1904         vpaddd          0x220-0x200(%rax),$xd1,$xd1
1905         vpaddd          0x240-0x200(%rax),$xd2,$xd2
1906         vpaddd          0x260-0x200(%rax),$xd3,$xd3
1907
1908         vpunpckldq      $xd1,$xd0,$xt2
1909         vpunpckldq      $xd3,$xd2,$xt3
1910         vpunpckhdq      $xd1,$xd0,$xd0
1911         vpunpckhdq      $xd3,$xd2,$xd2
1912         vpunpcklqdq     $xt3,$xt2,$xd1          # "d0"
1913         vpunpckhqdq     $xt3,$xt2,$xt2          # "d1"
1914         vpunpcklqdq     $xd2,$xd0,$xd3          # "d2"
1915         vpunpckhqdq     $xd2,$xd0,$xd0          # "d3"
1916 ___
1917         ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1918 $code.=<<___;
1919         vperm2i128      \$0x20,$xd0,$xc0,$xt3   # "de-interlace" further
1920         vperm2i128      \$0x31,$xd0,$xc0,$xd0
1921         vperm2i128      \$0x20,$xd1,$xc1,$xc0
1922         vperm2i128      \$0x31,$xd1,$xc1,$xd1
1923         vperm2i128      \$0x20,$xd2,$xc2,$xc1
1924         vperm2i128      \$0x31,$xd2,$xc2,$xd2
1925         vperm2i128      \$0x20,$xd3,$xc3,$xc2
1926         vperm2i128      \$0x31,$xd3,$xc3,$xd3
1927 ___
1928         ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
1929         ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
1930         ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
1931         ($xa0,$xa1)=($xt2,$xt3);
1932 $code.=<<___;
1933         vmovdqa         0x00(%rsp),$xa0         # $xaN was offloaded, remember?
1934         vmovdqa         0x20(%rsp),$xa1
1935
1936         cmp             \$64*8,$len
1937         jb              .Ltail8x
1938
1939         vpxor           0x00($inp),$xa0,$xa0    # xor with input
1940         vpxor           0x20($inp),$xb0,$xb0
1941         vpxor           0x40($inp),$xc0,$xc0
1942         vpxor           0x60($inp),$xd0,$xd0
1943         lea             0x80($inp),$inp         # size optimization
1944         vmovdqu         $xa0,0x00($out)
1945         vmovdqu         $xb0,0x20($out)
1946         vmovdqu         $xc0,0x40($out)
1947         vmovdqu         $xd0,0x60($out)
1948         lea             0x80($out),$out         # size optimization
1949
1950         vpxor           0x00($inp),$xa1,$xa1
1951         vpxor           0x20($inp),$xb1,$xb1
1952         vpxor           0x40($inp),$xc1,$xc1
1953         vpxor           0x60($inp),$xd1,$xd1
1954         lea             0x80($inp),$inp         # size optimization
1955         vmovdqu         $xa1,0x00($out)
1956         vmovdqu         $xb1,0x20($out)
1957         vmovdqu         $xc1,0x40($out)
1958         vmovdqu         $xd1,0x60($out)
1959         lea             0x80($out),$out         # size optimization
1960
1961         vpxor           0x00($inp),$xa2,$xa2
1962         vpxor           0x20($inp),$xb2,$xb2
1963         vpxor           0x40($inp),$xc2,$xc2
1964         vpxor           0x60($inp),$xd2,$xd2
1965         lea             0x80($inp),$inp         # size optimization
1966         vmovdqu         $xa2,0x00($out)
1967         vmovdqu         $xb2,0x20($out)
1968         vmovdqu         $xc2,0x40($out)
1969         vmovdqu         $xd2,0x60($out)
1970         lea             0x80($out),$out         # size optimization
1971
1972         vpxor           0x00($inp),$xa3,$xa3
1973         vpxor           0x20($inp),$xb3,$xb3
1974         vpxor           0x40($inp),$xc3,$xc3
1975         vpxor           0x60($inp),$xd3,$xd3
1976         lea             0x80($inp),$inp         # size optimization
1977         vmovdqu         $xa3,0x00($out)
1978         vmovdqu         $xb3,0x20($out)
1979         vmovdqu         $xc3,0x40($out)
1980         vmovdqu         $xd3,0x60($out)
1981         lea             0x80($out),$out         # size optimization
1982
1983         sub             \$64*8,$len
1984         jnz             .Loop_outer8x
1985
1986         jmp             .Ldone8x
1987
1988 .Ltail8x:
1989         cmp             \$448,$len
1990         jae             .L448_or_more8x
1991         cmp             \$384,$len
1992         jae             .L384_or_more8x
1993         cmp             \$320,$len
1994         jae             .L320_or_more8x
1995         cmp             \$256,$len
1996         jae             .L256_or_more8x
1997         cmp             \$192,$len
1998         jae             .L192_or_more8x
1999         cmp             \$128,$len
2000         jae             .L128_or_more8x
2001         cmp             \$64,$len
2002         jae             .L64_or_more8x
2003
2004         xor             %r10,%r10
2005         vmovdqa         $xa0,0x00(%rsp)
2006         vmovdqa         $xb0,0x20(%rsp)
2007         jmp             .Loop_tail8x
2008
2009 .align  32
2010 .L64_or_more8x:
2011         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2012         vpxor           0x20($inp),$xb0,$xb0
2013         vmovdqu         $xa0,0x00($out)
2014         vmovdqu         $xb0,0x20($out)
2015         je              .Ldone8x
2016
2017         lea             0x40($inp),$inp         # inp+=64*1
2018         xor             %r10,%r10
2019         vmovdqa         $xc0,0x00(%rsp)
2020         lea             0x40($out),$out         # out+=64*1
2021         sub             \$64,$len               # len-=64*1
2022         vmovdqa         $xd0,0x20(%rsp)
2023         jmp             .Loop_tail8x
2024
2025 .align  32
2026 .L128_or_more8x:
2027         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2028         vpxor           0x20($inp),$xb0,$xb0
2029         vpxor           0x40($inp),$xc0,$xc0
2030         vpxor           0x60($inp),$xd0,$xd0
2031         vmovdqu         $xa0,0x00($out)
2032         vmovdqu         $xb0,0x20($out)
2033         vmovdqu         $xc0,0x40($out)
2034         vmovdqu         $xd0,0x60($out)
2035         je              .Ldone8x
2036
2037         lea             0x80($inp),$inp         # inp+=64*2
2038         xor             %r10,%r10
2039         vmovdqa         $xa1,0x00(%rsp)
2040         lea             0x80($out),$out         # out+=64*2
2041         sub             \$128,$len              # len-=64*2
2042         vmovdqa         $xb1,0x20(%rsp)
2043         jmp             .Loop_tail8x
2044
2045 .align  32
2046 .L192_or_more8x:
2047         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2048         vpxor           0x20($inp),$xb0,$xb0
2049         vpxor           0x40($inp),$xc0,$xc0
2050         vpxor           0x60($inp),$xd0,$xd0
2051         vpxor           0x80($inp),$xa1,$xa1
2052         vpxor           0xa0($inp),$xb1,$xb1
2053         vmovdqu         $xa0,0x00($out)
2054         vmovdqu         $xb0,0x20($out)
2055         vmovdqu         $xc0,0x40($out)
2056         vmovdqu         $xd0,0x60($out)
2057         vmovdqu         $xa1,0x80($out)
2058         vmovdqu         $xb1,0xa0($out)
2059         je              .Ldone8x
2060
2061         lea             0xc0($inp),$inp         # inp+=64*3
2062         xor             %r10,%r10
2063         vmovdqa         $xc1,0x00(%rsp)
2064         lea             0xc0($out),$out         # out+=64*3
2065         sub             \$192,$len              # len-=64*3
2066         vmovdqa         $xd1,0x20(%rsp)
2067         jmp             .Loop_tail8x
2068
2069 .align  32
2070 .L256_or_more8x:
2071         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2072         vpxor           0x20($inp),$xb0,$xb0
2073         vpxor           0x40($inp),$xc0,$xc0
2074         vpxor           0x60($inp),$xd0,$xd0
2075         vpxor           0x80($inp),$xa1,$xa1
2076         vpxor           0xa0($inp),$xb1,$xb1
2077         vpxor           0xc0($inp),$xc1,$xc1
2078         vpxor           0xe0($inp),$xd1,$xd1
2079         vmovdqu         $xa0,0x00($out)
2080         vmovdqu         $xb0,0x20($out)
2081         vmovdqu         $xc0,0x40($out)
2082         vmovdqu         $xd0,0x60($out)
2083         vmovdqu         $xa1,0x80($out)
2084         vmovdqu         $xb1,0xa0($out)
2085         vmovdqu         $xc1,0xc0($out)
2086         vmovdqu         $xd1,0xe0($out)
2087         je              .Ldone8x
2088
2089         lea             0x100($inp),$inp        # inp+=64*4
2090         xor             %r10,%r10
2091         vmovdqa         $xa2,0x00(%rsp)
2092         lea             0x100($out),$out        # out+=64*4
2093         sub             \$256,$len              # len-=64*4
2094         vmovdqa         $xb2,0x20(%rsp)
2095         jmp             .Loop_tail8x
2096
2097 .align  32
2098 .L320_or_more8x:
2099         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2100         vpxor           0x20($inp),$xb0,$xb0
2101         vpxor           0x40($inp),$xc0,$xc0
2102         vpxor           0x60($inp),$xd0,$xd0
2103         vpxor           0x80($inp),$xa1,$xa1
2104         vpxor           0xa0($inp),$xb1,$xb1
2105         vpxor           0xc0($inp),$xc1,$xc1
2106         vpxor           0xe0($inp),$xd1,$xd1
2107         vpxor           0x100($inp),$xa2,$xa2
2108         vpxor           0x120($inp),$xb2,$xb2
2109         vmovdqu         $xa0,0x00($out)
2110         vmovdqu         $xb0,0x20($out)
2111         vmovdqu         $xc0,0x40($out)
2112         vmovdqu         $xd0,0x60($out)
2113         vmovdqu         $xa1,0x80($out)
2114         vmovdqu         $xb1,0xa0($out)
2115         vmovdqu         $xc1,0xc0($out)
2116         vmovdqu         $xd1,0xe0($out)
2117         vmovdqu         $xa2,0x100($out)
2118         vmovdqu         $xb2,0x120($out)
2119         je              .Ldone8x
2120
2121         lea             0x140($inp),$inp        # inp+=64*5
2122         xor             %r10,%r10
2123         vmovdqa         $xc2,0x00(%rsp)
2124         lea             0x140($out),$out        # out+=64*5
2125         sub             \$320,$len              # len-=64*5
2126         vmovdqa         $xd2,0x20(%rsp)
2127         jmp             .Loop_tail8x
2128
2129 .align  32
2130 .L384_or_more8x:
2131         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2132         vpxor           0x20($inp),$xb0,$xb0
2133         vpxor           0x40($inp),$xc0,$xc0
2134         vpxor           0x60($inp),$xd0,$xd0
2135         vpxor           0x80($inp),$xa1,$xa1
2136         vpxor           0xa0($inp),$xb1,$xb1
2137         vpxor           0xc0($inp),$xc1,$xc1
2138         vpxor           0xe0($inp),$xd1,$xd1
2139         vpxor           0x100($inp),$xa2,$xa2
2140         vpxor           0x120($inp),$xb2,$xb2
2141         vpxor           0x140($inp),$xc2,$xc2
2142         vpxor           0x160($inp),$xd2,$xd2
2143         vmovdqu         $xa0,0x00($out)
2144         vmovdqu         $xb0,0x20($out)
2145         vmovdqu         $xc0,0x40($out)
2146         vmovdqu         $xd0,0x60($out)
2147         vmovdqu         $xa1,0x80($out)
2148         vmovdqu         $xb1,0xa0($out)
2149         vmovdqu         $xc1,0xc0($out)
2150         vmovdqu         $xd1,0xe0($out)
2151         vmovdqu         $xa2,0x100($out)
2152         vmovdqu         $xb2,0x120($out)
2153         vmovdqu         $xc2,0x140($out)
2154         vmovdqu         $xd2,0x160($out)
2155         je              .Ldone8x
2156
2157         lea             0x180($inp),$inp        # inp+=64*6
2158         xor             %r10,%r10
2159         vmovdqa         $xa3,0x00(%rsp)
2160         lea             0x180($out),$out        # out+=64*6
2161         sub             \$384,$len              # len-=64*6
2162         vmovdqa         $xb3,0x20(%rsp)
2163         jmp             .Loop_tail8x
2164
2165 .align  32
2166 .L448_or_more8x:
2167         vpxor           0x00($inp),$xa0,$xa0    # xor with input
2168         vpxor           0x20($inp),$xb0,$xb0
2169         vpxor           0x40($inp),$xc0,$xc0
2170         vpxor           0x60($inp),$xd0,$xd0
2171         vpxor           0x80($inp),$xa1,$xa1
2172         vpxor           0xa0($inp),$xb1,$xb1
2173         vpxor           0xc0($inp),$xc1,$xc1
2174         vpxor           0xe0($inp),$xd1,$xd1
2175         vpxor           0x100($inp),$xa2,$xa2
2176         vpxor           0x120($inp),$xb2,$xb2
2177         vpxor           0x140($inp),$xc2,$xc2
2178         vpxor           0x160($inp),$xd2,$xd2
2179         vpxor           0x180($inp),$xa3,$xa3
2180         vpxor           0x1a0($inp),$xb3,$xb3
2181         vmovdqu         $xa0,0x00($out)
2182         vmovdqu         $xb0,0x20($out)
2183         vmovdqu         $xc0,0x40($out)
2184         vmovdqu         $xd0,0x60($out)
2185         vmovdqu         $xa1,0x80($out)
2186         vmovdqu         $xb1,0xa0($out)
2187         vmovdqu         $xc1,0xc0($out)
2188         vmovdqu         $xd1,0xe0($out)
2189         vmovdqu         $xa2,0x100($out)
2190         vmovdqu         $xb2,0x120($out)
2191         vmovdqu         $xc2,0x140($out)
2192         vmovdqu         $xd2,0x160($out)
2193         vmovdqu         $xa3,0x180($out)
2194         vmovdqu         $xb3,0x1a0($out)
2195         je              .Ldone8x
2196
2197         lea             0x1c0($inp),$inp        # inp+=64*7
2198         xor             %r10,%r10
2199         vmovdqa         $xc3,0x00(%rsp)
2200         lea             0x1c0($out),$out        # out+=64*7
2201         sub             \$448,$len              # len-=64*7
2202         vmovdqa         $xd3,0x20(%rsp)
2203
2204 .Loop_tail8x:
2205         movzb           ($inp,%r10),%eax
2206         movzb           (%rsp,%r10),%ecx
2207         lea             1(%r10),%r10
2208         xor             %ecx,%eax
2209         mov             %al,-1($out,%r10)
2210         dec             $len
2211         jnz             .Loop_tail8x
2212
2213 .Ldone8x:
2214         vzeroall
2215 ___
2216 $code.=<<___    if ($win64);
2217         lea             0x290+0x30(%rsp),%r11
2218         movaps          -0x30(%r11),%xmm6
2219         movaps          -0x20(%r11),%xmm7
2220         movaps          -0x10(%r11),%xmm8
2221         movaps          0x00(%r11),%xmm9
2222         movaps          0x10(%r11),%xmm10
2223         movaps          0x20(%r11),%xmm11
2224         movaps          0x30(%r11),%xmm12
2225         movaps          0x40(%r11),%xmm13
2226         movaps          0x50(%r11),%xmm14
2227         movaps          0x60(%r11),%xmm15
2228 ___
2229 $code.=<<___;
2230         mov             0x280(%rsp),%rsp
2231         ret
2232 .size   ChaCha20_8x,.-ChaCha20_8x
2233 ___
2234 }
2235
2236 foreach (split("\n",$code)) {
2237         s/\`([^\`]*)\`/eval $1/geo;
2238
2239         s/%x#%y/%x/go;
2240
2241         print $_,"\n";
2242 }
2243
2244 close STDOUT;