2 # Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # ChaCha20 for x86_64.
23 # Add AVX512F code path.
27 # Add AVX512VL code path.
29 # Performance in cycles per byte out of large buffer.
31 # IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v)
34 # Core2 7.83/+55% 7.90/5.76 4.35
35 # Westmere 7.19/+50% 5.60/4.50 3.00
36 # Sandy Bridge 8.31/+42% 5.45/4.00 2.72
37 # Ivy Bridge 6.71/+46% 5.40/? 2.41
38 # Haswell 5.92/+43% 5.20/3.45 2.42 1.23
39 # Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)]
40 # Silvermont 12.0/+33% 7.75/6.90 7.03(iii)
41 # Knights L 11.7/- ? 9.60(iii) 0.80
42 # Goldmont 10.6/+17% 5.10/3.52 3.28
43 # Sledgehammer 7.28/+52% - -
44 # Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv)
45 # Ryzen 5.96/+50% 5.19/3.00 2.40 2.09
46 # VIA Nano 10.5/+46% 6.72/6.88 6.05
48 # (i) compared to older gcc 3.x one can observe >2x improvement on
50 # (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used
51 # by chacha20_poly1305_tls_cipher, results are EVP-free;
52 # (iii) this is not optimal result for Atom because of MSROM
53 # limitations, SSE2 can do better, but gain is considered too
54 # low to justify the [maintenance] effort;
55 # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20
56 # and 4.85 for 128-byte inputs;
57 # (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
58 # (vi) even though Skylake-X can execute AVX512F code and deliver 0.57
59 # cpb in single thread, the corresponding capability is suppressed;
61 # $output is the last argument if it looks like a file (it has an extension)
62 # $flavour is the first argument if it doesn't look like a file
63 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
64 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
66 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
68 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
69 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
70 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
71 die "can't locate x86_64-xlate.pl";
73 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
74 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
75 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
78 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
79 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
80 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
81 $avx += 1 if ($1==2.11 && $2>=8);
84 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
85 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
86 $avx = ($1>=10) + ($1>=11);
89 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
90 $avx = ($2>=3.0) + ($2>3.0);
93 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
94 or die "can't call $xlate: $!";
97 # input parameter block
98 ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
103 .extern OPENSSL_ia32cap_P
105 .section .rodata align=64
116 .long 0,2,4,6,1,3,5,7
118 .long 8,8,8,8,8,8,8,8
120 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
122 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
124 .long 2,0,0,0, 2,0,0,0
127 .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
129 .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
131 .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
133 .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
135 .asciz "expand 32-byte k"
136 .asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
140 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
141 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
143 $arg = "\$$arg" if ($arg*1 eq $arg);
144 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
147 @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
148 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
151 sub ROUND { # critical path is 24 cycles per round
152 my ($a0,$b0,$c0,$d0)=@_;
153 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
154 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
155 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
156 my ($xc,$xc_)=map("\"$_\"",@t);
157 my @x=map("\"$_\"",@x);
159 # Consider order in which variables are addressed by their
164 # 0 4 8 12 < even round
168 # 0 5 10 15 < odd round
173 # 'a', 'b' and 'd's are permanently allocated in registers,
174 # @x[0..7,12..15], while 'c's are maintained in memory. If
175 # you observe 'c' column, you'll notice that pair of 'c's is
176 # invariant between rounds. This means that we have to reload
177 # them once per round, in the middle. This is why you'll see
178 # bunch of 'c' stores and loads in the middle, but none in
179 # the beginning or end.
181 # Normally instructions would be interleaved to favour in-order
182 # execution. Generally out-of-order cores manage it gracefully,
183 # but not this time for some reason. As in-order execution
184 # cores are dying breed, old Atom is the only one around,
185 # instructions are left uninterleaved. Besides, Atom is better
186 # off executing 1xSSSE3 code anyway...
189 "&add (@x[$a0],@x[$b0])", # Q1
190 "&xor (@x[$d0],@x[$a0])",
192 "&add (@x[$a1],@x[$b1])", # Q2
193 "&xor (@x[$d1],@x[$a1])",
196 "&add ($xc,@x[$d0])",
197 "&xor (@x[$b0],$xc)",
199 "&add ($xc_,@x[$d1])",
200 "&xor (@x[$b1],$xc_)",
203 "&add (@x[$a0],@x[$b0])",
204 "&xor (@x[$d0],@x[$a0])",
206 "&add (@x[$a1],@x[$b1])",
207 "&xor (@x[$d1],@x[$a1])",
210 "&add ($xc,@x[$d0])",
211 "&xor (@x[$b0],$xc)",
213 "&add ($xc_,@x[$d1])",
214 "&xor (@x[$b1],$xc_)",
217 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
218 "&mov (\"4*$c1(%rsp)\",$xc_)",
219 "&mov ($xc,\"4*$c2(%rsp)\")",
220 "&mov ($xc_,\"4*$c3(%rsp)\")",
222 "&add (@x[$a2],@x[$b2])", # Q3
223 "&xor (@x[$d2],@x[$a2])",
225 "&add (@x[$a3],@x[$b3])", # Q4
226 "&xor (@x[$d3],@x[$a3])",
229 "&add ($xc,@x[$d2])",
230 "&xor (@x[$b2],$xc)",
232 "&add ($xc_,@x[$d3])",
233 "&xor (@x[$b3],$xc_)",
236 "&add (@x[$a2],@x[$b2])",
237 "&xor (@x[$d2],@x[$a2])",
239 "&add (@x[$a3],@x[$b3])",
240 "&xor (@x[$d3],@x[$a3])",
243 "&add ($xc,@x[$d2])",
244 "&xor (@x[$b2],$xc)",
246 "&add ($xc_,@x[$d3])",
247 "&xor (@x[$b3],$xc_)",
252 ########################################################################
253 # Generic code path that handles all lengths on pre-SSSE3 processors.
255 .globl ChaCha20_ctr32
256 .type ChaCha20_ctr32,\@function,5
262 mov OPENSSL_ia32cap_P+4(%rip),%r10
264 $code.=<<___ if ($avx>2);
265 bt \$48,%r10 # check for AVX512F
267 test %r10,%r10 # check for AVX512VL
268 js .LChaCha20_avx512vl
271 test \$`1<<(41-32)`,%r10d
287 .cfi_adjust_cfa_offset 64+24
290 #movdqa .Lsigma(%rip),%xmm0
292 movdqu 16($key),%xmm2
293 movdqu ($counter),%xmm3
294 movdqa .Lone(%rip),%xmm4
296 #movdqa %xmm0,4*0(%rsp) # key[0]
297 movdqa %xmm1,4*4(%rsp) # key[1]
298 movdqa %xmm2,4*8(%rsp) # key[2]
299 movdqa %xmm3,4*12(%rsp) # key[3]
300 mov $len,%rbp # reassign $len
305 mov \$0x61707865,@x[0] # 'expa'
306 mov \$0x3320646e,@x[1] # 'nd 3'
307 mov \$0x79622d32,@x[2] # '2-by'
308 mov \$0x6b206574,@x[3] # 'te k'
314 mov 4*13(%rsp),@x[13]
315 mov 4*14(%rsp),@x[14]
316 mov 4*15(%rsp),@x[15]
318 mov %rbp,64+0(%rsp) # save len
320 mov $inp,64+8(%rsp) # save inp
321 movq %xmm2,%rsi # "@x[8]"
322 mov $out,64+16(%rsp) # save out
324 shr \$32,%rdi # "@x[9]"
330 foreach (&ROUND (0, 4, 8,12)) { eval; }
331 foreach (&ROUND (0, 5,10,15)) { eval; }
336 mov @t[1],4*9(%rsp) # modulo-scheduled
338 mov 64(%rsp),%rbp # load len
340 mov 64+8(%rsp),$inp # load inp
341 paddd %xmm4,%xmm3 # increment counter
342 mov 64+16(%rsp),$out # load out
344 add \$0x61707865,@x[0] # 'expa'
345 add \$0x3320646e,@x[1] # 'nd 3'
346 add \$0x79622d32,@x[2] # '2-by'
347 add \$0x6b206574,@x[3] # 'te k'
352 add 4*12(%rsp),@x[12]
353 add 4*13(%rsp),@x[13]
354 add 4*14(%rsp),@x[14]
355 add 4*15(%rsp),@x[15]
356 paddd 4*8(%rsp),%xmm1
361 xor 4*0($inp),@x[0] # xor with input
369 movdqu 4*8($inp),%xmm0
370 xor 4*12($inp),@x[12]
371 xor 4*13($inp),@x[13]
372 xor 4*14($inp),@x[14]
373 xor 4*15($inp),@x[15]
374 lea 4*16($inp),$inp # inp+=64
377 movdqa %xmm2,4*8(%rsp)
378 movd %xmm3,4*12(%rsp)
380 mov @x[0],4*0($out) # write output
388 movdqu %xmm0,4*8($out)
389 mov @x[12],4*12($out)
390 mov @x[13],4*13($out)
391 mov @x[14],4*14($out)
392 mov @x[15],4*15($out)
393 lea 4*16($out),$out # out+=64
411 movdqa %xmm1,4*8(%rsp)
412 mov @x[12],4*12(%rsp)
413 mov @x[13],4*13(%rsp)
414 mov @x[14],4*14(%rsp)
415 mov @x[15],4*15(%rsp)
418 movzb ($inp,%rbx),%eax
419 movzb (%rsp,%rbx),%edx
422 mov %al,-1($out,%rbx)
427 lea 64+24+48(%rsp),%rsi
442 .cfi_def_cfa_register %rsp
446 .size ChaCha20_ctr32,.-ChaCha20_ctr32
449 ########################################################################
450 # SSSE3 code path that handles shorter lengths
452 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
454 sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
478 my $xframe = $win64 ? 160+8 : 8;
481 .type ChaCha20_ssse3,\@function,5
486 mov %rsp,%r9 # frame pointer
487 .cfi_def_cfa_register %r9
489 $code.=<<___ if ($avx);
490 test \$`1<<(43-32)`,%r10d
491 jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4
494 cmp \$128,$len # we might throw away some data,
496 ja .LChaCha20_4x # but overall it won't be slower
499 sub \$64+$xframe,%rsp
501 $code.=<<___ if ($win64);
502 movaps %xmm6,-0x28(%r9)
503 movaps %xmm7,-0x18(%r9)
507 movdqa .Lsigma(%rip),$a
511 movdqa .Lrot16(%rip),$rot16
512 movdqa .Lrot24(%rip),$rot24
518 mov \$10,$counter # reuse $counter
523 movdqa .Lone(%rip),$d
536 &pshufd ($c,$c,0b01001110);
537 &pshufd ($b,$b,0b00111001);
538 &pshufd ($d,$d,0b10010011);
542 &pshufd ($c,$c,0b01001110);
543 &pshufd ($b,$b,0b10010011);
544 &pshufd ($d,$d,0b00111001);
547 &jnz (".Loop_ssse3");
559 movdqu 0x10($inp),$t1
560 pxor $t,$a # xor with input
563 movdqu 0x30($inp),$t1
564 lea 0x40($inp),$inp # inp+=64
568 movdqu $a,0x00($out) # write output
572 lea 0x40($out),$out # out+=64
575 jnz .Loop_outer_ssse3
585 xor $counter,$counter
588 movzb ($inp,$counter),%eax
589 movzb (%rsp,$counter),%ecx
590 lea 1($counter),$counter
592 mov %al,-1($out,$counter)
598 $code.=<<___ if ($win64);
599 movaps -0x28(%r9),%xmm6
600 movaps -0x18(%r9),%xmm7
604 .cfi_def_cfa_register %rsp
608 .size ChaCha20_ssse3,.-ChaCha20_ssse3
612 ########################################################################
613 # SSSE3 code path that handles 128-byte inputs
615 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7));
616 my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1));
660 my $xframe = $win64 ? 0x68 : 8;
663 .type ChaCha20_128,\@function,5
668 mov %rsp,%r9 # frame pointer
669 .cfi_def_cfa_register %r9
670 sub \$64+$xframe,%rsp
672 $code.=<<___ if ($win64);
673 movaps %xmm6,-0x68(%r9)
674 movaps %xmm7,-0x58(%r9)
675 movaps %xmm8,-0x48(%r9)
676 movaps %xmm9,-0x38(%r9)
677 movaps %xmm10,-0x28(%r9)
678 movaps %xmm11,-0x18(%r9)
682 movdqa .Lsigma(%rip),$a
686 movdqa .Lone(%rip),$d1
687 movdqa .Lrot16(%rip),$rot16
688 movdqa .Lrot24(%rip),$rot24
698 mov \$10,$counter # reuse $counter
705 &pshufd ($c,$c,0b01001110);
706 &pshufd ($b,$b,0b00111001);
707 &pshufd ($d,$d,0b10010011);
708 &pshufd ($c1,$c1,0b01001110);
709 &pshufd ($b1,$b1,0b00111001);
710 &pshufd ($d1,$d1,0b10010011);
713 &pshufd ($c,$c,0b01001110);
714 &pshufd ($b,$b,0b10010011);
715 &pshufd ($d,$d,0b00111001);
716 &pshufd ($c1,$c1,0b01001110);
717 &pshufd ($b1,$b1,0b10010011);
718 &pshufd ($d1,$d1,0b00111001);
728 paddd .Lone(%rip),$d1
735 movdqu 0x10($inp),$t1
736 pxor $t,$a # xor with input
739 movdqu 0x30($inp),$t1
743 movdqu 0x50($inp),$t1
747 movdqu 0x70($inp),$t1
751 movdqu $a,0x00($out) # write output
755 movdqu $a1,0x40($out)
756 movdqu $b1,0x50($out)
757 movdqu $c1,0x60($out)
758 movdqu $d1,0x70($out)
760 $code.=<<___ if ($win64);
761 movaps -0x68(%r9),%xmm6
762 movaps -0x58(%r9),%xmm7
763 movaps -0x48(%r9),%xmm8
764 movaps -0x38(%r9),%xmm9
765 movaps -0x28(%r9),%xmm10
766 movaps -0x18(%r9),%xmm11
770 .cfi_def_cfa_register %rsp
774 .size ChaCha20_128,.-ChaCha20_128
778 ########################################################################
779 # SSSE3 code path that handles longer messages.
781 # assign variables to favor Atom front-end
782 my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
783 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
784 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
785 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
787 sub SSSE3_lane_ROUND {
788 my ($a0,$b0,$c0,$d0)=@_;
789 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
790 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
791 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
792 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
793 my @x=map("\"$_\"",@xx);
795 # Consider order in which variables are addressed by their
800 # 0 4 8 12 < even round
804 # 0 5 10 15 < odd round
809 # 'a', 'b' and 'd's are permanently allocated in registers,
810 # @x[0..7,12..15], while 'c's are maintained in memory. If
811 # you observe 'c' column, you'll notice that pair of 'c's is
812 # invariant between rounds. This means that we have to reload
813 # them once per round, in the middle. This is why you'll see
814 # bunch of 'c' stores and loads in the middle, but none in
815 # the beginning or end.
818 "&paddd (@x[$a0],@x[$b0])", # Q1
819 "&paddd (@x[$a1],@x[$b1])", # Q2
820 "&pxor (@x[$d0],@x[$a0])",
821 "&pxor (@x[$d1],@x[$a1])",
822 "&pshufb (@x[$d0],$t1)",
823 "&pshufb (@x[$d1],$t1)",
825 "&paddd ($xc,@x[$d0])",
826 "&paddd ($xc_,@x[$d1])",
827 "&pxor (@x[$b0],$xc)",
828 "&pxor (@x[$b1],$xc_)",
829 "&movdqa ($t0,@x[$b0])",
830 "&pslld (@x[$b0],12)",
832 "&movdqa ($t1,@x[$b1])",
833 "&pslld (@x[$b1],12)",
834 "&por (@x[$b0],$t0)",
836 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
837 "&por (@x[$b1],$t1)",
839 "&paddd (@x[$a0],@x[$b0])",
840 "&paddd (@x[$a1],@x[$b1])",
841 "&pxor (@x[$d0],@x[$a0])",
842 "&pxor (@x[$d1],@x[$a1])",
843 "&pshufb (@x[$d0],$t0)",
844 "&pshufb (@x[$d1],$t0)",
846 "&paddd ($xc,@x[$d0])",
847 "&paddd ($xc_,@x[$d1])",
848 "&pxor (@x[$b0],$xc)",
849 "&pxor (@x[$b1],$xc_)",
850 "&movdqa ($t1,@x[$b0])",
851 "&pslld (@x[$b0],7)",
853 "&movdqa ($t0,@x[$b1])",
854 "&pslld (@x[$b1],7)",
855 "&por (@x[$b0],$t1)",
857 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
858 "&por (@x[$b1],$t0)",
860 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
861 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
862 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
863 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
865 "&paddd (@x[$a2],@x[$b2])", # Q3
866 "&paddd (@x[$a3],@x[$b3])", # Q4
867 "&pxor (@x[$d2],@x[$a2])",
868 "&pxor (@x[$d3],@x[$a3])",
869 "&pshufb (@x[$d2],$t1)",
870 "&pshufb (@x[$d3],$t1)",
872 "&paddd ($xc,@x[$d2])",
873 "&paddd ($xc_,@x[$d3])",
874 "&pxor (@x[$b2],$xc)",
875 "&pxor (@x[$b3],$xc_)",
876 "&movdqa ($t0,@x[$b2])",
877 "&pslld (@x[$b2],12)",
879 "&movdqa ($t1,@x[$b3])",
880 "&pslld (@x[$b3],12)",
881 "&por (@x[$b2],$t0)",
883 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
884 "&por (@x[$b3],$t1)",
886 "&paddd (@x[$a2],@x[$b2])",
887 "&paddd (@x[$a3],@x[$b3])",
888 "&pxor (@x[$d2],@x[$a2])",
889 "&pxor (@x[$d3],@x[$a3])",
890 "&pshufb (@x[$d2],$t0)",
891 "&pshufb (@x[$d3],$t0)",
893 "&paddd ($xc,@x[$d2])",
894 "&paddd ($xc_,@x[$d3])",
895 "&pxor (@x[$b2],$xc)",
896 "&pxor (@x[$b3],$xc_)",
897 "&movdqa ($t1,@x[$b2])",
898 "&pslld (@x[$b2],7)",
900 "&movdqa ($t0,@x[$b3])",
901 "&pslld (@x[$b3],7)",
902 "&por (@x[$b2],$t1)",
904 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
909 my $xframe = $win64 ? 0xa8 : 8;
912 .type ChaCha20_4x,\@function,5
917 mov %rsp,%r9 # frame pointer
918 .cfi_def_cfa_register %r9
921 $code.=<<___ if ($avx>1);
922 shr \$32,%r10 # OPENSSL_ia32cap_P+8
923 test \$`1<<5`,%r10 # test AVX2
930 and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
931 cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
932 je .Ldo_sse3_after_all # to detect Atom
935 sub \$0x140+$xframe,%rsp
937 ################ stack layout
938 # +0x00 SIMD equivalent of @x[8-12]
940 # +0x40 constant copy of key[0-2] smashed by lanes
942 # +0x100 SIMD counters (with nonce smashed by lanes)
945 $code.=<<___ if ($win64);
946 movaps %xmm6,-0xa8(%r9)
947 movaps %xmm7,-0x98(%r9)
948 movaps %xmm8,-0x88(%r9)
949 movaps %xmm9,-0x78(%r9)
950 movaps %xmm10,-0x68(%r9)
951 movaps %xmm11,-0x58(%r9)
952 movaps %xmm12,-0x48(%r9)
953 movaps %xmm13,-0x38(%r9)
954 movaps %xmm14,-0x28(%r9)
955 movaps %xmm15,-0x18(%r9)
959 movdqa .Lsigma(%rip),$xa3 # key[0]
960 movdqu ($key),$xb3 # key[1]
961 movdqu 16($key),$xt3 # key[2]
962 movdqu ($counter),$xd3 # key[3]
963 lea 0x100(%rsp),%rcx # size optimization
964 lea .Lrot16(%rip),%r10
965 lea .Lrot24(%rip),%r11
967 pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
968 pshufd \$0x55,$xa3,$xa1
969 movdqa $xa0,0x40(%rsp) # ... and offload
970 pshufd \$0xaa,$xa3,$xa2
971 movdqa $xa1,0x50(%rsp)
972 pshufd \$0xff,$xa3,$xa3
973 movdqa $xa2,0x60(%rsp)
974 movdqa $xa3,0x70(%rsp)
976 pshufd \$0x00,$xb3,$xb0
977 pshufd \$0x55,$xb3,$xb1
978 movdqa $xb0,0x80-0x100(%rcx)
979 pshufd \$0xaa,$xb3,$xb2
980 movdqa $xb1,0x90-0x100(%rcx)
981 pshufd \$0xff,$xb3,$xb3
982 movdqa $xb2,0xa0-0x100(%rcx)
983 movdqa $xb3,0xb0-0x100(%rcx)
985 pshufd \$0x00,$xt3,$xt0 # "$xc0"
986 pshufd \$0x55,$xt3,$xt1 # "$xc1"
987 movdqa $xt0,0xc0-0x100(%rcx)
988 pshufd \$0xaa,$xt3,$xt2 # "$xc2"
989 movdqa $xt1,0xd0-0x100(%rcx)
990 pshufd \$0xff,$xt3,$xt3 # "$xc3"
991 movdqa $xt2,0xe0-0x100(%rcx)
992 movdqa $xt3,0xf0-0x100(%rcx)
994 pshufd \$0x00,$xd3,$xd0
995 pshufd \$0x55,$xd3,$xd1
996 paddd .Linc(%rip),$xd0 # don't save counters yet
997 pshufd \$0xaa,$xd3,$xd2
998 movdqa $xd1,0x110-0x100(%rcx)
999 pshufd \$0xff,$xd3,$xd3
1000 movdqa $xd2,0x120-0x100(%rcx)
1001 movdqa $xd3,0x130-0x100(%rcx)
1007 movdqa 0x40(%rsp),$xa0 # re-load smashed key
1008 movdqa 0x50(%rsp),$xa1
1009 movdqa 0x60(%rsp),$xa2
1010 movdqa 0x70(%rsp),$xa3
1011 movdqa 0x80-0x100(%rcx),$xb0
1012 movdqa 0x90-0x100(%rcx),$xb1
1013 movdqa 0xa0-0x100(%rcx),$xb2
1014 movdqa 0xb0-0x100(%rcx),$xb3
1015 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
1016 movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
1017 movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
1018 movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
1019 movdqa 0x100-0x100(%rcx),$xd0
1020 movdqa 0x110-0x100(%rcx),$xd1
1021 movdqa 0x120-0x100(%rcx),$xd2
1022 movdqa 0x130-0x100(%rcx),$xd3
1023 paddd .Lfour(%rip),$xd0 # next SIMD counters
1026 movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
1027 movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
1028 movdqa (%r10),$xt3 # .Lrot16(%rip)
1030 movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
1036 foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
1037 foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
1042 paddd 0x40(%rsp),$xa0 # accumulate key material
1043 paddd 0x50(%rsp),$xa1
1044 paddd 0x60(%rsp),$xa2
1045 paddd 0x70(%rsp),$xa3
1047 movdqa $xa0,$xt2 # "de-interlace" data
1054 punpcklqdq $xa2,$xa0 # "a0"
1056 punpcklqdq $xt3,$xt2 # "a2"
1057 punpckhqdq $xa2,$xa1 # "a1"
1058 punpckhqdq $xt3,$xa3 # "a3"
1060 ($xa2,$xt2)=($xt2,$xa2);
1062 paddd 0x80-0x100(%rcx),$xb0
1063 paddd 0x90-0x100(%rcx),$xb1
1064 paddd 0xa0-0x100(%rcx),$xb2
1065 paddd 0xb0-0x100(%rcx),$xb3
1067 movdqa $xa0,0x00(%rsp) # offload $xaN
1068 movdqa $xa1,0x10(%rsp)
1069 movdqa 0x20(%rsp),$xa0 # "xc2"
1070 movdqa 0x30(%rsp),$xa1 # "xc3"
1079 punpcklqdq $xb2,$xb0 # "b0"
1081 punpcklqdq $xt3,$xt2 # "b2"
1082 punpckhqdq $xb2,$xb1 # "b1"
1083 punpckhqdq $xt3,$xb3 # "b3"
1085 ($xb2,$xt2)=($xt2,$xb2);
1086 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1088 paddd 0xc0-0x100(%rcx),$xc0
1089 paddd 0xd0-0x100(%rcx),$xc1
1090 paddd 0xe0-0x100(%rcx),$xc2
1091 paddd 0xf0-0x100(%rcx),$xc3
1093 movdqa $xa2,0x20(%rsp) # keep offloading $xaN
1094 movdqa $xa3,0x30(%rsp)
1103 punpcklqdq $xc2,$xc0 # "c0"
1105 punpcklqdq $xt3,$xt2 # "c2"
1106 punpckhqdq $xc2,$xc1 # "c1"
1107 punpckhqdq $xt3,$xc3 # "c3"
1109 ($xc2,$xt2)=($xt2,$xc2);
1110 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
1112 paddd 0x100-0x100(%rcx),$xd0
1113 paddd 0x110-0x100(%rcx),$xd1
1114 paddd 0x120-0x100(%rcx),$xd2
1115 paddd 0x130-0x100(%rcx),$xd3
1124 punpcklqdq $xd2,$xd0 # "d0"
1126 punpcklqdq $xt3,$xt2 # "d2"
1127 punpckhqdq $xd2,$xd1 # "d1"
1128 punpckhqdq $xt3,$xd3 # "d3"
1130 ($xd2,$xt2)=($xt2,$xd2);
1135 movdqu 0x00($inp),$xt0 # xor with input
1136 movdqu 0x10($inp),$xt1
1137 movdqu 0x20($inp),$xt2
1138 movdqu 0x30($inp),$xt3
1139 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1144 movdqu $xt0,0x00($out)
1145 movdqu 0x40($inp),$xt0
1146 movdqu $xt1,0x10($out)
1147 movdqu 0x50($inp),$xt1
1148 movdqu $xt2,0x20($out)
1149 movdqu 0x60($inp),$xt2
1150 movdqu $xt3,0x30($out)
1151 movdqu 0x70($inp),$xt3
1152 lea 0x80($inp),$inp # size optimization
1153 pxor 0x10(%rsp),$xt0
1158 movdqu $xt0,0x40($out)
1159 movdqu 0x00($inp),$xt0
1160 movdqu $xt1,0x50($out)
1161 movdqu 0x10($inp),$xt1
1162 movdqu $xt2,0x60($out)
1163 movdqu 0x20($inp),$xt2
1164 movdqu $xt3,0x70($out)
1165 lea 0x80($out),$out # size optimization
1166 movdqu 0x30($inp),$xt3
1167 pxor 0x20(%rsp),$xt0
1172 movdqu $xt0,0x00($out)
1173 movdqu 0x40($inp),$xt0
1174 movdqu $xt1,0x10($out)
1175 movdqu 0x50($inp),$xt1
1176 movdqu $xt2,0x20($out)
1177 movdqu 0x60($inp),$xt2
1178 movdqu $xt3,0x30($out)
1179 movdqu 0x70($inp),$xt3
1180 lea 0x80($inp),$inp # inp+=64*4
1181 pxor 0x30(%rsp),$xt0
1185 movdqu $xt0,0x40($out)
1186 movdqu $xt1,0x50($out)
1187 movdqu $xt2,0x60($out)
1188 movdqu $xt3,0x70($out)
1189 lea 0x80($out),$out # out+=64*4
1204 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1206 #movdqa $xt0,0x00(%rsp)
1207 movdqa $xb0,0x10(%rsp)
1208 movdqa $xc0,0x20(%rsp)
1209 movdqa $xd0,0x30(%rsp)
1214 movdqu 0x00($inp),$xt0 # xor with input
1215 movdqu 0x10($inp),$xt1
1216 movdqu 0x20($inp),$xt2
1217 movdqu 0x30($inp),$xt3
1218 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
1222 movdqu $xt0,0x00($out)
1223 movdqu $xt1,0x10($out)
1224 movdqu $xt2,0x20($out)
1225 movdqu $xt3,0x30($out)
1228 movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
1229 lea 0x40($inp),$inp # inp+=64*1
1231 movdqa $xt0,0x00(%rsp)
1232 movdqa $xb1,0x10(%rsp)
1233 lea 0x40($out),$out # out+=64*1
1234 movdqa $xc1,0x20(%rsp)
1235 sub \$64,$len # len-=64*1
1236 movdqa $xd1,0x30(%rsp)
1241 movdqu 0x00($inp),$xt0 # xor with input
1242 movdqu 0x10($inp),$xt1
1243 movdqu 0x20($inp),$xt2
1244 movdqu 0x30($inp),$xt3
1245 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1250 movdqu $xt0,0x00($out)
1251 movdqu 0x40($inp),$xt0
1252 movdqu $xt1,0x10($out)
1253 movdqu 0x50($inp),$xt1
1254 movdqu $xt2,0x20($out)
1255 movdqu 0x60($inp),$xt2
1256 movdqu $xt3,0x30($out)
1257 movdqu 0x70($inp),$xt3
1258 pxor 0x10(%rsp),$xt0
1262 movdqu $xt0,0x40($out)
1263 movdqu $xt1,0x50($out)
1264 movdqu $xt2,0x60($out)
1265 movdqu $xt3,0x70($out)
1268 movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
1269 lea 0x80($inp),$inp # inp+=64*2
1271 movdqa $xt0,0x00(%rsp)
1272 movdqa $xb2,0x10(%rsp)
1273 lea 0x80($out),$out # out+=64*2
1274 movdqa $xc2,0x20(%rsp)
1275 sub \$128,$len # len-=64*2
1276 movdqa $xd2,0x30(%rsp)
1281 movdqu 0x00($inp),$xt0 # xor with input
1282 movdqu 0x10($inp),$xt1
1283 movdqu 0x20($inp),$xt2
1284 movdqu 0x30($inp),$xt3
1285 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1290 movdqu $xt0,0x00($out)
1291 movdqu 0x40($inp),$xt0
1292 movdqu $xt1,0x10($out)
1293 movdqu 0x50($inp),$xt1
1294 movdqu $xt2,0x20($out)
1295 movdqu 0x60($inp),$xt2
1296 movdqu $xt3,0x30($out)
1297 movdqu 0x70($inp),$xt3
1298 lea 0x80($inp),$inp # size optimization
1299 pxor 0x10(%rsp),$xt0
1304 movdqu $xt0,0x40($out)
1305 movdqu 0x00($inp),$xt0
1306 movdqu $xt1,0x50($out)
1307 movdqu 0x10($inp),$xt1
1308 movdqu $xt2,0x60($out)
1309 movdqu 0x20($inp),$xt2
1310 movdqu $xt3,0x70($out)
1311 lea 0x80($out),$out # size optimization
1312 movdqu 0x30($inp),$xt3
1313 pxor 0x20(%rsp),$xt0
1317 movdqu $xt0,0x00($out)
1318 movdqu $xt1,0x10($out)
1319 movdqu $xt2,0x20($out)
1320 movdqu $xt3,0x30($out)
1323 movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
1324 lea 0x40($inp),$inp # inp+=64*3
1326 movdqa $xt0,0x00(%rsp)
1327 movdqa $xb3,0x10(%rsp)
1328 lea 0x40($out),$out # out+=64*3
1329 movdqa $xc3,0x20(%rsp)
1330 sub \$192,$len # len-=64*3
1331 movdqa $xd3,0x30(%rsp)
1334 movzb ($inp,%r10),%eax
1335 movzb (%rsp,%r10),%ecx
1338 mov %al,-1($out,%r10)
1344 $code.=<<___ if ($win64);
1345 movaps -0xa8(%r9),%xmm6
1346 movaps -0x98(%r9),%xmm7
1347 movaps -0x88(%r9),%xmm8
1348 movaps -0x78(%r9),%xmm9
1349 movaps -0x68(%r9),%xmm10
1350 movaps -0x58(%r9),%xmm11
1351 movaps -0x48(%r9),%xmm12
1352 movaps -0x38(%r9),%xmm13
1353 movaps -0x28(%r9),%xmm14
1354 movaps -0x18(%r9),%xmm15
1358 .cfi_def_cfa_register %rsp
1362 .size ChaCha20_4x,.-ChaCha20_4x
1366 ########################################################################
1367 # XOP code path that handles all lengths.
1369 # There is some "anomaly" observed depending on instructions' size or
1370 # alignment. If you look closely at below code you'll notice that
1371 # sometimes argument order varies. The order affects instruction
1372 # encoding by making it larger, and such fiddling gives 5% performance
1373 # improvement. This is on FX-4100...
1375 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1376 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1377 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1378 $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1380 sub XOP_lane_ROUND {
1381 my ($a0,$b0,$c0,$d0)=@_;
1382 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1383 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1384 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1385 my @x=map("\"$_\"",@xx);
1388 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1389 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1390 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1391 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1392 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1393 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1394 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1395 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1396 "&vprotd (@x[$d0],@x[$d0],16)",
1397 "&vprotd (@x[$d1],@x[$d1],16)",
1398 "&vprotd (@x[$d2],@x[$d2],16)",
1399 "&vprotd (@x[$d3],@x[$d3],16)",
1401 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1402 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1403 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1404 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1405 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1406 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1407 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1408 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1409 "&vprotd (@x[$b0],@x[$b0],12)",
1410 "&vprotd (@x[$b1],@x[$b1],12)",
1411 "&vprotd (@x[$b2],@x[$b2],12)",
1412 "&vprotd (@x[$b3],@x[$b3],12)",
1414 "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip
1415 "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip
1416 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1417 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1418 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1419 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1420 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1421 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1422 "&vprotd (@x[$d0],@x[$d0],8)",
1423 "&vprotd (@x[$d1],@x[$d1],8)",
1424 "&vprotd (@x[$d2],@x[$d2],8)",
1425 "&vprotd (@x[$d3],@x[$d3],8)",
1427 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1428 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1429 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1430 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1431 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1432 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1433 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1434 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1435 "&vprotd (@x[$b0],@x[$b0],7)",
1436 "&vprotd (@x[$b1],@x[$b1],7)",
1437 "&vprotd (@x[$b2],@x[$b2],7)",
1438 "&vprotd (@x[$b3],@x[$b3],7)"
1442 my $xframe = $win64 ? 0xa8 : 8;
1445 .type ChaCha20_4xop,\@function,5
1450 mov %rsp,%r9 # frame pointer
1451 .cfi_def_cfa_register %r9
1452 sub \$0x140+$xframe,%rsp
1454 ################ stack layout
1455 # +0x00 SIMD equivalent of @x[8-12]
1457 # +0x40 constant copy of key[0-2] smashed by lanes
1459 # +0x100 SIMD counters (with nonce smashed by lanes)
1462 $code.=<<___ if ($win64);
1463 movaps %xmm6,-0xa8(%r9)
1464 movaps %xmm7,-0x98(%r9)
1465 movaps %xmm8,-0x88(%r9)
1466 movaps %xmm9,-0x78(%r9)
1467 movaps %xmm10,-0x68(%r9)
1468 movaps %xmm11,-0x58(%r9)
1469 movaps %xmm12,-0x48(%r9)
1470 movaps %xmm13,-0x38(%r9)
1471 movaps %xmm14,-0x28(%r9)
1472 movaps %xmm15,-0x18(%r9)
1478 vmovdqa .Lsigma(%rip),$xa3 # key[0]
1479 vmovdqu ($key),$xb3 # key[1]
1480 vmovdqu 16($key),$xt3 # key[2]
1481 vmovdqu ($counter),$xd3 # key[3]
1482 lea 0x100(%rsp),%rcx # size optimization
1484 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1485 vpshufd \$0x55,$xa3,$xa1
1486 vmovdqa $xa0,0x40(%rsp) # ... and offload
1487 vpshufd \$0xaa,$xa3,$xa2
1488 vmovdqa $xa1,0x50(%rsp)
1489 vpshufd \$0xff,$xa3,$xa3
1490 vmovdqa $xa2,0x60(%rsp)
1491 vmovdqa $xa3,0x70(%rsp)
1493 vpshufd \$0x00,$xb3,$xb0
1494 vpshufd \$0x55,$xb3,$xb1
1495 vmovdqa $xb0,0x80-0x100(%rcx)
1496 vpshufd \$0xaa,$xb3,$xb2
1497 vmovdqa $xb1,0x90-0x100(%rcx)
1498 vpshufd \$0xff,$xb3,$xb3
1499 vmovdqa $xb2,0xa0-0x100(%rcx)
1500 vmovdqa $xb3,0xb0-0x100(%rcx)
1502 vpshufd \$0x00,$xt3,$xt0 # "$xc0"
1503 vpshufd \$0x55,$xt3,$xt1 # "$xc1"
1504 vmovdqa $xt0,0xc0-0x100(%rcx)
1505 vpshufd \$0xaa,$xt3,$xt2 # "$xc2"
1506 vmovdqa $xt1,0xd0-0x100(%rcx)
1507 vpshufd \$0xff,$xt3,$xt3 # "$xc3"
1508 vmovdqa $xt2,0xe0-0x100(%rcx)
1509 vmovdqa $xt3,0xf0-0x100(%rcx)
1511 vpshufd \$0x00,$xd3,$xd0
1512 vpshufd \$0x55,$xd3,$xd1
1513 vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet
1514 vpshufd \$0xaa,$xd3,$xd2
1515 vmovdqa $xd1,0x110-0x100(%rcx)
1516 vpshufd \$0xff,$xd3,$xd3
1517 vmovdqa $xd2,0x120-0x100(%rcx)
1518 vmovdqa $xd3,0x130-0x100(%rcx)
1524 vmovdqa 0x40(%rsp),$xa0 # re-load smashed key
1525 vmovdqa 0x50(%rsp),$xa1
1526 vmovdqa 0x60(%rsp),$xa2
1527 vmovdqa 0x70(%rsp),$xa3
1528 vmovdqa 0x80-0x100(%rcx),$xb0
1529 vmovdqa 0x90-0x100(%rcx),$xb1
1530 vmovdqa 0xa0-0x100(%rcx),$xb2
1531 vmovdqa 0xb0-0x100(%rcx),$xb3
1532 vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
1533 vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
1534 vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
1535 vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
1536 vmovdqa 0x100-0x100(%rcx),$xd0
1537 vmovdqa 0x110-0x100(%rcx),$xd1
1538 vmovdqa 0x120-0x100(%rcx),$xd2
1539 vmovdqa 0x130-0x100(%rcx),$xd3
1540 vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters
1544 vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
1550 foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
1551 foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
1556 vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material
1557 vpaddd 0x50(%rsp),$xa1,$xa1
1558 vpaddd 0x60(%rsp),$xa2,$xa2
1559 vpaddd 0x70(%rsp),$xa3,$xa3
1561 vmovdqa $xt2,0x20(%rsp) # offload $xc2,3
1562 vmovdqa $xt3,0x30(%rsp)
1564 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1565 vpunpckldq $xa3,$xa2,$xt3
1566 vpunpckhdq $xa1,$xa0,$xa0
1567 vpunpckhdq $xa3,$xa2,$xa2
1568 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1569 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1570 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1571 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1573 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1575 vpaddd 0x80-0x100(%rcx),$xb0,$xb0
1576 vpaddd 0x90-0x100(%rcx),$xb1,$xb1
1577 vpaddd 0xa0-0x100(%rcx),$xb2,$xb2
1578 vpaddd 0xb0-0x100(%rcx),$xb3,$xb3
1580 vmovdqa $xa0,0x00(%rsp) # offload $xa0,1
1581 vmovdqa $xa1,0x10(%rsp)
1582 vmovdqa 0x20(%rsp),$xa0 # "xc2"
1583 vmovdqa 0x30(%rsp),$xa1 # "xc3"
1585 vpunpckldq $xb1,$xb0,$xt2
1586 vpunpckldq $xb3,$xb2,$xt3
1587 vpunpckhdq $xb1,$xb0,$xb0
1588 vpunpckhdq $xb3,$xb2,$xb2
1589 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1590 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1591 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1592 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1594 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1595 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1597 vpaddd 0xc0-0x100(%rcx),$xc0,$xc0
1598 vpaddd 0xd0-0x100(%rcx),$xc1,$xc1
1599 vpaddd 0xe0-0x100(%rcx),$xc2,$xc2
1600 vpaddd 0xf0-0x100(%rcx),$xc3,$xc3
1602 vpunpckldq $xc1,$xc0,$xt2
1603 vpunpckldq $xc3,$xc2,$xt3
1604 vpunpckhdq $xc1,$xc0,$xc0
1605 vpunpckhdq $xc3,$xc2,$xc2
1606 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1607 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1608 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1609 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1611 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1613 vpaddd 0x100-0x100(%rcx),$xd0,$xd0
1614 vpaddd 0x110-0x100(%rcx),$xd1,$xd1
1615 vpaddd 0x120-0x100(%rcx),$xd2,$xd2
1616 vpaddd 0x130-0x100(%rcx),$xd3,$xd3
1618 vpunpckldq $xd1,$xd0,$xt2
1619 vpunpckldq $xd3,$xd2,$xt3
1620 vpunpckhdq $xd1,$xd0,$xd0
1621 vpunpckhdq $xd3,$xd2,$xd2
1622 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1623 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1624 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1625 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1627 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1628 ($xa0,$xa1)=($xt2,$xt3);
1630 vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1
1631 vmovdqa 0x10(%rsp),$xa1
1636 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1637 vpxor 0x10($inp),$xb0,$xb0
1638 vpxor 0x20($inp),$xc0,$xc0
1639 vpxor 0x30($inp),$xd0,$xd0
1640 vpxor 0x40($inp),$xa1,$xa1
1641 vpxor 0x50($inp),$xb1,$xb1
1642 vpxor 0x60($inp),$xc1,$xc1
1643 vpxor 0x70($inp),$xd1,$xd1
1644 lea 0x80($inp),$inp # size optimization
1645 vpxor 0x00($inp),$xa2,$xa2
1646 vpxor 0x10($inp),$xb2,$xb2
1647 vpxor 0x20($inp),$xc2,$xc2
1648 vpxor 0x30($inp),$xd2,$xd2
1649 vpxor 0x40($inp),$xa3,$xa3
1650 vpxor 0x50($inp),$xb3,$xb3
1651 vpxor 0x60($inp),$xc3,$xc3
1652 vpxor 0x70($inp),$xd3,$xd3
1653 lea 0x80($inp),$inp # inp+=64*4
1655 vmovdqu $xa0,0x00($out)
1656 vmovdqu $xb0,0x10($out)
1657 vmovdqu $xc0,0x20($out)
1658 vmovdqu $xd0,0x30($out)
1659 vmovdqu $xa1,0x40($out)
1660 vmovdqu $xb1,0x50($out)
1661 vmovdqu $xc1,0x60($out)
1662 vmovdqu $xd1,0x70($out)
1663 lea 0x80($out),$out # size optimization
1664 vmovdqu $xa2,0x00($out)
1665 vmovdqu $xb2,0x10($out)
1666 vmovdqu $xc2,0x20($out)
1667 vmovdqu $xd2,0x30($out)
1668 vmovdqu $xa3,0x40($out)
1669 vmovdqu $xb3,0x50($out)
1670 vmovdqu $xc3,0x60($out)
1671 vmovdqu $xd3,0x70($out)
1672 lea 0x80($out),$out # out+=64*4
1682 jae .L192_or_more4xop
1684 jae .L128_or_more4xop
1686 jae .L64_or_more4xop
1689 vmovdqa $xa0,0x00(%rsp)
1690 vmovdqa $xb0,0x10(%rsp)
1691 vmovdqa $xc0,0x20(%rsp)
1692 vmovdqa $xd0,0x30(%rsp)
1697 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1698 vpxor 0x10($inp),$xb0,$xb0
1699 vpxor 0x20($inp),$xc0,$xc0
1700 vpxor 0x30($inp),$xd0,$xd0
1701 vmovdqu $xa0,0x00($out)
1702 vmovdqu $xb0,0x10($out)
1703 vmovdqu $xc0,0x20($out)
1704 vmovdqu $xd0,0x30($out)
1707 lea 0x40($inp),$inp # inp+=64*1
1708 vmovdqa $xa1,0x00(%rsp)
1710 vmovdqa $xb1,0x10(%rsp)
1711 lea 0x40($out),$out # out+=64*1
1712 vmovdqa $xc1,0x20(%rsp)
1713 sub \$64,$len # len-=64*1
1714 vmovdqa $xd1,0x30(%rsp)
1719 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1720 vpxor 0x10($inp),$xb0,$xb0
1721 vpxor 0x20($inp),$xc0,$xc0
1722 vpxor 0x30($inp),$xd0,$xd0
1723 vpxor 0x40($inp),$xa1,$xa1
1724 vpxor 0x50($inp),$xb1,$xb1
1725 vpxor 0x60($inp),$xc1,$xc1
1726 vpxor 0x70($inp),$xd1,$xd1
1728 vmovdqu $xa0,0x00($out)
1729 vmovdqu $xb0,0x10($out)
1730 vmovdqu $xc0,0x20($out)
1731 vmovdqu $xd0,0x30($out)
1732 vmovdqu $xa1,0x40($out)
1733 vmovdqu $xb1,0x50($out)
1734 vmovdqu $xc1,0x60($out)
1735 vmovdqu $xd1,0x70($out)
1738 lea 0x80($inp),$inp # inp+=64*2
1739 vmovdqa $xa2,0x00(%rsp)
1741 vmovdqa $xb2,0x10(%rsp)
1742 lea 0x80($out),$out # out+=64*2
1743 vmovdqa $xc2,0x20(%rsp)
1744 sub \$128,$len # len-=64*2
1745 vmovdqa $xd2,0x30(%rsp)
1750 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1751 vpxor 0x10($inp),$xb0,$xb0
1752 vpxor 0x20($inp),$xc0,$xc0
1753 vpxor 0x30($inp),$xd0,$xd0
1754 vpxor 0x40($inp),$xa1,$xa1
1755 vpxor 0x50($inp),$xb1,$xb1
1756 vpxor 0x60($inp),$xc1,$xc1
1757 vpxor 0x70($inp),$xd1,$xd1
1758 lea 0x80($inp),$inp # size optimization
1759 vpxor 0x00($inp),$xa2,$xa2
1760 vpxor 0x10($inp),$xb2,$xb2
1761 vpxor 0x20($inp),$xc2,$xc2
1762 vpxor 0x30($inp),$xd2,$xd2
1764 vmovdqu $xa0,0x00($out)
1765 vmovdqu $xb0,0x10($out)
1766 vmovdqu $xc0,0x20($out)
1767 vmovdqu $xd0,0x30($out)
1768 vmovdqu $xa1,0x40($out)
1769 vmovdqu $xb1,0x50($out)
1770 vmovdqu $xc1,0x60($out)
1771 vmovdqu $xd1,0x70($out)
1772 lea 0x80($out),$out # size optimization
1773 vmovdqu $xa2,0x00($out)
1774 vmovdqu $xb2,0x10($out)
1775 vmovdqu $xc2,0x20($out)
1776 vmovdqu $xd2,0x30($out)
1779 lea 0x40($inp),$inp # inp+=64*3
1780 vmovdqa $xa3,0x00(%rsp)
1782 vmovdqa $xb3,0x10(%rsp)
1783 lea 0x40($out),$out # out+=64*3
1784 vmovdqa $xc3,0x20(%rsp)
1785 sub \$192,$len # len-=64*3
1786 vmovdqa $xd3,0x30(%rsp)
1789 movzb ($inp,%r10),%eax
1790 movzb (%rsp,%r10),%ecx
1793 mov %al,-1($out,%r10)
1800 $code.=<<___ if ($win64);
1801 movaps -0xa8(%r9),%xmm6
1802 movaps -0x98(%r9),%xmm7
1803 movaps -0x88(%r9),%xmm8
1804 movaps -0x78(%r9),%xmm9
1805 movaps -0x68(%r9),%xmm10
1806 movaps -0x58(%r9),%xmm11
1807 movaps -0x48(%r9),%xmm12
1808 movaps -0x38(%r9),%xmm13
1809 movaps -0x28(%r9),%xmm14
1810 movaps -0x18(%r9),%xmm15
1814 .cfi_def_cfa_register %rsp
1818 .size ChaCha20_4xop,.-ChaCha20_4xop
1822 ########################################################################
1825 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1826 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1827 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1828 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1830 sub AVX2_lane_ROUND {
1831 my ($a0,$b0,$c0,$d0)=@_;
1832 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1833 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1834 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1835 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1836 my @x=map("\"$_\"",@xx);
1838 # Consider order in which variables are addressed by their
1843 # 0 4 8 12 < even round
1847 # 0 5 10 15 < odd round
1852 # 'a', 'b' and 'd's are permanently allocated in registers,
1853 # @x[0..7,12..15], while 'c's are maintained in memory. If
1854 # you observe 'c' column, you'll notice that pair of 'c's is
1855 # invariant between rounds. This means that we have to reload
1856 # them once per round, in the middle. This is why you'll see
1857 # bunch of 'c' stores and loads in the middle, but none in
1858 # the beginning or end.
1861 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1862 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1863 "&vpshufb (@x[$d0],@x[$d0],$t1)",
1864 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1865 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1866 "&vpshufb (@x[$d1],@x[$d1],$t1)",
1868 "&vpaddd ($xc,$xc,@x[$d0])",
1869 "&vpxor (@x[$b0],$xc,@x[$b0])",
1870 "&vpslld ($t0,@x[$b0],12)",
1871 "&vpsrld (@x[$b0],@x[$b0],20)",
1872 "&vpor (@x[$b0],$t0,@x[$b0])",
1873 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1874 "&vpaddd ($xc_,$xc_,@x[$d1])",
1875 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1876 "&vpslld ($t1,@x[$b1],12)",
1877 "&vpsrld (@x[$b1],@x[$b1],20)",
1878 "&vpor (@x[$b1],$t1,@x[$b1])",
1880 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
1881 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1882 "&vpshufb (@x[$d0],@x[$d0],$t0)",
1883 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
1884 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1885 "&vpshufb (@x[$d1],@x[$d1],$t0)",
1887 "&vpaddd ($xc,$xc,@x[$d0])",
1888 "&vpxor (@x[$b0],$xc,@x[$b0])",
1889 "&vpslld ($t1,@x[$b0],7)",
1890 "&vpsrld (@x[$b0],@x[$b0],25)",
1891 "&vpor (@x[$b0],$t1,@x[$b0])",
1892 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1893 "&vpaddd ($xc_,$xc_,@x[$d1])",
1894 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1895 "&vpslld ($t0,@x[$b1],7)",
1896 "&vpsrld (@x[$b1],@x[$b1],25)",
1897 "&vpor (@x[$b1],$t0,@x[$b1])",
1899 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
1900 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
1901 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
1902 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
1904 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1905 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1906 "&vpshufb (@x[$d2],@x[$d2],$t1)",
1907 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1908 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1909 "&vpshufb (@x[$d3],@x[$d3],$t1)",
1911 "&vpaddd ($xc,$xc,@x[$d2])",
1912 "&vpxor (@x[$b2],$xc,@x[$b2])",
1913 "&vpslld ($t0,@x[$b2],12)",
1914 "&vpsrld (@x[$b2],@x[$b2],20)",
1915 "&vpor (@x[$b2],$t0,@x[$b2])",
1916 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1917 "&vpaddd ($xc_,$xc_,@x[$d3])",
1918 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1919 "&vpslld ($t1,@x[$b3],12)",
1920 "&vpsrld (@x[$b3],@x[$b3],20)",
1921 "&vpor (@x[$b3],$t1,@x[$b3])",
1923 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1924 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1925 "&vpshufb (@x[$d2],@x[$d2],$t0)",
1926 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1927 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1928 "&vpshufb (@x[$d3],@x[$d3],$t0)",
1930 "&vpaddd ($xc,$xc,@x[$d2])",
1931 "&vpxor (@x[$b2],$xc,@x[$b2])",
1932 "&vpslld ($t1,@x[$b2],7)",
1933 "&vpsrld (@x[$b2],@x[$b2],25)",
1934 "&vpor (@x[$b2],$t1,@x[$b2])",
1935 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1936 "&vpaddd ($xc_,$xc_,@x[$d3])",
1937 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1938 "&vpslld ($t0,@x[$b3],7)",
1939 "&vpsrld (@x[$b3],@x[$b3],25)",
1940 "&vpor (@x[$b3],$t0,@x[$b3])"
1944 my $xframe = $win64 ? 0xa8 : 8;
1947 .type ChaCha20_8x,\@function,5
1952 mov %rsp,%r9 # frame register
1953 .cfi_def_cfa_register %r9
1954 sub \$0x280+$xframe,%rsp
1957 $code.=<<___ if ($win64);
1958 movaps %xmm6,-0xa8(%r9)
1959 movaps %xmm7,-0x98(%r9)
1960 movaps %xmm8,-0x88(%r9)
1961 movaps %xmm9,-0x78(%r9)
1962 movaps %xmm10,-0x68(%r9)
1963 movaps %xmm11,-0x58(%r9)
1964 movaps %xmm12,-0x48(%r9)
1965 movaps %xmm13,-0x38(%r9)
1966 movaps %xmm14,-0x28(%r9)
1967 movaps %xmm15,-0x18(%r9)
1973 ################ stack layout
1974 # +0x00 SIMD equivalent of @x[8-12]
1976 # +0x80 constant copy of key[0-2] smashed by lanes
1978 # +0x200 SIMD counters (with nonce smashed by lanes)
1982 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
1983 vbroadcasti128 ($key),$xb3 # key[1]
1984 vbroadcasti128 16($key),$xt3 # key[2]
1985 vbroadcasti128 ($counter),$xd3 # key[3]
1986 lea 0x100(%rsp),%rcx # size optimization
1987 lea 0x200(%rsp),%rax # size optimization
1988 lea .Lrot16(%rip),%r10
1989 lea .Lrot24(%rip),%r11
1991 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1992 vpshufd \$0x55,$xa3,$xa1
1993 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload
1994 vpshufd \$0xaa,$xa3,$xa2
1995 vmovdqa $xa1,0xa0-0x100(%rcx)
1996 vpshufd \$0xff,$xa3,$xa3
1997 vmovdqa $xa2,0xc0-0x100(%rcx)
1998 vmovdqa $xa3,0xe0-0x100(%rcx)
2000 vpshufd \$0x00,$xb3,$xb0
2001 vpshufd \$0x55,$xb3,$xb1
2002 vmovdqa $xb0,0x100-0x100(%rcx)
2003 vpshufd \$0xaa,$xb3,$xb2
2004 vmovdqa $xb1,0x120-0x100(%rcx)
2005 vpshufd \$0xff,$xb3,$xb3
2006 vmovdqa $xb2,0x140-0x100(%rcx)
2007 vmovdqa $xb3,0x160-0x100(%rcx)
2009 vpshufd \$0x00,$xt3,$xt0 # "xc0"
2010 vpshufd \$0x55,$xt3,$xt1 # "xc1"
2011 vmovdqa $xt0,0x180-0x200(%rax)
2012 vpshufd \$0xaa,$xt3,$xt2 # "xc2"
2013 vmovdqa $xt1,0x1a0-0x200(%rax)
2014 vpshufd \$0xff,$xt3,$xt3 # "xc3"
2015 vmovdqa $xt2,0x1c0-0x200(%rax)
2016 vmovdqa $xt3,0x1e0-0x200(%rax)
2018 vpshufd \$0x00,$xd3,$xd0
2019 vpshufd \$0x55,$xd3,$xd1
2020 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
2021 vpshufd \$0xaa,$xd3,$xd2
2022 vmovdqa $xd1,0x220-0x200(%rax)
2023 vpshufd \$0xff,$xd3,$xd3
2024 vmovdqa $xd2,0x240-0x200(%rax)
2025 vmovdqa $xd3,0x260-0x200(%rax)
2031 vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key
2032 vmovdqa 0xa0-0x100(%rcx),$xa1
2033 vmovdqa 0xc0-0x100(%rcx),$xa2
2034 vmovdqa 0xe0-0x100(%rcx),$xa3
2035 vmovdqa 0x100-0x100(%rcx),$xb0
2036 vmovdqa 0x120-0x100(%rcx),$xb1
2037 vmovdqa 0x140-0x100(%rcx),$xb2
2038 vmovdqa 0x160-0x100(%rcx),$xb3
2039 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0"
2040 vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1"
2041 vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2"
2042 vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3"
2043 vmovdqa 0x200-0x200(%rax),$xd0
2044 vmovdqa 0x220-0x200(%rax),$xd1
2045 vmovdqa 0x240-0x200(%rax),$xd2
2046 vmovdqa 0x260-0x200(%rax),$xd3
2047 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters
2050 vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
2051 vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
2052 vbroadcasti128 (%r10),$xt3
2053 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
2060 foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
2061 foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
2066 lea 0x200(%rsp),%rax # size optimization
2067 vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
2068 vpaddd 0xa0-0x100(%rcx),$xa1,$xa1
2069 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2
2070 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3
2072 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
2073 vpunpckldq $xa3,$xa2,$xt3
2074 vpunpckhdq $xa1,$xa0,$xa0
2075 vpunpckhdq $xa3,$xa2,$xa2
2076 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
2077 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
2078 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
2079 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
2081 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
2083 vpaddd 0x100-0x100(%rcx),$xb0,$xb0
2084 vpaddd 0x120-0x100(%rcx),$xb1,$xb1
2085 vpaddd 0x140-0x100(%rcx),$xb2,$xb2
2086 vpaddd 0x160-0x100(%rcx),$xb3,$xb3
2088 vpunpckldq $xb1,$xb0,$xt2
2089 vpunpckldq $xb3,$xb2,$xt3
2090 vpunpckhdq $xb1,$xb0,$xb0
2091 vpunpckhdq $xb3,$xb2,$xb2
2092 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
2093 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
2094 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
2095 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
2097 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
2099 vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
2100 vperm2i128 \$0x31,$xb0,$xa0,$xb0
2101 vperm2i128 \$0x20,$xb1,$xa1,$xa0
2102 vperm2i128 \$0x31,$xb1,$xa1,$xb1
2103 vperm2i128 \$0x20,$xb2,$xa2,$xa1
2104 vperm2i128 \$0x31,$xb2,$xa2,$xb2
2105 vperm2i128 \$0x20,$xb3,$xa3,$xa2
2106 vperm2i128 \$0x31,$xb3,$xa3,$xb3
2108 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
2109 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
2111 vmovdqa $xa0,0x00(%rsp) # offload $xaN
2112 vmovdqa $xa1,0x20(%rsp)
2113 vmovdqa 0x40(%rsp),$xc2 # $xa0
2114 vmovdqa 0x60(%rsp),$xc3 # $xa1
2116 vpaddd 0x180-0x200(%rax),$xc0,$xc0
2117 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1
2118 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2
2119 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3
2121 vpunpckldq $xc1,$xc0,$xt2
2122 vpunpckldq $xc3,$xc2,$xt3
2123 vpunpckhdq $xc1,$xc0,$xc0
2124 vpunpckhdq $xc3,$xc2,$xc2
2125 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
2126 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
2127 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
2128 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
2130 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
2132 vpaddd 0x200-0x200(%rax),$xd0,$xd0
2133 vpaddd 0x220-0x200(%rax),$xd1,$xd1
2134 vpaddd 0x240-0x200(%rax),$xd2,$xd2
2135 vpaddd 0x260-0x200(%rax),$xd3,$xd3
2137 vpunpckldq $xd1,$xd0,$xt2
2138 vpunpckldq $xd3,$xd2,$xt3
2139 vpunpckhdq $xd1,$xd0,$xd0
2140 vpunpckhdq $xd3,$xd2,$xd2
2141 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
2142 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
2143 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
2144 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
2146 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
2148 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
2149 vperm2i128 \$0x31,$xd0,$xc0,$xd0
2150 vperm2i128 \$0x20,$xd1,$xc1,$xc0
2151 vperm2i128 \$0x31,$xd1,$xc1,$xd1
2152 vperm2i128 \$0x20,$xd2,$xc2,$xc1
2153 vperm2i128 \$0x31,$xd2,$xc2,$xd2
2154 vperm2i128 \$0x20,$xd3,$xc3,$xc2
2155 vperm2i128 \$0x31,$xd3,$xc3,$xd3
2157 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
2158 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
2159 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
2160 ($xa0,$xa1)=($xt2,$xt3);
2162 vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember?
2163 vmovdqa 0x20(%rsp),$xa1
2168 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2169 vpxor 0x20($inp),$xb0,$xb0
2170 vpxor 0x40($inp),$xc0,$xc0
2171 vpxor 0x60($inp),$xd0,$xd0
2172 lea 0x80($inp),$inp # size optimization
2173 vmovdqu $xa0,0x00($out)
2174 vmovdqu $xb0,0x20($out)
2175 vmovdqu $xc0,0x40($out)
2176 vmovdqu $xd0,0x60($out)
2177 lea 0x80($out),$out # size optimization
2179 vpxor 0x00($inp),$xa1,$xa1
2180 vpxor 0x20($inp),$xb1,$xb1
2181 vpxor 0x40($inp),$xc1,$xc1
2182 vpxor 0x60($inp),$xd1,$xd1
2183 lea 0x80($inp),$inp # size optimization
2184 vmovdqu $xa1,0x00($out)
2185 vmovdqu $xb1,0x20($out)
2186 vmovdqu $xc1,0x40($out)
2187 vmovdqu $xd1,0x60($out)
2188 lea 0x80($out),$out # size optimization
2190 vpxor 0x00($inp),$xa2,$xa2
2191 vpxor 0x20($inp),$xb2,$xb2
2192 vpxor 0x40($inp),$xc2,$xc2
2193 vpxor 0x60($inp),$xd2,$xd2
2194 lea 0x80($inp),$inp # size optimization
2195 vmovdqu $xa2,0x00($out)
2196 vmovdqu $xb2,0x20($out)
2197 vmovdqu $xc2,0x40($out)
2198 vmovdqu $xd2,0x60($out)
2199 lea 0x80($out),$out # size optimization
2201 vpxor 0x00($inp),$xa3,$xa3
2202 vpxor 0x20($inp),$xb3,$xb3
2203 vpxor 0x40($inp),$xc3,$xc3
2204 vpxor 0x60($inp),$xd3,$xd3
2205 lea 0x80($inp),$inp # size optimization
2206 vmovdqu $xa3,0x00($out)
2207 vmovdqu $xb3,0x20($out)
2208 vmovdqu $xc3,0x40($out)
2209 vmovdqu $xd3,0x60($out)
2210 lea 0x80($out),$out # size optimization
2234 vmovdqa $xa0,0x00(%rsp)
2235 vmovdqa $xb0,0x20(%rsp)
2240 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2241 vpxor 0x20($inp),$xb0,$xb0
2242 vmovdqu $xa0,0x00($out)
2243 vmovdqu $xb0,0x20($out)
2246 lea 0x40($inp),$inp # inp+=64*1
2248 vmovdqa $xc0,0x00(%rsp)
2249 lea 0x40($out),$out # out+=64*1
2250 sub \$64,$len # len-=64*1
2251 vmovdqa $xd0,0x20(%rsp)
2256 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2257 vpxor 0x20($inp),$xb0,$xb0
2258 vpxor 0x40($inp),$xc0,$xc0
2259 vpxor 0x60($inp),$xd0,$xd0
2260 vmovdqu $xa0,0x00($out)
2261 vmovdqu $xb0,0x20($out)
2262 vmovdqu $xc0,0x40($out)
2263 vmovdqu $xd0,0x60($out)
2266 lea 0x80($inp),$inp # inp+=64*2
2268 vmovdqa $xa1,0x00(%rsp)
2269 lea 0x80($out),$out # out+=64*2
2270 sub \$128,$len # len-=64*2
2271 vmovdqa $xb1,0x20(%rsp)
2276 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2277 vpxor 0x20($inp),$xb0,$xb0
2278 vpxor 0x40($inp),$xc0,$xc0
2279 vpxor 0x60($inp),$xd0,$xd0
2280 vpxor 0x80($inp),$xa1,$xa1
2281 vpxor 0xa0($inp),$xb1,$xb1
2282 vmovdqu $xa0,0x00($out)
2283 vmovdqu $xb0,0x20($out)
2284 vmovdqu $xc0,0x40($out)
2285 vmovdqu $xd0,0x60($out)
2286 vmovdqu $xa1,0x80($out)
2287 vmovdqu $xb1,0xa0($out)
2290 lea 0xc0($inp),$inp # inp+=64*3
2292 vmovdqa $xc1,0x00(%rsp)
2293 lea 0xc0($out),$out # out+=64*3
2294 sub \$192,$len # len-=64*3
2295 vmovdqa $xd1,0x20(%rsp)
2300 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2301 vpxor 0x20($inp),$xb0,$xb0
2302 vpxor 0x40($inp),$xc0,$xc0
2303 vpxor 0x60($inp),$xd0,$xd0
2304 vpxor 0x80($inp),$xa1,$xa1
2305 vpxor 0xa0($inp),$xb1,$xb1
2306 vpxor 0xc0($inp),$xc1,$xc1
2307 vpxor 0xe0($inp),$xd1,$xd1
2308 vmovdqu $xa0,0x00($out)
2309 vmovdqu $xb0,0x20($out)
2310 vmovdqu $xc0,0x40($out)
2311 vmovdqu $xd0,0x60($out)
2312 vmovdqu $xa1,0x80($out)
2313 vmovdqu $xb1,0xa0($out)
2314 vmovdqu $xc1,0xc0($out)
2315 vmovdqu $xd1,0xe0($out)
2318 lea 0x100($inp),$inp # inp+=64*4
2320 vmovdqa $xa2,0x00(%rsp)
2321 lea 0x100($out),$out # out+=64*4
2322 sub \$256,$len # len-=64*4
2323 vmovdqa $xb2,0x20(%rsp)
2328 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2329 vpxor 0x20($inp),$xb0,$xb0
2330 vpxor 0x40($inp),$xc0,$xc0
2331 vpxor 0x60($inp),$xd0,$xd0
2332 vpxor 0x80($inp),$xa1,$xa1
2333 vpxor 0xa0($inp),$xb1,$xb1
2334 vpxor 0xc0($inp),$xc1,$xc1
2335 vpxor 0xe0($inp),$xd1,$xd1
2336 vpxor 0x100($inp),$xa2,$xa2
2337 vpxor 0x120($inp),$xb2,$xb2
2338 vmovdqu $xa0,0x00($out)
2339 vmovdqu $xb0,0x20($out)
2340 vmovdqu $xc0,0x40($out)
2341 vmovdqu $xd0,0x60($out)
2342 vmovdqu $xa1,0x80($out)
2343 vmovdqu $xb1,0xa0($out)
2344 vmovdqu $xc1,0xc0($out)
2345 vmovdqu $xd1,0xe0($out)
2346 vmovdqu $xa2,0x100($out)
2347 vmovdqu $xb2,0x120($out)
2350 lea 0x140($inp),$inp # inp+=64*5
2352 vmovdqa $xc2,0x00(%rsp)
2353 lea 0x140($out),$out # out+=64*5
2354 sub \$320,$len # len-=64*5
2355 vmovdqa $xd2,0x20(%rsp)
2360 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2361 vpxor 0x20($inp),$xb0,$xb0
2362 vpxor 0x40($inp),$xc0,$xc0
2363 vpxor 0x60($inp),$xd0,$xd0
2364 vpxor 0x80($inp),$xa1,$xa1
2365 vpxor 0xa0($inp),$xb1,$xb1
2366 vpxor 0xc0($inp),$xc1,$xc1
2367 vpxor 0xe0($inp),$xd1,$xd1
2368 vpxor 0x100($inp),$xa2,$xa2
2369 vpxor 0x120($inp),$xb2,$xb2
2370 vpxor 0x140($inp),$xc2,$xc2
2371 vpxor 0x160($inp),$xd2,$xd2
2372 vmovdqu $xa0,0x00($out)
2373 vmovdqu $xb0,0x20($out)
2374 vmovdqu $xc0,0x40($out)
2375 vmovdqu $xd0,0x60($out)
2376 vmovdqu $xa1,0x80($out)
2377 vmovdqu $xb1,0xa0($out)
2378 vmovdqu $xc1,0xc0($out)
2379 vmovdqu $xd1,0xe0($out)
2380 vmovdqu $xa2,0x100($out)
2381 vmovdqu $xb2,0x120($out)
2382 vmovdqu $xc2,0x140($out)
2383 vmovdqu $xd2,0x160($out)
2386 lea 0x180($inp),$inp # inp+=64*6
2388 vmovdqa $xa3,0x00(%rsp)
2389 lea 0x180($out),$out # out+=64*6
2390 sub \$384,$len # len-=64*6
2391 vmovdqa $xb3,0x20(%rsp)
2396 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2397 vpxor 0x20($inp),$xb0,$xb0
2398 vpxor 0x40($inp),$xc0,$xc0
2399 vpxor 0x60($inp),$xd0,$xd0
2400 vpxor 0x80($inp),$xa1,$xa1
2401 vpxor 0xa0($inp),$xb1,$xb1
2402 vpxor 0xc0($inp),$xc1,$xc1
2403 vpxor 0xe0($inp),$xd1,$xd1
2404 vpxor 0x100($inp),$xa2,$xa2
2405 vpxor 0x120($inp),$xb2,$xb2
2406 vpxor 0x140($inp),$xc2,$xc2
2407 vpxor 0x160($inp),$xd2,$xd2
2408 vpxor 0x180($inp),$xa3,$xa3
2409 vpxor 0x1a0($inp),$xb3,$xb3
2410 vmovdqu $xa0,0x00($out)
2411 vmovdqu $xb0,0x20($out)
2412 vmovdqu $xc0,0x40($out)
2413 vmovdqu $xd0,0x60($out)
2414 vmovdqu $xa1,0x80($out)
2415 vmovdqu $xb1,0xa0($out)
2416 vmovdqu $xc1,0xc0($out)
2417 vmovdqu $xd1,0xe0($out)
2418 vmovdqu $xa2,0x100($out)
2419 vmovdqu $xb2,0x120($out)
2420 vmovdqu $xc2,0x140($out)
2421 vmovdqu $xd2,0x160($out)
2422 vmovdqu $xa3,0x180($out)
2423 vmovdqu $xb3,0x1a0($out)
2426 lea 0x1c0($inp),$inp # inp+=64*7
2428 vmovdqa $xc3,0x00(%rsp)
2429 lea 0x1c0($out),$out # out+=64*7
2430 sub \$448,$len # len-=64*7
2431 vmovdqa $xd3,0x20(%rsp)
2434 movzb ($inp,%r10),%eax
2435 movzb (%rsp,%r10),%ecx
2438 mov %al,-1($out,%r10)
2445 $code.=<<___ if ($win64);
2446 movaps -0xa8(%r9),%xmm6
2447 movaps -0x98(%r9),%xmm7
2448 movaps -0x88(%r9),%xmm8
2449 movaps -0x78(%r9),%xmm9
2450 movaps -0x68(%r9),%xmm10
2451 movaps -0x58(%r9),%xmm11
2452 movaps -0x48(%r9),%xmm12
2453 movaps -0x38(%r9),%xmm13
2454 movaps -0x28(%r9),%xmm14
2455 movaps -0x18(%r9),%xmm15
2459 .cfi_def_cfa_register %rsp
2463 .size ChaCha20_8x,.-ChaCha20_8x
2467 ########################################################################
2470 # This one handles shorter inputs...
2472 my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
2473 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
2475 sub vpxord() # size optimization
2476 { my $opcode = "vpxor"; # adhere to vpxor when possible
2479 if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) {
2485 $code .= "\t$opcode\t".join(',',reverse @_)."\n";
2488 sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round
2506 my $xframe = $win64 ? 160+8 : 8;
2509 .type ChaCha20_avx512,\@function,5
2514 mov %rsp,%r9 # frame pointer
2515 .cfi_def_cfa_register %r9
2519 sub \$64+$xframe,%rsp
2521 $code.=<<___ if ($win64);
2522 movaps %xmm6,-0xa8(%r9)
2523 movaps %xmm7,-0x98(%r9)
2524 movaps %xmm8,-0x88(%r9)
2525 movaps %xmm9,-0x78(%r9)
2526 movaps %xmm10,-0x68(%r9)
2527 movaps %xmm11,-0x58(%r9)
2528 movaps %xmm12,-0x48(%r9)
2529 movaps %xmm13,-0x38(%r9)
2530 movaps %xmm14,-0x28(%r9)
2531 movaps %xmm15,-0x18(%r9)
2535 vbroadcasti32x4 .Lsigma(%rip),$a
2536 vbroadcasti32x4 ($key),$b
2537 vbroadcasti32x4 16($key),$c
2538 vbroadcasti32x4 ($counter),$d
2543 vpaddd .Lzeroz(%rip),$d,$d
2544 vmovdqa32 .Lfourz(%rip),$fourz
2545 mov \$10,$counter # reuse $counter
2554 vpaddd $fourz,$d_,$d
2563 &vpshufd ($c,$c,0b01001110);
2564 &vpshufd ($b,$b,0b00111001);
2565 &vpshufd ($d,$d,0b10010011);
2568 &vpshufd ($c,$c,0b01001110);
2569 &vpshufd ($b,$b,0b10010011);
2570 &vpshufd ($d,$d,0b00111001);
2573 &jnz (".Loop_avx512");
2584 vpxor 0x00($inp),%x#$a,$t0 # xor with input
2585 vpxor 0x10($inp),%x#$b,$t1
2586 vpxor 0x20($inp),%x#$c,$t2
2587 vpxor 0x30($inp),%x#$d,$t3
2588 lea 0x40($inp),$inp # inp+=64
2590 vmovdqu $t0,0x00($out) # write output
2591 vmovdqu $t1,0x10($out)
2592 vmovdqu $t2,0x20($out)
2593 vmovdqu $t3,0x30($out)
2594 lea 0x40($out),$out # out+=64
2598 vextracti32x4 \$1,$a,$t0
2599 vextracti32x4 \$1,$b,$t1
2600 vextracti32x4 \$1,$c,$t2
2601 vextracti32x4 \$1,$d,$t3
2606 vpxor 0x00($inp),$t0,$t0 # xor with input
2607 vpxor 0x10($inp),$t1,$t1
2608 vpxor 0x20($inp),$t2,$t2
2609 vpxor 0x30($inp),$t3,$t3
2610 lea 0x40($inp),$inp # inp+=64
2612 vmovdqu $t0,0x00($out) # write output
2613 vmovdqu $t1,0x10($out)
2614 vmovdqu $t2,0x20($out)
2615 vmovdqu $t3,0x30($out)
2616 lea 0x40($out),$out # out+=64
2620 vextracti32x4 \$2,$a,$t0
2621 vextracti32x4 \$2,$b,$t1
2622 vextracti32x4 \$2,$c,$t2
2623 vextracti32x4 \$2,$d,$t3
2628 vpxor 0x00($inp),$t0,$t0 # xor with input
2629 vpxor 0x10($inp),$t1,$t1
2630 vpxor 0x20($inp),$t2,$t2
2631 vpxor 0x30($inp),$t3,$t3
2632 lea 0x40($inp),$inp # inp+=64
2634 vmovdqu $t0,0x00($out) # write output
2635 vmovdqu $t1,0x10($out)
2636 vmovdqu $t2,0x20($out)
2637 vmovdqu $t3,0x30($out)
2638 lea 0x40($out),$out # out+=64
2642 vextracti32x4 \$3,$a,$t0
2643 vextracti32x4 \$3,$b,$t1
2644 vextracti32x4 \$3,$c,$t2
2645 vextracti32x4 \$3,$d,$t3
2650 vpxor 0x00($inp),$t0,$t0 # xor with input
2651 vpxor 0x10($inp),$t1,$t1
2652 vpxor 0x20($inp),$t2,$t2
2653 vpxor 0x30($inp),$t3,$t3
2654 lea 0x40($inp),$inp # inp+=64
2656 vmovdqu $t0,0x00($out) # write output
2657 vmovdqu $t1,0x10($out)
2658 vmovdqu $t2,0x20($out)
2659 vmovdqu $t3,0x30($out)
2660 lea 0x40($out),$out # out+=64
2662 jnz .Loop_outer_avx512
2668 vmovdqa %x#$a,0x00(%rsp)
2669 vmovdqa %x#$b,0x10(%rsp)
2670 vmovdqa %x#$c,0x20(%rsp)
2671 vmovdqa %x#$d,0x30(%rsp)
2673 jmp .Loop_tail_avx512
2677 vmovdqa $t0,0x00(%rsp)
2678 vmovdqa $t1,0x10(%rsp)
2679 vmovdqa $t2,0x20(%rsp)
2680 vmovdqa $t3,0x30(%rsp)
2684 movzb ($inp,$counter),%eax
2685 movzb (%rsp,$counter),%ecx
2686 lea 1($counter),$counter
2688 mov %al,-1($out,$counter)
2690 jnz .Loop_tail_avx512
2692 vmovdqu32 $a_,0x00(%rsp)
2697 $code.=<<___ if ($win64);
2698 movaps -0xa8(%r9),%xmm6
2699 movaps -0x98(%r9),%xmm7
2700 movaps -0x88(%r9),%xmm8
2701 movaps -0x78(%r9),%xmm9
2702 movaps -0x68(%r9),%xmm10
2703 movaps -0x58(%r9),%xmm11
2704 movaps -0x48(%r9),%xmm12
2705 movaps -0x38(%r9),%xmm13
2706 movaps -0x28(%r9),%xmm14
2707 movaps -0x18(%r9),%xmm15
2711 .cfi_def_cfa_register %rsp
2715 .size ChaCha20_avx512,.-ChaCha20_avx512
2718 map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz);
2721 .type ChaCha20_avx512vl,\@function,5
2725 .LChaCha20_avx512vl:
2726 mov %rsp,%r9 # frame pointer
2727 .cfi_def_cfa_register %r9
2731 sub \$64+$xframe,%rsp
2733 $code.=<<___ if ($win64);
2734 movaps %xmm6,-0xa8(%r9)
2735 movaps %xmm7,-0x98(%r9)
2736 movaps %xmm8,-0x88(%r9)
2737 movaps %xmm9,-0x78(%r9)
2738 movaps %xmm10,-0x68(%r9)
2739 movaps %xmm11,-0x58(%r9)
2740 movaps %xmm12,-0x48(%r9)
2741 movaps %xmm13,-0x38(%r9)
2742 movaps %xmm14,-0x28(%r9)
2743 movaps %xmm15,-0x18(%r9)
2747 vbroadcasti128 .Lsigma(%rip),$a
2748 vbroadcasti128 ($key),$b
2749 vbroadcasti128 16($key),$c
2750 vbroadcasti128 ($counter),$d
2755 vpaddd .Lzeroz(%rip),$d,$d
2756 vmovdqa32 .Ltwoy(%rip),$fourz
2757 mov \$10,$counter # reuse $counter
2762 .Loop_outer_avx512vl:
2764 vpaddd $fourz,$d_,$d
2773 &vpshufd ($c,$c,0b01001110);
2774 &vpshufd ($b,$b,0b00111001);
2775 &vpshufd ($d,$d,0b10010011);
2778 &vpshufd ($c,$c,0b01001110);
2779 &vpshufd ($b,$b,0b10010011);
2780 &vpshufd ($d,$d,0b00111001);
2783 &jnz (".Loop_avx512vl");
2792 jb .Ltail64_avx512vl
2794 vpxor 0x00($inp),%x#$a,$t0 # xor with input
2795 vpxor 0x10($inp),%x#$b,$t1
2796 vpxor 0x20($inp),%x#$c,$t2
2797 vpxor 0x30($inp),%x#$d,$t3
2798 lea 0x40($inp),$inp # inp+=64
2800 vmovdqu $t0,0x00($out) # write output
2801 vmovdqu $t1,0x10($out)
2802 vmovdqu $t2,0x20($out)
2803 vmovdqu $t3,0x30($out)
2804 lea 0x40($out),$out # out+=64
2808 vextracti128 \$1,$a,$t0
2809 vextracti128 \$1,$b,$t1
2810 vextracti128 \$1,$c,$t2
2811 vextracti128 \$1,$d,$t3
2816 vpxor 0x00($inp),$t0,$t0 # xor with input
2817 vpxor 0x10($inp),$t1,$t1
2818 vpxor 0x20($inp),$t2,$t2
2819 vpxor 0x30($inp),$t3,$t3
2820 lea 0x40($inp),$inp # inp+=64
2822 vmovdqu $t0,0x00($out) # write output
2823 vmovdqu $t1,0x10($out)
2824 vmovdqu $t2,0x20($out)
2825 vmovdqu $t3,0x30($out)
2826 lea 0x40($out),$out # out+=64
2830 jnz .Loop_outer_avx512vl
2836 vmovdqa %x#$a,0x00(%rsp)
2837 vmovdqa %x#$b,0x10(%rsp)
2838 vmovdqa %x#$c,0x20(%rsp)
2839 vmovdqa %x#$d,0x30(%rsp)
2841 jmp .Loop_tail_avx512vl
2845 vmovdqa $t0,0x00(%rsp)
2846 vmovdqa $t1,0x10(%rsp)
2847 vmovdqa $t2,0x20(%rsp)
2848 vmovdqa $t3,0x30(%rsp)
2851 .Loop_tail_avx512vl:
2852 movzb ($inp,$counter),%eax
2853 movzb (%rsp,$counter),%ecx
2854 lea 1($counter),$counter
2856 mov %al,-1($out,$counter)
2858 jnz .Loop_tail_avx512vl
2860 vmovdqu32 $a_,0x00(%rsp)
2861 vmovdqu32 $a_,0x20(%rsp)
2866 $code.=<<___ if ($win64);
2867 movaps -0xa8(%r9),%xmm6
2868 movaps -0x98(%r9),%xmm7
2869 movaps -0x88(%r9),%xmm8
2870 movaps -0x78(%r9),%xmm9
2871 movaps -0x68(%r9),%xmm10
2872 movaps -0x58(%r9),%xmm11
2873 movaps -0x48(%r9),%xmm12
2874 movaps -0x38(%r9),%xmm13
2875 movaps -0x28(%r9),%xmm14
2876 movaps -0x18(%r9),%xmm15
2880 .cfi_def_cfa_register %rsp
2881 .Lavx512vl_epilogue:
2884 .size ChaCha20_avx512vl,.-ChaCha20_avx512vl
2888 # This one handles longer inputs...
2890 my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2891 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
2892 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2893 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2894 my @key=map("%zmm$_",(16..31));
2895 my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
2897 sub AVX512_lane_ROUND {
2898 my ($a0,$b0,$c0,$d0)=@_;
2899 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
2900 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
2901 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
2902 my @x=map("\"$_\"",@xx);
2905 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
2906 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
2907 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
2908 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
2909 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2910 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2911 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2912 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2913 "&vprold (@x[$d0],@x[$d0],16)",
2914 "&vprold (@x[$d1],@x[$d1],16)",
2915 "&vprold (@x[$d2],@x[$d2],16)",
2916 "&vprold (@x[$d3],@x[$d3],16)",
2918 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2919 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2920 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2921 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2922 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2923 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2924 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2925 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2926 "&vprold (@x[$b0],@x[$b0],12)",
2927 "&vprold (@x[$b1],@x[$b1],12)",
2928 "&vprold (@x[$b2],@x[$b2],12)",
2929 "&vprold (@x[$b3],@x[$b3],12)",
2931 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
2932 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
2933 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
2934 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
2935 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2936 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2937 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2938 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2939 "&vprold (@x[$d0],@x[$d0],8)",
2940 "&vprold (@x[$d1],@x[$d1],8)",
2941 "&vprold (@x[$d2],@x[$d2],8)",
2942 "&vprold (@x[$d3],@x[$d3],8)",
2944 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2945 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2946 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2947 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2948 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2949 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2950 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2951 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2952 "&vprold (@x[$b0],@x[$b0],7)",
2953 "&vprold (@x[$b1],@x[$b1],7)",
2954 "&vprold (@x[$b2],@x[$b2],7)",
2955 "&vprold (@x[$b3],@x[$b3],7)"
2959 my $xframe = $win64 ? 0xa8 : 8;
2962 .type ChaCha20_16x,\@function,5
2967 mov %rsp,%r9 # frame register
2968 .cfi_def_cfa_register %r9
2969 sub \$64+$xframe,%rsp
2972 $code.=<<___ if ($win64);
2973 movaps %xmm6,-0xa8(%r9)
2974 movaps %xmm7,-0x98(%r9)
2975 movaps %xmm8,-0x88(%r9)
2976 movaps %xmm9,-0x78(%r9)
2977 movaps %xmm10,-0x68(%r9)
2978 movaps %xmm11,-0x58(%r9)
2979 movaps %xmm12,-0x48(%r9)
2980 movaps %xmm13,-0x38(%r9)
2981 movaps %xmm14,-0x28(%r9)
2982 movaps %xmm15,-0x18(%r9)
2988 lea .Lsigma(%rip),%r10
2989 vbroadcasti32x4 (%r10),$xa3 # key[0]
2990 vbroadcasti32x4 ($key),$xb3 # key[1]
2991 vbroadcasti32x4 16($key),$xc3 # key[2]
2992 vbroadcasti32x4 ($counter),$xd3 # key[3]
2994 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
2995 vpshufd \$0x55,$xa3,$xa1
2996 vpshufd \$0xaa,$xa3,$xa2
2997 vpshufd \$0xff,$xa3,$xa3
2998 vmovdqa64 $xa0,@key[0]
2999 vmovdqa64 $xa1,@key[1]
3000 vmovdqa64 $xa2,@key[2]
3001 vmovdqa64 $xa3,@key[3]
3003 vpshufd \$0x00,$xb3,$xb0
3004 vpshufd \$0x55,$xb3,$xb1
3005 vpshufd \$0xaa,$xb3,$xb2
3006 vpshufd \$0xff,$xb3,$xb3
3007 vmovdqa64 $xb0,@key[4]
3008 vmovdqa64 $xb1,@key[5]
3009 vmovdqa64 $xb2,@key[6]
3010 vmovdqa64 $xb3,@key[7]
3012 vpshufd \$0x00,$xc3,$xc0
3013 vpshufd \$0x55,$xc3,$xc1
3014 vpshufd \$0xaa,$xc3,$xc2
3015 vpshufd \$0xff,$xc3,$xc3
3016 vmovdqa64 $xc0,@key[8]
3017 vmovdqa64 $xc1,@key[9]
3018 vmovdqa64 $xc2,@key[10]
3019 vmovdqa64 $xc3,@key[11]
3021 vpshufd \$0x00,$xd3,$xd0
3022 vpshufd \$0x55,$xd3,$xd1
3023 vpshufd \$0xaa,$xd3,$xd2
3024 vpshufd \$0xff,$xd3,$xd3
3025 vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet
3026 vmovdqa64 $xd0,@key[12]
3027 vmovdqa64 $xd1,@key[13]
3028 vmovdqa64 $xd2,@key[14]
3029 vmovdqa64 $xd3,@key[15]
3036 vpbroadcastd 0(%r10),$xa0 # reload key
3037 vpbroadcastd 4(%r10),$xa1
3038 vpbroadcastd 8(%r10),$xa2
3039 vpbroadcastd 12(%r10),$xa3
3040 vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters
3041 vmovdqa64 @key[4],$xb0
3042 vmovdqa64 @key[5],$xb1
3043 vmovdqa64 @key[6],$xb2
3044 vmovdqa64 @key[7],$xb3
3045 vmovdqa64 @key[8],$xc0
3046 vmovdqa64 @key[9],$xc1
3047 vmovdqa64 @key[10],$xc2
3048 vmovdqa64 @key[11],$xc3
3049 vmovdqa64 @key[12],$xd0
3050 vmovdqa64 @key[13],$xd1
3051 vmovdqa64 @key[14],$xd2
3052 vmovdqa64 @key[15],$xd3
3054 vmovdqa64 $xa0,@key[0]
3055 vmovdqa64 $xa1,@key[1]
3056 vmovdqa64 $xa2,@key[2]
3057 vmovdqa64 $xa3,@key[3]
3065 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
3066 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
3071 vpaddd @key[0],$xa0,$xa0 # accumulate key
3072 vpaddd @key[1],$xa1,$xa1
3073 vpaddd @key[2],$xa2,$xa2
3074 vpaddd @key[3],$xa3,$xa3
3076 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
3077 vpunpckldq $xa3,$xa2,$xt3
3078 vpunpckhdq $xa1,$xa0,$xa0
3079 vpunpckhdq $xa3,$xa2,$xa2
3080 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
3081 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
3082 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
3083 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
3085 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
3087 vpaddd @key[4],$xb0,$xb0
3088 vpaddd @key[5],$xb1,$xb1
3089 vpaddd @key[6],$xb2,$xb2
3090 vpaddd @key[7],$xb3,$xb3
3092 vpunpckldq $xb1,$xb0,$xt2
3093 vpunpckldq $xb3,$xb2,$xt3
3094 vpunpckhdq $xb1,$xb0,$xb0
3095 vpunpckhdq $xb3,$xb2,$xb2
3096 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
3097 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
3098 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
3099 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
3101 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
3103 vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further
3104 vshufi32x4 \$0xee,$xb0,$xa0,$xb0
3105 vshufi32x4 \$0x44,$xb1,$xa1,$xa0
3106 vshufi32x4 \$0xee,$xb1,$xa1,$xb1
3107 vshufi32x4 \$0x44,$xb2,$xa2,$xa1
3108 vshufi32x4 \$0xee,$xb2,$xa2,$xb2
3109 vshufi32x4 \$0x44,$xb3,$xa3,$xa2
3110 vshufi32x4 \$0xee,$xb3,$xa3,$xb3
3112 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
3114 vpaddd @key[8],$xc0,$xc0
3115 vpaddd @key[9],$xc1,$xc1
3116 vpaddd @key[10],$xc2,$xc2
3117 vpaddd @key[11],$xc3,$xc3
3119 vpunpckldq $xc1,$xc0,$xt2
3120 vpunpckldq $xc3,$xc2,$xt3
3121 vpunpckhdq $xc1,$xc0,$xc0
3122 vpunpckhdq $xc3,$xc2,$xc2
3123 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
3124 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
3125 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
3126 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
3128 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
3130 vpaddd @key[12],$xd0,$xd0
3131 vpaddd @key[13],$xd1,$xd1
3132 vpaddd @key[14],$xd2,$xd2
3133 vpaddd @key[15],$xd3,$xd3
3135 vpunpckldq $xd1,$xd0,$xt2
3136 vpunpckldq $xd3,$xd2,$xt3
3137 vpunpckhdq $xd1,$xd0,$xd0
3138 vpunpckhdq $xd3,$xd2,$xd2
3139 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
3140 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
3141 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
3142 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
3144 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
3146 vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further
3147 vshufi32x4 \$0xee,$xd0,$xc0,$xd0
3148 vshufi32x4 \$0x44,$xd1,$xc1,$xc0
3149 vshufi32x4 \$0xee,$xd1,$xc1,$xd1
3150 vshufi32x4 \$0x44,$xd2,$xc2,$xc1
3151 vshufi32x4 \$0xee,$xd2,$xc2,$xd2
3152 vshufi32x4 \$0x44,$xd3,$xc3,$xc2
3153 vshufi32x4 \$0xee,$xd3,$xc3,$xd3
3155 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
3157 vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further
3158 vshufi32x4 \$0xdd,$xc0,$xa0,$xa0
3159 vshufi32x4 \$0x88,$xd0,$xb0,$xc0
3160 vshufi32x4 \$0xdd,$xd0,$xb0,$xd0
3161 vshufi32x4 \$0x88,$xc1,$xa1,$xt1
3162 vshufi32x4 \$0xdd,$xc1,$xa1,$xa1
3163 vshufi32x4 \$0x88,$xd1,$xb1,$xc1
3164 vshufi32x4 \$0xdd,$xd1,$xb1,$xd1
3165 vshufi32x4 \$0x88,$xc2,$xa2,$xt2
3166 vshufi32x4 \$0xdd,$xc2,$xa2,$xa2
3167 vshufi32x4 \$0x88,$xd2,$xb2,$xc2
3168 vshufi32x4 \$0xdd,$xd2,$xb2,$xd2
3169 vshufi32x4 \$0x88,$xc3,$xa3,$xt3
3170 vshufi32x4 \$0xdd,$xc3,$xa3,$xa3
3171 vshufi32x4 \$0x88,$xd3,$xb3,$xc3
3172 vshufi32x4 \$0xdd,$xd3,$xb3,$xd3
3174 ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
3175 ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
3177 ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
3178 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
3179 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3180 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
3185 vpxord 0x00($inp),$xa0,$xa0 # xor with input
3186 vpxord 0x40($inp),$xb0,$xb0
3187 vpxord 0x80($inp),$xc0,$xc0
3188 vpxord 0xc0($inp),$xd0,$xd0
3189 vmovdqu32 $xa0,0x00($out)
3190 vmovdqu32 $xb0,0x40($out)
3191 vmovdqu32 $xc0,0x80($out)
3192 vmovdqu32 $xd0,0xc0($out)
3194 vpxord 0x100($inp),$xa1,$xa1
3195 vpxord 0x140($inp),$xb1,$xb1
3196 vpxord 0x180($inp),$xc1,$xc1
3197 vpxord 0x1c0($inp),$xd1,$xd1
3198 vmovdqu32 $xa1,0x100($out)
3199 vmovdqu32 $xb1,0x140($out)
3200 vmovdqu32 $xc1,0x180($out)
3201 vmovdqu32 $xd1,0x1c0($out)
3203 vpxord 0x200($inp),$xa2,$xa2
3204 vpxord 0x240($inp),$xb2,$xb2
3205 vpxord 0x280($inp),$xc2,$xc2
3206 vpxord 0x2c0($inp),$xd2,$xd2
3207 vmovdqu32 $xa2,0x200($out)
3208 vmovdqu32 $xb2,0x240($out)
3209 vmovdqu32 $xc2,0x280($out)
3210 vmovdqu32 $xd2,0x2c0($out)
3212 vpxord 0x300($inp),$xa3,$xa3
3213 vpxord 0x340($inp),$xb3,$xb3
3214 vpxord 0x380($inp),$xc3,$xc3
3215 vpxord 0x3c0($inp),$xd3,$xd3
3216 lea 0x400($inp),$inp
3217 vmovdqu32 $xa3,0x300($out)
3218 vmovdqu32 $xb3,0x340($out)
3219 vmovdqu32 $xc3,0x380($out)
3220 vmovdqu32 $xd3,0x3c0($out)
3221 lea 0x400($out),$out
3233 jb .Less_than_64_16x
3234 vpxord ($inp),$xa0,$xa0 # xor with input
3235 vmovdqu32 $xa0,($out,$inp)
3241 jb .Less_than_64_16x
3242 vpxord ($inp),$xb0,$xb0
3243 vmovdqu32 $xb0,($out,$inp)
3249 jb .Less_than_64_16x
3250 vpxord ($inp),$xc0,$xc0
3251 vmovdqu32 $xc0,($out,$inp)
3257 jb .Less_than_64_16x
3258 vpxord ($inp),$xd0,$xd0
3259 vmovdqu32 $xd0,($out,$inp)
3265 jb .Less_than_64_16x
3266 vpxord ($inp),$xa1,$xa1
3267 vmovdqu32 $xa1,($out,$inp)
3273 jb .Less_than_64_16x
3274 vpxord ($inp),$xb1,$xb1
3275 vmovdqu32 $xb1,($out,$inp)
3281 jb .Less_than_64_16x
3282 vpxord ($inp),$xc1,$xc1
3283 vmovdqu32 $xc1,($out,$inp)
3289 jb .Less_than_64_16x
3290 vpxord ($inp),$xd1,$xd1
3291 vmovdqu32 $xd1,($out,$inp)
3297 jb .Less_than_64_16x
3298 vpxord ($inp),$xa2,$xa2
3299 vmovdqu32 $xa2,($out,$inp)
3305 jb .Less_than_64_16x
3306 vpxord ($inp),$xb2,$xb2
3307 vmovdqu32 $xb2,($out,$inp)
3313 jb .Less_than_64_16x
3314 vpxord ($inp),$xc2,$xc2
3315 vmovdqu32 $xc2,($out,$inp)
3321 jb .Less_than_64_16x
3322 vpxord ($inp),$xd2,$xd2
3323 vmovdqu32 $xd2,($out,$inp)
3329 jb .Less_than_64_16x
3330 vpxord ($inp),$xa3,$xa3
3331 vmovdqu32 $xa3,($out,$inp)
3337 jb .Less_than_64_16x
3338 vpxord ($inp),$xb3,$xb3
3339 vmovdqu32 $xb3,($out,$inp)
3345 jb .Less_than_64_16x
3346 vpxord ($inp),$xc3,$xc3
3347 vmovdqu32 $xc3,($out,$inp)
3353 vmovdqa32 $xa0,0x00(%rsp)
3354 lea ($out,$inp),$out
3358 movzb ($inp,%r10),%eax
3359 movzb (%rsp,%r10),%ecx
3362 mov %al,-1($out,%r10)
3366 vpxord $xa0,$xa0,$xa0
3367 vmovdqa32 $xa0,0(%rsp)
3372 $code.=<<___ if ($win64);
3373 movaps -0xa8(%r9),%xmm6
3374 movaps -0x98(%r9),%xmm7
3375 movaps -0x88(%r9),%xmm8
3376 movaps -0x78(%r9),%xmm9
3377 movaps -0x68(%r9),%xmm10
3378 movaps -0x58(%r9),%xmm11
3379 movaps -0x48(%r9),%xmm12
3380 movaps -0x38(%r9),%xmm13
3381 movaps -0x28(%r9),%xmm14
3382 movaps -0x18(%r9),%xmm15
3386 .cfi_def_cfa_register %rsp
3390 .size ChaCha20_16x,.-ChaCha20_16x
3393 # switch to %ymm domain
3394 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3395 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15));
3396 @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3397 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
3398 @key=map("%ymm$_",(16..31));
3399 ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
3402 .type ChaCha20_8xvl,\@function,5
3407 mov %rsp,%r9 # frame register
3408 .cfi_def_cfa_register %r9
3409 sub \$64+$xframe,%rsp
3412 $code.=<<___ if ($win64);
3413 movaps %xmm6,-0xa8(%r9)
3414 movaps %xmm7,-0x98(%r9)
3415 movaps %xmm8,-0x88(%r9)
3416 movaps %xmm9,-0x78(%r9)
3417 movaps %xmm10,-0x68(%r9)
3418 movaps %xmm11,-0x58(%r9)
3419 movaps %xmm12,-0x48(%r9)
3420 movaps %xmm13,-0x38(%r9)
3421 movaps %xmm14,-0x28(%r9)
3422 movaps %xmm15,-0x18(%r9)
3428 lea .Lsigma(%rip),%r10
3429 vbroadcasti128 (%r10),$xa3 # key[0]
3430 vbroadcasti128 ($key),$xb3 # key[1]
3431 vbroadcasti128 16($key),$xc3 # key[2]
3432 vbroadcasti128 ($counter),$xd3 # key[3]
3434 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
3435 vpshufd \$0x55,$xa3,$xa1
3436 vpshufd \$0xaa,$xa3,$xa2
3437 vpshufd \$0xff,$xa3,$xa3
3438 vmovdqa64 $xa0,@key[0]
3439 vmovdqa64 $xa1,@key[1]
3440 vmovdqa64 $xa2,@key[2]
3441 vmovdqa64 $xa3,@key[3]
3443 vpshufd \$0x00,$xb3,$xb0
3444 vpshufd \$0x55,$xb3,$xb1
3445 vpshufd \$0xaa,$xb3,$xb2
3446 vpshufd \$0xff,$xb3,$xb3
3447 vmovdqa64 $xb0,@key[4]
3448 vmovdqa64 $xb1,@key[5]
3449 vmovdqa64 $xb2,@key[6]
3450 vmovdqa64 $xb3,@key[7]
3452 vpshufd \$0x00,$xc3,$xc0
3453 vpshufd \$0x55,$xc3,$xc1
3454 vpshufd \$0xaa,$xc3,$xc2
3455 vpshufd \$0xff,$xc3,$xc3
3456 vmovdqa64 $xc0,@key[8]
3457 vmovdqa64 $xc1,@key[9]
3458 vmovdqa64 $xc2,@key[10]
3459 vmovdqa64 $xc3,@key[11]
3461 vpshufd \$0x00,$xd3,$xd0
3462 vpshufd \$0x55,$xd3,$xd1
3463 vpshufd \$0xaa,$xd3,$xd2
3464 vpshufd \$0xff,$xd3,$xd3
3465 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
3466 vmovdqa64 $xd0,@key[12]
3467 vmovdqa64 $xd1,@key[13]
3468 vmovdqa64 $xd2,@key[14]
3469 vmovdqa64 $xd3,@key[15]
3476 #vpbroadcastd 0(%r10),$xa0 # reload key
3477 #vpbroadcastd 4(%r10),$xa1
3478 vpbroadcastd 8(%r10),$xa2
3479 vpbroadcastd 12(%r10),$xa3
3480 vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters
3481 vmovdqa64 @key[4],$xb0
3482 vmovdqa64 @key[5],$xb1
3483 vmovdqa64 @key[6],$xb2
3484 vmovdqa64 @key[7],$xb3
3485 vmovdqa64 @key[8],$xc0
3486 vmovdqa64 @key[9],$xc1
3487 vmovdqa64 @key[10],$xc2
3488 vmovdqa64 @key[11],$xc3
3489 vmovdqa64 @key[12],$xd0
3490 vmovdqa64 @key[13],$xd1
3491 vmovdqa64 @key[14],$xd2
3492 vmovdqa64 @key[15],$xd3
3494 vmovdqa64 $xa0,@key[0]
3495 vmovdqa64 $xa1,@key[1]
3496 vmovdqa64 $xa2,@key[2]
3497 vmovdqa64 $xa3,@key[3]
3505 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
3506 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
3511 vpaddd @key[0],$xa0,$xa0 # accumulate key
3512 vpaddd @key[1],$xa1,$xa1
3513 vpaddd @key[2],$xa2,$xa2
3514 vpaddd @key[3],$xa3,$xa3
3516 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
3517 vpunpckldq $xa3,$xa2,$xt3
3518 vpunpckhdq $xa1,$xa0,$xa0
3519 vpunpckhdq $xa3,$xa2,$xa2
3520 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
3521 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
3522 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
3523 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
3525 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
3527 vpaddd @key[4],$xb0,$xb0
3528 vpaddd @key[5],$xb1,$xb1
3529 vpaddd @key[6],$xb2,$xb2
3530 vpaddd @key[7],$xb3,$xb3
3532 vpunpckldq $xb1,$xb0,$xt2
3533 vpunpckldq $xb3,$xb2,$xt3
3534 vpunpckhdq $xb1,$xb0,$xb0
3535 vpunpckhdq $xb3,$xb2,$xb2
3536 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
3537 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
3538 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
3539 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
3541 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
3543 vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further
3544 vshufi32x4 \$3,$xb0,$xa0,$xb0
3545 vshufi32x4 \$0,$xb1,$xa1,$xa0
3546 vshufi32x4 \$3,$xb1,$xa1,$xb1
3547 vshufi32x4 \$0,$xb2,$xa2,$xa1
3548 vshufi32x4 \$3,$xb2,$xa2,$xb2
3549 vshufi32x4 \$0,$xb3,$xa3,$xa2
3550 vshufi32x4 \$3,$xb3,$xa3,$xb3
3552 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
3554 vpaddd @key[8],$xc0,$xc0
3555 vpaddd @key[9],$xc1,$xc1
3556 vpaddd @key[10],$xc2,$xc2
3557 vpaddd @key[11],$xc3,$xc3
3559 vpunpckldq $xc1,$xc0,$xt2
3560 vpunpckldq $xc3,$xc2,$xt3
3561 vpunpckhdq $xc1,$xc0,$xc0
3562 vpunpckhdq $xc3,$xc2,$xc2
3563 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
3564 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
3565 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
3566 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
3568 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
3570 vpaddd @key[12],$xd0,$xd0
3571 vpaddd @key[13],$xd1,$xd1
3572 vpaddd @key[14],$xd2,$xd2
3573 vpaddd @key[15],$xd3,$xd3
3575 vpunpckldq $xd1,$xd0,$xt2
3576 vpunpckldq $xd3,$xd2,$xt3
3577 vpunpckhdq $xd1,$xd0,$xd0
3578 vpunpckhdq $xd3,$xd2,$xd2
3579 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
3580 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
3581 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
3582 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
3584 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
3586 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
3587 vperm2i128 \$0x31,$xd0,$xc0,$xd0
3588 vperm2i128 \$0x20,$xd1,$xc1,$xc0
3589 vperm2i128 \$0x31,$xd1,$xc1,$xd1
3590 vperm2i128 \$0x20,$xd2,$xc2,$xc1
3591 vperm2i128 \$0x31,$xd2,$xc2,$xd2
3592 vperm2i128 \$0x20,$xd3,$xc3,$xc2
3593 vperm2i128 \$0x31,$xd3,$xc3,$xd3
3595 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
3596 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
3597 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
3602 mov \$0x80,%eax # size optimization
3603 vpxord 0x00($inp),$xa0,$xa0 # xor with input
3604 vpxor 0x20($inp),$xb0,$xb0
3605 vpxor 0x40($inp),$xc0,$xc0
3606 vpxor 0x60($inp),$xd0,$xd0
3607 lea ($inp,%rax),$inp # size optimization
3608 vmovdqu32 $xa0,0x00($out)
3609 vmovdqu $xb0,0x20($out)
3610 vmovdqu $xc0,0x40($out)
3611 vmovdqu $xd0,0x60($out)
3612 lea ($out,%rax),$out # size optimization
3614 vpxor 0x00($inp),$xa1,$xa1
3615 vpxor 0x20($inp),$xb1,$xb1
3616 vpxor 0x40($inp),$xc1,$xc1
3617 vpxor 0x60($inp),$xd1,$xd1
3618 lea ($inp,%rax),$inp # size optimization
3619 vmovdqu $xa1,0x00($out)
3620 vmovdqu $xb1,0x20($out)
3621 vmovdqu $xc1,0x40($out)
3622 vmovdqu $xd1,0x60($out)
3623 lea ($out,%rax),$out # size optimization
3625 vpxord 0x00($inp),$xa2,$xa2
3626 vpxor 0x20($inp),$xb2,$xb2
3627 vpxor 0x40($inp),$xc2,$xc2
3628 vpxor 0x60($inp),$xd2,$xd2
3629 lea ($inp,%rax),$inp # size optimization
3630 vmovdqu32 $xa2,0x00($out)
3631 vmovdqu $xb2,0x20($out)
3632 vmovdqu $xc2,0x40($out)
3633 vmovdqu $xd2,0x60($out)
3634 lea ($out,%rax),$out # size optimization
3636 vpxor 0x00($inp),$xa3,$xa3
3637 vpxor 0x20($inp),$xb3,$xb3
3638 vpxor 0x40($inp),$xc3,$xc3
3639 vpxor 0x60($inp),$xd3,$xd3
3640 lea ($inp,%rax),$inp # size optimization
3641 vmovdqu $xa3,0x00($out)
3642 vmovdqu $xb3,0x20($out)
3643 vmovdqu $xc3,0x40($out)
3644 vmovdqu $xd3,0x60($out)
3645 lea ($out,%rax),$out # size optimization
3647 vpbroadcastd 0(%r10),%ymm0 # reload key
3648 vpbroadcastd 4(%r10),%ymm1
3657 vmovdqa64 $xa0,%ymm8 # size optimization
3664 jb .Less_than_64_8xvl
3665 vpxor 0x00($inp),$xa0,$xa0 # xor with input
3666 vpxor 0x20($inp),$xb0,$xb0
3667 vmovdqu $xa0,0x00($out,$inp)
3668 vmovdqu $xb0,0x20($out,$inp)
3675 jb .Less_than_64_8xvl
3676 vpxor 0x00($inp),$xc0,$xc0
3677 vpxor 0x20($inp),$xd0,$xd0
3678 vmovdqu $xc0,0x00($out,$inp)
3679 vmovdqu $xd0,0x20($out,$inp)
3686 jb .Less_than_64_8xvl
3687 vpxor 0x00($inp),$xa1,$xa1
3688 vpxor 0x20($inp),$xb1,$xb1
3689 vmovdqu $xa1,0x00($out,$inp)
3690 vmovdqu $xb1,0x20($out,$inp)
3697 jb .Less_than_64_8xvl
3698 vpxor 0x00($inp),$xc1,$xc1
3699 vpxor 0x20($inp),$xd1,$xd1
3700 vmovdqu $xc1,0x00($out,$inp)
3701 vmovdqu $xd1,0x20($out,$inp)
3708 jb .Less_than_64_8xvl
3709 vpxord 0x00($inp),$xa2,$xa2
3710 vpxor 0x20($inp),$xb2,$xb2
3711 vmovdqu32 $xa2,0x00($out,$inp)
3712 vmovdqu $xb2,0x20($out,$inp)
3719 jb .Less_than_64_8xvl
3720 vpxor 0x00($inp),$xc2,$xc2
3721 vpxor 0x20($inp),$xd2,$xd2
3722 vmovdqu $xc2,0x00($out,$inp)
3723 vmovdqu $xd2,0x20($out,$inp)
3730 jb .Less_than_64_8xvl
3731 vpxor 0x00($inp),$xa3,$xa3
3732 vpxor 0x20($inp),$xb3,$xb3
3733 vmovdqu $xa3,0x00($out,$inp)
3734 vmovdqu $xb3,0x20($out,$inp)
3741 vmovdqa $xa0,0x00(%rsp)
3742 vmovdqa $xb0,0x20(%rsp)
3743 lea ($out,$inp),$out
3747 movzb ($inp,%r10),%eax
3748 movzb (%rsp,%r10),%ecx
3751 mov %al,-1($out,%r10)
3755 vpxor $xa0,$xa0,$xa0
3756 vmovdqa $xa0,0x00(%rsp)
3757 vmovdqa $xa0,0x20(%rsp)
3762 $code.=<<___ if ($win64);
3763 movaps -0xa8(%r9),%xmm6
3764 movaps -0x98(%r9),%xmm7
3765 movaps -0x88(%r9),%xmm8
3766 movaps -0x78(%r9),%xmm9
3767 movaps -0x68(%r9),%xmm10
3768 movaps -0x58(%r9),%xmm11
3769 movaps -0x48(%r9),%xmm12
3770 movaps -0x38(%r9),%xmm13
3771 movaps -0x28(%r9),%xmm14
3772 movaps -0x18(%r9),%xmm15
3776 .cfi_def_cfa_register %rsp
3780 .size ChaCha20_8xvl,.-ChaCha20_8xvl
3784 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3785 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
3793 .extern __imp_RtlVirtualUnwind
3794 .type se_handler,\@abi-omnipotent
3808 mov 120($context),%rax # pull context->Rax
3809 mov 248($context),%rbx # pull context->Rip
3811 mov 8($disp),%rsi # disp->ImageBase
3812 mov 56($disp),%r11 # disp->HandlerData
3814 lea .Lctr32_body(%rip),%r10
3815 cmp %r10,%rbx # context->Rip<.Lprologue
3816 jb .Lcommon_seh_tail
3818 mov 152($context),%rax # pull context->Rsp
3820 lea .Lno_data(%rip),%r10 # epilogue label
3821 cmp %r10,%rbx # context->Rip>=.Lepilogue
3822 jae .Lcommon_seh_tail
3824 lea 64+24+48(%rax),%rax
3832 mov %rbx,144($context) # restore context->Rbx
3833 mov %rbp,160($context) # restore context->Rbp
3834 mov %r12,216($context) # restore context->R12
3835 mov %r13,224($context) # restore context->R13
3836 mov %r14,232($context) # restore context->R14
3837 mov %r15,240($context) # restore context->R14
3842 mov %rax,152($context) # restore context->Rsp
3843 mov %rsi,168($context) # restore context->Rsi
3844 mov %rdi,176($context) # restore context->Rdi
3846 mov 40($disp),%rdi # disp->ContextRecord
3847 mov $context,%rsi # context
3848 mov \$154,%ecx # sizeof(CONTEXT)
3849 .long 0xa548f3fc # cld; rep movsq
3852 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3853 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3854 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3855 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3856 mov 40(%rsi),%r10 # disp->ContextRecord
3857 lea 56(%rsi),%r11 # &disp->HandlerData
3858 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3859 mov %r10,32(%rsp) # arg5
3860 mov %r11,40(%rsp) # arg6
3861 mov %r12,48(%rsp) # arg7
3862 mov %rcx,56(%rsp) # arg8, (NULL)
3863 call *__imp_RtlVirtualUnwind(%rip)
3865 mov \$1,%eax # ExceptionContinueSearch
3877 .size se_handler,.-se_handler
3879 .type simd_handler,\@abi-omnipotent
3893 mov 120($context),%rax # pull context->Rax
3894 mov 248($context),%rbx # pull context->Rip
3896 mov 8($disp),%rsi # disp->ImageBase
3897 mov 56($disp),%r11 # disp->HandlerData
3899 mov 0(%r11),%r10d # HandlerData[0]
3900 lea (%rsi,%r10),%r10 # prologue label
3901 cmp %r10,%rbx # context->Rip<prologue label
3902 jb .Lcommon_seh_tail
3904 mov 192($context),%rax # pull context->R9
3906 mov 4(%r11),%r10d # HandlerData[1]
3907 mov 8(%r11),%ecx # HandlerData[2]
3908 lea (%rsi,%r10),%r10 # epilogue label
3909 cmp %r10,%rbx # context->Rip>=epilogue label
3910 jae .Lcommon_seh_tail
3913 lea -8(%rax,%rcx),%rsi
3914 lea 512($context),%rdi # &context.Xmm6
3917 .long 0xa548f3fc # cld; rep movsq
3919 jmp .Lcommon_seh_tail
3920 .size simd_handler,.-simd_handler
3924 .rva .LSEH_begin_ChaCha20_ctr32
3925 .rva .LSEH_end_ChaCha20_ctr32
3926 .rva .LSEH_info_ChaCha20_ctr32
3928 .rva .LSEH_begin_ChaCha20_ssse3
3929 .rva .LSEH_end_ChaCha20_ssse3
3930 .rva .LSEH_info_ChaCha20_ssse3
3932 .rva .LSEH_begin_ChaCha20_128
3933 .rva .LSEH_end_ChaCha20_128
3934 .rva .LSEH_info_ChaCha20_128
3936 .rva .LSEH_begin_ChaCha20_4x
3937 .rva .LSEH_end_ChaCha20_4x
3938 .rva .LSEH_info_ChaCha20_4x
3940 $code.=<<___ if ($avx);
3941 .rva .LSEH_begin_ChaCha20_4xop
3942 .rva .LSEH_end_ChaCha20_4xop
3943 .rva .LSEH_info_ChaCha20_4xop
3945 $code.=<<___ if ($avx>1);
3946 .rva .LSEH_begin_ChaCha20_8x
3947 .rva .LSEH_end_ChaCha20_8x
3948 .rva .LSEH_info_ChaCha20_8x
3950 $code.=<<___ if ($avx>2);
3951 .rva .LSEH_begin_ChaCha20_avx512
3952 .rva .LSEH_end_ChaCha20_avx512
3953 .rva .LSEH_info_ChaCha20_avx512
3955 .rva .LSEH_begin_ChaCha20_avx512vl
3956 .rva .LSEH_end_ChaCha20_avx512vl
3957 .rva .LSEH_info_ChaCha20_avx512vl
3959 .rva .LSEH_begin_ChaCha20_16x
3960 .rva .LSEH_end_ChaCha20_16x
3961 .rva .LSEH_info_ChaCha20_16x
3963 .rva .LSEH_begin_ChaCha20_8xvl
3964 .rva .LSEH_end_ChaCha20_8xvl
3965 .rva .LSEH_info_ChaCha20_8xvl
3970 .LSEH_info_ChaCha20_ctr32:
3974 .LSEH_info_ChaCha20_ssse3:
3977 .rva .Lssse3_body,.Lssse3_epilogue
3980 .LSEH_info_ChaCha20_128:
3983 .rva .L128_body,.L128_epilogue
3986 .LSEH_info_ChaCha20_4x:
3989 .rva .L4x_body,.L4x_epilogue
3992 $code.=<<___ if ($avx);
3993 .LSEH_info_ChaCha20_4xop:
3996 .rva .L4xop_body,.L4xop_epilogue # HandlerData[]
3999 $code.=<<___ if ($avx>1);
4000 .LSEH_info_ChaCha20_8x:
4003 .rva .L8x_body,.L8x_epilogue # HandlerData[]
4006 $code.=<<___ if ($avx>2);
4007 .LSEH_info_ChaCha20_avx512:
4010 .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[]
4013 .LSEH_info_ChaCha20_avx512vl:
4016 .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[]
4019 .LSEH_info_ChaCha20_16x:
4022 .rva .L16x_body,.L16x_epilogue # HandlerData[]
4025 .LSEH_info_ChaCha20_8xvl:
4028 .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[]
4033 foreach (split("\n",$code)) {
4034 s/\`([^\`]*)\`/eval $1/ge;
4036 s/%x#%[yz]/%x/g; # "down-shift"
4041 close STDOUT or die "error closing STDOUT: $!";