2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # ChaCha20 for x86_64.
23 # Add AVX512F code path.
25 # Performance in cycles per byte out of large buffer.
27 # IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 8xAVX2
29 # P4 9.48/+99% -/22.7(ii) -
30 # Core2 7.83/+55% 7.90/8.08 4.35
31 # Westmere 7.19/+50% 5.60/6.70 3.00
32 # Sandy Bridge 8.31/+42% 5.45/6.76 2.72
33 # Ivy Bridge 6.71/+46% 5.40/6.49 2.41
34 # Haswell 5.92/+43% 5.20/6.45 2.42 1.23
35 # Skylake 5.87/+39% 4.70/- 2.31 1.19
36 # Silvermont 12.0/+33% 7.75/7.40 7.03(iii)
37 # Goldmont 10.6/+17% 5.10/- 3.28
38 # Sledgehammer 7.28/+52% -/14.2(ii) -
39 # Bulldozer 9.66/+28% 9.85/11.1 3.06(iv)
40 # VIA Nano 10.5/+46% 6.72/8.60 6.05
42 # (i) compared to older gcc 3.x one can observe >2x improvement on
44 # (ii) as it can be seen, SSE2 performance is too low on legacy
45 # processors; NxSSE2 results are naturally better, but not
46 # impressively better than IALU ones, which is why you won't
47 # find SSE2 code below;
48 # (iii) this is not optimal result for Atom because of MSROM
49 # limitations, SSE2 can do better, but gain is considered too
50 # low to justify the [maintenance] effort;
51 # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20;
55 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
57 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
59 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
61 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
62 die "can't locate x86_64-xlate.pl";
64 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
65 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
66 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
69 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
70 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
71 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
72 $avx += 1 if ($1==2.11 && $2>=8);
75 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
76 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
77 $avx = ($1>=10) + ($1>=11);
80 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
81 $avx = ($2>=3.0) + ($2>3.0);
84 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
87 # input parameter block
88 ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
93 .extern OPENSSL_ia32cap_P
105 .long 0,2,4,6,1,3,5,7
107 .long 8,8,8,8,8,8,8,8
109 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
111 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
113 .asciz "expand 32-byte k"
116 .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
118 .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
120 .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
122 .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
123 .asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
126 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
127 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
129 $arg = "\$$arg" if ($arg*1 eq $arg);
130 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
133 @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
134 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
137 sub ROUND { # critical path is 24 cycles per round
138 my ($a0,$b0,$c0,$d0)=@_;
139 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
140 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
141 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
142 my ($xc,$xc_)=map("\"$_\"",@t);
143 my @x=map("\"$_\"",@x);
145 # Consider order in which variables are addressed by their
150 # 0 4 8 12 < even round
154 # 0 5 10 15 < odd round
159 # 'a', 'b' and 'd's are permanently allocated in registers,
160 # @x[0..7,12..15], while 'c's are maintained in memory. If
161 # you observe 'c' column, you'll notice that pair of 'c's is
162 # invariant between rounds. This means that we have to reload
163 # them once per round, in the middle. This is why you'll see
164 # bunch of 'c' stores and loads in the middle, but none in
165 # the beginning or end.
167 # Normally instructions would be interleaved to favour in-order
168 # execution. Generally out-of-order cores manage it gracefully,
169 # but not this time for some reason. As in-order execution
170 # cores are dying breed, old Atom is the only one around,
171 # instructions are left uninterleaved. Besides, Atom is better
172 # off executing 1xSSSE3 code anyway...
175 "&add (@x[$a0],@x[$b0])", # Q1
176 "&xor (@x[$d0],@x[$a0])",
178 "&add (@x[$a1],@x[$b1])", # Q2
179 "&xor (@x[$d1],@x[$a1])",
182 "&add ($xc,@x[$d0])",
183 "&xor (@x[$b0],$xc)",
185 "&add ($xc_,@x[$d1])",
186 "&xor (@x[$b1],$xc_)",
189 "&add (@x[$a0],@x[$b0])",
190 "&xor (@x[$d0],@x[$a0])",
192 "&add (@x[$a1],@x[$b1])",
193 "&xor (@x[$d1],@x[$a1])",
196 "&add ($xc,@x[$d0])",
197 "&xor (@x[$b0],$xc)",
199 "&add ($xc_,@x[$d1])",
200 "&xor (@x[$b1],$xc_)",
203 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
204 "&mov (\"4*$c1(%rsp)\",$xc_)",
205 "&mov ($xc,\"4*$c2(%rsp)\")",
206 "&mov ($xc_,\"4*$c3(%rsp)\")",
208 "&add (@x[$a2],@x[$b2])", # Q3
209 "&xor (@x[$d2],@x[$a2])",
211 "&add (@x[$a3],@x[$b3])", # Q4
212 "&xor (@x[$d3],@x[$a3])",
215 "&add ($xc,@x[$d2])",
216 "&xor (@x[$b2],$xc)",
218 "&add ($xc_,@x[$d3])",
219 "&xor (@x[$b3],$xc_)",
222 "&add (@x[$a2],@x[$b2])",
223 "&xor (@x[$d2],@x[$a2])",
225 "&add (@x[$a3],@x[$b3])",
226 "&xor (@x[$d3],@x[$a3])",
229 "&add ($xc,@x[$d2])",
230 "&xor (@x[$b2],$xc)",
232 "&add ($xc_,@x[$d3])",
233 "&xor (@x[$b3],$xc_)",
238 ########################################################################
239 # Generic code path that handles all lengths on pre-SSSE3 processors.
241 .globl ChaCha20_ctr32
242 .type ChaCha20_ctr32,\@function,5
247 mov OPENSSL_ia32cap_P+4(%rip),%r10
249 $code.=<<___ if ($avx>2);
250 bt \$48,%r10 # check for AVX512F
254 test \$`1<<(41-32)`,%r10d
265 #movdqa .Lsigma(%rip),%xmm0
267 movdqu 16($key),%xmm2
268 movdqu ($counter),%xmm3
269 movdqa .Lone(%rip),%xmm4
271 #movdqa %xmm0,4*0(%rsp) # key[0]
272 movdqa %xmm1,4*4(%rsp) # key[1]
273 movdqa %xmm2,4*8(%rsp) # key[2]
274 movdqa %xmm3,4*12(%rsp) # key[3]
275 mov $len,%rbp # reassign $len
280 mov \$0x61707865,@x[0] # 'expa'
281 mov \$0x3320646e,@x[1] # 'nd 3'
282 mov \$0x79622d32,@x[2] # '2-by'
283 mov \$0x6b206574,@x[3] # 'te k'
289 mov 4*13(%rsp),@x[13]
290 mov 4*14(%rsp),@x[14]
291 mov 4*15(%rsp),@x[15]
293 mov %rbp,64+0(%rsp) # save len
295 mov $inp,64+8(%rsp) # save inp
296 movq %xmm2,%rsi # "@x[8]"
297 mov $out,64+16(%rsp) # save out
299 shr \$32,%rdi # "@x[9]"
305 foreach (&ROUND (0, 4, 8,12)) { eval; }
306 foreach (&ROUND (0, 5,10,15)) { eval; }
311 mov @t[1],4*9(%rsp) # modulo-scheduled
313 mov 64(%rsp),%rbp # load len
315 mov 64+8(%rsp),$inp # load inp
316 paddd %xmm4,%xmm3 # increment counter
317 mov 64+16(%rsp),$out # load out
319 add \$0x61707865,@x[0] # 'expa'
320 add \$0x3320646e,@x[1] # 'nd 3'
321 add \$0x79622d32,@x[2] # '2-by'
322 add \$0x6b206574,@x[3] # 'te k'
327 add 4*12(%rsp),@x[12]
328 add 4*13(%rsp),@x[13]
329 add 4*14(%rsp),@x[14]
330 add 4*15(%rsp),@x[15]
331 paddd 4*8(%rsp),%xmm1
336 xor 4*0($inp),@x[0] # xor with input
344 movdqu 4*8($inp),%xmm0
345 xor 4*12($inp),@x[12]
346 xor 4*13($inp),@x[13]
347 xor 4*14($inp),@x[14]
348 xor 4*15($inp),@x[15]
349 lea 4*16($inp),$inp # inp+=64
352 movdqa %xmm2,4*8(%rsp)
353 movd %xmm3,4*12(%rsp)
355 mov @x[0],4*0($out) # write output
363 movdqu %xmm0,4*8($out)
364 mov @x[12],4*12($out)
365 mov @x[13],4*13($out)
366 mov @x[14],4*14($out)
367 mov @x[15],4*15($out)
368 lea 4*16($out),$out # out+=64
386 movdqa %xmm1,4*8(%rsp)
387 mov @x[12],4*12(%rsp)
388 mov @x[13],4*13(%rsp)
389 mov @x[14],4*14(%rsp)
390 mov @x[15],4*15(%rsp)
393 movzb ($inp,%rbx),%eax
394 movzb (%rsp,%rbx),%edx
397 mov %al,-1($out,%rbx)
411 .size ChaCha20_ctr32,.-ChaCha20_ctr32
414 ########################################################################
415 # SSSE3 code path that handles shorter lengths
417 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
419 sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
443 my $xframe = $win64 ? 32+32+8 : 24;
446 .type ChaCha20_ssse3,\@function,5
451 $code.=<<___ if ($avx);
452 test \$`1<<(43-32)`,%r10d
453 jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4
456 cmp \$128,$len # we might throw away some data,
457 ja .LChaCha20_4x # but overall it won't be slower
460 push %rbx # just to share SEH handler, no pops
467 sub \$64+$xframe,%rsp
469 $code.=<<___ if ($win64);
470 movaps %xmm6,64+32(%rsp)
471 movaps %xmm7,64+48(%rsp)
474 movdqa .Lsigma(%rip),$a
478 movdqa .Lrot16(%rip),$rot16
479 movdqa .Lrot24(%rip),$rot24
485 mov \$10,$counter # reuse $counter
490 movdqa .Lone(%rip),$d
503 &pshufd ($c,$c,0b01001110);
504 &pshufd ($b,$b,0b00111001);
505 &pshufd ($d,$d,0b10010011);
509 &pshufd ($c,$c,0b01001110);
510 &pshufd ($b,$b,0b10010011);
511 &pshufd ($d,$d,0b00111001);
514 &jnz (".Loop_ssse3");
526 movdqu 0x10($inp),$t1
527 pxor $t,$a # xor with input
530 movdqu 0x30($inp),$t1
531 lea 0x40($inp),$inp # inp+=64
535 movdqu $a,0x00($out) # write output
539 lea 0x40($out),$out # out+=64
542 jnz .Loop_outer_ssse3
552 xor $counter,$counter
555 movzb ($inp,$counter),%eax
556 movzb (%rsp,$counter),%ecx
557 lea 1($counter),$counter
559 mov %al,-1($out,$counter)
565 $code.=<<___ if ($win64);
566 movaps 64+32(%rsp),%xmm6
567 movaps 64+48(%rsp),%xmm7
570 add \$64+$xframe+48,%rsp
572 .size ChaCha20_ssse3,.-ChaCha20_ssse3
576 ########################################################################
577 # SSSE3 code path that handles longer messages.
579 # assign variables to favor Atom front-end
580 my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
581 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
582 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
583 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
585 sub SSSE3_lane_ROUND {
586 my ($a0,$b0,$c0,$d0)=@_;
587 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
588 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
589 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
590 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
591 my @x=map("\"$_\"",@xx);
593 # Consider order in which variables are addressed by their
598 # 0 4 8 12 < even round
602 # 0 5 10 15 < odd round
607 # 'a', 'b' and 'd's are permanently allocated in registers,
608 # @x[0..7,12..15], while 'c's are maintained in memory. If
609 # you observe 'c' column, you'll notice that pair of 'c's is
610 # invariant between rounds. This means that we have to reload
611 # them once per round, in the middle. This is why you'll see
612 # bunch of 'c' stores and loads in the middle, but none in
613 # the beginning or end.
616 "&paddd (@x[$a0],@x[$b0])", # Q1
617 "&paddd (@x[$a1],@x[$b1])", # Q2
618 "&pxor (@x[$d0],@x[$a0])",
619 "&pxor (@x[$d1],@x[$a1])",
620 "&pshufb (@x[$d0],$t1)",
621 "&pshufb (@x[$d1],$t1)",
623 "&paddd ($xc,@x[$d0])",
624 "&paddd ($xc_,@x[$d1])",
625 "&pxor (@x[$b0],$xc)",
626 "&pxor (@x[$b1],$xc_)",
627 "&movdqa ($t0,@x[$b0])",
628 "&pslld (@x[$b0],12)",
630 "&movdqa ($t1,@x[$b1])",
631 "&pslld (@x[$b1],12)",
632 "&por (@x[$b0],$t0)",
634 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
635 "&por (@x[$b1],$t1)",
637 "&paddd (@x[$a0],@x[$b0])",
638 "&paddd (@x[$a1],@x[$b1])",
639 "&pxor (@x[$d0],@x[$a0])",
640 "&pxor (@x[$d1],@x[$a1])",
641 "&pshufb (@x[$d0],$t0)",
642 "&pshufb (@x[$d1],$t0)",
644 "&paddd ($xc,@x[$d0])",
645 "&paddd ($xc_,@x[$d1])",
646 "&pxor (@x[$b0],$xc)",
647 "&pxor (@x[$b1],$xc_)",
648 "&movdqa ($t1,@x[$b0])",
649 "&pslld (@x[$b0],7)",
651 "&movdqa ($t0,@x[$b1])",
652 "&pslld (@x[$b1],7)",
653 "&por (@x[$b0],$t1)",
655 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
656 "&por (@x[$b1],$t0)",
658 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
659 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
660 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
661 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
663 "&paddd (@x[$a2],@x[$b2])", # Q3
664 "&paddd (@x[$a3],@x[$b3])", # Q4
665 "&pxor (@x[$d2],@x[$a2])",
666 "&pxor (@x[$d3],@x[$a3])",
667 "&pshufb (@x[$d2],$t1)",
668 "&pshufb (@x[$d3],$t1)",
670 "&paddd ($xc,@x[$d2])",
671 "&paddd ($xc_,@x[$d3])",
672 "&pxor (@x[$b2],$xc)",
673 "&pxor (@x[$b3],$xc_)",
674 "&movdqa ($t0,@x[$b2])",
675 "&pslld (@x[$b2],12)",
677 "&movdqa ($t1,@x[$b3])",
678 "&pslld (@x[$b3],12)",
679 "&por (@x[$b2],$t0)",
681 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
682 "&por (@x[$b3],$t1)",
684 "&paddd (@x[$a2],@x[$b2])",
685 "&paddd (@x[$a3],@x[$b3])",
686 "&pxor (@x[$d2],@x[$a2])",
687 "&pxor (@x[$d3],@x[$a3])",
688 "&pshufb (@x[$d2],$t0)",
689 "&pshufb (@x[$d3],$t0)",
691 "&paddd ($xc,@x[$d2])",
692 "&paddd ($xc_,@x[$d3])",
693 "&pxor (@x[$b2],$xc)",
694 "&pxor (@x[$b3],$xc_)",
695 "&movdqa ($t1,@x[$b2])",
696 "&pslld (@x[$b2],7)",
698 "&movdqa ($t0,@x[$b3])",
699 "&pslld (@x[$b3],7)",
700 "&por (@x[$b2],$t1)",
702 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
707 my $xframe = $win64 ? 0xa0 : 0;
710 .type ChaCha20_4x,\@function,5
716 $code.=<<___ if ($avx>1);
717 shr \$32,%r10 # OPENSSL_ia32cap_P+8
718 test \$`1<<5`,%r10 # test AVX2
725 and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
726 cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
727 je .Ldo_sse3_after_all # to detect Atom
731 sub \$0x148+$xframe,%rsp
733 ################ stack layout
734 # +0x00 SIMD equivalent of @x[8-12]
736 # +0x40 constant copy of key[0-2] smashed by lanes
738 # +0x100 SIMD counters (with nonce smashed by lanes)
741 $code.=<<___ if ($win64);
742 movaps %xmm6,-0x30(%r11)
743 movaps %xmm7,-0x20(%r11)
744 movaps %xmm8,-0x10(%r11)
745 movaps %xmm9,0x00(%r11)
746 movaps %xmm10,0x10(%r11)
747 movaps %xmm11,0x20(%r11)
748 movaps %xmm12,0x30(%r11)
749 movaps %xmm13,0x40(%r11)
750 movaps %xmm14,0x50(%r11)
751 movaps %xmm15,0x60(%r11)
754 movdqa .Lsigma(%rip),$xa3 # key[0]
755 movdqu ($key),$xb3 # key[1]
756 movdqu 16($key),$xt3 # key[2]
757 movdqu ($counter),$xd3 # key[3]
758 lea 0x100(%rsp),%rcx # size optimization
759 lea .Lrot16(%rip),%r10
760 lea .Lrot24(%rip),%r11
762 pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
763 pshufd \$0x55,$xa3,$xa1
764 movdqa $xa0,0x40(%rsp) # ... and offload
765 pshufd \$0xaa,$xa3,$xa2
766 movdqa $xa1,0x50(%rsp)
767 pshufd \$0xff,$xa3,$xa3
768 movdqa $xa2,0x60(%rsp)
769 movdqa $xa3,0x70(%rsp)
771 pshufd \$0x00,$xb3,$xb0
772 pshufd \$0x55,$xb3,$xb1
773 movdqa $xb0,0x80-0x100(%rcx)
774 pshufd \$0xaa,$xb3,$xb2
775 movdqa $xb1,0x90-0x100(%rcx)
776 pshufd \$0xff,$xb3,$xb3
777 movdqa $xb2,0xa0-0x100(%rcx)
778 movdqa $xb3,0xb0-0x100(%rcx)
780 pshufd \$0x00,$xt3,$xt0 # "$xc0"
781 pshufd \$0x55,$xt3,$xt1 # "$xc1"
782 movdqa $xt0,0xc0-0x100(%rcx)
783 pshufd \$0xaa,$xt3,$xt2 # "$xc2"
784 movdqa $xt1,0xd0-0x100(%rcx)
785 pshufd \$0xff,$xt3,$xt3 # "$xc3"
786 movdqa $xt2,0xe0-0x100(%rcx)
787 movdqa $xt3,0xf0-0x100(%rcx)
789 pshufd \$0x00,$xd3,$xd0
790 pshufd \$0x55,$xd3,$xd1
791 paddd .Linc(%rip),$xd0 # don't save counters yet
792 pshufd \$0xaa,$xd3,$xd2
793 movdqa $xd1,0x110-0x100(%rcx)
794 pshufd \$0xff,$xd3,$xd3
795 movdqa $xd2,0x120-0x100(%rcx)
796 movdqa $xd3,0x130-0x100(%rcx)
802 movdqa 0x40(%rsp),$xa0 # re-load smashed key
803 movdqa 0x50(%rsp),$xa1
804 movdqa 0x60(%rsp),$xa2
805 movdqa 0x70(%rsp),$xa3
806 movdqa 0x80-0x100(%rcx),$xb0
807 movdqa 0x90-0x100(%rcx),$xb1
808 movdqa 0xa0-0x100(%rcx),$xb2
809 movdqa 0xb0-0x100(%rcx),$xb3
810 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
811 movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
812 movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
813 movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
814 movdqa 0x100-0x100(%rcx),$xd0
815 movdqa 0x110-0x100(%rcx),$xd1
816 movdqa 0x120-0x100(%rcx),$xd2
817 movdqa 0x130-0x100(%rcx),$xd3
818 paddd .Lfour(%rip),$xd0 # next SIMD counters
821 movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
822 movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
823 movdqa (%r10),$xt3 # .Lrot16(%rip)
825 movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
831 foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
832 foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
837 paddd 0x40(%rsp),$xa0 # accumulate key material
838 paddd 0x50(%rsp),$xa1
839 paddd 0x60(%rsp),$xa2
840 paddd 0x70(%rsp),$xa3
842 movdqa $xa0,$xt2 # "de-interlace" data
849 punpcklqdq $xa2,$xa0 # "a0"
851 punpcklqdq $xt3,$xt2 # "a2"
852 punpckhqdq $xa2,$xa1 # "a1"
853 punpckhqdq $xt3,$xa3 # "a3"
855 ($xa2,$xt2)=($xt2,$xa2);
857 paddd 0x80-0x100(%rcx),$xb0
858 paddd 0x90-0x100(%rcx),$xb1
859 paddd 0xa0-0x100(%rcx),$xb2
860 paddd 0xb0-0x100(%rcx),$xb3
862 movdqa $xa0,0x00(%rsp) # offload $xaN
863 movdqa $xa1,0x10(%rsp)
864 movdqa 0x20(%rsp),$xa0 # "xc2"
865 movdqa 0x30(%rsp),$xa1 # "xc3"
874 punpcklqdq $xb2,$xb0 # "b0"
876 punpcklqdq $xt3,$xt2 # "b2"
877 punpckhqdq $xb2,$xb1 # "b1"
878 punpckhqdq $xt3,$xb3 # "b3"
880 ($xb2,$xt2)=($xt2,$xb2);
881 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
883 paddd 0xc0-0x100(%rcx),$xc0
884 paddd 0xd0-0x100(%rcx),$xc1
885 paddd 0xe0-0x100(%rcx),$xc2
886 paddd 0xf0-0x100(%rcx),$xc3
888 movdqa $xa2,0x20(%rsp) # keep offloading $xaN
889 movdqa $xa3,0x30(%rsp)
898 punpcklqdq $xc2,$xc0 # "c0"
900 punpcklqdq $xt3,$xt2 # "c2"
901 punpckhqdq $xc2,$xc1 # "c1"
902 punpckhqdq $xt3,$xc3 # "c3"
904 ($xc2,$xt2)=($xt2,$xc2);
905 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
907 paddd 0x100-0x100(%rcx),$xd0
908 paddd 0x110-0x100(%rcx),$xd1
909 paddd 0x120-0x100(%rcx),$xd2
910 paddd 0x130-0x100(%rcx),$xd3
919 punpcklqdq $xd2,$xd0 # "d0"
921 punpcklqdq $xt3,$xt2 # "d2"
922 punpckhqdq $xd2,$xd1 # "d1"
923 punpckhqdq $xt3,$xd3 # "d3"
925 ($xd2,$xt2)=($xt2,$xd2);
930 movdqu 0x00($inp),$xt0 # xor with input
931 movdqu 0x10($inp),$xt1
932 movdqu 0x20($inp),$xt2
933 movdqu 0x30($inp),$xt3
934 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
939 movdqu $xt0,0x00($out)
940 movdqu 0x40($inp),$xt0
941 movdqu $xt1,0x10($out)
942 movdqu 0x50($inp),$xt1
943 movdqu $xt2,0x20($out)
944 movdqu 0x60($inp),$xt2
945 movdqu $xt3,0x30($out)
946 movdqu 0x70($inp),$xt3
947 lea 0x80($inp),$inp # size optimization
953 movdqu $xt0,0x40($out)
954 movdqu 0x00($inp),$xt0
955 movdqu $xt1,0x50($out)
956 movdqu 0x10($inp),$xt1
957 movdqu $xt2,0x60($out)
958 movdqu 0x20($inp),$xt2
959 movdqu $xt3,0x70($out)
960 lea 0x80($out),$out # size optimization
961 movdqu 0x30($inp),$xt3
967 movdqu $xt0,0x00($out)
968 movdqu 0x40($inp),$xt0
969 movdqu $xt1,0x10($out)
970 movdqu 0x50($inp),$xt1
971 movdqu $xt2,0x20($out)
972 movdqu 0x60($inp),$xt2
973 movdqu $xt3,0x30($out)
974 movdqu 0x70($inp),$xt3
975 lea 0x80($inp),$inp # inp+=64*4
980 movdqu $xt0,0x40($out)
981 movdqu $xt1,0x50($out)
982 movdqu $xt2,0x60($out)
983 movdqu $xt3,0x70($out)
984 lea 0x80($out),$out # out+=64*4
999 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1001 #movdqa $xt0,0x00(%rsp)
1002 movdqa $xb0,0x10(%rsp)
1003 movdqa $xc0,0x20(%rsp)
1004 movdqa $xd0,0x30(%rsp)
1009 movdqu 0x00($inp),$xt0 # xor with input
1010 movdqu 0x10($inp),$xt1
1011 movdqu 0x20($inp),$xt2
1012 movdqu 0x30($inp),$xt3
1013 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
1017 movdqu $xt0,0x00($out)
1018 movdqu $xt1,0x10($out)
1019 movdqu $xt2,0x20($out)
1020 movdqu $xt3,0x30($out)
1023 movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
1024 lea 0x40($inp),$inp # inp+=64*1
1026 movdqa $xt0,0x00(%rsp)
1027 movdqa $xb1,0x10(%rsp)
1028 lea 0x40($out),$out # out+=64*1
1029 movdqa $xc1,0x20(%rsp)
1030 sub \$64,$len # len-=64*1
1031 movdqa $xd1,0x30(%rsp)
1036 movdqu 0x00($inp),$xt0 # xor with input
1037 movdqu 0x10($inp),$xt1
1038 movdqu 0x20($inp),$xt2
1039 movdqu 0x30($inp),$xt3
1040 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1045 movdqu $xt0,0x00($out)
1046 movdqu 0x40($inp),$xt0
1047 movdqu $xt1,0x10($out)
1048 movdqu 0x50($inp),$xt1
1049 movdqu $xt2,0x20($out)
1050 movdqu 0x60($inp),$xt2
1051 movdqu $xt3,0x30($out)
1052 movdqu 0x70($inp),$xt3
1053 pxor 0x10(%rsp),$xt0
1057 movdqu $xt0,0x40($out)
1058 movdqu $xt1,0x50($out)
1059 movdqu $xt2,0x60($out)
1060 movdqu $xt3,0x70($out)
1063 movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
1064 lea 0x80($inp),$inp # inp+=64*2
1066 movdqa $xt0,0x00(%rsp)
1067 movdqa $xb2,0x10(%rsp)
1068 lea 0x80($out),$out # out+=64*2
1069 movdqa $xc2,0x20(%rsp)
1070 sub \$128,$len # len-=64*2
1071 movdqa $xd2,0x30(%rsp)
1076 movdqu 0x00($inp),$xt0 # xor with input
1077 movdqu 0x10($inp),$xt1
1078 movdqu 0x20($inp),$xt2
1079 movdqu 0x30($inp),$xt3
1080 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1085 movdqu $xt0,0x00($out)
1086 movdqu 0x40($inp),$xt0
1087 movdqu $xt1,0x10($out)
1088 movdqu 0x50($inp),$xt1
1089 movdqu $xt2,0x20($out)
1090 movdqu 0x60($inp),$xt2
1091 movdqu $xt3,0x30($out)
1092 movdqu 0x70($inp),$xt3
1093 lea 0x80($inp),$inp # size optimization
1094 pxor 0x10(%rsp),$xt0
1099 movdqu $xt0,0x40($out)
1100 movdqu 0x00($inp),$xt0
1101 movdqu $xt1,0x50($out)
1102 movdqu 0x10($inp),$xt1
1103 movdqu $xt2,0x60($out)
1104 movdqu 0x20($inp),$xt2
1105 movdqu $xt3,0x70($out)
1106 lea 0x80($out),$out # size optimization
1107 movdqu 0x30($inp),$xt3
1108 pxor 0x20(%rsp),$xt0
1112 movdqu $xt0,0x00($out)
1113 movdqu $xt1,0x10($out)
1114 movdqu $xt2,0x20($out)
1115 movdqu $xt3,0x30($out)
1118 movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
1119 lea 0x40($inp),$inp # inp+=64*3
1121 movdqa $xt0,0x00(%rsp)
1122 movdqa $xb3,0x10(%rsp)
1123 lea 0x40($out),$out # out+=64*3
1124 movdqa $xc3,0x20(%rsp)
1125 sub \$192,$len # len-=64*3
1126 movdqa $xd3,0x30(%rsp)
1129 movzb ($inp,%r10),%eax
1130 movzb (%rsp,%r10),%ecx
1133 mov %al,-1($out,%r10)
1139 $code.=<<___ if ($win64);
1140 lea 0x140+0x30(%rsp),%r11
1141 movaps -0x30(%r11),%xmm6
1142 movaps -0x20(%r11),%xmm7
1143 movaps -0x10(%r11),%xmm8
1144 movaps 0x00(%r11),%xmm9
1145 movaps 0x10(%r11),%xmm10
1146 movaps 0x20(%r11),%xmm11
1147 movaps 0x30(%r11),%xmm12
1148 movaps 0x40(%r11),%xmm13
1149 movaps 0x50(%r11),%xmm14
1150 movaps 0x60(%r11),%xmm15
1153 add \$0x148+$xframe,%rsp
1155 .size ChaCha20_4x,.-ChaCha20_4x
1159 ########################################################################
1160 # XOP code path that handles all lengths.
1162 # There is some "anomaly" observed depending on instructions' size or
1163 # alignment. If you look closely at below code you'll notice that
1164 # sometimes argument order varies. The order affects instruction
1165 # encoding by making it larger, and such fiddling gives 5% performance
1166 # improvement. This is on FX-4100...
1168 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1169 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1170 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1171 $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1173 sub XOP_lane_ROUND {
1174 my ($a0,$b0,$c0,$d0)=@_;
1175 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1176 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1177 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1178 my @x=map("\"$_\"",@xx);
1181 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1182 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1183 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1184 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1185 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1186 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1187 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1188 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1189 "&vprotd (@x[$d0],@x[$d0],16)",
1190 "&vprotd (@x[$d1],@x[$d1],16)",
1191 "&vprotd (@x[$d2],@x[$d2],16)",
1192 "&vprotd (@x[$d3],@x[$d3],16)",
1194 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1195 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1196 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1197 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1198 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1199 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1200 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1201 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1202 "&vprotd (@x[$b0],@x[$b0],12)",
1203 "&vprotd (@x[$b1],@x[$b1],12)",
1204 "&vprotd (@x[$b2],@x[$b2],12)",
1205 "&vprotd (@x[$b3],@x[$b3],12)",
1207 "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip
1208 "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip
1209 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1210 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1211 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1212 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1213 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1214 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1215 "&vprotd (@x[$d0],@x[$d0],8)",
1216 "&vprotd (@x[$d1],@x[$d1],8)",
1217 "&vprotd (@x[$d2],@x[$d2],8)",
1218 "&vprotd (@x[$d3],@x[$d3],8)",
1220 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1221 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1222 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1223 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1224 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1225 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1226 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1227 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1228 "&vprotd (@x[$b0],@x[$b0],7)",
1229 "&vprotd (@x[$b1],@x[$b1],7)",
1230 "&vprotd (@x[$b2],@x[$b2],7)",
1231 "&vprotd (@x[$b3],@x[$b3],7)"
1235 my $xframe = $win64 ? 0xa0 : 0;
1238 .type ChaCha20_4xop,\@function,5
1242 lea -0x78(%rsp),%r11
1243 sub \$0x148+$xframe,%rsp
1245 ################ stack layout
1246 # +0x00 SIMD equivalent of @x[8-12]
1248 # +0x40 constant copy of key[0-2] smashed by lanes
1250 # +0x100 SIMD counters (with nonce smashed by lanes)
1253 $code.=<<___ if ($win64);
1254 movaps %xmm6,-0x30(%r11)
1255 movaps %xmm7,-0x20(%r11)
1256 movaps %xmm8,-0x10(%r11)
1257 movaps %xmm9,0x00(%r11)
1258 movaps %xmm10,0x10(%r11)
1259 movaps %xmm11,0x20(%r11)
1260 movaps %xmm12,0x30(%r11)
1261 movaps %xmm13,0x40(%r11)
1262 movaps %xmm14,0x50(%r11)
1263 movaps %xmm15,0x60(%r11)
1268 vmovdqa .Lsigma(%rip),$xa3 # key[0]
1269 vmovdqu ($key),$xb3 # key[1]
1270 vmovdqu 16($key),$xt3 # key[2]
1271 vmovdqu ($counter),$xd3 # key[3]
1272 lea 0x100(%rsp),%rcx # size optimization
1274 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1275 vpshufd \$0x55,$xa3,$xa1
1276 vmovdqa $xa0,0x40(%rsp) # ... and offload
1277 vpshufd \$0xaa,$xa3,$xa2
1278 vmovdqa $xa1,0x50(%rsp)
1279 vpshufd \$0xff,$xa3,$xa3
1280 vmovdqa $xa2,0x60(%rsp)
1281 vmovdqa $xa3,0x70(%rsp)
1283 vpshufd \$0x00,$xb3,$xb0
1284 vpshufd \$0x55,$xb3,$xb1
1285 vmovdqa $xb0,0x80-0x100(%rcx)
1286 vpshufd \$0xaa,$xb3,$xb2
1287 vmovdqa $xb1,0x90-0x100(%rcx)
1288 vpshufd \$0xff,$xb3,$xb3
1289 vmovdqa $xb2,0xa0-0x100(%rcx)
1290 vmovdqa $xb3,0xb0-0x100(%rcx)
1292 vpshufd \$0x00,$xt3,$xt0 # "$xc0"
1293 vpshufd \$0x55,$xt3,$xt1 # "$xc1"
1294 vmovdqa $xt0,0xc0-0x100(%rcx)
1295 vpshufd \$0xaa,$xt3,$xt2 # "$xc2"
1296 vmovdqa $xt1,0xd0-0x100(%rcx)
1297 vpshufd \$0xff,$xt3,$xt3 # "$xc3"
1298 vmovdqa $xt2,0xe0-0x100(%rcx)
1299 vmovdqa $xt3,0xf0-0x100(%rcx)
1301 vpshufd \$0x00,$xd3,$xd0
1302 vpshufd \$0x55,$xd3,$xd1
1303 vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet
1304 vpshufd \$0xaa,$xd3,$xd2
1305 vmovdqa $xd1,0x110-0x100(%rcx)
1306 vpshufd \$0xff,$xd3,$xd3
1307 vmovdqa $xd2,0x120-0x100(%rcx)
1308 vmovdqa $xd3,0x130-0x100(%rcx)
1314 vmovdqa 0x40(%rsp),$xa0 # re-load smashed key
1315 vmovdqa 0x50(%rsp),$xa1
1316 vmovdqa 0x60(%rsp),$xa2
1317 vmovdqa 0x70(%rsp),$xa3
1318 vmovdqa 0x80-0x100(%rcx),$xb0
1319 vmovdqa 0x90-0x100(%rcx),$xb1
1320 vmovdqa 0xa0-0x100(%rcx),$xb2
1321 vmovdqa 0xb0-0x100(%rcx),$xb3
1322 vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
1323 vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
1324 vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
1325 vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
1326 vmovdqa 0x100-0x100(%rcx),$xd0
1327 vmovdqa 0x110-0x100(%rcx),$xd1
1328 vmovdqa 0x120-0x100(%rcx),$xd2
1329 vmovdqa 0x130-0x100(%rcx),$xd3
1330 vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters
1334 vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
1340 foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
1341 foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
1346 vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material
1347 vpaddd 0x50(%rsp),$xa1,$xa1
1348 vpaddd 0x60(%rsp),$xa2,$xa2
1349 vpaddd 0x70(%rsp),$xa3,$xa3
1351 vmovdqa $xt2,0x20(%rsp) # offload $xc2,3
1352 vmovdqa $xt3,0x30(%rsp)
1354 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1355 vpunpckldq $xa3,$xa2,$xt3
1356 vpunpckhdq $xa1,$xa0,$xa0
1357 vpunpckhdq $xa3,$xa2,$xa2
1358 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1359 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1360 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1361 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1363 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1365 vpaddd 0x80-0x100(%rcx),$xb0,$xb0
1366 vpaddd 0x90-0x100(%rcx),$xb1,$xb1
1367 vpaddd 0xa0-0x100(%rcx),$xb2,$xb2
1368 vpaddd 0xb0-0x100(%rcx),$xb3,$xb3
1370 vmovdqa $xa0,0x00(%rsp) # offload $xa0,1
1371 vmovdqa $xa1,0x10(%rsp)
1372 vmovdqa 0x20(%rsp),$xa0 # "xc2"
1373 vmovdqa 0x30(%rsp),$xa1 # "xc3"
1375 vpunpckldq $xb1,$xb0,$xt2
1376 vpunpckldq $xb3,$xb2,$xt3
1377 vpunpckhdq $xb1,$xb0,$xb0
1378 vpunpckhdq $xb3,$xb2,$xb2
1379 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1380 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1381 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1382 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1384 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1385 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1387 vpaddd 0xc0-0x100(%rcx),$xc0,$xc0
1388 vpaddd 0xd0-0x100(%rcx),$xc1,$xc1
1389 vpaddd 0xe0-0x100(%rcx),$xc2,$xc2
1390 vpaddd 0xf0-0x100(%rcx),$xc3,$xc3
1392 vpunpckldq $xc1,$xc0,$xt2
1393 vpunpckldq $xc3,$xc2,$xt3
1394 vpunpckhdq $xc1,$xc0,$xc0
1395 vpunpckhdq $xc3,$xc2,$xc2
1396 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1397 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1398 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1399 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1401 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1403 vpaddd 0x100-0x100(%rcx),$xd0,$xd0
1404 vpaddd 0x110-0x100(%rcx),$xd1,$xd1
1405 vpaddd 0x120-0x100(%rcx),$xd2,$xd2
1406 vpaddd 0x130-0x100(%rcx),$xd3,$xd3
1408 vpunpckldq $xd1,$xd0,$xt2
1409 vpunpckldq $xd3,$xd2,$xt3
1410 vpunpckhdq $xd1,$xd0,$xd0
1411 vpunpckhdq $xd3,$xd2,$xd2
1412 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1413 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1414 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1415 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1417 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1418 ($xa0,$xa1)=($xt2,$xt3);
1420 vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1
1421 vmovdqa 0x10(%rsp),$xa1
1426 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1427 vpxor 0x10($inp),$xb0,$xb0
1428 vpxor 0x20($inp),$xc0,$xc0
1429 vpxor 0x30($inp),$xd0,$xd0
1430 vpxor 0x40($inp),$xa1,$xa1
1431 vpxor 0x50($inp),$xb1,$xb1
1432 vpxor 0x60($inp),$xc1,$xc1
1433 vpxor 0x70($inp),$xd1,$xd1
1434 lea 0x80($inp),$inp # size optimization
1435 vpxor 0x00($inp),$xa2,$xa2
1436 vpxor 0x10($inp),$xb2,$xb2
1437 vpxor 0x20($inp),$xc2,$xc2
1438 vpxor 0x30($inp),$xd2,$xd2
1439 vpxor 0x40($inp),$xa3,$xa3
1440 vpxor 0x50($inp),$xb3,$xb3
1441 vpxor 0x60($inp),$xc3,$xc3
1442 vpxor 0x70($inp),$xd3,$xd3
1443 lea 0x80($inp),$inp # inp+=64*4
1445 vmovdqu $xa0,0x00($out)
1446 vmovdqu $xb0,0x10($out)
1447 vmovdqu $xc0,0x20($out)
1448 vmovdqu $xd0,0x30($out)
1449 vmovdqu $xa1,0x40($out)
1450 vmovdqu $xb1,0x50($out)
1451 vmovdqu $xc1,0x60($out)
1452 vmovdqu $xd1,0x70($out)
1453 lea 0x80($out),$out # size optimization
1454 vmovdqu $xa2,0x00($out)
1455 vmovdqu $xb2,0x10($out)
1456 vmovdqu $xc2,0x20($out)
1457 vmovdqu $xd2,0x30($out)
1458 vmovdqu $xa3,0x40($out)
1459 vmovdqu $xb3,0x50($out)
1460 vmovdqu $xc3,0x60($out)
1461 vmovdqu $xd3,0x70($out)
1462 lea 0x80($out),$out # out+=64*4
1472 jae .L192_or_more4xop
1474 jae .L128_or_more4xop
1476 jae .L64_or_more4xop
1479 vmovdqa $xa0,0x00(%rsp)
1480 vmovdqa $xb0,0x10(%rsp)
1481 vmovdqa $xc0,0x20(%rsp)
1482 vmovdqa $xd0,0x30(%rsp)
1487 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1488 vpxor 0x10($inp),$xb0,$xb0
1489 vpxor 0x20($inp),$xc0,$xc0
1490 vpxor 0x30($inp),$xd0,$xd0
1491 vmovdqu $xa0,0x00($out)
1492 vmovdqu $xb0,0x10($out)
1493 vmovdqu $xc0,0x20($out)
1494 vmovdqu $xd0,0x30($out)
1497 lea 0x40($inp),$inp # inp+=64*1
1498 vmovdqa $xa1,0x00(%rsp)
1500 vmovdqa $xb1,0x10(%rsp)
1501 lea 0x40($out),$out # out+=64*1
1502 vmovdqa $xc1,0x20(%rsp)
1503 sub \$64,$len # len-=64*1
1504 vmovdqa $xd1,0x30(%rsp)
1509 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1510 vpxor 0x10($inp),$xb0,$xb0
1511 vpxor 0x20($inp),$xc0,$xc0
1512 vpxor 0x30($inp),$xd0,$xd0
1513 vpxor 0x40($inp),$xa1,$xa1
1514 vpxor 0x50($inp),$xb1,$xb1
1515 vpxor 0x60($inp),$xc1,$xc1
1516 vpxor 0x70($inp),$xd1,$xd1
1518 vmovdqu $xa0,0x00($out)
1519 vmovdqu $xb0,0x10($out)
1520 vmovdqu $xc0,0x20($out)
1521 vmovdqu $xd0,0x30($out)
1522 vmovdqu $xa1,0x40($out)
1523 vmovdqu $xb1,0x50($out)
1524 vmovdqu $xc1,0x60($out)
1525 vmovdqu $xd1,0x70($out)
1528 lea 0x80($inp),$inp # inp+=64*2
1529 vmovdqa $xa2,0x00(%rsp)
1531 vmovdqa $xb2,0x10(%rsp)
1532 lea 0x80($out),$out # out+=64*2
1533 vmovdqa $xc2,0x20(%rsp)
1534 sub \$128,$len # len-=64*2
1535 vmovdqa $xd2,0x30(%rsp)
1540 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1541 vpxor 0x10($inp),$xb0,$xb0
1542 vpxor 0x20($inp),$xc0,$xc0
1543 vpxor 0x30($inp),$xd0,$xd0
1544 vpxor 0x40($inp),$xa1,$xa1
1545 vpxor 0x50($inp),$xb1,$xb1
1546 vpxor 0x60($inp),$xc1,$xc1
1547 vpxor 0x70($inp),$xd1,$xd1
1548 lea 0x80($inp),$inp # size optimization
1549 vpxor 0x00($inp),$xa2,$xa2
1550 vpxor 0x10($inp),$xb2,$xb2
1551 vpxor 0x20($inp),$xc2,$xc2
1552 vpxor 0x30($inp),$xd2,$xd2
1554 vmovdqu $xa0,0x00($out)
1555 vmovdqu $xb0,0x10($out)
1556 vmovdqu $xc0,0x20($out)
1557 vmovdqu $xd0,0x30($out)
1558 vmovdqu $xa1,0x40($out)
1559 vmovdqu $xb1,0x50($out)
1560 vmovdqu $xc1,0x60($out)
1561 vmovdqu $xd1,0x70($out)
1562 lea 0x80($out),$out # size optimization
1563 vmovdqu $xa2,0x00($out)
1564 vmovdqu $xb2,0x10($out)
1565 vmovdqu $xc2,0x20($out)
1566 vmovdqu $xd2,0x30($out)
1569 lea 0x40($inp),$inp # inp+=64*3
1570 vmovdqa $xa3,0x00(%rsp)
1572 vmovdqa $xb3,0x10(%rsp)
1573 lea 0x40($out),$out # out+=64*3
1574 vmovdqa $xc3,0x20(%rsp)
1575 sub \$192,$len # len-=64*3
1576 vmovdqa $xd3,0x30(%rsp)
1579 movzb ($inp,%r10),%eax
1580 movzb (%rsp,%r10),%ecx
1583 mov %al,-1($out,%r10)
1590 $code.=<<___ if ($win64);
1591 lea 0x140+0x30(%rsp),%r11
1592 movaps -0x30(%r11),%xmm6
1593 movaps -0x20(%r11),%xmm7
1594 movaps -0x10(%r11),%xmm8
1595 movaps 0x00(%r11),%xmm9
1596 movaps 0x10(%r11),%xmm10
1597 movaps 0x20(%r11),%xmm11
1598 movaps 0x30(%r11),%xmm12
1599 movaps 0x40(%r11),%xmm13
1600 movaps 0x50(%r11),%xmm14
1601 movaps 0x60(%r11),%xmm15
1604 add \$0x148+$xframe,%rsp
1606 .size ChaCha20_4xop,.-ChaCha20_4xop
1610 ########################################################################
1613 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1614 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1615 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1616 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1618 sub AVX2_lane_ROUND {
1619 my ($a0,$b0,$c0,$d0)=@_;
1620 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1621 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1622 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1623 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1624 my @x=map("\"$_\"",@xx);
1626 # Consider order in which variables are addressed by their
1631 # 0 4 8 12 < even round
1635 # 0 5 10 15 < odd round
1640 # 'a', 'b' and 'd's are permanently allocated in registers,
1641 # @x[0..7,12..15], while 'c's are maintained in memory. If
1642 # you observe 'c' column, you'll notice that pair of 'c's is
1643 # invariant between rounds. This means that we have to reload
1644 # them once per round, in the middle. This is why you'll see
1645 # bunch of 'c' stores and loads in the middle, but none in
1646 # the beginning or end.
1649 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1650 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1651 "&vpshufb (@x[$d0],@x[$d0],$t1)",
1652 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1653 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1654 "&vpshufb (@x[$d1],@x[$d1],$t1)",
1656 "&vpaddd ($xc,$xc,@x[$d0])",
1657 "&vpxor (@x[$b0],$xc,@x[$b0])",
1658 "&vpslld ($t0,@x[$b0],12)",
1659 "&vpsrld (@x[$b0],@x[$b0],20)",
1660 "&vpor (@x[$b0],$t0,@x[$b0])",
1661 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1662 "&vpaddd ($xc_,$xc_,@x[$d1])",
1663 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1664 "&vpslld ($t1,@x[$b1],12)",
1665 "&vpsrld (@x[$b1],@x[$b1],20)",
1666 "&vpor (@x[$b1],$t1,@x[$b1])",
1668 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
1669 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1670 "&vpshufb (@x[$d0],@x[$d0],$t0)",
1671 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
1672 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1673 "&vpshufb (@x[$d1],@x[$d1],$t0)",
1675 "&vpaddd ($xc,$xc,@x[$d0])",
1676 "&vpxor (@x[$b0],$xc,@x[$b0])",
1677 "&vpslld ($t1,@x[$b0],7)",
1678 "&vpsrld (@x[$b0],@x[$b0],25)",
1679 "&vpor (@x[$b0],$t1,@x[$b0])",
1680 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1681 "&vpaddd ($xc_,$xc_,@x[$d1])",
1682 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1683 "&vpslld ($t0,@x[$b1],7)",
1684 "&vpsrld (@x[$b1],@x[$b1],25)",
1685 "&vpor (@x[$b1],$t0,@x[$b1])",
1687 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
1688 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
1689 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
1690 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
1692 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1693 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1694 "&vpshufb (@x[$d2],@x[$d2],$t1)",
1695 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1696 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1697 "&vpshufb (@x[$d3],@x[$d3],$t1)",
1699 "&vpaddd ($xc,$xc,@x[$d2])",
1700 "&vpxor (@x[$b2],$xc,@x[$b2])",
1701 "&vpslld ($t0,@x[$b2],12)",
1702 "&vpsrld (@x[$b2],@x[$b2],20)",
1703 "&vpor (@x[$b2],$t0,@x[$b2])",
1704 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1705 "&vpaddd ($xc_,$xc_,@x[$d3])",
1706 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1707 "&vpslld ($t1,@x[$b3],12)",
1708 "&vpsrld (@x[$b3],@x[$b3],20)",
1709 "&vpor (@x[$b3],$t1,@x[$b3])",
1711 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1712 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1713 "&vpshufb (@x[$d2],@x[$d2],$t0)",
1714 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1715 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1716 "&vpshufb (@x[$d3],@x[$d3],$t0)",
1718 "&vpaddd ($xc,$xc,@x[$d2])",
1719 "&vpxor (@x[$b2],$xc,@x[$b2])",
1720 "&vpslld ($t1,@x[$b2],7)",
1721 "&vpsrld (@x[$b2],@x[$b2],25)",
1722 "&vpor (@x[$b2],$t1,@x[$b2])",
1723 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1724 "&vpaddd ($xc_,$xc_,@x[$d3])",
1725 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1726 "&vpslld ($t0,@x[$b3],7)",
1727 "&vpsrld (@x[$b3],@x[$b3],25)",
1728 "&vpor (@x[$b3],$t0,@x[$b3])"
1732 my $xframe = $win64 ? 0xb0 : 8;
1735 .type ChaCha20_8x,\@function,5
1740 sub \$0x280+$xframe,%rsp
1743 $code.=<<___ if ($win64);
1744 lea 0x290+0x30(%rsp),%r11
1745 movaps %xmm6,-0x30(%r11)
1746 movaps %xmm7,-0x20(%r11)
1747 movaps %xmm8,-0x10(%r11)
1748 movaps %xmm9,0x00(%r11)
1749 movaps %xmm10,0x10(%r11)
1750 movaps %xmm11,0x20(%r11)
1751 movaps %xmm12,0x30(%r11)
1752 movaps %xmm13,0x40(%r11)
1753 movaps %xmm14,0x50(%r11)
1754 movaps %xmm15,0x60(%r11)
1758 mov %r10,0x280(%rsp)
1760 ################ stack layout
1761 # +0x00 SIMD equivalent of @x[8-12]
1763 # +0x80 constant copy of key[0-2] smashed by lanes
1765 # +0x200 SIMD counters (with nonce smashed by lanes)
1769 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
1770 vbroadcasti128 ($key),$xb3 # key[1]
1771 vbroadcasti128 16($key),$xt3 # key[2]
1772 vbroadcasti128 ($counter),$xd3 # key[3]
1773 lea 0x100(%rsp),%rcx # size optimization
1774 lea 0x200(%rsp),%rax # size optimization
1775 lea .Lrot16(%rip),%r10
1776 lea .Lrot24(%rip),%r11
1778 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1779 vpshufd \$0x55,$xa3,$xa1
1780 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload
1781 vpshufd \$0xaa,$xa3,$xa2
1782 vmovdqa $xa1,0xa0-0x100(%rcx)
1783 vpshufd \$0xff,$xa3,$xa3
1784 vmovdqa $xa2,0xc0-0x100(%rcx)
1785 vmovdqa $xa3,0xe0-0x100(%rcx)
1787 vpshufd \$0x00,$xb3,$xb0
1788 vpshufd \$0x55,$xb3,$xb1
1789 vmovdqa $xb0,0x100-0x100(%rcx)
1790 vpshufd \$0xaa,$xb3,$xb2
1791 vmovdqa $xb1,0x120-0x100(%rcx)
1792 vpshufd \$0xff,$xb3,$xb3
1793 vmovdqa $xb2,0x140-0x100(%rcx)
1794 vmovdqa $xb3,0x160-0x100(%rcx)
1796 vpshufd \$0x00,$xt3,$xt0 # "xc0"
1797 vpshufd \$0x55,$xt3,$xt1 # "xc1"
1798 vmovdqa $xt0,0x180-0x200(%rax)
1799 vpshufd \$0xaa,$xt3,$xt2 # "xc2"
1800 vmovdqa $xt1,0x1a0-0x200(%rax)
1801 vpshufd \$0xff,$xt3,$xt3 # "xc3"
1802 vmovdqa $xt2,0x1c0-0x200(%rax)
1803 vmovdqa $xt3,0x1e0-0x200(%rax)
1805 vpshufd \$0x00,$xd3,$xd0
1806 vpshufd \$0x55,$xd3,$xd1
1807 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
1808 vpshufd \$0xaa,$xd3,$xd2
1809 vmovdqa $xd1,0x220-0x200(%rax)
1810 vpshufd \$0xff,$xd3,$xd3
1811 vmovdqa $xd2,0x240-0x200(%rax)
1812 vmovdqa $xd3,0x260-0x200(%rax)
1818 vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key
1819 vmovdqa 0xa0-0x100(%rcx),$xa1
1820 vmovdqa 0xc0-0x100(%rcx),$xa2
1821 vmovdqa 0xe0-0x100(%rcx),$xa3
1822 vmovdqa 0x100-0x100(%rcx),$xb0
1823 vmovdqa 0x120-0x100(%rcx),$xb1
1824 vmovdqa 0x140-0x100(%rcx),$xb2
1825 vmovdqa 0x160-0x100(%rcx),$xb3
1826 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0"
1827 vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1"
1828 vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2"
1829 vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3"
1830 vmovdqa 0x200-0x200(%rax),$xd0
1831 vmovdqa 0x220-0x200(%rax),$xd1
1832 vmovdqa 0x240-0x200(%rax),$xd2
1833 vmovdqa 0x260-0x200(%rax),$xd3
1834 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters
1837 vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
1838 vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
1839 vbroadcasti128 (%r10),$xt3
1840 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
1847 foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
1848 foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
1853 lea 0x200(%rsp),%rax # size optimization
1854 vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
1855 vpaddd 0xa0-0x100(%rcx),$xa1,$xa1
1856 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2
1857 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3
1859 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1860 vpunpckldq $xa3,$xa2,$xt3
1861 vpunpckhdq $xa1,$xa0,$xa0
1862 vpunpckhdq $xa3,$xa2,$xa2
1863 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1864 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1865 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1866 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1868 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1870 vpaddd 0x100-0x100(%rcx),$xb0,$xb0
1871 vpaddd 0x120-0x100(%rcx),$xb1,$xb1
1872 vpaddd 0x140-0x100(%rcx),$xb2,$xb2
1873 vpaddd 0x160-0x100(%rcx),$xb3,$xb3
1875 vpunpckldq $xb1,$xb0,$xt2
1876 vpunpckldq $xb3,$xb2,$xt3
1877 vpunpckhdq $xb1,$xb0,$xb0
1878 vpunpckhdq $xb3,$xb2,$xb2
1879 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1880 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1881 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1882 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1884 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1886 vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
1887 vperm2i128 \$0x31,$xb0,$xa0,$xb0
1888 vperm2i128 \$0x20,$xb1,$xa1,$xa0
1889 vperm2i128 \$0x31,$xb1,$xa1,$xb1
1890 vperm2i128 \$0x20,$xb2,$xa2,$xa1
1891 vperm2i128 \$0x31,$xb2,$xa2,$xb2
1892 vperm2i128 \$0x20,$xb3,$xa3,$xa2
1893 vperm2i128 \$0x31,$xb3,$xa3,$xb3
1895 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
1896 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1898 vmovdqa $xa0,0x00(%rsp) # offload $xaN
1899 vmovdqa $xa1,0x20(%rsp)
1900 vmovdqa 0x40(%rsp),$xc2 # $xa0
1901 vmovdqa 0x60(%rsp),$xc3 # $xa1
1903 vpaddd 0x180-0x200(%rax),$xc0,$xc0
1904 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1
1905 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2
1906 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3
1908 vpunpckldq $xc1,$xc0,$xt2
1909 vpunpckldq $xc3,$xc2,$xt3
1910 vpunpckhdq $xc1,$xc0,$xc0
1911 vpunpckhdq $xc3,$xc2,$xc2
1912 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1913 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1914 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1915 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1917 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1919 vpaddd 0x200-0x200(%rax),$xd0,$xd0
1920 vpaddd 0x220-0x200(%rax),$xd1,$xd1
1921 vpaddd 0x240-0x200(%rax),$xd2,$xd2
1922 vpaddd 0x260-0x200(%rax),$xd3,$xd3
1924 vpunpckldq $xd1,$xd0,$xt2
1925 vpunpckldq $xd3,$xd2,$xt3
1926 vpunpckhdq $xd1,$xd0,$xd0
1927 vpunpckhdq $xd3,$xd2,$xd2
1928 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1929 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1930 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1931 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1933 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1935 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
1936 vperm2i128 \$0x31,$xd0,$xc0,$xd0
1937 vperm2i128 \$0x20,$xd1,$xc1,$xc0
1938 vperm2i128 \$0x31,$xd1,$xc1,$xd1
1939 vperm2i128 \$0x20,$xd2,$xc2,$xc1
1940 vperm2i128 \$0x31,$xd2,$xc2,$xd2
1941 vperm2i128 \$0x20,$xd3,$xc3,$xc2
1942 vperm2i128 \$0x31,$xd3,$xc3,$xd3
1944 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
1945 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
1946 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
1947 ($xa0,$xa1)=($xt2,$xt3);
1949 vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember?
1950 vmovdqa 0x20(%rsp),$xa1
1955 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1956 vpxor 0x20($inp),$xb0,$xb0
1957 vpxor 0x40($inp),$xc0,$xc0
1958 vpxor 0x60($inp),$xd0,$xd0
1959 lea 0x80($inp),$inp # size optimization
1960 vmovdqu $xa0,0x00($out)
1961 vmovdqu $xb0,0x20($out)
1962 vmovdqu $xc0,0x40($out)
1963 vmovdqu $xd0,0x60($out)
1964 lea 0x80($out),$out # size optimization
1966 vpxor 0x00($inp),$xa1,$xa1
1967 vpxor 0x20($inp),$xb1,$xb1
1968 vpxor 0x40($inp),$xc1,$xc1
1969 vpxor 0x60($inp),$xd1,$xd1
1970 lea 0x80($inp),$inp # size optimization
1971 vmovdqu $xa1,0x00($out)
1972 vmovdqu $xb1,0x20($out)
1973 vmovdqu $xc1,0x40($out)
1974 vmovdqu $xd1,0x60($out)
1975 lea 0x80($out),$out # size optimization
1977 vpxor 0x00($inp),$xa2,$xa2
1978 vpxor 0x20($inp),$xb2,$xb2
1979 vpxor 0x40($inp),$xc2,$xc2
1980 vpxor 0x60($inp),$xd2,$xd2
1981 lea 0x80($inp),$inp # size optimization
1982 vmovdqu $xa2,0x00($out)
1983 vmovdqu $xb2,0x20($out)
1984 vmovdqu $xc2,0x40($out)
1985 vmovdqu $xd2,0x60($out)
1986 lea 0x80($out),$out # size optimization
1988 vpxor 0x00($inp),$xa3,$xa3
1989 vpxor 0x20($inp),$xb3,$xb3
1990 vpxor 0x40($inp),$xc3,$xc3
1991 vpxor 0x60($inp),$xd3,$xd3
1992 lea 0x80($inp),$inp # size optimization
1993 vmovdqu $xa3,0x00($out)
1994 vmovdqu $xb3,0x20($out)
1995 vmovdqu $xc3,0x40($out)
1996 vmovdqu $xd3,0x60($out)
1997 lea 0x80($out),$out # size optimization
2021 vmovdqa $xa0,0x00(%rsp)
2022 vmovdqa $xb0,0x20(%rsp)
2027 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2028 vpxor 0x20($inp),$xb0,$xb0
2029 vmovdqu $xa0,0x00($out)
2030 vmovdqu $xb0,0x20($out)
2033 lea 0x40($inp),$inp # inp+=64*1
2035 vmovdqa $xc0,0x00(%rsp)
2036 lea 0x40($out),$out # out+=64*1
2037 sub \$64,$len # len-=64*1
2038 vmovdqa $xd0,0x20(%rsp)
2043 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2044 vpxor 0x20($inp),$xb0,$xb0
2045 vpxor 0x40($inp),$xc0,$xc0
2046 vpxor 0x60($inp),$xd0,$xd0
2047 vmovdqu $xa0,0x00($out)
2048 vmovdqu $xb0,0x20($out)
2049 vmovdqu $xc0,0x40($out)
2050 vmovdqu $xd0,0x60($out)
2053 lea 0x80($inp),$inp # inp+=64*2
2055 vmovdqa $xa1,0x00(%rsp)
2056 lea 0x80($out),$out # out+=64*2
2057 sub \$128,$len # len-=64*2
2058 vmovdqa $xb1,0x20(%rsp)
2063 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2064 vpxor 0x20($inp),$xb0,$xb0
2065 vpxor 0x40($inp),$xc0,$xc0
2066 vpxor 0x60($inp),$xd0,$xd0
2067 vpxor 0x80($inp),$xa1,$xa1
2068 vpxor 0xa0($inp),$xb1,$xb1
2069 vmovdqu $xa0,0x00($out)
2070 vmovdqu $xb0,0x20($out)
2071 vmovdqu $xc0,0x40($out)
2072 vmovdqu $xd0,0x60($out)
2073 vmovdqu $xa1,0x80($out)
2074 vmovdqu $xb1,0xa0($out)
2077 lea 0xc0($inp),$inp # inp+=64*3
2079 vmovdqa $xc1,0x00(%rsp)
2080 lea 0xc0($out),$out # out+=64*3
2081 sub \$192,$len # len-=64*3
2082 vmovdqa $xd1,0x20(%rsp)
2087 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2088 vpxor 0x20($inp),$xb0,$xb0
2089 vpxor 0x40($inp),$xc0,$xc0
2090 vpxor 0x60($inp),$xd0,$xd0
2091 vpxor 0x80($inp),$xa1,$xa1
2092 vpxor 0xa0($inp),$xb1,$xb1
2093 vpxor 0xc0($inp),$xc1,$xc1
2094 vpxor 0xe0($inp),$xd1,$xd1
2095 vmovdqu $xa0,0x00($out)
2096 vmovdqu $xb0,0x20($out)
2097 vmovdqu $xc0,0x40($out)
2098 vmovdqu $xd0,0x60($out)
2099 vmovdqu $xa1,0x80($out)
2100 vmovdqu $xb1,0xa0($out)
2101 vmovdqu $xc1,0xc0($out)
2102 vmovdqu $xd1,0xe0($out)
2105 lea 0x100($inp),$inp # inp+=64*4
2107 vmovdqa $xa2,0x00(%rsp)
2108 lea 0x100($out),$out # out+=64*4
2109 sub \$256,$len # len-=64*4
2110 vmovdqa $xb2,0x20(%rsp)
2115 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2116 vpxor 0x20($inp),$xb0,$xb0
2117 vpxor 0x40($inp),$xc0,$xc0
2118 vpxor 0x60($inp),$xd0,$xd0
2119 vpxor 0x80($inp),$xa1,$xa1
2120 vpxor 0xa0($inp),$xb1,$xb1
2121 vpxor 0xc0($inp),$xc1,$xc1
2122 vpxor 0xe0($inp),$xd1,$xd1
2123 vpxor 0x100($inp),$xa2,$xa2
2124 vpxor 0x120($inp),$xb2,$xb2
2125 vmovdqu $xa0,0x00($out)
2126 vmovdqu $xb0,0x20($out)
2127 vmovdqu $xc0,0x40($out)
2128 vmovdqu $xd0,0x60($out)
2129 vmovdqu $xa1,0x80($out)
2130 vmovdqu $xb1,0xa0($out)
2131 vmovdqu $xc1,0xc0($out)
2132 vmovdqu $xd1,0xe0($out)
2133 vmovdqu $xa2,0x100($out)
2134 vmovdqu $xb2,0x120($out)
2137 lea 0x140($inp),$inp # inp+=64*5
2139 vmovdqa $xc2,0x00(%rsp)
2140 lea 0x140($out),$out # out+=64*5
2141 sub \$320,$len # len-=64*5
2142 vmovdqa $xd2,0x20(%rsp)
2147 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2148 vpxor 0x20($inp),$xb0,$xb0
2149 vpxor 0x40($inp),$xc0,$xc0
2150 vpxor 0x60($inp),$xd0,$xd0
2151 vpxor 0x80($inp),$xa1,$xa1
2152 vpxor 0xa0($inp),$xb1,$xb1
2153 vpxor 0xc0($inp),$xc1,$xc1
2154 vpxor 0xe0($inp),$xd1,$xd1
2155 vpxor 0x100($inp),$xa2,$xa2
2156 vpxor 0x120($inp),$xb2,$xb2
2157 vpxor 0x140($inp),$xc2,$xc2
2158 vpxor 0x160($inp),$xd2,$xd2
2159 vmovdqu $xa0,0x00($out)
2160 vmovdqu $xb0,0x20($out)
2161 vmovdqu $xc0,0x40($out)
2162 vmovdqu $xd0,0x60($out)
2163 vmovdqu $xa1,0x80($out)
2164 vmovdqu $xb1,0xa0($out)
2165 vmovdqu $xc1,0xc0($out)
2166 vmovdqu $xd1,0xe0($out)
2167 vmovdqu $xa2,0x100($out)
2168 vmovdqu $xb2,0x120($out)
2169 vmovdqu $xc2,0x140($out)
2170 vmovdqu $xd2,0x160($out)
2173 lea 0x180($inp),$inp # inp+=64*6
2175 vmovdqa $xa3,0x00(%rsp)
2176 lea 0x180($out),$out # out+=64*6
2177 sub \$384,$len # len-=64*6
2178 vmovdqa $xb3,0x20(%rsp)
2183 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2184 vpxor 0x20($inp),$xb0,$xb0
2185 vpxor 0x40($inp),$xc0,$xc0
2186 vpxor 0x60($inp),$xd0,$xd0
2187 vpxor 0x80($inp),$xa1,$xa1
2188 vpxor 0xa0($inp),$xb1,$xb1
2189 vpxor 0xc0($inp),$xc1,$xc1
2190 vpxor 0xe0($inp),$xd1,$xd1
2191 vpxor 0x100($inp),$xa2,$xa2
2192 vpxor 0x120($inp),$xb2,$xb2
2193 vpxor 0x140($inp),$xc2,$xc2
2194 vpxor 0x160($inp),$xd2,$xd2
2195 vpxor 0x180($inp),$xa3,$xa3
2196 vpxor 0x1a0($inp),$xb3,$xb3
2197 vmovdqu $xa0,0x00($out)
2198 vmovdqu $xb0,0x20($out)
2199 vmovdqu $xc0,0x40($out)
2200 vmovdqu $xd0,0x60($out)
2201 vmovdqu $xa1,0x80($out)
2202 vmovdqu $xb1,0xa0($out)
2203 vmovdqu $xc1,0xc0($out)
2204 vmovdqu $xd1,0xe0($out)
2205 vmovdqu $xa2,0x100($out)
2206 vmovdqu $xb2,0x120($out)
2207 vmovdqu $xc2,0x140($out)
2208 vmovdqu $xd2,0x160($out)
2209 vmovdqu $xa3,0x180($out)
2210 vmovdqu $xb3,0x1a0($out)
2213 lea 0x1c0($inp),$inp # inp+=64*7
2215 vmovdqa $xc3,0x00(%rsp)
2216 lea 0x1c0($out),$out # out+=64*7
2217 sub \$448,$len # len-=64*7
2218 vmovdqa $xd3,0x20(%rsp)
2221 movzb ($inp,%r10),%eax
2222 movzb (%rsp,%r10),%ecx
2225 mov %al,-1($out,%r10)
2232 $code.=<<___ if ($win64);
2233 lea 0x290+0x30(%rsp),%r11
2234 movaps -0x30(%r11),%xmm6
2235 movaps -0x20(%r11),%xmm7
2236 movaps -0x10(%r11),%xmm8
2237 movaps 0x00(%r11),%xmm9
2238 movaps 0x10(%r11),%xmm10
2239 movaps 0x20(%r11),%xmm11
2240 movaps 0x30(%r11),%xmm12
2241 movaps 0x40(%r11),%xmm13
2242 movaps 0x50(%r11),%xmm14
2243 movaps 0x60(%r11),%xmm15
2246 mov 0x280(%rsp),%rsp
2248 .size ChaCha20_8x,.-ChaCha20_8x
2252 ########################################################################
2255 # This one handles shorter inputs...
2257 my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
2258 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
2260 sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round
2278 my $xframe = $win64 ? 32+32+8 : 24;
2281 .type ChaCha20_avx512,\@function,5
2288 push %rbx # just to share SEH handler, no pops
2295 sub \$64+$xframe,%rsp
2297 $code.=<<___ if ($win64);
2298 movaps %xmm6,64+32(%rsp)
2299 movaps %xmm7,64+48(%rsp)
2302 vbroadcasti32x4 .Lsigma(%rip),$a
2303 vbroadcasti32x4 ($key),$b
2304 vbroadcasti32x4 16($key),$c
2305 vbroadcasti32x4 ($counter),$d
2310 vpaddd .Lzeroz(%rip),$d,$d
2311 vmovdqa32 .Lfourz(%rip),$fourz
2312 mov \$10,$counter # reuse $counter
2321 vpaddd $fourz,$d_,$d
2330 &vpshufd ($c,$c,0b01001110);
2331 &vpshufd ($b,$b,0b00111001);
2332 &vpshufd ($d,$d,0b10010011);
2335 &vpshufd ($c,$c,0b01001110);
2336 &vpshufd ($b,$b,0b10010011);
2337 &vpshufd ($d,$d,0b00111001);
2340 &jnz (".Loop_avx512");
2351 vpxor 0x00($inp),%x#$a,$t0 # xor with input
2352 vpxor 0x10($inp),%x#$b,$t1
2353 vpxor 0x20($inp),%x#$c,$t2
2354 vpxor 0x30($inp),%x#$d,$t3
2355 lea 0x40($inp),$inp # inp+=64
2357 vmovdqu $t0,0x00($out) # write output
2358 vmovdqu $t1,0x10($out)
2359 vmovdqu $t2,0x20($out)
2360 vmovdqu $t3,0x30($out)
2361 lea 0x40($out),$out # out+=64
2365 vextracti32x4 \$1,$a,$t0
2366 vextracti32x4 \$1,$b,$t1
2367 vextracti32x4 \$1,$c,$t2
2368 vextracti32x4 \$1,$d,$t3
2373 vpxor 0x00($inp),$t0,$t0 # xor with input
2374 vpxor 0x10($inp),$t1,$t1
2375 vpxor 0x20($inp),$t2,$t2
2376 vpxor 0x30($inp),$t3,$t3
2377 lea 0x40($inp),$inp # inp+=64
2379 vmovdqu $t0,0x00($out) # write output
2380 vmovdqu $t1,0x10($out)
2381 vmovdqu $t2,0x20($out)
2382 vmovdqu $t3,0x30($out)
2383 lea 0x40($out),$out # out+=64
2387 vextracti32x4 \$2,$a,$t0
2388 vextracti32x4 \$2,$b,$t1
2389 vextracti32x4 \$2,$c,$t2
2390 vextracti32x4 \$2,$d,$t3
2395 vpxor 0x00($inp),$t0,$t0 # xor with input
2396 vpxor 0x10($inp),$t1,$t1
2397 vpxor 0x20($inp),$t2,$t2
2398 vpxor 0x30($inp),$t3,$t3
2399 lea 0x40($inp),$inp # inp+=64
2401 vmovdqu $t0,0x00($out) # write output
2402 vmovdqu $t1,0x10($out)
2403 vmovdqu $t2,0x20($out)
2404 vmovdqu $t3,0x30($out)
2405 lea 0x40($out),$out # out+=64
2409 vextracti32x4 \$3,$a,$t0
2410 vextracti32x4 \$3,$b,$t1
2411 vextracti32x4 \$3,$c,$t2
2412 vextracti32x4 \$3,$d,$t3
2417 vpxor 0x00($inp),$t0,$t0 # xor with input
2418 vpxor 0x10($inp),$t1,$t1
2419 vpxor 0x20($inp),$t2,$t2
2420 vpxor 0x30($inp),$t3,$t3
2421 lea 0x40($inp),$inp # inp+=64
2423 vmovdqu $t0,0x00($out) # write output
2424 vmovdqu $t1,0x10($out)
2425 vmovdqu $t2,0x20($out)
2426 vmovdqu $t3,0x30($out)
2427 lea 0x40($out),$out # out+=64
2429 jnz .Loop_outer_avx512
2435 vmovdqa %x#$a,0x00(%rsp)
2436 vmovdqa %x#$b,0x10(%rsp)
2437 vmovdqa %x#$c,0x20(%rsp)
2438 vmovdqa %x#$d,0x30(%rsp)
2440 jmp .Loop_tail_avx512
2444 vmovdqa $t0,0x00(%rsp)
2445 vmovdqa $t1,0x10(%rsp)
2446 vmovdqa $t2,0x20(%rsp)
2447 vmovdqa $t3,0x30(%rsp)
2451 movzb ($inp,$counter),%eax
2452 movzb (%rsp,$counter),%ecx
2453 lea 1($counter),$counter
2455 mov %al,-1($out,$counter)
2457 jnz .Loop_tail_avx512
2459 vmovdqa32 $a_,0x00(%rsp)
2464 $code.=<<___ if ($win64);
2465 movaps 64+32(%rsp),%xmm6
2466 movaps 64+48(%rsp),%xmm7
2469 add \$64+$xframe+48,%rsp
2471 .size ChaCha20_avx512,.-ChaCha20_avx512
2475 # This one handles longer inputs...
2477 my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2478 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
2479 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2480 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2481 my @key=map("%zmm$_",(16..31));
2482 my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
2484 sub AVX512_lane_ROUND {
2485 my ($a0,$b0,$c0,$d0)=@_;
2486 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
2487 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
2488 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
2489 my @x=map("\"$_\"",@xx);
2492 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
2493 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
2494 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
2495 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
2496 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2497 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2498 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2499 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2500 "&vprold (@x[$d0],@x[$d0],16)",
2501 "&vprold (@x[$d1],@x[$d1],16)",
2502 "&vprold (@x[$d2],@x[$d2],16)",
2503 "&vprold (@x[$d3],@x[$d3],16)",
2505 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2506 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2507 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2508 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2509 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2510 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2511 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2512 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2513 "&vprold (@x[$b0],@x[$b0],12)",
2514 "&vprold (@x[$b1],@x[$b1],12)",
2515 "&vprold (@x[$b2],@x[$b2],12)",
2516 "&vprold (@x[$b3],@x[$b3],12)",
2518 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
2519 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
2520 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
2521 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
2522 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2523 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2524 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2525 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2526 "&vprold (@x[$d0],@x[$d0],8)",
2527 "&vprold (@x[$d1],@x[$d1],8)",
2528 "&vprold (@x[$d2],@x[$d2],8)",
2529 "&vprold (@x[$d3],@x[$d3],8)",
2531 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2532 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2533 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2534 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2535 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2536 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2537 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2538 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2539 "&vprold (@x[$b0],@x[$b0],7)",
2540 "&vprold (@x[$b1],@x[$b1],7)",
2541 "&vprold (@x[$b2],@x[$b2],7)",
2542 "&vprold (@x[$b3],@x[$b3],7)"
2546 my $xframe = $win64 ? 0xb0 : 8;
2549 .type ChaCha20_16x,\@function,5
2554 sub \$64+$xframe,%rsp
2557 $code.=<<___ if ($win64);
2558 lea 0x290+0x30(%rsp),%r11
2559 movaps %xmm6,-0x30(%r11)
2560 movaps %xmm7,-0x20(%r11)
2561 movaps %xmm8,-0x10(%r11)
2562 movaps %xmm9,0x00(%r11)
2563 movaps %xmm10,0x10(%r11)
2564 movaps %xmm11,0x20(%r11)
2565 movaps %xmm12,0x30(%r11)
2566 movaps %xmm13,0x40(%r11)
2567 movaps %xmm14,0x50(%r11)
2568 movaps %xmm15,0x60(%r11)
2573 lea .Lsigma(%rip),%r10
2574 vbroadcasti32x4 (%r10),$xa3 # key[0]
2575 vbroadcasti32x4 ($key),$xb3 # key[1]
2576 vbroadcasti32x4 16($key),$xc3 # key[2]
2577 vbroadcasti32x4 ($counter),$xd3 # key[3]
2579 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
2580 vpshufd \$0x55,$xa3,$xa1
2581 vpshufd \$0xaa,$xa3,$xa2
2582 vpshufd \$0xff,$xa3,$xa3
2583 vmovdqa64 $xa0,@key[0]
2584 vmovdqa64 $xa1,@key[1]
2585 vmovdqa64 $xa2,@key[2]
2586 vmovdqa64 $xa3,@key[3]
2588 vpshufd \$0x00,$xb3,$xb0
2589 vpshufd \$0x55,$xb3,$xb1
2590 vpshufd \$0xaa,$xb3,$xb2
2591 vpshufd \$0xff,$xb3,$xb3
2592 vmovdqa64 $xb0,@key[4]
2593 vmovdqa64 $xb1,@key[5]
2594 vmovdqa64 $xb2,@key[6]
2595 vmovdqa64 $xb3,@key[7]
2597 vpshufd \$0x00,$xc3,$xc0
2598 vpshufd \$0x55,$xc3,$xc1
2599 vpshufd \$0xaa,$xc3,$xc2
2600 vpshufd \$0xff,$xc3,$xc3
2601 vmovdqa64 $xc0,@key[8]
2602 vmovdqa64 $xc1,@key[9]
2603 vmovdqa64 $xc2,@key[10]
2604 vmovdqa64 $xc3,@key[11]
2606 vpshufd \$0x00,$xd3,$xd0
2607 vpshufd \$0x55,$xd3,$xd1
2608 vpshufd \$0xaa,$xd3,$xd2
2609 vpshufd \$0xff,$xd3,$xd3
2610 vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet
2611 vmovdqa64 $xd0,@key[12]
2612 vmovdqa64 $xd1,@key[13]
2613 vmovdqa64 $xd2,@key[14]
2614 vmovdqa64 $xd3,@key[15]
2621 vpbroadcastd 0(%r10),$xa0 # reload key
2622 vpbroadcastd 4(%r10),$xa1
2623 vpbroadcastd 8(%r10),$xa2
2624 vpbroadcastd 12(%r10),$xa3
2625 vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters
2626 vmovdqa64 @key[4],$xb0
2627 vmovdqa64 @key[5],$xb1
2628 vmovdqa64 @key[6],$xb2
2629 vmovdqa64 @key[7],$xb3
2630 vmovdqa64 @key[8],$xc0
2631 vmovdqa64 @key[9],$xc1
2632 vmovdqa64 @key[10],$xc2
2633 vmovdqa64 @key[11],$xc3
2634 vmovdqa64 @key[12],$xd0
2635 vmovdqa64 @key[13],$xd1
2636 vmovdqa64 @key[14],$xd2
2637 vmovdqa64 @key[15],$xd3
2639 vmovdqa64 $xa0,@key[0]
2640 vmovdqa64 $xa1,@key[1]
2641 vmovdqa64 $xa2,@key[2]
2642 vmovdqa64 $xa3,@key[3]
2650 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
2651 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
2656 vpaddd @key[0],$xa0,$xa0 # accumulate key
2657 vpaddd @key[1],$xa1,$xa1
2658 vpaddd @key[2],$xa2,$xa2
2659 vpaddd @key[3],$xa3,$xa3
2661 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
2662 vpunpckldq $xa3,$xa2,$xt3
2663 vpunpckhdq $xa1,$xa0,$xa0
2664 vpunpckhdq $xa3,$xa2,$xa2
2665 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
2666 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
2667 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
2668 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
2670 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
2672 vpaddd @key[4],$xb0,$xb0
2673 vpaddd @key[5],$xb1,$xb1
2674 vpaddd @key[6],$xb2,$xb2
2675 vpaddd @key[7],$xb3,$xb3
2677 vpunpckldq $xb1,$xb0,$xt2
2678 vpunpckldq $xb3,$xb2,$xt3
2679 vpunpckhdq $xb1,$xb0,$xb0
2680 vpunpckhdq $xb3,$xb2,$xb2
2681 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
2682 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
2683 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
2684 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
2686 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
2688 vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further
2689 vshufi32x4 \$0xee,$xb0,$xa0,$xb0
2690 vshufi32x4 \$0x44,$xb1,$xa1,$xa0
2691 vshufi32x4 \$0xee,$xb1,$xa1,$xb1
2692 vshufi32x4 \$0x44,$xb2,$xa2,$xa1
2693 vshufi32x4 \$0xee,$xb2,$xa2,$xb2
2694 vshufi32x4 \$0x44,$xb3,$xa3,$xa2
2695 vshufi32x4 \$0xee,$xb3,$xa3,$xb3
2697 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
2699 vpaddd @key[8],$xc0,$xc0
2700 vpaddd @key[9],$xc1,$xc1
2701 vpaddd @key[10],$xc2,$xc2
2702 vpaddd @key[11],$xc3,$xc3
2704 vpunpckldq $xc1,$xc0,$xt2
2705 vpunpckldq $xc3,$xc2,$xt3
2706 vpunpckhdq $xc1,$xc0,$xc0
2707 vpunpckhdq $xc3,$xc2,$xc2
2708 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
2709 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
2710 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
2711 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
2713 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
2715 vpaddd @key[12],$xd0,$xd0
2716 vpaddd @key[13],$xd1,$xd1
2717 vpaddd @key[14],$xd2,$xd2
2718 vpaddd @key[15],$xd3,$xd3
2720 vpunpckldq $xd1,$xd0,$xt2
2721 vpunpckldq $xd3,$xd2,$xt3
2722 vpunpckhdq $xd1,$xd0,$xd0
2723 vpunpckhdq $xd3,$xd2,$xd2
2724 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
2725 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
2726 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
2727 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
2729 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
2731 vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further
2732 vshufi32x4 \$0xee,$xd0,$xc0,$xd0
2733 vshufi32x4 \$0x44,$xd1,$xc1,$xc0
2734 vshufi32x4 \$0xee,$xd1,$xc1,$xd1
2735 vshufi32x4 \$0x44,$xd2,$xc2,$xc1
2736 vshufi32x4 \$0xee,$xd2,$xc2,$xd2
2737 vshufi32x4 \$0x44,$xd3,$xc3,$xc2
2738 vshufi32x4 \$0xee,$xd3,$xc3,$xd3
2740 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
2742 vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further
2743 vshufi32x4 \$0xdd,$xc0,$xa0,$xa0
2744 vshufi32x4 \$0x88,$xd0,$xb0,$xc0
2745 vshufi32x4 \$0xdd,$xd0,$xb0,$xd0
2746 vshufi32x4 \$0x88,$xc1,$xa1,$xt1
2747 vshufi32x4 \$0xdd,$xc1,$xa1,$xa1
2748 vshufi32x4 \$0x88,$xd1,$xb1,$xc1
2749 vshufi32x4 \$0xdd,$xd1,$xb1,$xd1
2750 vshufi32x4 \$0x88,$xc2,$xa2,$xt2
2751 vshufi32x4 \$0xdd,$xc2,$xa2,$xa2
2752 vshufi32x4 \$0x88,$xd2,$xb2,$xc2
2753 vshufi32x4 \$0xdd,$xd2,$xb2,$xd2
2754 vshufi32x4 \$0x88,$xc3,$xa3,$xt3
2755 vshufi32x4 \$0xdd,$xc3,$xa3,$xa3
2756 vshufi32x4 \$0x88,$xd3,$xb3,$xc3
2757 vshufi32x4 \$0xdd,$xd3,$xb3,$xd3
2759 ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
2760 ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
2762 ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
2763 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
2764 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2765 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2770 vpxord 0x00($inp),$xa0,$xa0 # xor with input
2771 vpxord 0x40($inp),$xb0,$xb0
2772 vpxord 0x80($inp),$xc0,$xc0
2773 vpxord 0xc0($inp),$xd0,$xd0
2774 vmovdqu32 $xa0,0x00($out)
2775 vmovdqu32 $xb0,0x40($out)
2776 vmovdqu32 $xc0,0x80($out)
2777 vmovdqu32 $xd0,0xc0($out)
2779 vpxord 0x100($inp),$xa1,$xa1
2780 vpxord 0x140($inp),$xb1,$xb1
2781 vpxord 0x180($inp),$xc1,$xc1
2782 vpxord 0x1c0($inp),$xd1,$xd1
2783 vmovdqu32 $xa1,0x100($out)
2784 vmovdqu32 $xb1,0x140($out)
2785 vmovdqu32 $xc1,0x180($out)
2786 vmovdqu32 $xd1,0x1c0($out)
2788 vpxord 0x200($inp),$xa2,$xa2
2789 vpxord 0x240($inp),$xb2,$xb2
2790 vpxord 0x280($inp),$xc2,$xc2
2791 vpxord 0x2c0($inp),$xd2,$xd2
2792 vmovdqu32 $xa2,0x200($out)
2793 vmovdqu32 $xb2,0x240($out)
2794 vmovdqu32 $xc2,0x280($out)
2795 vmovdqu32 $xd2,0x2c0($out)
2797 vpxord 0x300($inp),$xa3,$xa3
2798 vpxord 0x340($inp),$xb3,$xb3
2799 vpxord 0x380($inp),$xc3,$xc3
2800 vpxord 0x3c0($inp),$xd3,$xd3
2801 lea 0x400($inp),$inp
2802 vmovdqu32 $xa3,0x300($out)
2803 vmovdqu32 $xb3,0x340($out)
2804 vmovdqu32 $xc3,0x380($out)
2805 vmovdqu32 $xd3,0x3c0($out)
2806 lea 0x400($out),$out
2818 jb .Less_than_64_16x
2819 vpxord ($inp),$xa0,$xa0 # xor with input
2820 vmovdqu32 $xa0,($out,$inp)
2826 jb .Less_than_64_16x
2827 vpxord ($inp),$xb0,$xb0
2828 vmovdqu32 $xb0,($out,$inp)
2834 jb .Less_than_64_16x
2835 vpxord ($inp),$xc0,$xc0
2836 vmovdqu32 $xc0,($out,$inp)
2842 jb .Less_than_64_16x
2843 vpxord ($inp),$xd0,$xd0
2844 vmovdqu32 $xd0,($out,$inp)
2850 jb .Less_than_64_16x
2851 vpxord ($inp),$xa1,$xa1
2852 vmovdqu32 $xa1,($out,$inp)
2858 jb .Less_than_64_16x
2859 vpxord ($inp),$xb1,$xb1
2860 vmovdqu32 $xb1,($out,$inp)
2866 jb .Less_than_64_16x
2867 vpxord ($inp),$xc1,$xc1
2868 vmovdqu32 $xc1,($out,$inp)
2874 jb .Less_than_64_16x
2875 vpxord ($inp),$xd1,$xd1
2876 vmovdqu32 $xd1,($out,$inp)
2882 jb .Less_than_64_16x
2883 vpxord ($inp),$xa2,$xa2
2884 vmovdqu32 $xa2,($out,$inp)
2890 jb .Less_than_64_16x
2891 vpxord ($inp),$xb2,$xb2
2892 vmovdqu32 $xb2,($out,$inp)
2898 jb .Less_than_64_16x
2899 vpxord ($inp),$xc2,$xc2
2900 vmovdqu32 $xc2,($out,$inp)
2906 jb .Less_than_64_16x
2907 vpxord ($inp),$xd2,$xd2
2908 vmovdqu32 $xd2,($out,$inp)
2914 jb .Less_than_64_16x
2915 vpxord ($inp),$xa3,$xa3
2916 vmovdqu32 $xa3,($out,$inp)
2922 jb .Less_than_64_16x
2923 vpxord ($inp),$xb3,$xb3
2924 vmovdqu32 $xb3,($out,$inp)
2930 jb .Less_than_64_16x
2931 vpxord ($inp),$xc3,$xc3
2932 vmovdqu32 $xc3,($out,$inp)
2938 vmovdqa32 $xa0,0x00(%rsp)
2939 lea ($out,$inp),$out
2943 movzb ($inp,%r10),%eax
2944 movzb (%rsp,%r10),%ecx
2947 mov %al,-1($out,%r10)
2951 vpxord $xa0,$xa0,$xa0
2952 vmovdqa32 $xa0,0(%rsp)
2957 $code.=<<___ if ($win64);
2958 lea 0x290+0x30(%rsp),%r11
2959 movaps -0x30(%r11),%xmm6
2960 movaps -0x20(%r11),%xmm7
2961 movaps -0x10(%r11),%xmm8
2962 movaps 0x00(%r11),%xmm9
2963 movaps 0x10(%r11),%xmm10
2964 movaps 0x20(%r11),%xmm11
2965 movaps 0x30(%r11),%xmm12
2966 movaps 0x40(%r11),%xmm13
2967 movaps 0x50(%r11),%xmm14
2968 movaps 0x60(%r11),%xmm15
2973 .size ChaCha20_16x,.-ChaCha20_16x
2977 foreach (split("\n",$code)) {
2978 s/\`([^\`]*)\`/eval $1/ge;
2980 s/%x#%[yz]/%x/g; # "down-shift"