3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # ChaCha20 for x86_64.
14 # Performance in cycles per byte out of large buffer.
16 # IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 8xAVX2
18 # P4 9.48/+99% -/22.7(ii) -
19 # Core2 7.83/+55% 7.90/8.08 4.35
20 # Westmere 7.19/+50% 5.60/6.70 3.00
21 # Sandy Bridge 8.31/+42% 5.45/6.76 2.72
22 # Ivy Bridge 6.71/+46% 5.40/6.49 2.41
23 # Haswell 5.92/+43% 5.20/6.45 2.42 1.23
24 # Silvermont 12.0/+33% 7.75/7.40 7.03(iii)
25 # Sledgehammer 7.28/+52% -/14.2(ii) -
26 # Bulldozer 9.66/+28% 9.85/11.1 3.06(iv)
27 # VIA Nano 10.5/+46% 6.72/8.60 6.05
29 # (i) compared to older gcc 3.x one can observe >2x improvement on
31 # (ii) as it can be seen, SSE2 performance is too low on legacy
32 # processors; NxSSE2 results are naturally better, but not
33 # impressively better than IALU ones, which is why you won't
34 # find SSE2 code below;
35 # (iii) this is not optimal result for Atom because of MSROM
36 # limitations, SSE2 can do better, but gain is considered too
37 # low to justify the [maintenance] effort;
38 # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20;
42 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
44 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
48 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
49 die "can't locate x86_64-xlate.pl";
51 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
52 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
53 $avx = ($1>=2.19) + ($1>=2.22);
56 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
57 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
58 $avx = ($1>=2.09) + ($1>=2.10);
61 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
62 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
63 $avx = ($1>=10) + ($1>=11);
66 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
67 $avx = ($2>=3.0) + ($2>3.0);
70 open OUT,"| \"$^X\" $xlate $flavour $output";
73 # input parameter block
74 ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
79 .extern OPENSSL_ia32cap_P
95 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
97 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
99 .asciz "expand 32-byte k"
100 .asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
103 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
104 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
106 $arg = "\$$arg" if ($arg*1 eq $arg);
107 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
110 @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
111 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
114 sub ROUND { # critical path is 24 cycles per round
115 my ($a0,$b0,$c0,$d0)=@_;
116 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
117 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
118 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
119 my ($xc,$xc_)=map("\"$_\"",@t);
120 my @x=map("\"$_\"",@x);
122 # Consider order in which variables are addressed by their
127 # 0 4 8 12 < even round
131 # 0 5 10 15 < odd round
136 # 'a', 'b' and 'd's are permanently allocated in registers,
137 # @x[0..7,12..15], while 'c's are maintained in memory. If
138 # you observe 'c' column, you'll notice that pair of 'c's is
139 # invariant between rounds. This means that we have to reload
140 # them once per round, in the middle. This is why you'll see
141 # bunch of 'c' stores and loads in the middle, but none in
142 # the beginning or end.
144 # Normally instructions would be interleaved to favour in-order
145 # execution. Generally out-of-order cores manage it gracefully,
146 # but not this time for some reason. As in-order execution
147 # cores are dying breed, old Atom is the only one around,
148 # instructions are left uninterleaved. Besides, Atom is better
149 # off executing 1xSSSE3 code anyway...
152 "&add (@x[$a0],@x[$b0])", # Q1
153 "&xor (@x[$d0],@x[$a0])",
155 "&add (@x[$a1],@x[$b1])", # Q2
156 "&xor (@x[$d1],@x[$a1])",
159 "&add ($xc,@x[$d0])",
160 "&xor (@x[$b0],$xc)",
162 "&add ($xc_,@x[$d1])",
163 "&xor (@x[$b1],$xc_)",
166 "&add (@x[$a0],@x[$b0])",
167 "&xor (@x[$d0],@x[$a0])",
169 "&add (@x[$a1],@x[$b1])",
170 "&xor (@x[$d1],@x[$a1])",
173 "&add ($xc,@x[$d0])",
174 "&xor (@x[$b0],$xc)",
176 "&add ($xc_,@x[$d1])",
177 "&xor (@x[$b1],$xc_)",
180 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
181 "&mov (\"4*$c1(%rsp)\",$xc_)",
182 "&mov ($xc,\"4*$c2(%rsp)\")",
183 "&mov ($xc_,\"4*$c3(%rsp)\")",
185 "&add (@x[$a2],@x[$b2])", # Q3
186 "&xor (@x[$d2],@x[$a2])",
188 "&add (@x[$a3],@x[$b3])", # Q4
189 "&xor (@x[$d3],@x[$a3])",
192 "&add ($xc,@x[$d2])",
193 "&xor (@x[$b2],$xc)",
195 "&add ($xc_,@x[$d3])",
196 "&xor (@x[$b3],$xc_)",
199 "&add (@x[$a2],@x[$b2])",
200 "&xor (@x[$d2],@x[$a2])",
202 "&add (@x[$a3],@x[$b3])",
203 "&xor (@x[$d3],@x[$a3])",
206 "&add ($xc,@x[$d2])",
207 "&xor (@x[$b2],$xc)",
209 "&add ($xc_,@x[$d3])",
210 "&xor (@x[$b3],$xc_)",
215 ########################################################################
216 # Generic code path that handles all lengths on pre-SSSE3 processors.
218 .globl ChaCha20_ctr32
219 .type ChaCha20_ctr32,\@function,5
222 mov OPENSSL_ia32cap_P+4(%rip),%r10
223 test \$`1<<(41-32)`,%r10d
234 #movdqa .Lsigma(%rip),%xmm0
236 movdqu 16($key),%xmm2
237 movdqu ($counter),%xmm3
238 movdqa .Lone(%rip),%xmm4
240 #movdqa %xmm0,4*0(%rsp) # key[0]
241 movdqa %xmm1,4*4(%rsp) # key[1]
242 movdqa %xmm2,4*8(%rsp) # key[2]
243 movdqa %xmm3,4*12(%rsp) # key[3]
244 mov $len,%rbp # reassign $len
249 mov \$0x61707865,@x[0] # 'expa'
250 mov \$0x3320646e,@x[1] # 'nd 3'
251 mov \$0x79622d32,@x[2] # '2-by'
252 mov \$0x6b206574,@x[3] # 'te k'
258 mov 4*13(%rsp),@x[13]
259 mov 4*14(%rsp),@x[14]
260 mov 4*15(%rsp),@x[15]
262 mov %rbp,64+0(%rsp) # save len
264 mov $inp,64+8(%rsp) # save inp
265 movq %xmm2,%rsi # "@x[8]"
266 mov $out,64+16(%rsp) # save out
268 shr \$32,%rdi # "@x[9]"
274 foreach (&ROUND (0, 4, 8,12)) { eval; }
275 foreach (&ROUND (0, 5,10,15)) { eval; }
280 mov @t[1],4*9(%rsp) # modulo-scheduled
282 mov 64(%rsp),%rbp # load len
284 mov 64+8(%rsp),$inp # load inp
285 paddd %xmm4,%xmm3 # increment counter
286 mov 64+16(%rsp),$out # load out
288 add \$0x61707865,@x[0] # 'expa'
289 add \$0x3320646e,@x[1] # 'nd 3'
290 add \$0x79622d32,@x[2] # '2-by'
291 add \$0x6b206574,@x[3] # 'te k'
296 add 4*12(%rsp),@x[12]
297 add 4*13(%rsp),@x[13]
298 add 4*14(%rsp),@x[14]
299 add 4*15(%rsp),@x[15]
300 paddd 4*8(%rsp),%xmm1
305 xor 4*0($inp),@x[0] # xor with input
313 movdqu 4*8($inp),%xmm0
314 xor 4*12($inp),@x[12]
315 xor 4*13($inp),@x[13]
316 xor 4*14($inp),@x[14]
317 xor 4*15($inp),@x[15]
318 lea 4*16($inp),$inp # inp+=64
321 movdqa %xmm2,4*8(%rsp)
322 movd %xmm3,4*12(%rsp)
324 mov @x[0],4*0($out) # write output
332 movdqu %xmm0,4*8($out)
333 mov @x[12],4*12($out)
334 mov @x[13],4*13($out)
335 mov @x[14],4*14($out)
336 mov @x[15],4*15($out)
337 lea 4*16($out),$out # out+=64
355 movdqa %xmm1,4*8(%rsp)
356 mov @x[12],4*12(%rsp)
357 mov @x[13],4*13(%rsp)
358 mov @x[14],4*14(%rsp)
359 mov @x[15],4*15(%rsp)
362 movzb ($inp,%rbx),%eax
363 movzb (%rsp,%rbx),%edx
366 mov %al,-1($out,%rbx)
379 .size ChaCha20_ctr32,.-ChaCha20_ctr32
382 ########################################################################
383 # SSSE3 code path that handles shorter lengths
385 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
387 sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
411 my $xframe = $win64 ? 32+32+8 : 24;
414 .type ChaCha20_ssse3,\@function,5
419 $code.=<<___ if ($avx);
420 test \$`1<<(43-32)`,%r10d
421 jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4
424 cmp \$128,$len # we might throw away some data,
425 ja .LChaCha20_4x # but overall it won't be slower
435 sub \$64+$xframe,%rsp
437 $code.=<<___ if ($win64);
438 movaps %xmm6,64+32(%rsp)
439 movaps %xmm7,64+48(%rsp)
442 movdqa .Lsigma(%rip),$a
446 movdqa .Lrot16(%rip),$rot16
447 movdqa .Lrot24(%rip),$rot24
458 movdqa .Lone(%rip),$d
471 &pshufd ($c,$c,0b01001110);
472 &pshufd ($b,$b,0b00111001);
473 &pshufd ($d,$d,0b10010011);
477 &pshufd ($c,$c,0b01001110);
478 &pshufd ($b,$b,0b10010011);
479 &pshufd ($d,$d,0b00111001);
482 &jnz (".Loop_ssse3");
494 movdqu 0x10($inp),$t1
495 pxor $t,$a # xor with input
498 movdqu 0x30($inp),$t1
499 lea 0x40($inp),$inp # inp+=64
503 movdqu $a,0x00($out) # write output
507 lea 0x40($out),$out # out+=64
510 jnz .Loop_outer_ssse3
523 movzb ($inp,%rbx),%eax
524 movzb (%rsp,%rbx),%edx
527 mov %al,-1($out,%rbx)
533 $code.=<<___ if ($win64);
534 movaps 64+32(%rsp),%xmm6
535 movaps 64+48(%rsp),%xmm7
538 add \$64+$xframe,%rsp
546 .size ChaCha20_ssse3,.-ChaCha20_ssse3
550 ########################################################################
551 # SSSE3 code path that handles longer messages.
553 # assign variables to favor Atom front-end
554 my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
555 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
556 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
557 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
559 sub SSSE3_lane_ROUND {
560 my ($a0,$b0,$c0,$d0)=@_;
561 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
562 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
563 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
564 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
565 my @x=map("\"$_\"",@xx);
567 # Consider order in which variables are addressed by their
572 # 0 4 8 12 < even round
576 # 0 5 10 15 < odd round
581 # 'a', 'b' and 'd's are permanently allocated in registers,
582 # @x[0..7,12..15], while 'c's are maintained in memory. If
583 # you observe 'c' column, you'll notice that pair of 'c's is
584 # invariant between rounds. This means that we have to reload
585 # them once per round, in the middle. This is why you'll see
586 # bunch of 'c' stores and loads in the middle, but none in
587 # the beginning or end.
590 "&paddd (@x[$a0],@x[$b0])", # Q1
591 "&paddd (@x[$a1],@x[$b1])", # Q2
592 "&pxor (@x[$d0],@x[$a0])",
593 "&pxor (@x[$d1],@x[$a1])",
594 "&pshufb (@x[$d0],$t1)",
595 "&pshufb (@x[$d1],$t1)",
597 "&paddd ($xc,@x[$d0])",
598 "&paddd ($xc_,@x[$d1])",
599 "&pxor (@x[$b0],$xc)",
600 "&pxor (@x[$b1],$xc_)",
601 "&movdqa ($t0,@x[$b0])",
602 "&pslld (@x[$b0],12)",
604 "&movdqa ($t1,@x[$b1])",
605 "&pslld (@x[$b1],12)",
606 "&por (@x[$b0],$t0)",
608 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
609 "&por (@x[$b1],$t1)",
611 "&paddd (@x[$a0],@x[$b0])",
612 "&paddd (@x[$a1],@x[$b1])",
613 "&pxor (@x[$d0],@x[$a0])",
614 "&pxor (@x[$d1],@x[$a1])",
615 "&pshufb (@x[$d0],$t0)",
616 "&pshufb (@x[$d1],$t0)",
618 "&paddd ($xc,@x[$d0])",
619 "&paddd ($xc_,@x[$d1])",
620 "&pxor (@x[$b0],$xc)",
621 "&pxor (@x[$b1],$xc_)",
622 "&movdqa ($t1,@x[$b0])",
623 "&pslld (@x[$b0],7)",
625 "&movdqa ($t0,@x[$b1])",
626 "&pslld (@x[$b1],7)",
627 "&por (@x[$b0],$t1)",
629 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
630 "&por (@x[$b1],$t0)",
632 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
633 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
634 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
635 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
637 "&paddd (@x[$a2],@x[$b2])", # Q3
638 "&paddd (@x[$a3],@x[$b3])", # Q4
639 "&pxor (@x[$d2],@x[$a2])",
640 "&pxor (@x[$d3],@x[$a3])",
641 "&pshufb (@x[$d2],$t1)",
642 "&pshufb (@x[$d3],$t1)",
644 "&paddd ($xc,@x[$d2])",
645 "&paddd ($xc_,@x[$d3])",
646 "&pxor (@x[$b2],$xc)",
647 "&pxor (@x[$b3],$xc_)",
648 "&movdqa ($t0,@x[$b2])",
649 "&pslld (@x[$b2],12)",
651 "&movdqa ($t1,@x[$b3])",
652 "&pslld (@x[$b3],12)",
653 "&por (@x[$b2],$t0)",
655 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
656 "&por (@x[$b3],$t1)",
658 "&paddd (@x[$a2],@x[$b2])",
659 "&paddd (@x[$a3],@x[$b3])",
660 "&pxor (@x[$d2],@x[$a2])",
661 "&pxor (@x[$d3],@x[$a3])",
662 "&pshufb (@x[$d2],$t0)",
663 "&pshufb (@x[$d3],$t0)",
665 "&paddd ($xc,@x[$d2])",
666 "&paddd ($xc_,@x[$d3])",
667 "&pxor (@x[$b2],$xc)",
668 "&pxor (@x[$b3],$xc_)",
669 "&movdqa ($t1,@x[$b2])",
670 "&pslld (@x[$b2],7)",
672 "&movdqa ($t0,@x[$b3])",
673 "&pslld (@x[$b3],7)",
674 "&por (@x[$b2],$t1)",
676 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
681 my $xframe = $win64 ? 0xa0 : 0;
684 .type ChaCha20_4x,\@function,5
690 $code.=<<___ if ($avx>1);
691 shr \$32,%r10 # OPENSSL_ia32cap_P+8
692 test \$`1<<5`,%r10 # test AVX2
699 and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
700 cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
701 je .Ldo_sse3_after_all # to detect Atom
705 sub \$0x148+$xframe,%rsp
707 ################ stack layout
708 # +0x00 SIMD equivalent of @x[8-12]
710 # +0x40 constant copy of key[0-2] smashed by lanes
712 # +0x100 SIMD counters (with nonce smashed by lanes)
715 $code.=<<___ if ($win64);
716 movaps %xmm6,-0x30(%r11)
717 movaps %xmm7,-0x20(%r11)
718 movaps %xmm8,-0x10(%r11)
719 movaps %xmm9,0x00(%r11)
720 movaps %xmm10,0x10(%r11)
721 movaps %xmm11,0x20(%r11)
722 movaps %xmm12,0x30(%r11)
723 movaps %xmm13,0x40(%r11)
724 movaps %xmm14,0x50(%r11)
725 movaps %xmm15,0x60(%r11)
728 movdqa .Lsigma(%rip),$xa3 # key[0]
729 movdqu ($key),$xb3 # key[1]
730 movdqu 16($key),$xt3 # key[2]
731 movdqu ($counter),$xd3 # key[3]
732 lea 0x100(%rsp),%rcx # size optimization
733 lea .Lrot16(%rip),%r10
734 lea .Lrot24(%rip),%r11
736 pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
737 pshufd \$0x55,$xa3,$xa1
738 movdqa $xa0,0x40(%rsp) # ... and offload
739 pshufd \$0xaa,$xa3,$xa2
740 movdqa $xa1,0x50(%rsp)
741 pshufd \$0xff,$xa3,$xa3
742 movdqa $xa2,0x60(%rsp)
743 movdqa $xa3,0x70(%rsp)
745 pshufd \$0x00,$xb3,$xb0
746 pshufd \$0x55,$xb3,$xb1
747 movdqa $xb0,0x80-0x100(%rcx)
748 pshufd \$0xaa,$xb3,$xb2
749 movdqa $xb1,0x90-0x100(%rcx)
750 pshufd \$0xff,$xb3,$xb3
751 movdqa $xb2,0xa0-0x100(%rcx)
752 movdqa $xb3,0xb0-0x100(%rcx)
754 pshufd \$0x00,$xt3,$xt0 # "$xc0"
755 pshufd \$0x55,$xt3,$xt1 # "$xc1"
756 movdqa $xt0,0xc0-0x100(%rcx)
757 pshufd \$0xaa,$xt3,$xt2 # "$xc2"
758 movdqa $xt1,0xd0-0x100(%rcx)
759 pshufd \$0xff,$xt3,$xt3 # "$xc3"
760 movdqa $xt2,0xe0-0x100(%rcx)
761 movdqa $xt3,0xf0-0x100(%rcx)
763 pshufd \$0x00,$xd3,$xd0
764 pshufd \$0x55,$xd3,$xd1
765 paddd .Linc(%rip),$xd0 # don't save counters yet
766 pshufd \$0xaa,$xd3,$xd2
767 movdqa $xd1,0x110-0x100(%rcx)
768 pshufd \$0xff,$xd3,$xd3
769 movdqa $xd2,0x120-0x100(%rcx)
770 movdqa $xd3,0x130-0x100(%rcx)
776 movdqa 0x40(%rsp),$xa0 # re-load smashed key
777 movdqa 0x50(%rsp),$xa1
778 movdqa 0x60(%rsp),$xa2
779 movdqa 0x70(%rsp),$xa3
780 movdqa 0x80-0x100(%rcx),$xb0
781 movdqa 0x90-0x100(%rcx),$xb1
782 movdqa 0xa0-0x100(%rcx),$xb2
783 movdqa 0xb0-0x100(%rcx),$xb3
784 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
785 movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
786 movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
787 movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
788 movdqa 0x100-0x100(%rcx),$xd0
789 movdqa 0x110-0x100(%rcx),$xd1
790 movdqa 0x120-0x100(%rcx),$xd2
791 movdqa 0x130-0x100(%rcx),$xd3
792 paddd .Lfour(%rip),$xd0 # next SIMD counters
795 movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
796 movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
797 movdqa (%r10),$xt3 # .Lrot16(%rip)
799 movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
805 foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
806 foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
811 paddd 0x40(%rsp),$xa0 # accumulate key material
812 paddd 0x50(%rsp),$xa1
813 paddd 0x60(%rsp),$xa2
814 paddd 0x70(%rsp),$xa3
816 movdqa $xa0,$xt2 # "de-interlace" data
823 punpcklqdq $xa2,$xa0 # "a0"
825 punpcklqdq $xt3,$xt2 # "a2"
826 punpckhqdq $xa2,$xa1 # "a1"
827 punpckhqdq $xt3,$xa3 # "a3"
829 ($xa2,$xt2)=($xt2,$xa2);
831 paddd 0x80-0x100(%rcx),$xb0
832 paddd 0x90-0x100(%rcx),$xb1
833 paddd 0xa0-0x100(%rcx),$xb2
834 paddd 0xb0-0x100(%rcx),$xb3
836 movdqa $xa0,0x00(%rsp) # offload $xaN
837 movdqa $xa1,0x10(%rsp)
838 movdqa 0x20(%rsp),$xa0 # "xc2"
839 movdqa 0x30(%rsp),$xa1 # "xc3"
848 punpcklqdq $xb2,$xb0 # "b0"
850 punpcklqdq $xt3,$xt2 # "b2"
851 punpckhqdq $xb2,$xb1 # "b1"
852 punpckhqdq $xt3,$xb3 # "b3"
854 ($xb2,$xt2)=($xt2,$xb2);
855 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
857 paddd 0xc0-0x100(%rcx),$xc0
858 paddd 0xd0-0x100(%rcx),$xc1
859 paddd 0xe0-0x100(%rcx),$xc2
860 paddd 0xf0-0x100(%rcx),$xc3
862 movdqa $xa2,0x20(%rsp) # keep offloading $xaN
863 movdqa $xa3,0x30(%rsp)
872 punpcklqdq $xc2,$xc0 # "c0"
874 punpcklqdq $xt3,$xt2 # "c2"
875 punpckhqdq $xc2,$xc1 # "c1"
876 punpckhqdq $xt3,$xc3 # "c3"
878 ($xc2,$xt2)=($xt2,$xc2);
879 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
881 paddd 0x100-0x100(%rcx),$xd0
882 paddd 0x110-0x100(%rcx),$xd1
883 paddd 0x120-0x100(%rcx),$xd2
884 paddd 0x130-0x100(%rcx),$xd3
893 punpcklqdq $xd2,$xd0 # "d0"
895 punpcklqdq $xt3,$xt2 # "d2"
896 punpckhqdq $xd2,$xd1 # "d1"
897 punpckhqdq $xt3,$xd3 # "d3"
899 ($xd2,$xt2)=($xt2,$xd2);
904 movdqu 0x00($inp),$xt0 # xor with input
905 movdqu 0x10($inp),$xt1
906 movdqu 0x20($inp),$xt2
907 movdqu 0x30($inp),$xt3
908 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
913 movdqu $xt0,0x00($out)
914 movdqu 0x40($inp),$xt0
915 movdqu $xt1,0x10($out)
916 movdqu 0x50($inp),$xt1
917 movdqu $xt2,0x20($out)
918 movdqu 0x60($inp),$xt2
919 movdqu $xt3,0x30($out)
920 movdqu 0x70($inp),$xt3
921 lea 0x80($inp),$inp # size optimization
927 movdqu $xt0,0x40($out)
928 movdqu 0x00($inp),$xt0
929 movdqu $xt1,0x50($out)
930 movdqu 0x10($inp),$xt1
931 movdqu $xt2,0x60($out)
932 movdqu 0x20($inp),$xt2
933 movdqu $xt3,0x70($out)
934 lea 0x80($out),$out # size optimization
935 movdqu 0x30($inp),$xt3
941 movdqu $xt0,0x00($out)
942 movdqu 0x40($inp),$xt0
943 movdqu $xt1,0x10($out)
944 movdqu 0x50($inp),$xt1
945 movdqu $xt2,0x20($out)
946 movdqu 0x60($inp),$xt2
947 movdqu $xt3,0x30($out)
948 movdqu 0x70($inp),$xt3
949 lea 0x80($inp),$inp # inp+=64*4
954 movdqu $xt0,0x40($out)
955 movdqu $xt1,0x50($out)
956 movdqu $xt2,0x60($out)
957 movdqu $xt3,0x70($out)
958 lea 0x80($out),$out # out+=64*4
973 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
975 #movdqa $xt0,0x00(%rsp)
976 movdqa $xb0,0x10(%rsp)
977 movdqa $xc0,0x20(%rsp)
978 movdqa $xd0,0x30(%rsp)
983 movdqu 0x00($inp),$xt0 # xor with input
984 movdqu 0x10($inp),$xt1
985 movdqu 0x20($inp),$xt2
986 movdqu 0x30($inp),$xt3
987 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
991 movdqu $xt0,0x00($out)
992 movdqu $xt1,0x10($out)
993 movdqu $xt2,0x20($out)
994 movdqu $xt3,0x30($out)
997 movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
998 lea 0x40($inp),$inp # inp+=64*1
1000 movdqa $xt0,0x00(%rsp)
1001 movdqa $xb1,0x10(%rsp)
1002 lea 0x40($out),$out # out+=64*1
1003 movdqa $xc1,0x20(%rsp)
1004 sub \$64,$len # len-=64*1
1005 movdqa $xd1,0x30(%rsp)
1010 movdqu 0x00($inp),$xt0 # xor with input
1011 movdqu 0x10($inp),$xt1
1012 movdqu 0x20($inp),$xt2
1013 movdqu 0x30($inp),$xt3
1014 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1019 movdqu $xt0,0x00($out)
1020 movdqu 0x40($inp),$xt0
1021 movdqu $xt1,0x10($out)
1022 movdqu 0x50($inp),$xt1
1023 movdqu $xt2,0x20($out)
1024 movdqu 0x60($inp),$xt2
1025 movdqu $xt3,0x30($out)
1026 movdqu 0x70($inp),$xt3
1027 pxor 0x10(%rsp),$xt0
1031 movdqu $xt0,0x40($out)
1032 movdqu $xt1,0x50($out)
1033 movdqu $xt2,0x60($out)
1034 movdqu $xt3,0x70($out)
1037 movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
1038 lea 0x80($inp),$inp # inp+=64*2
1040 movdqa $xt0,0x00(%rsp)
1041 movdqa $xb2,0x10(%rsp)
1042 lea 0x80($out),$out # out+=64*2
1043 movdqa $xc2,0x20(%rsp)
1044 sub \$128,$len # len-=64*2
1045 movdqa $xd2,0x30(%rsp)
1050 movdqu 0x00($inp),$xt0 # xor with input
1051 movdqu 0x10($inp),$xt1
1052 movdqu 0x20($inp),$xt2
1053 movdqu 0x30($inp),$xt3
1054 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1059 movdqu $xt0,0x00($out)
1060 movdqu 0x40($inp),$xt0
1061 movdqu $xt1,0x10($out)
1062 movdqu 0x50($inp),$xt1
1063 movdqu $xt2,0x20($out)
1064 movdqu 0x60($inp),$xt2
1065 movdqu $xt3,0x30($out)
1066 movdqu 0x70($inp),$xt3
1067 lea 0x80($inp),$inp # size optimization
1068 pxor 0x10(%rsp),$xt0
1073 movdqu $xt0,0x40($out)
1074 movdqu 0x00($inp),$xt0
1075 movdqu $xt1,0x50($out)
1076 movdqu 0x10($inp),$xt1
1077 movdqu $xt2,0x60($out)
1078 movdqu 0x20($inp),$xt2
1079 movdqu $xt3,0x70($out)
1080 lea 0x80($out),$out # size optimization
1081 movdqu 0x30($inp),$xt3
1082 pxor 0x20(%rsp),$xt0
1086 movdqu $xt0,0x00($out)
1087 movdqu $xt1,0x10($out)
1088 movdqu $xt2,0x20($out)
1089 movdqu $xt3,0x30($out)
1092 movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
1093 lea 0x40($inp),$inp # inp+=64*3
1095 movdqa $xt0,0x00(%rsp)
1096 movdqa $xb3,0x10(%rsp)
1097 lea 0x40($out),$out # out+=64*3
1098 movdqa $xc3,0x20(%rsp)
1099 sub \$192,$len # len-=64*3
1100 movdqa $xd3,0x30(%rsp)
1103 movzb ($inp,%r10),%eax
1104 movzb (%rsp,%r10),%ecx
1107 mov %al,-1($out,%r10)
1113 $code.=<<___ if ($win64);
1114 lea 0x140+0x30(%rsp),%r11
1115 movaps -0x30(%r11),%xmm6
1116 movaps -0x20(%r11),%xmm7
1117 movaps -0x10(%r11),%xmm8
1118 movaps 0x00(%r11),%xmm9
1119 movaps 0x10(%r11),%xmm10
1120 movaps 0x20(%r11),%xmm11
1121 movaps 0x30(%r11),%xmm12
1122 movaps 0x40(%r11),%xmm13
1123 movaps 0x50(%r11),%xmm14
1124 movaps 0x60(%r11),%xmm15
1127 add \$0x148+$xframe,%rsp
1129 .size ChaCha20_4x,.-ChaCha20_4x
1133 ########################################################################
1134 # XOP code path that handles all lengths.
1136 # There is some "anomaly" observed depending on instructions' size or
1137 # alignment. If you look closely at below code you'll notice that
1138 # sometimes argument order varies. The order affects instruction
1139 # encoding by making it larger, and such fiddling gives 5% performance
1140 # improvement. This is on FX-4100...
1142 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1143 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1144 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1145 $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1147 sub XOP_lane_ROUND {
1148 my ($a0,$b0,$c0,$d0)=@_;
1149 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1150 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1151 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1152 my @x=map("\"$_\"",@xx);
1155 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1156 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1157 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1158 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1159 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1160 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1161 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1162 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1163 "&vprotd (@x[$d0],@x[$d0],16)",
1164 "&vprotd (@x[$d1],@x[$d1],16)",
1165 "&vprotd (@x[$d2],@x[$d2],16)",
1166 "&vprotd (@x[$d3],@x[$d3],16)",
1168 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1169 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1170 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1171 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1172 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1173 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1174 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1175 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1176 "&vprotd (@x[$b0],@x[$b0],12)",
1177 "&vprotd (@x[$b1],@x[$b1],12)",
1178 "&vprotd (@x[$b2],@x[$b2],12)",
1179 "&vprotd (@x[$b3],@x[$b3],12)",
1181 "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip
1182 "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip
1183 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1184 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1185 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1186 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1187 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1188 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1189 "&vprotd (@x[$d0],@x[$d0],8)",
1190 "&vprotd (@x[$d1],@x[$d1],8)",
1191 "&vprotd (@x[$d2],@x[$d2],8)",
1192 "&vprotd (@x[$d3],@x[$d3],8)",
1194 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1195 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1196 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1197 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1198 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1199 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1200 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1201 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1202 "&vprotd (@x[$b0],@x[$b0],7)",
1203 "&vprotd (@x[$b1],@x[$b1],7)",
1204 "&vprotd (@x[$b2],@x[$b2],7)",
1205 "&vprotd (@x[$b3],@x[$b3],7)"
1209 my $xframe = $win64 ? 0xa0 : 0;
1212 .type ChaCha20_4xop,\@function,5
1216 lea -0x78(%rsp),%r11
1217 sub \$0x148+$xframe,%rsp
1219 ################ stack layout
1220 # +0x00 SIMD equivalent of @x[8-12]
1222 # +0x40 constant copy of key[0-2] smashed by lanes
1224 # +0x100 SIMD counters (with nonce smashed by lanes)
1227 $code.=<<___ if ($win64);
1228 movaps %xmm6,-0x30(%r11)
1229 movaps %xmm7,-0x20(%r11)
1230 movaps %xmm8,-0x10(%r11)
1231 movaps %xmm9,0x00(%r11)
1232 movaps %xmm10,0x10(%r11)
1233 movaps %xmm11,0x20(%r11)
1234 movaps %xmm12,0x30(%r11)
1235 movaps %xmm13,0x40(%r11)
1236 movaps %xmm14,0x50(%r11)
1237 movaps %xmm15,0x60(%r11)
1242 vmovdqa .Lsigma(%rip),$xa3 # key[0]
1243 vmovdqu ($key),$xb3 # key[1]
1244 vmovdqu 16($key),$xt3 # key[2]
1245 vmovdqu ($counter),$xd3 # key[3]
1246 lea 0x100(%rsp),%rcx # size optimization
1248 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1249 vpshufd \$0x55,$xa3,$xa1
1250 vmovdqa $xa0,0x40(%rsp) # ... and offload
1251 vpshufd \$0xaa,$xa3,$xa2
1252 vmovdqa $xa1,0x50(%rsp)
1253 vpshufd \$0xff,$xa3,$xa3
1254 vmovdqa $xa2,0x60(%rsp)
1255 vmovdqa $xa3,0x70(%rsp)
1257 vpshufd \$0x00,$xb3,$xb0
1258 vpshufd \$0x55,$xb3,$xb1
1259 vmovdqa $xb0,0x80-0x100(%rcx)
1260 vpshufd \$0xaa,$xb3,$xb2
1261 vmovdqa $xb1,0x90-0x100(%rcx)
1262 vpshufd \$0xff,$xb3,$xb3
1263 vmovdqa $xb2,0xa0-0x100(%rcx)
1264 vmovdqa $xb3,0xb0-0x100(%rcx)
1266 vpshufd \$0x00,$xt3,$xt0 # "$xc0"
1267 vpshufd \$0x55,$xt3,$xt1 # "$xc1"
1268 vmovdqa $xt0,0xc0-0x100(%rcx)
1269 vpshufd \$0xaa,$xt3,$xt2 # "$xc2"
1270 vmovdqa $xt1,0xd0-0x100(%rcx)
1271 vpshufd \$0xff,$xt3,$xt3 # "$xc3"
1272 vmovdqa $xt2,0xe0-0x100(%rcx)
1273 vmovdqa $xt3,0xf0-0x100(%rcx)
1275 vpshufd \$0x00,$xd3,$xd0
1276 vpshufd \$0x55,$xd3,$xd1
1277 vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet
1278 vpshufd \$0xaa,$xd3,$xd2
1279 vmovdqa $xd1,0x110-0x100(%rcx)
1280 vpshufd \$0xff,$xd3,$xd3
1281 vmovdqa $xd2,0x120-0x100(%rcx)
1282 vmovdqa $xd3,0x130-0x100(%rcx)
1288 vmovdqa 0x40(%rsp),$xa0 # re-load smashed key
1289 vmovdqa 0x50(%rsp),$xa1
1290 vmovdqa 0x60(%rsp),$xa2
1291 vmovdqa 0x70(%rsp),$xa3
1292 vmovdqa 0x80-0x100(%rcx),$xb0
1293 vmovdqa 0x90-0x100(%rcx),$xb1
1294 vmovdqa 0xa0-0x100(%rcx),$xb2
1295 vmovdqa 0xb0-0x100(%rcx),$xb3
1296 vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
1297 vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
1298 vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
1299 vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
1300 vmovdqa 0x100-0x100(%rcx),$xd0
1301 vmovdqa 0x110-0x100(%rcx),$xd1
1302 vmovdqa 0x120-0x100(%rcx),$xd2
1303 vmovdqa 0x130-0x100(%rcx),$xd3
1304 vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters
1308 vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
1314 foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
1315 foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
1320 vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material
1321 vpaddd 0x50(%rsp),$xa1,$xa1
1322 vpaddd 0x60(%rsp),$xa2,$xa2
1323 vpaddd 0x70(%rsp),$xa3,$xa3
1325 vmovdqa $xt2,0x20(%rsp) # offload $xc2,3
1326 vmovdqa $xt3,0x30(%rsp)
1328 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1329 vpunpckldq $xa3,$xa2,$xt3
1330 vpunpckhdq $xa1,$xa0,$xa0
1331 vpunpckhdq $xa3,$xa2,$xa2
1332 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1333 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1334 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1335 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1337 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1339 vpaddd 0x80-0x100(%rcx),$xb0,$xb0
1340 vpaddd 0x90-0x100(%rcx),$xb1,$xb1
1341 vpaddd 0xa0-0x100(%rcx),$xb2,$xb2
1342 vpaddd 0xb0-0x100(%rcx),$xb3,$xb3
1344 vmovdqa $xa0,0x00(%rsp) # offload $xa0,1
1345 vmovdqa $xa1,0x10(%rsp)
1346 vmovdqa 0x20(%rsp),$xa0 # "xc2"
1347 vmovdqa 0x30(%rsp),$xa1 # "xc3"
1349 vpunpckldq $xb1,$xb0,$xt2
1350 vpunpckldq $xb3,$xb2,$xt3
1351 vpunpckhdq $xb1,$xb0,$xb0
1352 vpunpckhdq $xb3,$xb2,$xb2
1353 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1354 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1355 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1356 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1358 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1359 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1361 vpaddd 0xc0-0x100(%rcx),$xc0,$xc0
1362 vpaddd 0xd0-0x100(%rcx),$xc1,$xc1
1363 vpaddd 0xe0-0x100(%rcx),$xc2,$xc2
1364 vpaddd 0xf0-0x100(%rcx),$xc3,$xc3
1366 vpunpckldq $xc1,$xc0,$xt2
1367 vpunpckldq $xc3,$xc2,$xt3
1368 vpunpckhdq $xc1,$xc0,$xc0
1369 vpunpckhdq $xc3,$xc2,$xc2
1370 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1371 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1372 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1373 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1375 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1377 vpaddd 0x100-0x100(%rcx),$xd0,$xd0
1378 vpaddd 0x110-0x100(%rcx),$xd1,$xd1
1379 vpaddd 0x120-0x100(%rcx),$xd2,$xd2
1380 vpaddd 0x130-0x100(%rcx),$xd3,$xd3
1382 vpunpckldq $xd1,$xd0,$xt2
1383 vpunpckldq $xd3,$xd2,$xt3
1384 vpunpckhdq $xd1,$xd0,$xd0
1385 vpunpckhdq $xd3,$xd2,$xd2
1386 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1387 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1388 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1389 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1391 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1392 ($xa0,$xa1)=($xt2,$xt3);
1394 vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1
1395 vmovdqa 0x10(%rsp),$xa1
1400 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1401 vpxor 0x10($inp),$xb0,$xb0
1402 vpxor 0x20($inp),$xc0,$xc0
1403 vpxor 0x30($inp),$xd0,$xd0
1404 vpxor 0x40($inp),$xa1,$xa1
1405 vpxor 0x50($inp),$xb1,$xb1
1406 vpxor 0x60($inp),$xc1,$xc1
1407 vpxor 0x70($inp),$xd1,$xd1
1408 lea 0x80($inp),$inp # size optimization
1409 vpxor 0x00($inp),$xa2,$xa2
1410 vpxor 0x10($inp),$xb2,$xb2
1411 vpxor 0x20($inp),$xc2,$xc2
1412 vpxor 0x30($inp),$xd2,$xd2
1413 vpxor 0x40($inp),$xa3,$xa3
1414 vpxor 0x50($inp),$xb3,$xb3
1415 vpxor 0x60($inp),$xc3,$xc3
1416 vpxor 0x70($inp),$xd3,$xd3
1417 lea 0x80($inp),$inp # inp+=64*4
1419 vmovdqu $xa0,0x00($out)
1420 vmovdqu $xb0,0x10($out)
1421 vmovdqu $xc0,0x20($out)
1422 vmovdqu $xd0,0x30($out)
1423 vmovdqu $xa1,0x40($out)
1424 vmovdqu $xb1,0x50($out)
1425 vmovdqu $xc1,0x60($out)
1426 vmovdqu $xd1,0x70($out)
1427 lea 0x80($out),$out # size optimization
1428 vmovdqu $xa2,0x00($out)
1429 vmovdqu $xb2,0x10($out)
1430 vmovdqu $xc2,0x20($out)
1431 vmovdqu $xd2,0x30($out)
1432 vmovdqu $xa3,0x40($out)
1433 vmovdqu $xb3,0x50($out)
1434 vmovdqu $xc3,0x60($out)
1435 vmovdqu $xd3,0x70($out)
1436 lea 0x80($out),$out # out+=64*4
1446 jae .L192_or_more4xop
1448 jae .L128_or_more4xop
1450 jae .L64_or_more4xop
1453 vmovdqa $xa0,0x00(%rsp)
1454 vmovdqa $xb0,0x10(%rsp)
1455 vmovdqa $xc0,0x20(%rsp)
1456 vmovdqa $xd0,0x30(%rsp)
1461 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1462 vpxor 0x10($inp),$xb0,$xb0
1463 vpxor 0x20($inp),$xc0,$xc0
1464 vpxor 0x30($inp),$xd0,$xd0
1465 vmovdqu $xa0,0x00($out)
1466 vmovdqu $xb0,0x10($out)
1467 vmovdqu $xc0,0x20($out)
1468 vmovdqu $xd0,0x30($out)
1471 lea 0x40($inp),$inp # inp+=64*1
1472 vmovdqa $xa1,0x00(%rsp)
1474 vmovdqa $xb1,0x10(%rsp)
1475 lea 0x40($out),$out # out+=64*1
1476 vmovdqa $xc1,0x20(%rsp)
1477 sub \$64,$len # len-=64*1
1478 vmovdqa $xd1,0x30(%rsp)
1483 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1484 vpxor 0x10($inp),$xb0,$xb0
1485 vpxor 0x20($inp),$xc0,$xc0
1486 vpxor 0x30($inp),$xd0,$xd0
1487 vpxor 0x40($inp),$xa1,$xa1
1488 vpxor 0x50($inp),$xb1,$xb1
1489 vpxor 0x60($inp),$xc1,$xc1
1490 vpxor 0x70($inp),$xd1,$xd1
1492 vmovdqu $xa0,0x00($out)
1493 vmovdqu $xb0,0x10($out)
1494 vmovdqu $xc0,0x20($out)
1495 vmovdqu $xd0,0x30($out)
1496 vmovdqu $xa1,0x40($out)
1497 vmovdqu $xb1,0x50($out)
1498 vmovdqu $xc1,0x60($out)
1499 vmovdqu $xd1,0x70($out)
1502 lea 0x80($inp),$inp # inp+=64*2
1503 vmovdqa $xa2,0x00(%rsp)
1505 vmovdqa $xb2,0x10(%rsp)
1506 lea 0x80($out),$out # out+=64*2
1507 vmovdqa $xc2,0x20(%rsp)
1508 sub \$128,$len # len-=64*2
1509 vmovdqa $xd2,0x30(%rsp)
1514 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1515 vpxor 0x10($inp),$xb0,$xb0
1516 vpxor 0x20($inp),$xc0,$xc0
1517 vpxor 0x30($inp),$xd0,$xd0
1518 vpxor 0x40($inp),$xa1,$xa1
1519 vpxor 0x50($inp),$xb1,$xb1
1520 vpxor 0x60($inp),$xc1,$xc1
1521 vpxor 0x70($inp),$xd1,$xd1
1522 lea 0x80($inp),$inp # size optimization
1523 vpxor 0x00($inp),$xa2,$xa2
1524 vpxor 0x10($inp),$xb2,$xb2
1525 vpxor 0x20($inp),$xc2,$xc2
1526 vpxor 0x30($inp),$xd2,$xd2
1528 vmovdqu $xa0,0x00($out)
1529 vmovdqu $xb0,0x10($out)
1530 vmovdqu $xc0,0x20($out)
1531 vmovdqu $xd0,0x30($out)
1532 vmovdqu $xa1,0x40($out)
1533 vmovdqu $xb1,0x50($out)
1534 vmovdqu $xc1,0x60($out)
1535 vmovdqu $xd1,0x70($out)
1536 lea 0x80($out),$out # size optimization
1537 vmovdqu $xa2,0x00($out)
1538 vmovdqu $xb2,0x10($out)
1539 vmovdqu $xc2,0x20($out)
1540 vmovdqu $xd2,0x30($out)
1543 lea 0x40($inp),$inp # inp+=64*3
1544 vmovdqa $xa2,0x00(%rsp)
1546 vmovdqa $xb2,0x10(%rsp)
1547 lea 0x40($out),$out # out+=64*3
1548 vmovdqa $xc2,0x20(%rsp)
1549 sub \$192,$len # len-=64*3
1550 vmovdqa $xd2,0x30(%rsp)
1553 movzb ($inp,%r10),%eax
1554 movzb (%rsp,%r10),%ecx
1557 mov %al,-1($out,%r10)
1564 $code.=<<___ if ($win64);
1565 lea 0x140+0x30(%rsp),%r11
1566 movaps -0x30(%r11),%xmm6
1567 movaps -0x20(%r11),%xmm7
1568 movaps -0x10(%r11),%xmm8
1569 movaps 0x00(%r11),%xmm9
1570 movaps 0x10(%r11),%xmm10
1571 movaps 0x20(%r11),%xmm11
1572 movaps 0x30(%r11),%xmm12
1573 movaps 0x40(%r11),%xmm13
1574 movaps 0x50(%r11),%xmm14
1575 movaps 0x60(%r11),%xmm15
1578 add \$0x148+$xframe,%rsp
1580 .size ChaCha20_4xop,.-ChaCha20_4xop
1584 ########################################################################
1587 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1588 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1589 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1590 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1592 sub AVX2_lane_ROUND {
1593 my ($a0,$b0,$c0,$d0)=@_;
1594 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1595 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1596 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1597 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1598 my @x=map("\"$_\"",@xx);
1600 # Consider order in which variables are addressed by their
1605 # 0 4 8 12 < even round
1609 # 0 5 10 15 < odd round
1614 # 'a', 'b' and 'd's are permanently allocated in registers,
1615 # @x[0..7,12..15], while 'c's are maintained in memory. If
1616 # you observe 'c' column, you'll notice that pair of 'c's is
1617 # invariant between rounds. This means that we have to reload
1618 # them once per round, in the middle. This is why you'll see
1619 # bunch of 'c' stores and loads in the middle, but none in
1620 # the beginning or end.
1623 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1624 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1625 "&vpshufb (@x[$d0],@x[$d0],$t1)",
1626 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1627 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1628 "&vpshufb (@x[$d1],@x[$d1],$t1)",
1630 "&vpaddd ($xc,$xc,@x[$d0])",
1631 "&vpxor (@x[$b0],$xc,@x[$b0])",
1632 "&vpslld ($t0,@x[$b0],12)",
1633 "&vpsrld (@x[$b0],@x[$b0],20)",
1634 "&vpor (@x[$b0],$t0,@x[$b0])",
1635 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1636 "&vpaddd ($xc_,$xc_,@x[$d1])",
1637 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1638 "&vpslld ($t1,@x[$b1],12)",
1639 "&vpsrld (@x[$b1],@x[$b1],20)",
1640 "&vpor (@x[$b1],$t1,@x[$b1])",
1642 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
1643 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1644 "&vpshufb (@x[$d0],@x[$d0],$t0)",
1645 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
1646 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1647 "&vpshufb (@x[$d1],@x[$d1],$t0)",
1649 "&vpaddd ($xc,$xc,@x[$d0])",
1650 "&vpxor (@x[$b0],$xc,@x[$b0])",
1651 "&vpslld ($t1,@x[$b0],7)",
1652 "&vpsrld (@x[$b0],@x[$b0],25)",
1653 "&vpor (@x[$b0],$t1,@x[$b0])",
1654 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1655 "&vpaddd ($xc_,$xc_,@x[$d1])",
1656 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1657 "&vpslld ($t0,@x[$b1],7)",
1658 "&vpsrld (@x[$b1],@x[$b1],25)",
1659 "&vpor (@x[$b1],$t0,@x[$b1])",
1661 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
1662 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
1663 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
1664 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
1666 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1667 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1668 "&vpshufb (@x[$d2],@x[$d2],$t1)",
1669 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1670 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1671 "&vpshufb (@x[$d3],@x[$d3],$t1)",
1673 "&vpaddd ($xc,$xc,@x[$d2])",
1674 "&vpxor (@x[$b2],$xc,@x[$b2])",
1675 "&vpslld ($t0,@x[$b2],12)",
1676 "&vpsrld (@x[$b2],@x[$b2],20)",
1677 "&vpor (@x[$b2],$t0,@x[$b2])",
1678 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1679 "&vpaddd ($xc_,$xc_,@x[$d3])",
1680 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1681 "&vpslld ($t1,@x[$b3],12)",
1682 "&vpsrld (@x[$b3],@x[$b3],20)",
1683 "&vpor (@x[$b3],$t1,@x[$b3])",
1685 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1686 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1687 "&vpshufb (@x[$d2],@x[$d2],$t0)",
1688 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1689 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1690 "&vpshufb (@x[$d3],@x[$d3],$t0)",
1692 "&vpaddd ($xc,$xc,@x[$d2])",
1693 "&vpxor (@x[$b2],$xc,@x[$b2])",
1694 "&vpslld ($t1,@x[$b2],7)",
1695 "&vpsrld (@x[$b2],@x[$b2],25)",
1696 "&vpor (@x[$b2],$t1,@x[$b2])",
1697 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1698 "&vpaddd ($xc_,$xc_,@x[$d3])",
1699 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1700 "&vpslld ($t0,@x[$b3],7)",
1701 "&vpsrld (@x[$b3],@x[$b3],25)",
1702 "&vpor (@x[$b3],$t0,@x[$b3])"
1706 my $xframe = $win64 ? 0xb0 : 8;
1709 .type ChaCha20_8x,\@function,5
1714 sub \$0x280+$xframe,%rsp
1717 $code.=<<___ if ($win64);
1718 lea 0x290+0x30(%rsp),%r11
1719 movaps %xmm6,-0x30(%r11)
1720 movaps %xmm7,-0x20(%r11)
1721 movaps %xmm8,-0x10(%r11)
1722 movaps %xmm9,0x00(%r11)
1723 movaps %xmm10,0x10(%r11)
1724 movaps %xmm11,0x20(%r11)
1725 movaps %xmm12,0x30(%r11)
1726 movaps %xmm13,0x40(%r11)
1727 movaps %xmm14,0x50(%r11)
1728 movaps %xmm15,0x60(%r11)
1732 mov %r10,0x280(%rsp)
1734 ################ stack layout
1735 # +0x00 SIMD equivalent of @x[8-12]
1737 # +0x80 constant copy of key[0-2] smashed by lanes
1739 # +0x200 SIMD counters (with nonce smashed by lanes)
1743 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
1744 vbroadcasti128 ($key),$xb3 # key[1]
1745 vbroadcasti128 16($key),$xt3 # key[2]
1746 vbroadcasti128 ($counter),$xd3 # key[3]
1747 lea 0x100(%rsp),%rcx # size optimization
1748 lea 0x200(%rsp),%rax # size optimization
1749 lea .Lrot16(%rip),%r10
1750 lea .Lrot24(%rip),%r11
1752 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1753 vpshufd \$0x55,$xa3,$xa1
1754 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload
1755 vpshufd \$0xaa,$xa3,$xa2
1756 vmovdqa $xa1,0xa0-0x100(%rcx)
1757 vpshufd \$0xff,$xa3,$xa3
1758 vmovdqa $xa2,0xc0-0x100(%rcx)
1759 vmovdqa $xa3,0xe0-0x100(%rcx)
1761 vpshufd \$0x00,$xb3,$xb0
1762 vpshufd \$0x55,$xb3,$xb1
1763 vmovdqa $xb0,0x100-0x100(%rcx)
1764 vpshufd \$0xaa,$xb3,$xb2
1765 vmovdqa $xb1,0x120-0x100(%rcx)
1766 vpshufd \$0xff,$xb3,$xb3
1767 vmovdqa $xb2,0x140-0x100(%rcx)
1768 vmovdqa $xb3,0x160-0x100(%rcx)
1770 vpshufd \$0x00,$xt3,$xt0 # "xc0"
1771 vpshufd \$0x55,$xt3,$xt1 # "xc1"
1772 vmovdqa $xt0,0x180-0x200(%rax)
1773 vpshufd \$0xaa,$xt3,$xt2 # "xc2"
1774 vmovdqa $xt1,0x1a0-0x200(%rax)
1775 vpshufd \$0xff,$xt3,$xt3 # "xc3"
1776 vmovdqa $xt2,0x1c0-0x200(%rax)
1777 vmovdqa $xt3,0x1e0-0x200(%rax)
1779 vpshufd \$0x00,$xd3,$xd0
1780 vpshufd \$0x55,$xd3,$xd1
1781 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
1782 vpshufd \$0xaa,$xd3,$xd2
1783 vmovdqa $xd1,0x220-0x200(%rax)
1784 vpshufd \$0xff,$xd3,$xd3
1785 vmovdqa $xd2,0x240-0x200(%rax)
1786 vmovdqa $xd3,0x260-0x200(%rax)
1792 vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key
1793 vmovdqa 0xa0-0x100(%rcx),$xa1
1794 vmovdqa 0xc0-0x100(%rcx),$xa2
1795 vmovdqa 0xe0-0x100(%rcx),$xa3
1796 vmovdqa 0x100-0x100(%rcx),$xb0
1797 vmovdqa 0x120-0x100(%rcx),$xb1
1798 vmovdqa 0x140-0x100(%rcx),$xb2
1799 vmovdqa 0x160-0x100(%rcx),$xb3
1800 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0"
1801 vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1"
1802 vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2"
1803 vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3"
1804 vmovdqa 0x200-0x200(%rax),$xd0
1805 vmovdqa 0x220-0x200(%rax),$xd1
1806 vmovdqa 0x240-0x200(%rax),$xd2
1807 vmovdqa 0x260-0x200(%rax),$xd3
1808 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters
1811 vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
1812 vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
1813 vbroadcasti128 (%r10),$xt3
1814 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
1821 foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
1822 foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
1827 lea 0x200(%rsp),%rax # size optimization
1828 vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
1829 vpaddd 0xa0-0x100(%rcx),$xa1,$xa1
1830 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2
1831 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3
1833 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1834 vpunpckldq $xa3,$xa2,$xt3
1835 vpunpckhdq $xa1,$xa0,$xa0
1836 vpunpckhdq $xa3,$xa2,$xa2
1837 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1838 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1839 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1840 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1842 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1844 vpaddd 0x100-0x100(%rcx),$xb0,$xb0
1845 vpaddd 0x120-0x100(%rcx),$xb1,$xb1
1846 vpaddd 0x140-0x100(%rcx),$xb2,$xb2
1847 vpaddd 0x160-0x100(%rcx),$xb3,$xb3
1849 vpunpckldq $xb1,$xb0,$xt2
1850 vpunpckldq $xb3,$xb2,$xt3
1851 vpunpckhdq $xb1,$xb0,$xb0
1852 vpunpckhdq $xb3,$xb2,$xb2
1853 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1854 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1855 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1856 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1858 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1860 vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
1861 vperm2i128 \$0x31,$xb0,$xa0,$xb0
1862 vperm2i128 \$0x20,$xb1,$xa1,$xa0
1863 vperm2i128 \$0x31,$xb1,$xa1,$xb1
1864 vperm2i128 \$0x20,$xb2,$xa2,$xa1
1865 vperm2i128 \$0x31,$xb2,$xa2,$xb2
1866 vperm2i128 \$0x20,$xb3,$xa3,$xa2
1867 vperm2i128 \$0x31,$xb3,$xa3,$xb3
1869 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
1870 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1872 vmovdqa $xa0,0x00(%rsp) # offload $xaN
1873 vmovdqa $xa1,0x20(%rsp)
1874 vmovdqa 0x40(%rsp),$xc2 # $xa0
1875 vmovdqa 0x60(%rsp),$xc3 # $xa1
1877 vpaddd 0x180-0x200(%rax),$xc0,$xc0
1878 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1
1879 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2
1880 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3
1882 vpunpckldq $xc1,$xc0,$xt2
1883 vpunpckldq $xc3,$xc2,$xt3
1884 vpunpckhdq $xc1,$xc0,$xc0
1885 vpunpckhdq $xc3,$xc2,$xc2
1886 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1887 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1888 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1889 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1891 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1893 vpaddd 0x200-0x200(%rax),$xd0,$xd0
1894 vpaddd 0x220-0x200(%rax),$xd1,$xd1
1895 vpaddd 0x240-0x200(%rax),$xd2,$xd2
1896 vpaddd 0x260-0x200(%rax),$xd3,$xd3
1898 vpunpckldq $xd1,$xd0,$xt2
1899 vpunpckldq $xd3,$xd2,$xt3
1900 vpunpckhdq $xd1,$xd0,$xd0
1901 vpunpckhdq $xd3,$xd2,$xd2
1902 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1903 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1904 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1905 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1907 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1909 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
1910 vperm2i128 \$0x31,$xd0,$xc0,$xd0
1911 vperm2i128 \$0x20,$xd1,$xc1,$xc0
1912 vperm2i128 \$0x31,$xd1,$xc1,$xd1
1913 vperm2i128 \$0x20,$xd2,$xc2,$xc1
1914 vperm2i128 \$0x31,$xd2,$xc2,$xd2
1915 vperm2i128 \$0x20,$xd3,$xc3,$xc2
1916 vperm2i128 \$0x31,$xd3,$xc3,$xd3
1918 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
1919 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
1920 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
1921 ($xa0,$xa1)=($xt2,$xt3);
1923 vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember?
1924 vmovdqa 0x20(%rsp),$xa1
1929 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1930 vpxor 0x20($inp),$xb0,$xb0
1931 vpxor 0x40($inp),$xc0,$xc0
1932 vpxor 0x60($inp),$xd0,$xd0
1933 lea 0x80($inp),$inp # size optimization
1934 vmovdqu $xa0,0x00($out)
1935 vmovdqu $xb0,0x20($out)
1936 vmovdqu $xc0,0x40($out)
1937 vmovdqu $xd0,0x60($out)
1938 lea 0x80($out),$out # size optimization
1940 vpxor 0x00($inp),$xa1,$xa1
1941 vpxor 0x20($inp),$xb1,$xb1
1942 vpxor 0x40($inp),$xc1,$xc1
1943 vpxor 0x60($inp),$xd1,$xd1
1944 lea 0x80($inp),$inp # size optimization
1945 vmovdqu $xa1,0x00($out)
1946 vmovdqu $xb1,0x20($out)
1947 vmovdqu $xc1,0x40($out)
1948 vmovdqu $xd1,0x60($out)
1949 lea 0x80($out),$out # size optimization
1951 vpxor 0x00($inp),$xa2,$xa2
1952 vpxor 0x20($inp),$xb2,$xb2
1953 vpxor 0x40($inp),$xc2,$xc2
1954 vpxor 0x60($inp),$xd2,$xd2
1955 lea 0x80($inp),$inp # size optimization
1956 vmovdqu $xa2,0x00($out)
1957 vmovdqu $xb2,0x20($out)
1958 vmovdqu $xc2,0x40($out)
1959 vmovdqu $xd2,0x60($out)
1960 lea 0x80($out),$out # size optimization
1962 vpxor 0x00($inp),$xa3,$xa3
1963 vpxor 0x20($inp),$xb3,$xb3
1964 vpxor 0x40($inp),$xc3,$xc3
1965 vpxor 0x60($inp),$xd3,$xd3
1966 lea 0x80($inp),$inp # size optimization
1967 vmovdqu $xa3,0x00($out)
1968 vmovdqu $xb3,0x20($out)
1969 vmovdqu $xc3,0x40($out)
1970 vmovdqu $xd3,0x60($out)
1971 lea 0x80($out),$out # size optimization
1995 vmovdqa $xa0,0x00(%rsp)
1996 vmovdqa $xb0,0x20(%rsp)
2001 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2002 vpxor 0x20($inp),$xb0,$xb0
2003 vmovdqu $xa0,0x00($out)
2004 vmovdqu $xb0,0x20($out)
2007 lea 0x40($inp),$inp # inp+=64*1
2009 vmovdqa $xc0,0x00(%rsp)
2010 lea 0x40($out),$out # out+=64*1
2011 sub \$64,$len # len-=64*1
2012 vmovdqa $xd0,0x20(%rsp)
2017 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2018 vpxor 0x20($inp),$xb0,$xb0
2019 vpxor 0x40($inp),$xc0,$xc0
2020 vpxor 0x60($inp),$xd0,$xd0
2021 vmovdqu $xa0,0x00($out)
2022 vmovdqu $xb0,0x20($out)
2023 vmovdqu $xc0,0x40($out)
2024 vmovdqu $xd0,0x60($out)
2027 lea 0x80($inp),$inp # inp+=64*2
2029 vmovdqa $xa1,0x00(%rsp)
2030 lea 0x80($out),$out # out+=64*2
2031 sub \$128,$len # len-=64*2
2032 vmovdqa $xb1,0x20(%rsp)
2037 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2038 vpxor 0x20($inp),$xb0,$xb0
2039 vpxor 0x40($inp),$xc0,$xc0
2040 vpxor 0x60($inp),$xd0,$xd0
2041 vpxor 0x80($inp),$xa1,$xa1
2042 vpxor 0xa0($inp),$xb1,$xb1
2043 vmovdqu $xa0,0x00($out)
2044 vmovdqu $xb0,0x20($out)
2045 vmovdqu $xc0,0x40($out)
2046 vmovdqu $xd0,0x60($out)
2047 vmovdqu $xa1,0x80($out)
2048 vmovdqu $xb1,0xa0($out)
2051 lea 0xc0($inp),$inp # inp+=64*3
2053 vmovdqa $xc1,0x00(%rsp)
2054 lea 0xc0($out),$out # out+=64*3
2055 sub \$192,$len # len-=64*3
2056 vmovdqa $xd1,0x20(%rsp)
2061 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2062 vpxor 0x20($inp),$xb0,$xb0
2063 vpxor 0x40($inp),$xc0,$xc0
2064 vpxor 0x60($inp),$xd0,$xd0
2065 vpxor 0x80($inp),$xa1,$xa1
2066 vpxor 0xa0($inp),$xb1,$xb1
2067 vpxor 0xc0($inp),$xc1,$xc1
2068 vpxor 0xe0($inp),$xd1,$xd1
2069 vmovdqu $xa0,0x00($out)
2070 vmovdqu $xb0,0x20($out)
2071 vmovdqu $xc0,0x40($out)
2072 vmovdqu $xd0,0x60($out)
2073 vmovdqu $xa1,0x80($out)
2074 vmovdqu $xb1,0xa0($out)
2075 vmovdqu $xc1,0xc0($out)
2076 vmovdqu $xd1,0xe0($out)
2079 lea 0x100($inp),$inp # inp+=64*4
2081 vmovdqa $xa2,0x00(%rsp)
2082 lea 0x100($out),$out # out+=64*4
2083 sub \$256,$len # len-=64*4
2084 vmovdqa $xb2,0x20(%rsp)
2089 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2090 vpxor 0x20($inp),$xb0,$xb0
2091 vpxor 0x40($inp),$xc0,$xc0
2092 vpxor 0x60($inp),$xd0,$xd0
2093 vpxor 0x80($inp),$xa1,$xa1
2094 vpxor 0xa0($inp),$xb1,$xb1
2095 vpxor 0xc0($inp),$xc1,$xc1
2096 vpxor 0xe0($inp),$xd1,$xd1
2097 vpxor 0x100($inp),$xa2,$xa2
2098 vpxor 0x120($inp),$xb2,$xb2
2099 vmovdqu $xa0,0x00($out)
2100 vmovdqu $xb0,0x20($out)
2101 vmovdqu $xc0,0x40($out)
2102 vmovdqu $xd0,0x60($out)
2103 vmovdqu $xa1,0x80($out)
2104 vmovdqu $xb1,0xa0($out)
2105 vmovdqu $xc1,0xc0($out)
2106 vmovdqu $xd1,0xe0($out)
2107 vmovdqu $xa2,0x100($out)
2108 vmovdqu $xb2,0x120($out)
2111 lea 0x140($inp),$inp # inp+=64*5
2113 vmovdqa $xc2,0x00(%rsp)
2114 lea 0x140($out),$out # out+=64*5
2115 sub \$320,$len # len-=64*5
2116 vmovdqa $xd2,0x20(%rsp)
2121 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2122 vpxor 0x20($inp),$xb0,$xb0
2123 vpxor 0x40($inp),$xc0,$xc0
2124 vpxor 0x60($inp),$xd0,$xd0
2125 vpxor 0x80($inp),$xa1,$xa1
2126 vpxor 0xa0($inp),$xb1,$xb1
2127 vpxor 0xc0($inp),$xc1,$xc1
2128 vpxor 0xe0($inp),$xd1,$xd1
2129 vpxor 0x100($inp),$xa2,$xa2
2130 vpxor 0x120($inp),$xb2,$xb2
2131 vpxor 0x140($inp),$xc2,$xc2
2132 vpxor 0x160($inp),$xd2,$xd2
2133 vmovdqu $xa0,0x00($out)
2134 vmovdqu $xb0,0x20($out)
2135 vmovdqu $xc0,0x40($out)
2136 vmovdqu $xd0,0x60($out)
2137 vmovdqu $xa1,0x80($out)
2138 vmovdqu $xb1,0xa0($out)
2139 vmovdqu $xc1,0xc0($out)
2140 vmovdqu $xd1,0xe0($out)
2141 vmovdqu $xa2,0x100($out)
2142 vmovdqu $xb2,0x120($out)
2143 vmovdqu $xc2,0x140($out)
2144 vmovdqu $xd2,0x160($out)
2147 lea 0x180($inp),$inp # inp+=64*6
2149 vmovdqa $xa3,0x00(%rsp)
2150 lea 0x180($out),$out # out+=64*6
2151 sub \$384,$len # len-=64*6
2152 vmovdqa $xb3,0x20(%rsp)
2157 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2158 vpxor 0x20($inp),$xb0,$xb0
2159 vpxor 0x40($inp),$xc0,$xc0
2160 vpxor 0x60($inp),$xd0,$xd0
2161 vpxor 0x80($inp),$xa1,$xa1
2162 vpxor 0xa0($inp),$xb1,$xb1
2163 vpxor 0xc0($inp),$xc1,$xc1
2164 vpxor 0xe0($inp),$xd1,$xd1
2165 vpxor 0x100($inp),$xa2,$xa2
2166 vpxor 0x120($inp),$xb2,$xb2
2167 vpxor 0x140($inp),$xc2,$xc2
2168 vpxor 0x160($inp),$xd2,$xd2
2169 vpxor 0x180($inp),$xa3,$xa3
2170 vpxor 0x1a0($inp),$xb3,$xb3
2171 vmovdqu $xa0,0x00($out)
2172 vmovdqu $xb0,0x20($out)
2173 vmovdqu $xc0,0x40($out)
2174 vmovdqu $xd0,0x60($out)
2175 vmovdqu $xa1,0x80($out)
2176 vmovdqu $xb1,0xa0($out)
2177 vmovdqu $xc1,0xc0($out)
2178 vmovdqu $xd1,0xe0($out)
2179 vmovdqu $xa2,0x100($out)
2180 vmovdqu $xb2,0x120($out)
2181 vmovdqu $xc2,0x140($out)
2182 vmovdqu $xd2,0x160($out)
2183 vmovdqu $xa3,0x180($out)
2184 vmovdqu $xb3,0x1a0($out)
2187 lea 0x1c0($inp),$inp # inp+=64*7
2189 vmovdqa $xc3,0x00(%rsp)
2190 lea 0x1c0($out),$out # out+=64*7
2191 sub \$448,$len # len-=64*7
2192 vmovdqa $xd3,0x20(%rsp)
2195 movzb ($inp,%r10),%eax
2196 movzb (%rsp,%r10),%ecx
2199 mov %al,-1($out,%r10)
2206 $code.=<<___ if ($win64);
2207 lea 0x290+0x30(%rsp),%r11
2208 movaps -0x30(%r11),%xmm6
2209 movaps -0x20(%r11),%xmm7
2210 movaps -0x10(%r11),%xmm8
2211 movaps 0x00(%r11),%xmm9
2212 movaps 0x10(%r11),%xmm10
2213 movaps 0x20(%r11),%xmm11
2214 movaps 0x30(%r11),%xmm12
2215 movaps 0x40(%r11),%xmm13
2216 movaps 0x50(%r11),%xmm14
2217 movaps 0x60(%r11),%xmm15
2220 mov 0x280(%rsp),%rsp
2222 .size ChaCha20_8x,.-ChaCha20_8x
2226 foreach (split("\n",$code)) {
2227 s/\`([^\`]*)\`/eval $1/geo;