x86[_64] assembly pack: add ChaCha20 and Poly1305 modules.
authorAndy Polyakov <appro@openssl.org>
Sun, 13 Dec 2015 20:40:20 +0000 (21:40 +0100)
committerAndy Polyakov <appro@openssl.org>
Wed, 10 Feb 2016 09:31:14 +0000 (10:31 +0100)
Reviewed-by: Rich Salz <rsalz@openssl.org>
crypto/chacha/Makefile.in
crypto/chacha/asm/chacha-x86.pl [new file with mode: 0755]
crypto/chacha/asm/chacha-x86_64.pl [new file with mode: 0755]
crypto/perlasm/x86gas.pl
crypto/poly1305/Makefile.in
crypto/poly1305/asm/poly1305-x86.pl [new file with mode: 0755]
crypto/poly1305/asm/poly1305-x86_64.pl [new file with mode: 0755]
test/evptests.txt

index 8987a85..6fb63c1 100644 (file)
@@ -36,6 +36,11 @@ lib: $(LIBOBJ)
        $(RANLIB) $(LIB) || echo Never mind.
        @touch lib
 
+chacha-x86.s:          asm/chacha-x86.pl
+       $(PERL) asm/chacha-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
+chacha-x86_64.s:       asm/chacha-x86_64.pl
+       $(PERL) asm/chacha-x86_64.pl $(PERLASM_SCHEME) > $@
+
 chacha-%.S:    asm/chacha-%.pl;        $(PERL) $< $(PERLASM_SCHEME) $@
 
 files:
diff --git a/crypto/chacha/asm/chacha-x86.pl b/crypto/chacha/asm/chacha-x86.pl
new file mode 100755 (executable)
index 0000000..5d097ad
--- /dev/null
@@ -0,0 +1,1128 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# January 2015
+#
+# ChaCha20 for x86.
+#
+# Performance in cycles per byte out of large buffer.
+#
+#              1xIALU/gcc      4xSSSE3
+# Pentium      17.5/+80%
+# PIII         14.2/+60%
+# P4           18.6/+84%
+# Core2                9.56/+89%       4.83
+# Westmere     9.50/+45%       3.35
+# Sandy Bridge 10.5/+47%       3.20
+# Haswell      8.15/+50%       2.83
+# Silvermont   17.4/+36%       8.35
+# Sledgehammer 10.2/+54%
+# Bulldozer    13.4/+50%       4.38(*)
+#
+# (*)  Bulldozer actually executes 4xXOP code path that delivers 3.55;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"chacha-x86.pl",$ARGV[$#ARGV] eq "386");
+
+$xmm=$ymm=0;
+for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+$ymm=1 if ($xmm &&
+               `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+                       =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+               $1>=2.19);      # first version supporting AVX
+
+$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
+               `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+               $1>=2.03);      # first version supporting AVX
+
+$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" &&
+               `ml 2>&1` =~ /Version ([0-9]+)\./ &&
+               $1>=10);        # first version supporting AVX
+
+$ymm=1 if ($xmm && !$ymm &&
+               `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ &&
+               $2>=3.0);       # first version supporting AVX
+
+$a="eax";
+($b,$b_)=("ebx","ebp");
+($c,$c_)=("ecx","esi");
+($d,$d_)=("edx","edi");
+
+sub QUARTERROUND {
+my ($ai,$bi,$ci,$di,$i)=@_;
+my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));        # next
+my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));        # previous
+
+       #       a   b   c   d
+       #
+       #       0   4   8  12 < even round
+       #       1   5   9  13
+       #       2   6  10  14
+       #       3   7  11  15
+       #       0   5  10  15 < odd round
+       #       1   6  11  12
+       #       2   7   8  13
+       #       3   4   9  14
+
+       if ($i==0) {
+            my $j=4;
+           ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
+       } elsif ($i==3) {
+            my $j=0;
+           ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
+       } elsif ($i==4) {
+            my $j=4;
+           ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
+       } elsif ($i==7) {
+            my $j=0;
+           ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
+       }
+
+       #&add   ($a,$b);                        # see elsewhere
+       &xor    ($d,$a);
+        &mov   (&DWP(4*$cp,"esp"),$c_)         if ($ai>0 && $ai<3);
+       &rol    ($d,16);
+        &mov   (&DWP(4*$bp,"esp"),$b_)         if ($i!=0);
+       &add    ($c,$d);
+        &mov   ($c_,&DWP(4*$cn,"esp"))         if ($ai>0 && $ai<3);
+       &xor    ($b,$c);
+        &mov   ($d_,&DWP(4*$dn,"esp"))         if ($di!=$dn);
+       &rol    ($b,12);
+        &mov   ($b_,&DWP(4*$bn,"esp"))         if ($i<7);
+        &mov   ($b_,&DWP(128,"esp"))           if ($i==7);     # loop counter
+       &add    ($a,$b);
+       &xor    ($d,$a);
+       &mov    (&DWP(4*$ai,"esp"),$a);
+       &rol    ($d,8);
+       &mov    ($a,&DWP(4*$an,"esp"));
+       &add    ($c,$d);
+       &mov    (&DWP(4*$di,"esp"),$d)          if ($di!=$dn);
+       &mov    ($d_,$d)                        if ($di==$dn);
+       &xor    ($b,$c);
+        &add   ($a,$b_)                        if ($i<7);      # elsewhere
+       &rol    ($b,7);
+
+       ($b,$b_)=($b_,$b);
+       ($c,$c_)=($c_,$c);
+       ($d,$d_)=($d_,$d);
+}
+
+&static_label("ssse3_shortcut");
+&static_label("xop_shortcut");
+&static_label("ssse3_data");
+&static_label("pic_point");
+
+&function_begin("ChaCha20_ctr32");
+if ($xmm) {
+       &call   (&label("pic_point"));
+&set_label("pic_point");
+       &blindpop("eax");
+       &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
+       &test   (&DWP(0,"ebp"),1<<24);          # test FXSR bit
+       &jz     (&label("x86"));
+       &test   (&DWP(4,"ebp"),1<<9);           # test SSSE3 bit
+       &jz     (&label("x86"));
+       &jmp    (&label("ssse3_shortcut"));
+&set_label("x86");
+}
+       &mov    ("esi",&wparam(3));             # key
+       &mov    ("edi",&wparam(4));             # counter and nonce
+
+       &stack_push(33);
+
+       &mov    ("eax",&DWP(4*0,"esi"));        # copy key
+       &mov    ("ebx",&DWP(4*1,"esi"));
+       &mov    ("ecx",&DWP(4*2,"esi"));
+       &mov    ("edx",&DWP(4*3,"esi"));
+       &mov    (&DWP(64+4*4,"esp"),"eax");
+       &mov    (&DWP(64+4*5,"esp"),"ebx");
+       &mov    (&DWP(64+4*6,"esp"),"ecx");
+       &mov    (&DWP(64+4*7,"esp"),"edx");
+       &mov    ("eax",&DWP(4*4,"esi"));
+       &mov    ("ebx",&DWP(4*5,"esi"));
+       &mov    ("ecx",&DWP(4*6,"esi"));
+       &mov    ("edx",&DWP(4*7,"esi"));
+       &mov    (&DWP(64+4*8,"esp"),"eax");
+       &mov    (&DWP(64+4*9,"esp"),"ebx");
+       &mov    (&DWP(64+4*10,"esp"),"ecx");
+       &mov    (&DWP(64+4*11,"esp"),"edx");
+       &mov    ("eax",&DWP(4*0,"edi"));        # copy counter and nonce
+       &mov    ("ebx",&DWP(4*1,"edi"));
+       &mov    ("ecx",&DWP(4*2,"edi"));
+       &mov    ("edx",&DWP(4*3,"edi"));
+       &sub    ("eax",1);
+       &mov    (&DWP(64+4*12,"esp"),"eax");
+       &mov    (&DWP(64+4*13,"esp"),"ebx");
+       &mov    (&DWP(64+4*14,"esp"),"ecx");
+       &mov    (&DWP(64+4*15,"esp"),"edx");
+       &jmp    (&label("entry"));
+
+&set_label("outer_loop",16);
+       &mov    (&wparam(1),$b);                # save input
+       &mov    (&wparam(0),$a);                # save output
+       &mov    (&wparam(2),$c);                # save len
+&set_label("entry");
+       &mov    ($a,0x61707865);
+       &mov    (&DWP(4*1,"esp"),0x3320646e);
+       &mov    (&DWP(4*2,"esp"),0x79622d32);
+       &mov    (&DWP(4*3,"esp"),0x6b206574);
+
+       &mov    ($b, &DWP(64+4*5,"esp"));       # copy key material
+       &mov    ($b_,&DWP(64+4*6,"esp"));
+       &mov    ($c, &DWP(64+4*10,"esp"));
+       &mov    ($c_,&DWP(64+4*11,"esp"));
+       &mov    ($d, &DWP(64+4*13,"esp"));
+       &mov    ($d_,&DWP(64+4*14,"esp"));
+       &mov    (&DWP(4*5,"esp"),$b);
+       &mov    (&DWP(4*6,"esp"),$b_);
+       &mov    (&DWP(4*10,"esp"),$c);
+       &mov    (&DWP(4*11,"esp"),$c_);
+       &mov    (&DWP(4*13,"esp"),$d);
+       &mov    (&DWP(4*14,"esp"),$d_);
+
+       &mov    ($b, &DWP(64+4*7,"esp"));
+       &mov    ($d_,&DWP(64+4*15,"esp"));
+       &mov    ($d, &DWP(64+4*12,"esp"));
+       &mov    ($b_,&DWP(64+4*4,"esp"));
+       &mov    ($c, &DWP(64+4*8,"esp"));
+       &mov    ($c_,&DWP(64+4*9,"esp"));
+       &add    ($d,1);                         # counter value
+       &mov    (&DWP(4*7,"esp"),$b);
+       &mov    (&DWP(4*15,"esp"),$d_);
+       &mov    (&DWP(64+4*12,"esp"),$d);       # save counter value
+
+       &mov    ($b,10);                        # loop counter
+       &jmp    (&label("loop"));
+
+&set_label("loop",16);
+       &add    ($a,$b_);                       # elsewhere
+       &mov    (&DWP(128,"esp"),$b);           # save loop counter
+       &mov    ($b,$b_);
+       &QUARTERROUND(0, 4, 8, 12, 0);
+       &QUARTERROUND(1, 5, 9, 13, 1);
+       &QUARTERROUND(2, 6,10, 14, 2);
+       &QUARTERROUND(3, 7,11, 15, 3);
+       &QUARTERROUND(0, 5,10, 15, 4);
+       &QUARTERROUND(1, 6,11, 12, 5);
+       &QUARTERROUND(2, 7, 8, 13, 6);
+       &QUARTERROUND(3, 4, 9, 14, 7);
+       &dec    ($b);
+       &jnz    (&label("loop"));
+
+       &mov    ($b,&wparam(3));                # load len
+
+       &add    ($a,0x61707865);                # accumulate key material
+       &add    ($b_,&DWP(64+4*4,"esp"));
+       &add    ($c, &DWP(64+4*8,"esp"));
+       &add    ($c_,&DWP(64+4*9,"esp"));
+
+       &cmp    ($b,64);
+       &jb     (&label("tail"));
+
+       &mov    ($b,&wparam(1));                # load input pointer
+       &add    ($d, &DWP(64+4*12,"esp"));
+       &add    ($d_,&DWP(64+4*14,"esp"));
+
+       &xor    ($a, &DWP(4*0,$b));             # xor with input
+       &xor    ($b_,&DWP(4*4,$b));
+       &mov    (&DWP(4*0,"esp"),$a);
+       &mov    ($a,&wparam(0));                # load output pointer
+       &xor    ($c, &DWP(4*8,$b));
+       &xor    ($c_,&DWP(4*9,$b));
+       &xor    ($d, &DWP(4*12,$b));
+       &xor    ($d_,&DWP(4*14,$b));
+       &mov    (&DWP(4*4,$a),$b_);             # write output
+       &mov    (&DWP(4*8,$a),$c);
+       &mov    (&DWP(4*9,$a),$c_);
+       &mov    (&DWP(4*12,$a),$d);
+       &mov    (&DWP(4*14,$a),$d_);
+
+       &mov    ($b_,&DWP(4*1,"esp"));
+       &mov    ($c, &DWP(4*2,"esp"));
+       &mov    ($c_,&DWP(4*3,"esp"));
+       &mov    ($d, &DWP(4*5,"esp"));
+       &mov    ($d_,&DWP(4*6,"esp"));
+       &add    ($b_,0x3320646e);               # accumulate key material
+       &add    ($c, 0x79622d32);
+       &add    ($c_,0x6b206574);
+       &add    ($d, &DWP(64+4*5,"esp"));
+       &add    ($d_,&DWP(64+4*6,"esp"));
+       &xor    ($b_,&DWP(4*1,$b));
+       &xor    ($c, &DWP(4*2,$b));
+       &xor    ($c_,&DWP(4*3,$b));
+       &xor    ($d, &DWP(4*5,$b));
+       &xor    ($d_,&DWP(4*6,$b));
+       &mov    (&DWP(4*1,$a),$b_);
+       &mov    (&DWP(4*2,$a),$c);
+       &mov    (&DWP(4*3,$a),$c_);
+       &mov    (&DWP(4*5,$a),$d);
+       &mov    (&DWP(4*6,$a),$d_);
+
+       &mov    ($b_,&DWP(4*7,"esp"));
+       &mov    ($c, &DWP(4*10,"esp"));
+       &mov    ($c_,&DWP(4*11,"esp"));
+       &mov    ($d, &DWP(4*13,"esp"));
+       &mov    ($d_,&DWP(4*15,"esp"));
+       &add    ($b_,&DWP(64+4*7,"esp"));
+       &add    ($c, &DWP(64+4*10,"esp"));
+       &add    ($c_,&DWP(64+4*11,"esp"));
+       &add    ($d, &DWP(64+4*13,"esp"));
+       &add    ($d_,&DWP(64+4*15,"esp"));
+       &xor    ($b_,&DWP(4*7,$b));
+       &xor    ($c, &DWP(4*10,$b));
+       &xor    ($c_,&DWP(4*11,$b));
+       &xor    ($d, &DWP(4*13,$b));
+       &xor    ($d_,&DWP(4*15,$b));
+       &lea    ($b,&DWP(4*16,$b));
+       &mov    (&DWP(4*7,$a),$b_);
+       &mov    ($b_,&DWP(4*0,"esp"));
+       &mov    (&DWP(4*10,$a),$c);
+       &mov    ($c,&wparam(2));                # len
+       &mov    (&DWP(4*11,$a),$c_);
+       &mov    (&DWP(4*13,$a),$d);
+       &mov    (&DWP(4*15,$a),$d_);
+       &mov    (&DWP(4*0,$a),$b_);
+       &lea    ($a,&DWP(4*16,$a));
+       &sub    ($c,64);
+       &jnz    (&label("outer_loop"));
+
+       &jmp    (&label("done"));
+
+&set_label("tail");
+       &add    ($d, &DWP(64+4*12,"esp"));
+       &add    ($d_,&DWP(64+4*14,"esp"));
+       &mov    (&DWP(4*0,"esp"),$a);
+       &mov    (&DWP(4*4,"esp"),$b_);
+       &mov    (&DWP(4*8,"esp"),$c);
+       &mov    (&DWP(4*9,"esp"),$c_);
+       &mov    (&DWP(4*12,"esp"),$d);
+       &mov    (&DWP(4*14,"esp"),$d_);
+
+       &mov    ($b_,&DWP(4*1,"esp"));
+       &mov    ($c, &DWP(4*2,"esp"));
+       &mov    ($c_,&DWP(4*3,"esp"));
+       &mov    ($d, &DWP(4*5,"esp"));
+       &mov    ($d_,&DWP(4*6,"esp"));
+       &add    ($b_,0x3320646e);               # accumulate key material
+       &add    ($c, 0x79622d32);
+       &add    ($c_,0x6b206574);
+       &add    ($d, &DWP(64+4*5,"esp"));
+       &add    ($d_,&DWP(64+4*6,"esp"));
+       &mov    (&DWP(4*1,"esp"),$b_);
+       &mov    (&DWP(4*2,"esp"),$c);
+       &mov    (&DWP(4*3,"esp"),$c_);
+       &mov    (&DWP(4*5,"esp"),$d);
+       &mov    (&DWP(4*6,"esp"),$d_);
+
+       &mov    ($b_,&DWP(4*7,"esp"));
+       &mov    ($c, &DWP(4*10,"esp"));
+       &mov    ($c_,&DWP(4*11,"esp"));
+       &mov    ($d, &DWP(4*13,"esp"));
+       &mov    ($d_,&DWP(4*15,"esp"));
+       &add    ($b_,&DWP(64+4*7,"esp"));
+       &add    ($c, &DWP(64+4*10,"esp"));
+       &add    ($c_,&DWP(64+4*11,"esp"));
+       &add    ($d, &DWP(64+4*13,"esp"));
+       &add    ($d_,&DWP(64+4*15,"esp"));
+       &mov    (&DWP(4*7,"esp"),$b_);
+       &mov    ($b_,&wparam(1));               # load input
+       &mov    (&DWP(4*10,"esp"),$c);
+       &mov    ($c,&wparam(0));                # load output
+       &mov    (&DWP(4*11,"esp"),$c_);
+       &xor    ($c_,$c_);
+       &mov    (&DWP(4*13,"esp"),$d);
+       &mov    (&DWP(4*15,"esp"),$d_);
+
+       &xor    ("eax","eax");
+       &xor    ("edx","edx");
+&set_label("tail_loop");
+       &movb   ("al",&DWP(0,$c_,$b_));
+       &movb   ("dl",&DWP(0,"esp",$c_));
+       &lea    ($c_,&DWP(1,$c_));
+       &xor    ("al","dl");
+       &mov    (&DWP(-1,$c,$c_),"al");
+       &dec    ($b);
+       &jnz    (&label("tail_loop"));
+
+&set_label("done");
+       &stack_pop(33);
+&function_end("ChaCha20_ctr32");
+
+if ($xmm) {
+my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
+my ($out,$inp,$len)=("edi","esi","ecx");
+
+sub QUARTERROUND_SSSE3 {
+my ($ai,$bi,$ci,$di,$i)=@_;
+my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));        # next
+my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));        # previous
+
+       #       a   b   c   d
+       #
+       #       0   4   8  12 < even round
+       #       1   5   9  13
+       #       2   6  10  14
+       #       3   7  11  15
+       #       0   5  10  15 < odd round
+       #       1   6  11  12
+       #       2   7   8  13
+       #       3   4   9  14
+
+       if ($i==0) {
+            my $j=4;
+           ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
+       } elsif ($i==3) {
+            my $j=0;
+           ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
+       } elsif ($i==4) {
+            my $j=4;
+           ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
+       } elsif ($i==7) {
+            my $j=0;
+           ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
+       }
+
+       #&paddd ($xa,$xb);                      # see elsewhere
+       #&pxor  ($xd,$xa);                      # see elsewhere
+        &movdqa(&QWP(16*$cp-128,"ebx"),$xc_)   if ($ai>0 && $ai<3);
+       &pshufb ($xd,&QWP(0,"eax"));            # rot16
+        &movdqa(&QWP(16*$bp-128,"ebx"),$xb_)   if ($i!=0);
+       &paddd  ($xc,$xd);
+        &movdqa($xc_,&QWP(16*$cn-128,"ebx"))   if ($ai>0 && $ai<3);
+       &pxor   ($xb,$xc);
+        &movdqa($xb_,&QWP(16*$bn-128,"ebx"))   if ($i<7);
+       &movdqa ($xa_,$xb);                     # borrow as temporary
+       &pslld  ($xb,12);
+       &psrld  ($xa_,20);
+       &por    ($xb,$xa_);
+        &movdqa($xa_,&QWP(16*$an-128,"ebx"));
+       &paddd  ($xa,$xb);
+        &movdqa($xd_,&QWP(16*$dn-128,"ebx"))   if ($di!=$dn);
+       &pxor   ($xd,$xa);
+       &movdqa (&QWP(16*$ai-128,"ebx"),$xa);
+       &pshufb ($xd,&QWP(16,"eax"));           # rot8
+       &paddd  ($xc,$xd);
+       &movdqa (&QWP(16*$di-128,"ebx"),$xd)    if ($di!=$dn);
+       &movdqa ($xd_,$xd)                      if ($di==$dn);
+       &pxor   ($xb,$xc);
+        &paddd ($xa_,$xb_)                     if ($i<7);      # elsewhere
+       &movdqa ($xa,$xb);                      # borrow as temporary
+       &pslld  ($xb,7);
+       &psrld  ($xa,25);
+        &pxor  ($xd_,$xa_)                     if ($i<7);      # elsewhere
+       &por    ($xb,$xa);
+
+       ($xa,$xa_)=($xa_,$xa);
+       ($xb,$xb_)=($xb_,$xb);
+       ($xc,$xc_)=($xc_,$xc);
+       ($xd,$xd_)=($xd_,$xd);
+}
+
+&function_begin("ChaCha20_ssse3");
+&set_label("ssse3_shortcut");
+       &test           (&DWP(4,"ebp"),1<<11);          # test XOP bit
+       &jnz            (&label("xop_shortcut"));
+
+       &mov            ($out,&wparam(0));
+       &mov            ($inp,&wparam(1));
+       &mov            ($len,&wparam(2));
+       &mov            ("edx",&wparam(3));             # key
+       &mov            ("ebx",&wparam(4));             # counter and nonce
+
+       &mov            ("ebp","esp");
+       &stack_push     (131);
+       &and            ("esp",-64);
+       &mov            (&DWP(512,"esp"),"ebp");
+
+       &lea            ("eax",&DWP(&label("ssse3_data")."-".
+                                   &label("pic_point"),"eax"));
+       &movdqu         ("xmm3",&QWP(0,"ebx"));         # counter and nonce
+
+       &cmp            ($len,64*4);
+       &jb             (&label("1x"));
+
+       &mov            (&DWP(512+4,"esp"),"edx");      # offload pointers
+       &mov            (&DWP(512+8,"esp"),"ebx");
+       &sub            ($len,64*4);                    # bias len
+       &lea            ("ebp",&DWP(256+128,"esp"));    # size optimization
+
+       &movdqu         ("xmm7",&DWP(0,"edx"));         # key
+       &pshufd         ("xmm0","xmm3",0x00);
+       &pshufd         ("xmm1","xmm3",0x55);
+       &pshufd         ("xmm2","xmm3",0xaa);
+       &pshufd         ("xmm3","xmm3",0xff);
+        &paddd         ("xmm0",&QWP(16*3,"eax"));      # fix counters
+       &pshufd         ("xmm4","xmm7",0x00);
+       &pshufd         ("xmm5","xmm7",0x55);
+        &psubd         ("xmm0",&QWP(16*4,"eax"));
+       &pshufd         ("xmm6","xmm7",0xaa);
+       &pshufd         ("xmm7","xmm7",0xff);
+       &movdqa         (&QWP(16*12-128,"ebp"),"xmm0");
+       &movdqa         (&QWP(16*13-128,"ebp"),"xmm1");
+       &movdqa         (&QWP(16*14-128,"ebp"),"xmm2");
+       &movdqa         (&QWP(16*15-128,"ebp"),"xmm3");
+        &movdqu        ("xmm3",&DWP(16,"edx"));        # key
+       &movdqa         (&QWP(16*4-128,"ebp"),"xmm4");
+       &movdqa         (&QWP(16*5-128,"ebp"),"xmm5");
+       &movdqa         (&QWP(16*6-128,"ebp"),"xmm6");
+       &movdqa         (&QWP(16*7-128,"ebp"),"xmm7");
+        &movdqa        ("xmm7",&DWP(16*2,"eax"));      # sigma
+        &lea           ("ebx",&DWP(128,"esp"));        # size optimization
+
+       &pshufd         ("xmm0","xmm3",0x00);
+       &pshufd         ("xmm1","xmm3",0x55);
+       &pshufd         ("xmm2","xmm3",0xaa);
+       &pshufd         ("xmm3","xmm3",0xff);
+       &pshufd         ("xmm4","xmm7",0x00);
+       &pshufd         ("xmm5","xmm7",0x55);
+       &pshufd         ("xmm6","xmm7",0xaa);
+       &pshufd         ("xmm7","xmm7",0xff);
+       &movdqa         (&QWP(16*8-128,"ebp"),"xmm0");
+       &movdqa         (&QWP(16*9-128,"ebp"),"xmm1");
+       &movdqa         (&QWP(16*10-128,"ebp"),"xmm2");
+       &movdqa         (&QWP(16*11-128,"ebp"),"xmm3");
+       &movdqa         (&QWP(16*0-128,"ebp"),"xmm4");
+       &movdqa         (&QWP(16*1-128,"ebp"),"xmm5");
+       &movdqa         (&QWP(16*2-128,"ebp"),"xmm6");
+       &movdqa         (&QWP(16*3-128,"ebp"),"xmm7");
+
+       &lea            ($inp,&DWP(128,$inp));          # size optimization
+       &lea            ($out,&DWP(128,$out));          # size optimization
+       &jmp            (&label("outer_loop"));
+
+&set_label("outer_loop",16);
+       #&movdqa        ("xmm0",&QWP(16*0-128,"ebp"));  # copy key material
+       &movdqa         ("xmm1",&QWP(16*1-128,"ebp"));
+       &movdqa         ("xmm2",&QWP(16*2-128,"ebp"));
+       &movdqa         ("xmm3",&QWP(16*3-128,"ebp"));
+       #&movdqa        ("xmm4",&QWP(16*4-128,"ebp"));
+       &movdqa         ("xmm5",&QWP(16*5-128,"ebp"));
+       &movdqa         ("xmm6",&QWP(16*6-128,"ebp"));
+       &movdqa         ("xmm7",&QWP(16*7-128,"ebp"));
+       #&movdqa        (&QWP(16*0-128,"ebx"),"xmm0");
+       &movdqa         (&QWP(16*1-128,"ebx"),"xmm1");
+       &movdqa         (&QWP(16*2-128,"ebx"),"xmm2");
+       &movdqa         (&QWP(16*3-128,"ebx"),"xmm3");
+       #&movdqa        (&QWP(16*4-128,"ebx"),"xmm4");
+       &movdqa         (&QWP(16*5-128,"ebx"),"xmm5");
+       &movdqa         (&QWP(16*6-128,"ebx"),"xmm6");
+       &movdqa         (&QWP(16*7-128,"ebx"),"xmm7");
+       #&movdqa        ("xmm0",&QWP(16*8-128,"ebp"));
+       #&movdqa        ("xmm1",&QWP(16*9-128,"ebp"));
+       &movdqa         ("xmm2",&QWP(16*10-128,"ebp"));
+       &movdqa         ("xmm3",&QWP(16*11-128,"ebp"));
+       &movdqa         ("xmm4",&QWP(16*12-128,"ebp"));
+       &movdqa         ("xmm5",&QWP(16*13-128,"ebp"));
+       &movdqa         ("xmm6",&QWP(16*14-128,"ebp"));
+       &movdqa         ("xmm7",&QWP(16*15-128,"ebp"));
+       &paddd          ("xmm4",&QWP(16*4,"eax"));      # counter value
+       #&movdqa        (&QWP(16*8-128,"ebx"),"xmm0");
+       #&movdqa        (&QWP(16*9-128,"ebx"),"xmm1");
+       &movdqa         (&QWP(16*10-128,"ebx"),"xmm2");
+       &movdqa         (&QWP(16*11-128,"ebx"),"xmm3");
+       &movdqa         (&QWP(16*12-128,"ebx"),"xmm4");
+       &movdqa         (&QWP(16*13-128,"ebx"),"xmm5");
+       &movdqa         (&QWP(16*14-128,"ebx"),"xmm6");
+       &movdqa         (&QWP(16*15-128,"ebx"),"xmm7");
+       &movdqa         (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value
+
+       &movdqa         ($xa, &QWP(16*0-128,"ebp"));
+       &movdqa         ($xd, "xmm4");
+       &movdqa         ($xb_,&QWP(16*4-128,"ebp"));
+       &movdqa         ($xc, &QWP(16*8-128,"ebp"));
+       &movdqa         ($xc_,&QWP(16*9-128,"ebp"));
+
+       &mov            ("edx",10);                     # loop counter
+       &nop            ();
+
+&set_label("loop",16);
+       &paddd          ($xa,$xb_);                     # elsewhere
+       &movdqa         ($xb,$xb_);
+       &pxor           ($xd,$xa);                      # elsewhere
+       &QUARTERROUND_SSSE3(0, 4, 8, 12, 0);
+       &QUARTERROUND_SSSE3(1, 5, 9, 13, 1);
+       &QUARTERROUND_SSSE3(2, 6,10, 14, 2);
+       &QUARTERROUND_SSSE3(3, 7,11, 15, 3);
+       &QUARTERROUND_SSSE3(0, 5,10, 15, 4);
+       &QUARTERROUND_SSSE3(1, 6,11, 12, 5);
+       &QUARTERROUND_SSSE3(2, 7, 8, 13, 6);
+       &QUARTERROUND_SSSE3(3, 4, 9, 14, 7);
+       &dec            ("edx");
+       &jnz            (&label("loop"));
+
+       &movdqa         (&QWP(16*4-128,"ebx"),$xb_);
+       &movdqa         (&QWP(16*8-128,"ebx"),$xc);
+       &movdqa         (&QWP(16*9-128,"ebx"),$xc_);
+       &movdqa         (&QWP(16*12-128,"ebx"),$xd);
+       &movdqa         (&QWP(16*14-128,"ebx"),$xd_);
+
+    my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
+
+       #&movdqa        ($xa0,&QWP(16*0-128,"ebx"));    # it's there
+       &movdqa         ($xa1,&QWP(16*1-128,"ebx"));
+       &movdqa         ($xa2,&QWP(16*2-128,"ebx"));
+       &movdqa         ($xa3,&QWP(16*3-128,"ebx"));
+
+    for($i=0;$i<256;$i+=64) {
+       &paddd          ($xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material
+       &paddd          ($xa1,&QWP($i+16*1-128,"ebp"));
+       &paddd          ($xa2,&QWP($i+16*2-128,"ebp"));
+       &paddd          ($xa3,&QWP($i+16*3-128,"ebp"));
+
+       &movdqa         ($xt2,$xa0);            # "de-interlace" data
+       &punpckldq      ($xa0,$xa1);
+       &movdqa         ($xt3,$xa2);
+       &punpckldq      ($xa2,$xa3);
+       &punpckhdq      ($xt2,$xa1);
+       &punpckhdq      ($xt3,$xa3);
+       &movdqa         ($xa1,$xa0);
+       &punpcklqdq     ($xa0,$xa2);            # "a0"
+       &movdqa         ($xa3,$xt2);
+       &punpcklqdq     ($xt2,$xt3);            # "a2"
+       &punpckhqdq     ($xa1,$xa2);            # "a1"
+       &punpckhqdq     ($xa3,$xt3);            # "a3"
+
+       #($xa2,$xt2)=($xt2,$xa2);
+
+       &movdqu         ($xt0,&QWP(64*0-128,$inp));     # load input
+       &movdqu         ($xt1,&QWP(64*1-128,$inp));
+       &movdqu         ($xa2,&QWP(64*2-128,$inp));
+       &movdqu         ($xt3,&QWP(64*3-128,$inp));
+       &lea            ($inp,&QWP($i<192?16:(64*4-16*3),$inp));
+       &pxor           ($xt0,$xa0);
+       &movdqa         ($xa0,&QWP($i+16*4-128,"ebx"))  if ($i<192);
+       &pxor           ($xt1,$xa1);
+       &movdqa         ($xa1,&QWP($i+16*5-128,"ebx"))  if ($i<192);
+       &pxor           ($xt2,$xa2);
+       &movdqa         ($xa2,&QWP($i+16*6-128,"ebx"))  if ($i<192);
+       &pxor           ($xt3,$xa3);
+       &movdqa         ($xa3,&QWP($i+16*7-128,"ebx"))  if ($i<192);
+       &movdqu         (&QWP(64*0-128,$out),$xt0);     # store output
+       &movdqu         (&QWP(64*1-128,$out),$xt1);
+       &movdqu         (&QWP(64*2-128,$out),$xt2);
+       &movdqu         (&QWP(64*3-128,$out),$xt3);
+       &lea            ($out,&QWP($i<192?16:(64*4-16*3),$out));
+    }
+       &sub            ($len,64*4);
+       &jnc            (&label("outer_loop"));
+
+       &add            ($len,64*4);
+       &jz             (&label("done"));
+
+       &mov            ("ebx",&DWP(512+8,"esp"));      # restore pointers
+       &lea            ($inp,&DWP(-128,$inp));
+       &mov            ("edx",&DWP(512+4,"esp"));
+       &lea            ($out,&DWP(-128,$out));
+
+       &movd           ("xmm2",&DWP(16*12-128,"ebp")); # counter value
+       &movdqu         ("xmm3",&QWP(0,"ebx"));
+       &paddd          ("xmm2",&QWP(16*6,"eax"));      # +four
+       &pand           ("xmm3",&QWP(16*7,"eax"));
+       &por            ("xmm3","xmm2");                # counter value
+{
+my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
+
+sub SSSE3ROUND {       # critical path is 20 "SIMD ticks" per round
+       &paddd          ($a,$b);
+       &pxor           ($d,$a);
+       &pshufb         ($d,$rot16);
+
+       &paddd          ($c,$d);
+       &pxor           ($b,$c);
+       &movdqa         ($t,$b);
+       &psrld          ($b,20);
+       &pslld          ($t,12);
+       &por            ($b,$t);
+
+       &paddd          ($a,$b);
+       &pxor           ($d,$a);
+       &pshufb         ($d,$rot24);
+
+       &paddd          ($c,$d);
+       &pxor           ($b,$c);
+       &movdqa         ($t,$b);
+       &psrld          ($b,25);
+       &pslld          ($t,7);
+       &por            ($b,$t);
+}
+
+&set_label("1x");
+       &movdqa         ($a,&QWP(16*2,"eax"));          # sigma
+       &movdqu         ($b,&QWP(0,"edx"));
+       &movdqu         ($c,&QWP(16,"edx"));
+       #&movdqu        ($d,&QWP(0,"ebx"));             # already loaded
+       &movdqa         ($rot16,&QWP(0,"eax"));
+       &movdqa         ($rot24,&QWP(16,"eax"));
+       &mov            (&DWP(16*3,"esp"),"ebp");
+
+       &movdqa         (&QWP(16*0,"esp"),$a);
+       &movdqa         (&QWP(16*1,"esp"),$b);
+       &movdqa         (&QWP(16*2,"esp"),$c);
+       &movdqa         (&QWP(16*3,"esp"),$d);
+       &mov            ("edx",10);
+       &jmp            (&label("loop1x"));
+
+&set_label("outer1x",16);
+       &movdqa         ($d,&QWP(16*5,"eax"));          # one
+       &movdqa         ($a,&QWP(16*0,"esp"));
+       &movdqa         ($b,&QWP(16*1,"esp"));
+       &movdqa         ($c,&QWP(16*2,"esp"));
+       &paddd          ($d,&QWP(16*3,"esp"));
+       &mov            ("edx",10);
+       &movdqa         (&QWP(16*3,"esp"),$d);
+       &jmp            (&label("loop1x"));
+
+&set_label("loop1x",16);
+       &SSSE3ROUND();
+       &pshufd ($c,$c,0b01001110);
+       &pshufd ($b,$b,0b00111001);
+       &pshufd ($d,$d,0b10010011);
+       &nop    ();
+
+       &SSSE3ROUND();
+       &pshufd ($c,$c,0b01001110);
+       &pshufd ($b,$b,0b10010011);
+       &pshufd ($d,$d,0b00111001);
+
+       &dec            ("edx");
+       &jnz            (&label("loop1x"));
+
+       &paddd          ($a,&QWP(16*0,"esp"));
+       &paddd          ($b,&QWP(16*1,"esp"));
+       &paddd          ($c,&QWP(16*2,"esp"));
+       &paddd          ($d,&QWP(16*3,"esp"));
+
+       &cmp            ($len,64);
+       &jb             (&label("tail"));
+
+       &movdqu         ($t,&QWP(16*0,$inp));
+       &movdqu         ($t1,&QWP(16*1,$inp));
+       &pxor           ($a,$t);                # xor with input
+       &movdqu         ($t,&QWP(16*2,$inp));
+       &pxor           ($b,$t1);
+       &movdqu         ($t1,&QWP(16*3,$inp));
+       &pxor           ($c,$t);
+       &pxor           ($d,$t1);
+       &lea            ($inp,&DWP(16*4,$inp)); # inp+=64
+
+       &movdqu         (&QWP(16*0,$out),$a);   # write output
+       &movdqu         (&QWP(16*1,$out),$b);
+       &movdqu         (&QWP(16*2,$out),$c);
+       &movdqu         (&QWP(16*3,$out),$d);
+       &lea            ($out,&DWP(16*4,$out)); # inp+=64
+
+       &sub            ($len,64);
+       &jnz            (&label("outer1x"));
+
+       &jmp            (&label("done"));
+
+&set_label("tail");
+       &movdqa         (&QWP(16*0,"esp"),$a);
+       &movdqa         (&QWP(16*1,"esp"),$b);
+       &movdqa         (&QWP(16*2,"esp"),$c);
+       &movdqa         (&QWP(16*3,"esp"),$d);
+
+       &xor            ("eax","eax");
+       &xor            ("edx","edx");
+       &xor            ("ebp","ebp");
+
+&set_label("tail_loop");
+       &movb           ("al",&BP(0,"esp","ebp"));
+       &movb           ("dl",&BP(0,$inp,"ebp"));
+       &lea            ("ebp",&DWP(1,"ebp"));
+       &xor            ("al","dl");
+       &movb           (&BP(-1,$out,"ebp"),"al");
+       &dec            ($len);
+       &jnz            (&label("tail_loop"));
+}
+&set_label("done");
+       &mov            ("esp",&DWP(512,"esp"));
+&function_end("ChaCha20_ssse3");
+
+&align (64);
+&set_label("ssse3_data");
+&data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd);
+&data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe);
+&data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574);
+&data_word(0,1,2,3);
+&data_word(4,4,4,4);
+&data_word(1,0,0,0);
+&data_word(4,0,0,0);
+&data_word(0,-1,-1,-1);
+&align (64);
+}
+&asciz ("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+if ($xmm) {
+my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
+my ($out,$inp,$len)=("edi","esi","ecx");
+
+sub QUARTERROUND_XOP {
+my ($ai,$bi,$ci,$di,$i)=@_;
+my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));        # next
+my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));        # previous
+
+       #       a   b   c   d
+       #
+       #       0   4   8  12 < even round
+       #       1   5   9  13
+       #       2   6  10  14
+       #       3   7  11  15
+       #       0   5  10  15 < odd round
+       #       1   6  11  12
+       #       2   7   8  13
+       #       3   4   9  14
+
+       if ($i==0) {
+            my $j=4;
+           ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
+       } elsif ($i==3) {
+            my $j=0;
+           ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
+       } elsif ($i==4) {
+            my $j=4;
+           ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
+       } elsif ($i==7) {
+            my $j=0;
+           ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
+       }
+
+       #&vpaddd        ($xa,$xa,$xb);                  # see elsewhere
+       #&vpxor         ($xd,$xd,$xa);                  # see elsewhere
+        &vmovdqa       (&QWP(16*$cp-128,"ebx"),$xc_)   if ($ai>0 && $ai<3);
+       &vprotd         ($xd,$xd,16);
+        &vmovdqa       (&QWP(16*$bp-128,"ebx"),$xb_)   if ($i!=0);
+       &vpaddd         ($xc,$xc,$xd);
+        &vmovdqa       ($xc_,&QWP(16*$cn-128,"ebx"))   if ($ai>0 && $ai<3);
+       &vpxor          ($xb,$i!=0?$xb:$xb_,$xc);
+        &vmovdqa       ($xa_,&QWP(16*$an-128,"ebx"));
+       &vprotd         ($xb,$xb,12);
+        &vmovdqa       ($xb_,&QWP(16*$bn-128,"ebx"))   if ($i<7);
+       &vpaddd         ($xa,$xa,$xb);
+        &vmovdqa       ($xd_,&QWP(16*$dn-128,"ebx"))   if ($di!=$dn);
+       &vpxor          ($xd,$xd,$xa);
+        &vpaddd        ($xa_,$xa_,$xb_)                if ($i<7);      # elsewhere
+       &vprotd         ($xd,$xd,8);
+       &vmovdqa        (&QWP(16*$ai-128,"ebx"),$xa);
+       &vpaddd         ($xc,$xc,$xd);
+       &vmovdqa        (&QWP(16*$di-128,"ebx"),$xd)    if ($di!=$dn);
+       &vpxor          ($xb,$xb,$xc);
+        &vpxor         ($xd_,$di==$dn?$xd:$xd_,$xa_)   if ($i<7);      # elsewhere
+       &vprotd         ($xb,$xb,7);
+
+       ($xa,$xa_)=($xa_,$xa);
+       ($xb,$xb_)=($xb_,$xb);
+       ($xc,$xc_)=($xc_,$xc);
+       ($xd,$xd_)=($xd_,$xd);
+}
+
+&function_begin("ChaCha20_xop");
+&set_label("xop_shortcut");
+       &mov            ($out,&wparam(0));
+       &mov            ($inp,&wparam(1));
+       &mov            ($len,&wparam(2));
+       &mov            ("edx",&wparam(3));             # key
+       &mov            ("ebx",&wparam(4));             # counter and nonce
+       &vzeroupper     ();
+
+       &mov            ("ebp","esp");
+       &stack_push     (131);
+       &and            ("esp",-64);
+       &mov            (&DWP(512,"esp"),"ebp");
+
+       &lea            ("eax",&DWP(&label("ssse3_data")."-".
+                                   &label("pic_point"),"eax"));
+       &vmovdqu        ("xmm3",&QWP(0,"ebx"));         # counter and nonce
+
+       &cmp            ($len,64*4);
+       &jb             (&label("1x"));
+
+       &mov            (&DWP(512+4,"esp"),"edx");      # offload pointers
+       &mov            (&DWP(512+8,"esp"),"ebx");
+       &sub            ($len,64*4);                    # bias len
+       &lea            ("ebp",&DWP(256+128,"esp"));    # size optimization
+
+       &vmovdqu        ("xmm7",&DWP(0,"edx"));         # key
+       &vpshufd        ("xmm0","xmm3",0x00);
+       &vpshufd        ("xmm1","xmm3",0x55);
+       &vpshufd        ("xmm2","xmm3",0xaa);
+       &vpshufd        ("xmm3","xmm3",0xff);
+        &vpaddd        ("xmm0","xmm0",&QWP(16*3,"eax"));       # fix counters
+       &vpshufd        ("xmm4","xmm7",0x00);
+       &vpshufd        ("xmm5","xmm7",0x55);
+        &vpsubd        ("xmm0","xmm0",&QWP(16*4,"eax"));
+       &vpshufd        ("xmm6","xmm7",0xaa);
+       &vpshufd        ("xmm7","xmm7",0xff);
+       &vmovdqa        (&QWP(16*12-128,"ebp"),"xmm0");
+       &vmovdqa        (&QWP(16*13-128,"ebp"),"xmm1");
+       &vmovdqa        (&QWP(16*14-128,"ebp"),"xmm2");
+       &vmovdqa        (&QWP(16*15-128,"ebp"),"xmm3");
+        &vmovdqu       ("xmm3",&DWP(16,"edx"));        # key
+       &vmovdqa        (&QWP(16*4-128,"ebp"),"xmm4");
+       &vmovdqa        (&QWP(16*5-128,"ebp"),"xmm5");
+       &vmovdqa        (&QWP(16*6-128,"ebp"),"xmm6");
+       &vmovdqa        (&QWP(16*7-128,"ebp"),"xmm7");
+        &vmovdqa       ("xmm7",&DWP(16*2,"eax"));      # sigma
+        &lea           ("ebx",&DWP(128,"esp"));        # size optimization
+
+       &vpshufd        ("xmm0","xmm3",0x00);
+       &vpshufd        ("xmm1","xmm3",0x55);
+       &vpshufd        ("xmm2","xmm3",0xaa);
+       &vpshufd        ("xmm3","xmm3",0xff);
+       &vpshufd        ("xmm4","xmm7",0x00);
+       &vpshufd        ("xmm5","xmm7",0x55);
+       &vpshufd        ("xmm6","xmm7",0xaa);
+       &vpshufd        ("xmm7","xmm7",0xff);
+       &vmovdqa        (&QWP(16*8-128,"ebp"),"xmm0");
+       &vmovdqa        (&QWP(16*9-128,"ebp"),"xmm1");
+       &vmovdqa        (&QWP(16*10-128,"ebp"),"xmm2");
+       &vmovdqa        (&QWP(16*11-128,"ebp"),"xmm3");
+       &vmovdqa        (&QWP(16*0-128,"ebp"),"xmm4");
+       &vmovdqa        (&QWP(16*1-128,"ebp"),"xmm5");
+       &vmovdqa        (&QWP(16*2-128,"ebp"),"xmm6");
+       &vmovdqa        (&QWP(16*3-128,"ebp"),"xmm7");
+
+       &lea            ($inp,&DWP(128,$inp));          # size optimization
+       &lea            ($out,&DWP(128,$out));          # size optimization
+       &jmp            (&label("outer_loop"));
+
+&set_label("outer_loop",32);
+       #&vmovdqa       ("xmm0",&QWP(16*0-128,"ebp"));  # copy key material
+       &vmovdqa        ("xmm1",&QWP(16*1-128,"ebp"));
+       &vmovdqa        ("xmm2",&QWP(16*2-128,"ebp"));
+       &vmovdqa        ("xmm3",&QWP(16*3-128,"ebp"));
+       #&vmovdqa       ("xmm4",&QWP(16*4-128,"ebp"));
+       &vmovdqa        ("xmm5",&QWP(16*5-128,"ebp"));
+       &vmovdqa        ("xmm6",&QWP(16*6-128,"ebp"));
+       &vmovdqa        ("xmm7",&QWP(16*7-128,"ebp"));
+       #&vmovdqa       (&QWP(16*0-128,"ebx"),"xmm0");
+       &vmovdqa        (&QWP(16*1-128,"ebx"),"xmm1");
+       &vmovdqa        (&QWP(16*2-128,"ebx"),"xmm2");
+       &vmovdqa        (&QWP(16*3-128,"ebx"),"xmm3");
+       #&vmovdqa       (&QWP(16*4-128,"ebx"),"xmm4");
+       &vmovdqa        (&QWP(16*5-128,"ebx"),"xmm5");
+       &vmovdqa        (&QWP(16*6-128,"ebx"),"xmm6");
+       &vmovdqa        (&QWP(16*7-128,"ebx"),"xmm7");
+       #&vmovdqa       ("xmm0",&QWP(16*8-128,"ebp"));
+       #&vmovdqa       ("xmm1",&QWP(16*9-128,"ebp"));
+       &vmovdqa        ("xmm2",&QWP(16*10-128,"ebp"));
+       &vmovdqa        ("xmm3",&QWP(16*11-128,"ebp"));
+       &vmovdqa        ("xmm4",&QWP(16*12-128,"ebp"));
+       &vmovdqa        ("xmm5",&QWP(16*13-128,"ebp"));
+       &vmovdqa        ("xmm6",&QWP(16*14-128,"ebp"));
+       &vmovdqa        ("xmm7",&QWP(16*15-128,"ebp"));
+       &vpaddd         ("xmm4","xmm4",&QWP(16*4,"eax"));       # counter value
+       #&vmovdqa       (&QWP(16*8-128,"ebx"),"xmm0");
+       #&vmovdqa       (&QWP(16*9-128,"ebx"),"xmm1");
+       &vmovdqa        (&QWP(16*10-128,"ebx"),"xmm2");
+       &vmovdqa        (&QWP(16*11-128,"ebx"),"xmm3");
+       &vmovdqa        (&QWP(16*12-128,"ebx"),"xmm4");
+       &vmovdqa        (&QWP(16*13-128,"ebx"),"xmm5");
+       &vmovdqa        (&QWP(16*14-128,"ebx"),"xmm6");
+       &vmovdqa        (&QWP(16*15-128,"ebx"),"xmm7");
+       &vmovdqa        (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value
+
+       &vmovdqa        ($xa, &QWP(16*0-128,"ebp"));
+       &vmovdqa        ($xd, "xmm4");
+       &vmovdqa        ($xb_,&QWP(16*4-128,"ebp"));
+       &vmovdqa        ($xc, &QWP(16*8-128,"ebp"));
+       &vmovdqa        ($xc_,&QWP(16*9-128,"ebp"));
+
+       &mov            ("edx",10);                     # loop counter
+       &nop            ();
+
+&set_label("loop",32);
+       &vpaddd         ($xa,$xa,$xb_);                 # elsewhere
+       &vpxor          ($xd,$xd,$xa);                  # elsewhere
+       &QUARTERROUND_XOP(0, 4, 8, 12, 0);
+       &QUARTERROUND_XOP(1, 5, 9, 13, 1);
+       &QUARTERROUND_XOP(2, 6,10, 14, 2);
+       &QUARTERROUND_XOP(3, 7,11, 15, 3);
+       &QUARTERROUND_XOP(0, 5,10, 15, 4);
+       &QUARTERROUND_XOP(1, 6,11, 12, 5);
+       &QUARTERROUND_XOP(2, 7, 8, 13, 6);
+       &QUARTERROUND_XOP(3, 4, 9, 14, 7);
+       &dec            ("edx");
+       &jnz            (&label("loop"));
+
+       &vmovdqa        (&QWP(16*4-128,"ebx"),$xb_);
+       &vmovdqa        (&QWP(16*8-128,"ebx"),$xc);
+       &vmovdqa        (&QWP(16*9-128,"ebx"),$xc_);
+       &vmovdqa        (&QWP(16*12-128,"ebx"),$xd);
+       &vmovdqa        (&QWP(16*14-128,"ebx"),$xd_);
+
+    my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
+
+       #&vmovdqa       ($xa0,&QWP(16*0-128,"ebx"));    # it's there
+       &vmovdqa        ($xa1,&QWP(16*1-128,"ebx"));
+       &vmovdqa        ($xa2,&QWP(16*2-128,"ebx"));
+       &vmovdqa        ($xa3,&QWP(16*3-128,"ebx"));
+
+    for($i=0;$i<256;$i+=64) {
+       &vpaddd         ($xa0,$xa0,&QWP($i+16*0-128,"ebp"));    # accumulate key material
+       &vpaddd         ($xa1,$xa1,&QWP($i+16*1-128,"ebp"));
+       &vpaddd         ($xa2,$xa2,&QWP($i+16*2-128,"ebp"));
+       &vpaddd         ($xa3,$xa3,&QWP($i+16*3-128,"ebp"));
+
+       &vpunpckldq     ($xt2,$xa0,$xa1);       # "de-interlace" data
+       &vpunpckldq     ($xt3,$xa2,$xa3);
+       &vpunpckhdq     ($xa0,$xa0,$xa1);
+       &vpunpckhdq     ($xa2,$xa2,$xa3);
+       &vpunpcklqdq    ($xa1,$xt2,$xt3);       # "a0"
+       &vpunpckhqdq    ($xt2,$xt2,$xt3);       # "a1"
+       &vpunpcklqdq    ($xt3,$xa0,$xa2);       # "a2"
+       &vpunpckhqdq    ($xa3,$xa0,$xa2);       # "a3"
+
+       &vpxor          ($xt0,$xa1,&QWP(64*0-128,$inp));
+       &vpxor          ($xt1,$xt2,&QWP(64*1-128,$inp));
+       &vpxor          ($xt2,$xt3,&QWP(64*2-128,$inp));
+       &vpxor          ($xt3,$xa3,&QWP(64*3-128,$inp));
+       &lea            ($inp,&QWP($i<192?16:(64*4-16*3),$inp));
+       &vmovdqa        ($xa0,&QWP($i+16*4-128,"ebx"))  if ($i<192);
+       &vmovdqa        ($xa1,&QWP($i+16*5-128,"ebx"))  if ($i<192);
+       &vmovdqa        ($xa2,&QWP($i+16*6-128,"ebx"))  if ($i<192);
+       &vmovdqa        ($xa3,&QWP($i+16*7-128,"ebx"))  if ($i<192);
+       &vmovdqu        (&QWP(64*0-128,$out),$xt0);     # store output
+       &vmovdqu        (&QWP(64*1-128,$out),$xt1);
+       &vmovdqu        (&QWP(64*2-128,$out),$xt2);
+       &vmovdqu        (&QWP(64*3-128,$out),$xt3);
+       &lea            ($out,&QWP($i<192?16:(64*4-16*3),$out));
+    }
+       &sub            ($len,64*4);
+       &jnc            (&label("outer_loop"));
+
+       &add            ($len,64*4);
+       &jz             (&label("done"));
+
+       &mov            ("ebx",&DWP(512+8,"esp"));      # restore pointers
+       &lea            ($inp,&DWP(-128,$inp));
+       &mov            ("edx",&DWP(512+4,"esp"));
+       &lea            ($out,&DWP(-128,$out));
+
+       &vmovd          ("xmm2",&DWP(16*12-128,"ebp")); # counter value
+       &vmovdqu        ("xmm3",&QWP(0,"ebx"));
+       &vpaddd         ("xmm2","xmm2",&QWP(16*6,"eax"));# +four
+       &vpand          ("xmm3","xmm3",&QWP(16*7,"eax"));
+       &vpor           ("xmm3","xmm3","xmm2");         # counter value
+{
+my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
+
+sub XOPROUND {
+       &vpaddd         ($a,$a,$b);
+       &vpxor          ($d,$d,$a);
+       &vprotd         ($d,$d,16);
+
+       &vpaddd         ($c,$c,$d);
+       &vpxor          ($b,$b,$c);
+       &vprotd         ($b,$b,12);
+
+       &vpaddd         ($a,$a,$b);
+       &vpxor          ($d,$d,$a);
+       &vprotd         ($d,$d,8);
+
+       &vpaddd         ($c,$c,$d);
+       &vpxor          ($b,$b,$c);
+       &vprotd         ($b,$b,7);
+}
+
+&set_label("1x");
+       &vmovdqa        ($a,&QWP(16*2,"eax"));          # sigma
+       &vmovdqu        ($b,&QWP(0,"edx"));
+       &vmovdqu        ($c,&QWP(16,"edx"));
+       #&vmovdqu       ($d,&QWP(0,"ebx"));             # already loaded
+       &vmovdqa        ($rot16,&QWP(0,"eax"));
+       &vmovdqa        ($rot24,&QWP(16,"eax"));
+       &mov            (&DWP(16*3,"esp"),"ebp");
+
+       &vmovdqa        (&QWP(16*0,"esp"),$a);
+       &vmovdqa        (&QWP(16*1,"esp"),$b);
+       &vmovdqa        (&QWP(16*2,"esp"),$c);
+       &vmovdqa        (&QWP(16*3,"esp"),$d);
+       &mov            ("edx",10);
+       &jmp            (&label("loop1x"));
+
+&set_label("outer1x",16);
+       &vmovdqa        ($d,&QWP(16*5,"eax"));          # one
+       &vmovdqa        ($a,&QWP(16*0,"esp"));
+       &vmovdqa        ($b,&QWP(16*1,"esp"));
+       &vmovdqa        ($c,&QWP(16*2,"esp"));
+       &vpaddd         ($d,$d,&QWP(16*3,"esp"));
+       &mov            ("edx",10);
+       &vmovdqa        (&QWP(16*3,"esp"),$d);
+       &jmp            (&label("loop1x"));
+
+&set_label("loop1x",16);
+       &XOPROUND();
+       &vpshufd        ($c,$c,0b01001110);
+       &vpshufd        ($b,$b,0b00111001);
+       &vpshufd        ($d,$d,0b10010011);
+
+       &XOPROUND();
+       &vpshufd        ($c,$c,0b01001110);
+       &vpshufd        ($b,$b,0b10010011);
+       &vpshufd        ($d,$d,0b00111001);
+
+       &dec            ("edx");
+       &jnz            (&label("loop1x"));
+
+       &vpaddd         ($a,$a,&QWP(16*0,"esp"));
+       &vpaddd         ($b,$b,&QWP(16*1,"esp"));
+       &vpaddd         ($c,$c,&QWP(16*2,"esp"));
+       &vpaddd         ($d,$d,&QWP(16*3,"esp"));
+
+       &cmp            ($len,64);
+       &jb             (&label("tail"));
+
+       &vpxor          ($a,$a,&QWP(16*0,$inp));        # xor with input
+       &vpxor          ($b,$b,&QWP(16*1,$inp));
+       &vpxor          ($c,$c,&QWP(16*2,$inp));
+       &vpxor          ($d,$d,&QWP(16*3,$inp));
+       &lea            ($inp,&DWP(16*4,$inp));         # inp+=64
+
+       &vmovdqu        (&QWP(16*0,$out),$a);           # write output
+       &vmovdqu        (&QWP(16*1,$out),$b);
+       &vmovdqu        (&QWP(16*2,$out),$c);
+       &vmovdqu        (&QWP(16*3,$out),$d);
+       &lea            ($out,&DWP(16*4,$out));         # inp+=64
+
+       &sub            ($len,64);
+       &jnz            (&label("outer1x"));
+
+       &jmp            (&label("done"));
+
+&set_label("tail");
+       &vmovdqa        (&QWP(16*0,"esp"),$a);
+       &vmovdqa        (&QWP(16*1,"esp"),$b);
+       &vmovdqa        (&QWP(16*2,"esp"),$c);
+       &vmovdqa        (&QWP(16*3,"esp"),$d);
+
+       &xor            ("eax","eax");
+       &xor            ("edx","edx");
+       &xor            ("ebp","ebp");
+
+&set_label("tail_loop");
+       &movb           ("al",&BP(0,"esp","ebp"));
+       &movb           ("dl",&BP(0,$inp,"ebp"));
+       &lea            ("ebp",&DWP(1,"ebp"));
+       &xor            ("al","dl");
+       &movb           (&BP(-1,$out,"ebp"),"al");
+       &dec            ($len);
+       &jnz            (&label("tail_loop"));
+}
+&set_label("done");
+       &vzeroupper     ();
+       &mov            ("esp",&DWP(512,"esp"));
+&function_end("ChaCha20_xop");
+}
+
+&asm_finish();
diff --git a/crypto/chacha/asm/chacha-x86_64.pl b/crypto/chacha/asm/chacha-x86_64.pl
new file mode 100755 (executable)
index 0000000..41dbef5
--- /dev/null
@@ -0,0 +1,2234 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# November 2014
+#
+# ChaCha20 for x86_64.
+#
+# Performance in cycles per byte out of large buffer.
+#
+#              IALU/gcc 4.8(i) 1xSSSE3/SSE2    4xSSSE3     8xAVX2
+#
+# P4           9.48/+99%       -/22.7(ii)      -
+# Core2                7.83/+55%       7.90/8.08       4.35
+# Westmere     7.19/+50%       5.60/6.70       3.00
+# Sandy Bridge 8.31/+42%       5.45/6.76       2.72
+# Ivy Bridge   6.71/+46%       5.40/6.49       2.41
+# Haswell      5.92/+43%       5.20/6.45       2.42        1.23
+# Silvermont   12.0/+33%       7.75/7.40       7.03(iii)
+# Sledgehammer 7.28/+52%       -/14.2(ii)      -
+# Bulldozer    9.66/+28%       9.85/11.1       3.06(iv)
+# VIA Nano     10.5/+46%       6.72/8.60       6.05
+#
+# (i)  compared to older gcc 3.x one can observe >2x improvement on
+#      most platforms;
+# (ii) as it can be seen, SSE2 performance is too low on legacy
+#      processors; NxSSE2 results are naturally better, but not
+#      impressively better than IALU ones, which is why you won't
+#      find SSE2 code below;
+# (iii)        this is not optimal result for Atom because of MSROM
+#      limitations, SSE2 can do better, but gain is considered too
+#      low to justify the [maintenance] effort;
+# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20;
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+               =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+       $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+          `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+       $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+          `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+       $avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
+       $avx = ($2>=3.0) + ($2>3.0);
+}
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+# input parameter block
+($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
+
+$code.=<<___;
+.text
+
+.extern OPENSSL_ia32cap_P
+
+.align 64
+.Lzero:
+.long  0,0,0,0
+.Lone:
+.long  1,0,0,0
+.Linc:
+.long  0,1,2,3
+.Lfour:
+.long  4,4,4,4
+.Lincy:
+.long  0,2,4,6,1,3,5,7
+.Leight:
+.long  8,8,8,8,8,8,8,8
+.Lrot16:
+.byte  0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
+.Lrot24:
+.byte  0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+.Lsigma:
+.asciz "expand 32-byte k"
+.asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
+    "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
+@t=("%esi","%edi");
+
+sub ROUND {                    # critical path is 24 cycles per round
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my ($xc,$xc_)=map("\"$_\"",@t);
+my @x=map("\"$_\"",@x);
+
+       # Consider order in which variables are addressed by their
+       # index:
+       #
+       #       a   b   c   d
+       #
+       #       0   4   8  12 < even round
+       #       1   5   9  13
+       #       2   6  10  14
+       #       3   7  11  15
+       #       0   5  10  15 < odd round
+       #       1   6  11  12
+       #       2   7   8  13
+       #       3   4   9  14
+       #
+       # 'a', 'b' and 'd's are permanently allocated in registers,
+       # @x[0..7,12..15], while 'c's are maintained in memory. If
+       # you observe 'c' column, you'll notice that pair of 'c's is
+       # invariant between rounds. This means that we have to reload
+       # them once per round, in the middle. This is why you'll see
+       # bunch of 'c' stores and loads in the middle, but none in
+       # the beginning or end.
+
+       # Normally instructions would be interleaved to favour in-order
+       # execution. Generally out-of-order cores manage it gracefully,
+       # but not this time for some reason. As in-order execution
+       # cores are dying breed, old Atom is the only one around,
+       # instructions are left uninterleaved. Besides, Atom is better
+       # off executing 1xSSSE3 code anyway...
+
+       (
+       "&add   (@x[$a0],@x[$b0])",     # Q1
+       "&xor   (@x[$d0],@x[$a0])",
+       "&rol   (@x[$d0],16)",
+        "&add  (@x[$a1],@x[$b1])",     # Q2
+        "&xor  (@x[$d1],@x[$a1])",
+        "&rol  (@x[$d1],16)",
+
+       "&add   ($xc,@x[$d0])",
+       "&xor   (@x[$b0],$xc)",
+       "&rol   (@x[$b0],12)",
+        "&add  ($xc_,@x[$d1])",
+        "&xor  (@x[$b1],$xc_)",
+        "&rol  (@x[$b1],12)",
+
+       "&add   (@x[$a0],@x[$b0])",
+       "&xor   (@x[$d0],@x[$a0])",
+       "&rol   (@x[$d0],8)",
+        "&add  (@x[$a1],@x[$b1])",
+        "&xor  (@x[$d1],@x[$a1])",
+        "&rol  (@x[$d1],8)",
+
+       "&add   ($xc,@x[$d0])",
+       "&xor   (@x[$b0],$xc)",
+       "&rol   (@x[$b0],7)",
+        "&add  ($xc_,@x[$d1])",
+        "&xor  (@x[$b1],$xc_)",
+        "&rol  (@x[$b1],7)",
+
+       "&mov   (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
+        "&mov  (\"4*$c1(%rsp)\",$xc_)",
+       "&mov   ($xc,\"4*$c2(%rsp)\")",
+        "&mov  ($xc_,\"4*$c3(%rsp)\")",
+
+       "&add   (@x[$a2],@x[$b2])",     # Q3
+       "&xor   (@x[$d2],@x[$a2])",
+       "&rol   (@x[$d2],16)",
+        "&add  (@x[$a3],@x[$b3])",     # Q4
+        "&xor  (@x[$d3],@x[$a3])",
+        "&rol  (@x[$d3],16)",
+
+       "&add   ($xc,@x[$d2])",
+       "&xor   (@x[$b2],$xc)",
+       "&rol   (@x[$b2],12)",
+        "&add  ($xc_,@x[$d3])",
+        "&xor  (@x[$b3],$xc_)",
+        "&rol  (@x[$b3],12)",
+
+       "&add   (@x[$a2],@x[$b2])",
+       "&xor   (@x[$d2],@x[$a2])",
+       "&rol   (@x[$d2],8)",
+        "&add  (@x[$a3],@x[$b3])",
+        "&xor  (@x[$d3],@x[$a3])",
+        "&rol  (@x[$d3],8)",
+
+       "&add   ($xc,@x[$d2])",
+       "&xor   (@x[$b2],$xc)",
+       "&rol   (@x[$b2],7)",
+        "&add  ($xc_,@x[$d3])",
+        "&xor  (@x[$b3],$xc_)",
+        "&rol  (@x[$b3],7)"
+       );
+}
+
+########################################################################
+# Generic code path that handles all lengths on pre-SSSE3 processors.
+$code.=<<___;
+.globl ChaCha20_ctr32
+.type  ChaCha20_ctr32,\@function,5
+.align 64
+ChaCha20_ctr32:
+       mov     OPENSSL_ia32cap_P+4(%rip),%r10
+       test    \$`1<<(41-32)`,%r10d
+       jnz     .LChaCha20_ssse3
+
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       sub     \$64+24,%rsp
+
+       #movdqa .Lsigma(%rip),%xmm0
+       movdqu  ($key),%xmm1
+       movdqu  16($key),%xmm2
+       movdqu  ($counter),%xmm3
+       movdqa  .Lone(%rip),%xmm4
+
+       #movdqa %xmm0,4*0(%rsp)         # key[0]
+       movdqa  %xmm1,4*4(%rsp)         # key[1]
+       movdqa  %xmm2,4*8(%rsp)         # key[2]
+       movdqa  %xmm3,4*12(%rsp)        # key[3]
+       mov     $len,%rbp               # reassign $len
+       jmp     .Loop_outer
+
+.align 32
+.Loop_outer:
+       mov     \$0x61707865,@x[0]      # 'expa'
+       mov     \$0x3320646e,@x[1]      # 'nd 3'
+       mov     \$0x79622d32,@x[2]      # '2-by'
+       mov     \$0x6b206574,@x[3]      # 'te k'
+       mov     4*4(%rsp),@x[4]
+       mov     4*5(%rsp),@x[5]
+       mov     4*6(%rsp),@x[6]
+       mov     4*7(%rsp),@x[7]
+       movd    %xmm3,@x[12]
+       mov     4*13(%rsp),@x[13]
+       mov     4*14(%rsp),@x[14]
+       mov     4*15(%rsp),@x[15]
+
+       mov     %rbp,64+0(%rsp)         # save len
+       mov     \$10,%ebp
+       mov     $inp,64+8(%rsp)         # save inp
+       movq    %xmm2,%rsi              # "@x[8]"
+       mov     $out,64+16(%rsp)        # save out
+       mov     %rsi,%rdi
+       shr     \$32,%rdi               # "@x[9]"
+       jmp     .Loop
+
+.align 32
+.Loop:
+___
+       foreach (&ROUND (0, 4, 8,12)) { eval; }
+       foreach (&ROUND (0, 5,10,15)) { eval; }
+       &dec    ("%ebp");
+       &jnz    (".Loop");
+
+$code.=<<___;
+       mov     @t[1],4*9(%rsp)         # modulo-scheduled
+       mov     @t[0],4*8(%rsp)
+       mov     64(%rsp),%rbp           # load len
+       movdqa  %xmm2,%xmm1
+       mov     64+8(%rsp),$inp         # load inp
+       paddd   %xmm4,%xmm3             # increment counter
+       mov     64+16(%rsp),$out        # load out
+
+       add     \$0x61707865,@x[0]      # 'expa'
+       add     \$0x3320646e,@x[1]      # 'nd 3'
+       add     \$0x79622d32,@x[2]      # '2-by'
+       add     \$0x6b206574,@x[3]      # 'te k'
+       add     4*4(%rsp),@x[4]
+       add     4*5(%rsp),@x[5]
+       add     4*6(%rsp),@x[6]
+       add     4*7(%rsp),@x[7]
+       add     4*12(%rsp),@x[12]
+       add     4*13(%rsp),@x[13]
+       add     4*14(%rsp),@x[14]
+       add     4*15(%rsp),@x[15]
+       paddd   4*8(%rsp),%xmm1
+
+       cmp     \$64,%rbp
+       jb      .Ltail
+
+       xor     4*0($inp),@x[0]         # xor with input
+       xor     4*1($inp),@x[1]
+       xor     4*2($inp),@x[2]
+       xor     4*3($inp),@x[3]
+       xor     4*4($inp),@x[4]
+       xor     4*5($inp),@x[5]
+       xor     4*6($inp),@x[6]
+       xor     4*7($inp),@x[7]
+       movdqu  4*8($inp),%xmm0
+       xor     4*12($inp),@x[12]
+       xor     4*13($inp),@x[13]
+       xor     4*14($inp),@x[14]
+       xor     4*15($inp),@x[15]
+       lea     4*16($inp),$inp         # inp+=64
+       pxor    %xmm1,%xmm0
+
+       movdqa  %xmm2,4*8(%rsp)
+       movd    %xmm3,4*12(%rsp)
+
+       mov     @x[0],4*0($out)         # write output
+       mov     @x[1],4*1($out)
+       mov     @x[2],4*2($out)
+       mov     @x[3],4*3($out)
+       mov     @x[4],4*4($out)
+       mov     @x[5],4*5($out)
+       mov     @x[6],4*6($out)
+       mov     @x[7],4*7($out)
+       movdqu  %xmm0,4*8($out)
+       mov     @x[12],4*12($out)
+       mov     @x[13],4*13($out)
+       mov     @x[14],4*14($out)
+       mov     @x[15],4*15($out)
+       lea     4*16($out),$out         # out+=64
+
+       sub     \$64,%rbp
+       jnz     .Loop_outer
+
+       jmp     .Ldone
+
+.align 16
+.Ltail:
+       mov     @x[0],4*0(%rsp)
+       xor     %rbx,%rbx
+       mov     @x[1],4*1(%rsp)
+       mov     @x[2],4*2(%rsp)
+       mov     @x[3],4*3(%rsp)
+       mov     @x[4],4*4(%rsp)
+       mov     @x[5],4*5(%rsp)
+       mov     @x[6],4*6(%rsp)
+       mov     @x[7],4*7(%rsp)
+       movdqa  %xmm1,4*8(%rsp)
+       mov     @x[12],4*12(%rsp)
+       mov     @x[13],4*13(%rsp)
+       mov     @x[14],4*14(%rsp)
+       mov     @x[15],4*15(%rsp)
+
+.Loop_tail:
+       movzb   ($inp,%rbx),%eax
+       movzb   (%rsp,%rbx),%edx
+       lea     1(%rbx),%rbx
+       xor     %edx,%eax
+       mov     %al,-1($out,%rbx)
+       dec     %rbp
+       jnz     .Loop_tail
+
+.Ldone:
+       add     \$64+24,%rsp
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbp
+       pop     %rbx
+       ret
+.size  ChaCha20_ctr32,.-ChaCha20_ctr32
+___
+
+########################################################################
+# SSSE3 code path that handles shorter lengths
+{
+my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
+
+sub SSSE3ROUND {       # critical path is 20 "SIMD ticks" per round
+       &paddd  ($a,$b);
+       &pxor   ($d,$a);
+       &pshufb ($d,$rot16);
+
+       &paddd  ($c,$d);
+       &pxor   ($b,$c);
+       &movdqa ($t,$b);
+       &psrld  ($b,20);
+       &pslld  ($t,12);
+       &por    ($b,$t);
+
+       &paddd  ($a,$b);
+       &pxor   ($d,$a);
+       &pshufb ($d,$rot24);
+
+       &paddd  ($c,$d);
+       &pxor   ($b,$c);
+       &movdqa ($t,$b);
+       &psrld  ($b,25);
+       &pslld  ($t,7);
+       &por    ($b,$t);
+}
+
+my $xframe = $win64 ? 32+32+8 : 24;
+
+$code.=<<___;
+.type  ChaCha20_ssse3,\@function,5
+.align 32
+ChaCha20_ssse3:
+.LChaCha20_ssse3:
+___
+$code.=<<___   if ($avx);
+       test    \$`1<<(43-32)`,%r10d
+       jnz     .LChaCha20_4xop         # XOP is fastest even if we use 1/4
+___
+$code.=<<___;
+       cmp     \$128,$len              # we might throw away some data,
+       ja      .LChaCha20_4x           # but overall it won't be slower
+
+.Ldo_sse3_after_all:
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+
+       sub     \$64+$xframe,%rsp
+___
+$code.=<<___   if ($win64);
+       movaps  %xmm6,64+32(%rsp)
+       movaps  %xmm7,64+48(%rsp)
+___
+$code.=<<___;
+       movdqa  .Lsigma(%rip),$a
+       movdqu  ($key),$b
+       movdqu  16($key),$c
+       movdqu  ($counter),$d
+       movdqa  .Lrot16(%rip),$rot16
+       movdqa  .Lrot24(%rip),$rot24
+
+       movdqa  $a,0x00(%rsp)
+       movdqa  $b,0x10(%rsp)
+       movdqa  $c,0x20(%rsp)
+       movdqa  $d,0x30(%rsp)
+       mov     \$10,%ebp
+       jmp     .Loop_ssse3
+
+.align 32
+.Loop_outer_ssse3:
+       movdqa  .Lone(%rip),$d
+       movdqa  0x00(%rsp),$a
+       movdqa  0x10(%rsp),$b
+       movdqa  0x20(%rsp),$c
+       paddd   0x30(%rsp),$d
+       mov     \$10,%ebp
+       movdqa  $d,0x30(%rsp)
+       jmp     .Loop_ssse3
+
+.align 32
+.Loop_ssse3:
+___
+       &SSSE3ROUND();
+       &pshufd ($c,$c,0b01001110);
+       &pshufd ($b,$b,0b00111001);
+       &pshufd ($d,$d,0b10010011);
+       &nop    ();
+
+       &SSSE3ROUND();
+       &pshufd ($c,$c,0b01001110);
+       &pshufd ($b,$b,0b10010011);
+       &pshufd ($d,$d,0b00111001);
+
+       &dec    ("%ebp");
+       &jnz    (".Loop_ssse3");
+
+$code.=<<___;
+       paddd   0x00(%rsp),$a
+       paddd   0x10(%rsp),$b
+       paddd   0x20(%rsp),$c
+       paddd   0x30(%rsp),$d
+
+       cmp     \$64,$len
+       jb      .Ltail_ssse3
+
+       movdqu  0x00($inp),$t
+       movdqu  0x10($inp),$t1
+       pxor    $t,$a                   # xor with input
+       movdqu  0x20($inp),$t
+       pxor    $t1,$b
+       movdqu  0x30($inp),$t1
+       lea     0x40($inp),$inp         # inp+=64
+       pxor    $t,$c
+       pxor    $t1,$d
+
+       movdqu  $a,0x00($out)           # write output
+       movdqu  $b,0x10($out)
+       movdqu  $c,0x20($out)
+       movdqu  $d,0x30($out)
+       lea     0x40($out),$out         # out+=64
+
+       sub     \$64,$len
+       jnz     .Loop_outer_ssse3
+
+       jmp     .Ldone_ssse3
+
+.align 16
+.Ltail_ssse3:
+       movdqa  $a,0x00(%rsp)
+       movdqa  $b,0x10(%rsp)
+       movdqa  $c,0x20(%rsp)
+       movdqa  $d,0x30(%rsp)
+       xor     %rbx,%rbx
+
+.Loop_tail_ssse3:
+       movzb   ($inp,%rbx),%eax
+       movzb   (%rsp,%rbx),%edx
+       lea     1(%rbx),%rbx
+       xor     %edx,%eax
+       mov     %al,-1($out,%rbx)
+       inc     %rbp
+       jnz     .Loop_tail_ssse3
+
+.Ldone_ssse3:
+___
+$code.=<<___   if ($win64);
+       movaps  64+32(%rsp),%xmm6
+       movaps  64+48(%rsp),%xmm7
+___
+$code.=<<___;
+       add     \$64+$xframe,%rsp
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbp
+       pop     %rbx
+       ret
+.size  ChaCha20_ssse3,.-ChaCha20_ssse3
+___
+}
+
+########################################################################
+# SSSE3 code path that handles longer messages.
+{
+# assign variables to favor Atom front-end
+my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
+    $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
+my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
+       "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
+
+sub SSSE3_lane_ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
+my @x=map("\"$_\"",@xx);
+
+       # Consider order in which variables are addressed by their
+       # index:
+       #
+       #       a   b   c   d
+       #
+       #       0   4   8  12 < even round
+       #       1   5   9  13
+       #       2   6  10  14
+       #       3   7  11  15
+       #       0   5  10  15 < odd round
+       #       1   6  11  12
+       #       2   7   8  13
+       #       3   4   9  14
+       #
+       # 'a', 'b' and 'd's are permanently allocated in registers,
+       # @x[0..7,12..15], while 'c's are maintained in memory. If
+       # you observe 'c' column, you'll notice that pair of 'c's is
+       # invariant between rounds. This means that we have to reload
+       # them once per round, in the middle. This is why you'll see
+       # bunch of 'c' stores and loads in the middle, but none in
+       # the beginning or end.
+
+       (
+       "&paddd         (@x[$a0],@x[$b0])",     # Q1
+        "&paddd        (@x[$a1],@x[$b1])",     # Q2
+       "&pxor          (@x[$d0],@x[$a0])",
+        "&pxor         (@x[$d1],@x[$a1])",
+       "&pshufb        (@x[$d0],$t1)",
+        "&pshufb       (@x[$d1],$t1)",
+
+       "&paddd         ($xc,@x[$d0])",
+        "&paddd        ($xc_,@x[$d1])",
+       "&pxor          (@x[$b0],$xc)",
+        "&pxor         (@x[$b1],$xc_)",
+       "&movdqa        ($t0,@x[$b0])",
+       "&pslld         (@x[$b0],12)",
+       "&psrld         ($t0,20)",
+        "&movdqa       ($t1,@x[$b1])",
+        "&pslld        (@x[$b1],12)",
+       "&por           (@x[$b0],$t0)",
+        "&psrld        ($t1,20)",
+       "&movdqa        ($t0,'(%r11)')",        # .Lrot24(%rip)
+        "&por          (@x[$b1],$t1)",
+
+       "&paddd         (@x[$a0],@x[$b0])",
+        "&paddd        (@x[$a1],@x[$b1])",
+       "&pxor          (@x[$d0],@x[$a0])",
+        "&pxor         (@x[$d1],@x[$a1])",
+       "&pshufb        (@x[$d0],$t0)",
+        "&pshufb       (@x[$d1],$t0)",
+
+       "&paddd         ($xc,@x[$d0])",
+        "&paddd        ($xc_,@x[$d1])",
+       "&pxor          (@x[$b0],$xc)",
+        "&pxor         (@x[$b1],$xc_)",
+       "&movdqa        ($t1,@x[$b0])",
+       "&pslld         (@x[$b0],7)",
+       "&psrld         ($t1,25)",
+        "&movdqa       ($t0,@x[$b1])",
+        "&pslld        (@x[$b1],7)",
+       "&por           (@x[$b0],$t1)",
+        "&psrld        ($t0,25)",
+       "&movdqa        ($t1,'(%r10)')",        # .Lrot16(%rip)
+        "&por          (@x[$b1],$t0)",
+
+       "&movdqa        (\"`16*($c0-8)`(%rsp)\",$xc)",  # reload pair of 'c's
+        "&movdqa       (\"`16*($c1-8)`(%rsp)\",$xc_)",
+       "&movdqa        ($xc,\"`16*($c2-8)`(%rsp)\")",
+        "&movdqa       ($xc_,\"`16*($c3-8)`(%rsp)\")",
+
+       "&paddd         (@x[$a2],@x[$b2])",     # Q3
+        "&paddd        (@x[$a3],@x[$b3])",     # Q4
+       "&pxor          (@x[$d2],@x[$a2])",
+        "&pxor         (@x[$d3],@x[$a3])",
+       "&pshufb        (@x[$d2],$t1)",
+        "&pshufb       (@x[$d3],$t1)",
+
+       "&paddd         ($xc,@x[$d2])",
+        "&paddd        ($xc_,@x[$d3])",
+       "&pxor          (@x[$b2],$xc)",
+        "&pxor         (@x[$b3],$xc_)",
+       "&movdqa        ($t0,@x[$b2])",
+       "&pslld         (@x[$b2],12)",
+       "&psrld         ($t0,20)",
+        "&movdqa       ($t1,@x[$b3])",
+        "&pslld        (@x[$b3],12)",
+       "&por           (@x[$b2],$t0)",
+        "&psrld        ($t1,20)",
+       "&movdqa        ($t0,'(%r11)')",        # .Lrot24(%rip)
+        "&por          (@x[$b3],$t1)",
+
+       "&paddd         (@x[$a2],@x[$b2])",
+        "&paddd        (@x[$a3],@x[$b3])",
+       "&pxor          (@x[$d2],@x[$a2])",
+        "&pxor         (@x[$d3],@x[$a3])",
+       "&pshufb        (@x[$d2],$t0)",
+        "&pshufb       (@x[$d3],$t0)",
+
+       "&paddd         ($xc,@x[$d2])",
+        "&paddd        ($xc_,@x[$d3])",
+       "&pxor          (@x[$b2],$xc)",
+        "&pxor         (@x[$b3],$xc_)",
+       "&movdqa        ($t1,@x[$b2])",
+       "&pslld         (@x[$b2],7)",
+       "&psrld         ($t1,25)",
+        "&movdqa       ($t0,@x[$b3])",
+        "&pslld        (@x[$b3],7)",
+       "&por           (@x[$b2],$t1)",
+        "&psrld        ($t0,25)",
+       "&movdqa        ($t1,'(%r10)')",        # .Lrot16(%rip)
+        "&por          (@x[$b3],$t0)"
+       );
+}
+
+my $xframe = $win64 ? 0xa0 : 0;
+
+$code.=<<___;
+.type  ChaCha20_4x,\@function,5
+.align 32
+ChaCha20_4x:
+.LChaCha20_4x:
+       mov             %r10,%r11
+___
+$code.=<<___   if ($avx>1);
+       shr             \$32,%r10               # OPENSSL_ia32cap_P+8
+       test            \$`1<<5`,%r10           # test AVX2
+       jnz             .LChaCha20_8x
+___
+$code.=<<___;
+       cmp             \$192,$len
+       ja              .Lproceed4x
+
+       and             \$`1<<26|1<<22`,%r11    # isolate XSAVE+MOVBE
+       cmp             \$`1<<22`,%r11          # check for MOVBE without XSAVE
+       je              .Ldo_sse3_after_all     # to detect Atom
+
+.Lproceed4x:
+       lea             -0x78(%rsp),%r11
+       sub             \$0x148+$xframe,%rsp
+___
+       ################ stack layout
+       # +0x00         SIMD equivalent of @x[8-12]
+       # ...
+       # +0x40         constant copy of key[0-2] smashed by lanes
+       # ...
+       # +0x100        SIMD counters (with nonce smashed by lanes)
+       # ...
+       # +0x140
+$code.=<<___   if ($win64);
+       movaps          %xmm6,-0x30(%r11)
+       movaps          %xmm7,-0x20(%r11)
+       movaps          %xmm8,-0x10(%r11)
+       movaps          %xmm9,0x00(%r11)
+       movaps          %xmm10,0x10(%r11)
+       movaps          %xmm11,0x20(%r11)
+       movaps          %xmm12,0x30(%r11)
+       movaps          %xmm13,0x40(%r11)
+       movaps          %xmm14,0x50(%r11)
+       movaps          %xmm15,0x60(%r11)
+___
+$code.=<<___;
+       movdqa          .Lsigma(%rip),$xa3      # key[0]
+       movdqu          ($key),$xb3             # key[1]
+       movdqu          16($key),$xt3           # key[2]
+       movdqu          ($counter),$xd3         # key[3]
+       lea             0x100(%rsp),%rcx        # size optimization
+       lea             .Lrot16(%rip),%r10
+       lea             .Lrot24(%rip),%r11
+
+       pshufd          \$0x00,$xa3,$xa0        # smash key by lanes...
+       pshufd          \$0x55,$xa3,$xa1
+       movdqa          $xa0,0x40(%rsp)         # ... and offload
+       pshufd          \$0xaa,$xa3,$xa2
+       movdqa          $xa1,0x50(%rsp)
+       pshufd          \$0xff,$xa3,$xa3
+       movdqa          $xa2,0x60(%rsp)
+       movdqa          $xa3,0x70(%rsp)
+
+       pshufd          \$0x00,$xb3,$xb0
+       pshufd          \$0x55,$xb3,$xb1
+       movdqa          $xb0,0x80-0x100(%rcx)
+       pshufd          \$0xaa,$xb3,$xb2
+       movdqa          $xb1,0x90-0x100(%rcx)
+       pshufd          \$0xff,$xb3,$xb3
+       movdqa          $xb2,0xa0-0x100(%rcx)
+       movdqa          $xb3,0xb0-0x100(%rcx)
+
+       pshufd          \$0x00,$xt3,$xt0        # "$xc0"
+       pshufd          \$0x55,$xt3,$xt1        # "$xc1"
+       movdqa          $xt0,0xc0-0x100(%rcx)
+       pshufd          \$0xaa,$xt3,$xt2        # "$xc2"
+       movdqa          $xt1,0xd0-0x100(%rcx)
+       pshufd          \$0xff,$xt3,$xt3        # "$xc3"
+       movdqa          $xt2,0xe0-0x100(%rcx)
+       movdqa          $xt3,0xf0-0x100(%rcx)
+
+       pshufd          \$0x00,$xd3,$xd0
+       pshufd          \$0x55,$xd3,$xd1
+       paddd           .Linc(%rip),$xd0        # don't save counters yet
+       pshufd          \$0xaa,$xd3,$xd2
+       movdqa          $xd1,0x110-0x100(%rcx)
+       pshufd          \$0xff,$xd3,$xd3
+       movdqa          $xd2,0x120-0x100(%rcx)
+       movdqa          $xd3,0x130-0x100(%rcx)
+
+       jmp             .Loop_enter4x
+
+.align 32
+.Loop_outer4x:
+       movdqa          0x40(%rsp),$xa0         # re-load smashed key
+       movdqa          0x50(%rsp),$xa1
+       movdqa          0x60(%rsp),$xa2
+       movdqa          0x70(%rsp),$xa3
+       movdqa          0x80-0x100(%rcx),$xb0
+       movdqa          0x90-0x100(%rcx),$xb1
+       movdqa          0xa0-0x100(%rcx),$xb2
+       movdqa          0xb0-0x100(%rcx),$xb3
+       movdqa          0xc0-0x100(%rcx),$xt0   # "$xc0"
+       movdqa          0xd0-0x100(%rcx),$xt1   # "$xc1"
+       movdqa          0xe0-0x100(%rcx),$xt2   # "$xc2"
+       movdqa          0xf0-0x100(%rcx),$xt3   # "$xc3"
+       movdqa          0x100-0x100(%rcx),$xd0
+       movdqa          0x110-0x100(%rcx),$xd1
+       movdqa          0x120-0x100(%rcx),$xd2
+       movdqa          0x130-0x100(%rcx),$xd3
+       paddd           .Lfour(%rip),$xd0       # next SIMD counters
+
+.Loop_enter4x:
+       movdqa          $xt2,0x20(%rsp)         # SIMD equivalent of "@x[10]"
+       movdqa          $xt3,0x30(%rsp)         # SIMD equivalent of "@x[11]"
+       movdqa          (%r10),$xt3             # .Lrot16(%rip)
+       mov             \$10,%eax
+       movdqa          $xd0,0x100-0x100(%rcx)  # save SIMD counters
+       jmp             .Loop4x
+
+.align 32
+.Loop4x:
+___
+       foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
+       foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
+$code.=<<___;
+       dec             %eax
+       jnz             .Loop4x
+
+       paddd           0x40(%rsp),$xa0         # accumulate key material
+       paddd           0x50(%rsp),$xa1
+       paddd           0x60(%rsp),$xa2
+       paddd           0x70(%rsp),$xa3
+
+       movdqa          $xa0,$xt2               # "de-interlace" data
+       punpckldq       $xa1,$xa0
+       movdqa          $xa2,$xt3
+       punpckldq       $xa3,$xa2
+       punpckhdq       $xa1,$xt2
+       punpckhdq       $xa3,$xt3
+       movdqa          $xa0,$xa1
+       punpcklqdq      $xa2,$xa0               # "a0"
+       movdqa          $xt2,$xa3
+       punpcklqdq      $xt3,$xt2               # "a2"
+       punpckhqdq      $xa2,$xa1               # "a1"
+       punpckhqdq      $xt3,$xa3               # "a3"
+___
+       ($xa2,$xt2)=($xt2,$xa2);
+$code.=<<___;
+       paddd           0x80-0x100(%rcx),$xb0
+       paddd           0x90-0x100(%rcx),$xb1
+       paddd           0xa0-0x100(%rcx),$xb2
+       paddd           0xb0-0x100(%rcx),$xb3
+
+       movdqa          $xa0,0x00(%rsp)         # offload $xaN
+       movdqa          $xa1,0x10(%rsp)
+       movdqa          0x20(%rsp),$xa0         # "xc2"
+       movdqa          0x30(%rsp),$xa1         # "xc3"
+
+       movdqa          $xb0,$xt2
+       punpckldq       $xb1,$xb0
+       movdqa          $xb2,$xt3
+       punpckldq       $xb3,$xb2
+       punpckhdq       $xb1,$xt2
+       punpckhdq       $xb3,$xt3
+       movdqa          $xb0,$xb1
+       punpcklqdq      $xb2,$xb0               # "b0"
+       movdqa          $xt2,$xb3
+       punpcklqdq      $xt3,$xt2               # "b2"
+       punpckhqdq      $xb2,$xb1               # "b1"
+       punpckhqdq      $xt3,$xb3               # "b3"
+___
+       ($xb2,$xt2)=($xt2,$xb2);
+       my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
+$code.=<<___;
+       paddd           0xc0-0x100(%rcx),$xc0
+       paddd           0xd0-0x100(%rcx),$xc1
+       paddd           0xe0-0x100(%rcx),$xc2
+       paddd           0xf0-0x100(%rcx),$xc3
+
+       movdqa          $xa2,0x20(%rsp)         # keep offloading $xaN
+       movdqa          $xa3,0x30(%rsp)
+
+       movdqa          $xc0,$xt2
+       punpckldq       $xc1,$xc0
+       movdqa          $xc2,$xt3
+       punpckldq       $xc3,$xc2
+       punpckhdq       $xc1,$xt2
+       punpckhdq       $xc3,$xt3
+       movdqa          $xc0,$xc1
+       punpcklqdq      $xc2,$xc0               # "c0"
+       movdqa          $xt2,$xc3
+       punpcklqdq      $xt3,$xt2               # "c2"
+       punpckhqdq      $xc2,$xc1               # "c1"
+       punpckhqdq      $xt3,$xc3               # "c3"
+___
+       ($xc2,$xt2)=($xt2,$xc2);
+       ($xt0,$xt1)=($xa2,$xa3);                # use $xaN as temporary
+$code.=<<___;
+       paddd           0x100-0x100(%rcx),$xd0
+       paddd           0x110-0x100(%rcx),$xd1
+       paddd           0x120-0x100(%rcx),$xd2
+       paddd           0x130-0x100(%rcx),$xd3
+
+       movdqa          $xd0,$xt2
+       punpckldq       $xd1,$xd0
+       movdqa          $xd2,$xt3
+       punpckldq       $xd3,$xd2
+       punpckhdq       $xd1,$xt2
+       punpckhdq       $xd3,$xt3
+       movdqa          $xd0,$xd1
+       punpcklqdq      $xd2,$xd0               # "d0"
+       movdqa          $xt2,$xd3
+       punpcklqdq      $xt3,$xt2               # "d2"
+       punpckhqdq      $xd2,$xd1               # "d1"
+       punpckhqdq      $xt3,$xd3               # "d3"
+___
+       ($xd2,$xt2)=($xt2,$xd2);
+$code.=<<___;
+       cmp             \$64*4,$len
+       jb              .Ltail4x
+
+       movdqu          0x00($inp),$xt0         # xor with input
+       movdqu          0x10($inp),$xt1
+       movdqu          0x20($inp),$xt2
+       movdqu          0x30($inp),$xt3
+       pxor            0x00(%rsp),$xt0         # $xaN is offloaded, remember?
+       pxor            $xb0,$xt1
+       pxor            $xc0,$xt2
+       pxor            $xd0,$xt3
+
+        movdqu         $xt0,0x00($out)
+       movdqu          0x40($inp),$xt0
+        movdqu         $xt1,0x10($out)
+       movdqu          0x50($inp),$xt1
+        movdqu         $xt2,0x20($out)
+       movdqu          0x60($inp),$xt2
+        movdqu         $xt3,0x30($out)
+       movdqu          0x70($inp),$xt3
+       lea             0x80($inp),$inp         # size optimization
+       pxor            0x10(%rsp),$xt0
+       pxor            $xb1,$xt1
+       pxor            $xc1,$xt2
+       pxor            $xd1,$xt3
+
+        movdqu         $xt0,0x40($out)
+       movdqu          0x00($inp),$xt0
+        movdqu         $xt1,0x50($out)
+       movdqu          0x10($inp),$xt1
+        movdqu         $xt2,0x60($out)
+       movdqu          0x20($inp),$xt2
+        movdqu         $xt3,0x70($out)
+        lea            0x80($out),$out         # size optimization
+       movdqu          0x30($inp),$xt3
+       pxor            0x20(%rsp),$xt0
+       pxor            $xb2,$xt1
+       pxor            $xc2,$xt2
+       pxor            $xd2,$xt3
+
+        movdqu         $xt0,0x00($out)
+       movdqu          0x40($inp),$xt0
+        movdqu         $xt1,0x10($out)
+       movdqu          0x50($inp),$xt1
+        movdqu         $xt2,0x20($out)
+       movdqu          0x60($inp),$xt2
+        movdqu         $xt3,0x30($out)
+       movdqu          0x70($inp),$xt3
+       lea             0x80($inp),$inp         # inp+=64*4
+       pxor            0x30(%rsp),$xt0
+       pxor            $xb3,$xt1
+       pxor            $xc3,$xt2
+       pxor            $xd3,$xt3
+       movdqu          $xt0,0x40($out)
+       movdqu          $xt1,0x50($out)
+       movdqu          $xt2,0x60($out)
+       movdqu          $xt3,0x70($out)
+       lea             0x80($out),$out         # out+=64*4
+
+       sub             \$64*4,$len
+       jnz             .Loop_outer4x
+
+       jmp             .Ldone4x
+
+.Ltail4x:
+       cmp             \$192,$len
+       jae             .L192_or_more4x
+       cmp             \$128,$len
+       jae             .L128_or_more4x
+       cmp             \$64,$len
+       jae             .L64_or_more4x
+
+       #movdqa         0x00(%rsp),$xt0         # $xaN is offloaded, remember?
+       xor             %r10,%r10
+       #movdqa         $xt0,0x00(%rsp)
+       movdqa          $xb0,0x10(%rsp)
+       movdqa          $xc0,0x20(%rsp)
+       movdqa          $xd0,0x30(%rsp)
+       jmp             .Loop_tail4x
+
+.align 32
+.L64_or_more4x:
+       movdqu          0x00($inp),$xt0         # xor with input
+       movdqu          0x10($inp),$xt1
+       movdqu          0x20($inp),$xt2
+       movdqu          0x30($inp),$xt3
+       pxor            0x00(%rsp),$xt0         # $xaxN is offloaded, remember?
+       pxor            $xb0,$xt1
+       pxor            $xc0,$xt2
+       pxor            $xd0,$xt3
+       movdqu          $xt0,0x00($out)
+       movdqu          $xt1,0x10($out)
+       movdqu          $xt2,0x20($out)
+       movdqu          $xt3,0x30($out)
+       je              .Ldone4x
+
+       movdqa          0x10(%rsp),$xt0         # $xaN is offloaded, remember?
+       lea             0x40($inp),$inp         # inp+=64*1
+       xor             %r10,%r10
+       movdqa          $xt0,0x00(%rsp)
+       movdqa          $xb1,0x10(%rsp)
+       lea             0x40($out),$out         # out+=64*1
+       movdqa          $xc1,0x20(%rsp)
+       sub             \$64,$len               # len-=64*1
+       movdqa          $xd1,0x30(%rsp)
+       jmp             .Loop_tail4x
+
+.align 32
+.L128_or_more4x:
+       movdqu          0x00($inp),$xt0         # xor with input
+       movdqu          0x10($inp),$xt1
+       movdqu          0x20($inp),$xt2
+       movdqu          0x30($inp),$xt3
+       pxor            0x00(%rsp),$xt0         # $xaN is offloaded, remember?
+       pxor            $xb0,$xt1
+       pxor            $xc0,$xt2
+       pxor            $xd0,$xt3
+
+        movdqu         $xt0,0x00($out)
+       movdqu          0x40($inp),$xt0
+        movdqu         $xt1,0x10($out)
+       movdqu          0x50($inp),$xt1
+        movdqu         $xt2,0x20($out)
+       movdqu          0x60($inp),$xt2
+        movdqu         $xt3,0x30($out)
+       movdqu          0x70($inp),$xt3
+       pxor            0x10(%rsp),$xt0
+       pxor            $xb1,$xt1
+       pxor            $xc1,$xt2
+       pxor            $xd1,$xt3
+       movdqu          $xt0,0x40($out)
+       movdqu          $xt1,0x50($out)
+       movdqu          $xt2,0x60($out)
+       movdqu          $xt3,0x70($out)
+       je              .Ldone4x
+
+       movdqa          0x20(%rsp),$xt0         # $xaN is offloaded, remember?
+       lea             0x80($inp),$inp         # inp+=64*2
+       xor             %r10,%r10
+       movdqa          $xt0,0x00(%rsp)
+       movdqa          $xb2,0x10(%rsp)
+       lea             0x80($out),$out         # out+=64*2
+       movdqa          $xc2,0x20(%rsp)
+       sub             \$128,$len              # len-=64*2
+       movdqa          $xd2,0x30(%rsp)
+       jmp             .Loop_tail4x
+
+.align 32
+.L192_or_more4x:
+       movdqu          0x00($inp),$xt0         # xor with input
+       movdqu          0x10($inp),$xt1
+       movdqu          0x20($inp),$xt2
+       movdqu          0x30($inp),$xt3
+       pxor            0x00(%rsp),$xt0         # $xaN is offloaded, remember?
+       pxor            $xb0,$xt1
+       pxor            $xc0,$xt2
+       pxor            $xd0,$xt3
+
+        movdqu         $xt0,0x00($out)
+       movdqu          0x40($inp),$xt0
+        movdqu         $xt1,0x10($out)
+       movdqu          0x50($inp),$xt1
+        movdqu         $xt2,0x20($out)
+       movdqu          0x60($inp),$xt2
+        movdqu         $xt3,0x30($out)
+       movdqu          0x70($inp),$xt3
+       lea             0x80($inp),$inp         # size optimization
+       pxor            0x10(%rsp),$xt0
+       pxor            $xb1,$xt1
+       pxor            $xc1,$xt2
+       pxor            $xd1,$xt3
+
+        movdqu         $xt0,0x40($out)
+       movdqu          0x00($inp),$xt0
+        movdqu         $xt1,0x50($out)
+       movdqu          0x10($inp),$xt1
+        movdqu         $xt2,0x60($out)
+       movdqu          0x20($inp),$xt2
+        movdqu         $xt3,0x70($out)
+        lea            0x80($out),$out         # size optimization
+       movdqu          0x30($inp),$xt3
+       pxor            0x20(%rsp),$xt0
+       pxor            $xb2,$xt1
+       pxor            $xc2,$xt2
+       pxor            $xd2,$xt3
+       movdqu          $xt0,0x00($out)
+       movdqu          $xt1,0x10($out)
+       movdqu          $xt2,0x20($out)
+       movdqu          $xt3,0x30($out)
+       je              .Ldone4x
+
+       movdqa          0x30(%rsp),$xt0         # $xaN is offloaded, remember?
+       lea             0x40($inp),$inp         # inp+=64*3
+       xor             %r10,%r10
+       movdqa          $xt0,0x00(%rsp)
+       movdqa          $xb3,0x10(%rsp)
+       lea             0x40($out),$out         # out+=64*3
+       movdqa          $xc3,0x20(%rsp)
+       sub             \$192,$len              # len-=64*3
+       movdqa          $xd3,0x30(%rsp)
+
+.Loop_tail4x:
+       movzb           ($inp,%r10),%eax
+       movzb           (%rsp,%r10),%ecx
+       lea             1(%r10),%r10
+       xor             %ecx,%eax
+       mov             %al,-1($out,%r10)
+       dec             $len
+       jnz             .Loop_tail4x
+
+.Ldone4x:
+___
+$code.=<<___   if ($win64);
+       lea             0x140+0x30(%rsp),%r11
+       movaps          -0x30(%r11),%xmm6
+       movaps          -0x20(%r11),%xmm7
+       movaps          -0x10(%r11),%xmm8
+       movaps          0x00(%r11),%xmm9
+       movaps          0x10(%r11),%xmm10
+       movaps          0x20(%r11),%xmm11
+       movaps          0x30(%r11),%xmm12
+       movaps          0x40(%r11),%xmm13
+       movaps          0x50(%r11),%xmm14
+       movaps          0x60(%r11),%xmm15
+___
+$code.=<<___;
+       add             \$0x148+$xframe,%rsp
+       ret
+.size  ChaCha20_4x,.-ChaCha20_4x
+___
+}
+
+########################################################################
+# XOP code path that handles all lengths.
+if ($avx) {
+# There is some "anomaly" observed depending on instructions' size or
+# alignment. If you look closely at below code you'll notice that
+# sometimes argument order varies. The order affects instruction
+# encoding by making it larger, and such fiddling gives 5% performance
+# improvement. This is on FX-4100...
+
+my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
+    $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
+my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
+        $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
+
+sub XOP_lane_ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my @x=map("\"$_\"",@xx);
+
+       (
+       "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",     # Q1
+        "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",     # Q2
+         "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",     # Q3
+          "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",     # Q4
+       "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
+        "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
+         "&vpxor       (@x[$d2],@x[$a2],@x[$d2])",
+          "&vpxor      (@x[$d3],@x[$a3],@x[$d3])",
+       "&vprotd        (@x[$d0],@x[$d0],16)",
+        "&vprotd       (@x[$d1],@x[$d1],16)",
+         "&vprotd      (@x[$d2],@x[$d2],16)",
+          "&vprotd     (@x[$d3],@x[$d3],16)",
+
+       "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
+        "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
+         "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
+          "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
+       "&vpxor         (@x[$b0],@x[$c0],@x[$b0])",
+        "&vpxor        (@x[$b1],@x[$c1],@x[$b1])",
+         "&vpxor       (@x[$b2],@x[$b2],@x[$c2])",     # flip
+          "&vpxor      (@x[$b3],@x[$b3],@x[$c3])",     # flip
+       "&vprotd        (@x[$b0],@x[$b0],12)",
+        "&vprotd       (@x[$b1],@x[$b1],12)",
+         "&vprotd      (@x[$b2],@x[$b2],12)",
+          "&vprotd     (@x[$b3],@x[$b3],12)",
+
+       "&vpaddd        (@x[$a0],@x[$b0],@x[$a0])",     # flip
+        "&vpaddd       (@x[$a1],@x[$b1],@x[$a1])",     # flip
+         "&vpaddd      (@x[$a2],@x[$a2],@x[$b2])",
+          "&vpaddd     (@x[$a3],@x[$a3],@x[$b3])",
+       "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
+        "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
+         "&vpxor       (@x[$d2],@x[$a2],@x[$d2])",
+          "&vpxor      (@x[$d3],@x[$a3],@x[$d3])",
+       "&vprotd        (@x[$d0],@x[$d0],8)",
+        "&vprotd       (@x[$d1],@x[$d1],8)",
+         "&vprotd      (@x[$d2],@x[$d2],8)",
+          "&vprotd     (@x[$d3],@x[$d3],8)",
+
+       "&vpaddd        (@x[$c0],@x[$c0],@x[$d0])",
+        "&vpaddd       (@x[$c1],@x[$c1],@x[$d1])",
+         "&vpaddd      (@x[$c2],@x[$c2],@x[$d2])",
+          "&vpaddd     (@x[$c3],@x[$c3],@x[$d3])",
+       "&vpxor         (@x[$b0],@x[$c0],@x[$b0])",
+        "&vpxor        (@x[$b1],@x[$c1],@x[$b1])",
+         "&vpxor       (@x[$b2],@x[$b2],@x[$c2])",     # flip
+          "&vpxor      (@x[$b3],@x[$b3],@x[$c3])",     # flip
+       "&vprotd        (@x[$b0],@x[$b0],7)",
+        "&vprotd       (@x[$b1],@x[$b1],7)",
+         "&vprotd      (@x[$b2],@x[$b2],7)",
+          "&vprotd     (@x[$b3],@x[$b3],7)"
+       );
+}
+
+my $xframe = $win64 ? 0xa0 : 0;
+
+$code.=<<___;
+.type  ChaCha20_4xop,\@function,5
+.align 32
+ChaCha20_4xop:
+.LChaCha20_4xop:
+       lea             -0x78(%rsp),%r11
+       sub             \$0x148+$xframe,%rsp
+___
+       ################ stack layout
+       # +0x00         SIMD equivalent of @x[8-12]
+       # ...
+       # +0x40         constant copy of key[0-2] smashed by lanes
+       # ...
+       # +0x100        SIMD counters (with nonce smashed by lanes)
+       # ...
+       # +0x140
+$code.=<<___   if ($win64);
+       movaps          %xmm6,-0x30(%r11)
+       movaps          %xmm7,-0x20(%r11)
+       movaps          %xmm8,-0x10(%r11)
+       movaps          %xmm9,0x00(%r11)
+       movaps          %xmm10,0x10(%r11)
+       movaps          %xmm11,0x20(%r11)
+       movaps          %xmm12,0x30(%r11)
+       movaps          %xmm13,0x40(%r11)
+       movaps          %xmm14,0x50(%r11)
+       movaps          %xmm15,0x60(%r11)
+___
+$code.=<<___;
+       vzeroupper
+
+       vmovdqa         .Lsigma(%rip),$xa3      # key[0]
+       vmovdqu         ($key),$xb3             # key[1]
+       vmovdqu         16($key),$xt3           # key[2]
+       vmovdqu         ($counter),$xd3         # key[3]
+       lea             0x100(%rsp),%rcx        # size optimization
+
+       vpshufd         \$0x00,$xa3,$xa0        # smash key by lanes...
+       vpshufd         \$0x55,$xa3,$xa1
+       vmovdqa         $xa0,0x40(%rsp)         # ... and offload
+       vpshufd         \$0xaa,$xa3,$xa2
+       vmovdqa         $xa1,0x50(%rsp)
+       vpshufd         \$0xff,$xa3,$xa3
+       vmovdqa         $xa2,0x60(%rsp)
+       vmovdqa         $xa3,0x70(%rsp)
+
+       vpshufd         \$0x00,$xb3,$xb0
+       vpshufd         \$0x55,$xb3,$xb1
+       vmovdqa         $xb0,0x80-0x100(%rcx)
+       vpshufd         \$0xaa,$xb3,$xb2
+       vmovdqa         $xb1,0x90-0x100(%rcx)
+       vpshufd         \$0xff,$xb3,$xb3
+       vmovdqa         $xb2,0xa0-0x100(%rcx)
+       vmovdqa         $xb3,0xb0-0x100(%rcx)
+
+       vpshufd         \$0x00,$xt3,$xt0        # "$xc0"
+       vpshufd         \$0x55,$xt3,$xt1        # "$xc1"
+       vmovdqa         $xt0,0xc0-0x100(%rcx)
+       vpshufd         \$0xaa,$xt3,$xt2        # "$xc2"
+       vmovdqa         $xt1,0xd0-0x100(%rcx)
+       vpshufd         \$0xff,$xt3,$xt3        # "$xc3"
+       vmovdqa         $xt2,0xe0-0x100(%rcx)
+       vmovdqa         $xt3,0xf0-0x100(%rcx)
+
+       vpshufd         \$0x00,$xd3,$xd0
+       vpshufd         \$0x55,$xd3,$xd1
+       vpaddd          .Linc(%rip),$xd0,$xd0   # don't save counters yet
+       vpshufd         \$0xaa,$xd3,$xd2
+       vmovdqa         $xd1,0x110-0x100(%rcx)
+       vpshufd         \$0xff,$xd3,$xd3
+       vmovdqa         $xd2,0x120-0x100(%rcx)
+       vmovdqa         $xd3,0x130-0x100(%rcx)
+
+       jmp             .Loop_enter4xop
+
+.align 32
+.Loop_outer4xop:
+       vmovdqa         0x40(%rsp),$xa0         # re-load smashed key
+       vmovdqa         0x50(%rsp),$xa1
+       vmovdqa         0x60(%rsp),$xa2
+       vmovdqa         0x70(%rsp),$xa3
+       vmovdqa         0x80-0x100(%rcx),$xb0
+       vmovdqa         0x90-0x100(%rcx),$xb1
+       vmovdqa         0xa0-0x100(%rcx),$xb2
+       vmovdqa         0xb0-0x100(%rcx),$xb3
+       vmovdqa         0xc0-0x100(%rcx),$xt0   # "$xc0"
+       vmovdqa         0xd0-0x100(%rcx),$xt1   # "$xc1"
+       vmovdqa         0xe0-0x100(%rcx),$xt2   # "$xc2"
+       vmovdqa         0xf0-0x100(%rcx),$xt3   # "$xc3"
+       vmovdqa         0x100-0x100(%rcx),$xd0
+       vmovdqa         0x110-0x100(%rcx),$xd1
+       vmovdqa         0x120-0x100(%rcx),$xd2
+       vmovdqa         0x130-0x100(%rcx),$xd3
+       vpaddd          .Lfour(%rip),$xd0,$xd0  # next SIMD counters
+
+.Loop_enter4xop:
+       mov             \$10,%eax
+       vmovdqa         $xd0,0x100-0x100(%rcx)  # save SIMD counters
+       jmp             .Loop4xop
+
+.align 32
+.Loop4xop:
+___
+       foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
+       foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
+$code.=<<___;
+       dec             %eax
+       jnz             .Loop4xop
+
+       vpaddd          0x40(%rsp),$xa0,$xa0    # accumulate key material
+       vpaddd          0x50(%rsp),$xa1,$xa1
+       vpaddd          0x60(%rsp),$xa2,$xa2
+       vpaddd          0x70(%rsp),$xa3,$xa3
+
+       vmovdqa         $xt2,0x20(%rsp)         # offload $xc2,3
+       vmovdqa         $xt3,0x30(%rsp)
+
+       vpunpckldq      $xa1,$xa0,$xt2          # "de-interlace" data
+       vpunpckldq      $xa3,$xa2,$xt3
+       vpunpckhdq      $xa1,$xa0,$xa0
+       vpunpckhdq      $xa3,$xa2,$xa2
+       vpunpcklqdq     $xt3,$xt2,$xa1          # "a0"
+       vpunpckhqdq     $xt3,$xt2,$xt2          # "a1"
+       vpunpcklqdq     $xa2,$xa0,$xa3          # "a2"
+       vpunpckhqdq     $xa2,$xa0,$xa0          # "a3"
+___
+        ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
+$code.=<<___;
+       vpaddd          0x80-0x100(%rcx),$xb0,$xb0
+       vpaddd          0x90-0x100(%rcx),$xb1,$xb1
+       vpaddd          0xa0-0x100(%rcx),$xb2,$xb2
+       vpaddd          0xb0-0x100(%rcx),$xb3,$xb3
+
+       vmovdqa         $xa0,0x00(%rsp)         # offload $xa0,1
+       vmovdqa         $xa1,0x10(%rsp)
+       vmovdqa         0x20(%rsp),$xa0         # "xc2"
+       vmovdqa         0x30(%rsp),$xa1         # "xc3"
+
+       vpunpckldq      $xb1,$xb0,$xt2
+       vpunpckldq      $xb3,$xb2,$xt3
+       vpunpckhdq      $xb1,$xb0,$xb0
+       vpunpckhdq      $xb3,$xb2,$xb2
+       vpunpcklqdq     $xt3,$xt2,$xb1          # "b0"
+       vpunpckhqdq     $xt3,$xt2,$xt2          # "b1"
+       vpunpcklqdq     $xb2,$xb0,$xb3          # "b2"
+       vpunpckhqdq     $xb2,$xb0,$xb0          # "b3"
+___
+       ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
+       my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
+$code.=<<___;
+       vpaddd          0xc0-0x100(%rcx),$xc0,$xc0
+       vpaddd          0xd0-0x100(%rcx),$xc1,$xc1
+       vpaddd          0xe0-0x100(%rcx),$xc2,$xc2
+       vpaddd          0xf0-0x100(%rcx),$xc3,$xc3
+
+       vpunpckldq      $xc1,$xc0,$xt2
+       vpunpckldq      $xc3,$xc2,$xt3
+       vpunpckhdq      $xc1,$xc0,$xc0
+       vpunpckhdq      $xc3,$xc2,$xc2
+       vpunpcklqdq     $xt3,$xt2,$xc1          # "c0"
+       vpunpckhqdq     $xt3,$xt2,$xt2          # "c1"
+       vpunpcklqdq     $xc2,$xc0,$xc3          # "c2"
+       vpunpckhqdq     $xc2,$xc0,$xc0          # "c3"
+___
+       ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
+$code.=<<___;
+       vpaddd          0x100-0x100(%rcx),$xd0,$xd0
+       vpaddd          0x110-0x100(%rcx),$xd1,$xd1
+       vpaddd          0x120-0x100(%rcx),$xd2,$xd2
+       vpaddd          0x130-0x100(%rcx),$xd3,$xd3
+
+       vpunpckldq      $xd1,$xd0,$xt2
+       vpunpckldq      $xd3,$xd2,$xt3
+       vpunpckhdq      $xd1,$xd0,$xd0
+       vpunpckhdq      $xd3,$xd2,$xd2
+       vpunpcklqdq     $xt3,$xt2,$xd1          # "d0"
+       vpunpckhqdq     $xt3,$xt2,$xt2          # "d1"
+       vpunpcklqdq     $xd2,$xd0,$xd3          # "d2"
+       vpunpckhqdq     $xd2,$xd0,$xd0          # "d3"
+___
+       ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
+       ($xa0,$xa1)=($xt2,$xt3);
+$code.=<<___;
+       vmovdqa         0x00(%rsp),$xa0         # restore $xa0,1
+       vmovdqa         0x10(%rsp),$xa1
+
+       cmp             \$64*4,$len
+       jb              .Ltail4xop
+
+       vpxor           0x00($inp),$xa0,$xa0    # xor with input
+       vpxor           0x10($inp),$xb0,$xb0
+       vpxor           0x20($inp),$xc0,$xc0
+       vpxor           0x30($inp),$xd0,$xd0
+       vpxor           0x40($inp),$xa1,$xa1
+       vpxor           0x50($inp),$xb1,$xb1
+       vpxor           0x60($inp),$xc1,$xc1
+       vpxor           0x70($inp),$xd1,$xd1
+       lea             0x80($inp),$inp         # size optimization
+       vpxor           0x00($inp),$xa2,$xa2
+       vpxor           0x10($inp),$xb2,$xb2
+       vpxor           0x20($inp),$xc2,$xc2
+       vpxor           0x30($inp),$xd2,$xd2
+       vpxor           0x40($inp),$xa3,$xa3
+       vpxor           0x50($inp),$xb3,$xb3
+       vpxor           0x60($inp),$xc3,$xc3
+       vpxor           0x70($inp),$xd3,$xd3
+       lea             0x80($inp),$inp         # inp+=64*4
+
+       vmovdqu         $xa0,0x00($out)
+       vmovdqu         $xb0,0x10($out)
+       vmovdqu         $xc0,0x20($out)
+       vmovdqu         $xd0,0x30($out)
+       vmovdqu         $xa1,0x40($out)
+       vmovdqu         $xb1,0x50($out)
+       vmovdqu         $xc1,0x60($out)
+       vmovdqu         $xd1,0x70($out)
+       lea             0x80($out),$out         # size optimization
+       vmovdqu         $xa2,0x00($out)
+       vmovdqu         $xb2,0x10($out)
+       vmovdqu         $xc2,0x20($out)
+       vmovdqu         $xd2,0x30($out)
+       vmovdqu         $xa3,0x40($out)
+       vmovdqu         $xb3,0x50($out)
+       vmovdqu         $xc3,0x60($out)
+       vmovdqu         $xd3,0x70($out)
+       lea             0x80($out),$out         # out+=64*4
+
+       sub             \$64*4,$len
+       jnz             .Loop_outer4xop
+
+       jmp             .Ldone4xop
+
+.align 32
+.Ltail4xop:
+       cmp             \$192,$len
+       jae             .L192_or_more4xop
+       cmp             \$128,$len
+       jae             .L128_or_more4xop
+       cmp             \$64,$len
+       jae             .L64_or_more4xop
+
+       xor             %r10,%r10
+       vmovdqa         $xa0,0x00(%rsp)
+       vmovdqa         $xb0,0x10(%rsp)
+       vmovdqa         $xc0,0x20(%rsp)
+       vmovdqa         $xd0,0x30(%rsp)
+       jmp             .Loop_tail4xop
+
+.align 32
+.L64_or_more4xop:
+       vpxor           0x00($inp),$xa0,$xa0    # xor with input
+       vpxor           0x10($inp),$xb0,$xb0
+       vpxor           0x20($inp),$xc0,$xc0
+       vpxor           0x30($inp),$xd0,$xd0
+       vmovdqu         $xa0,0x00($out)
+       vmovdqu         $xb0,0x10($out)
+       vmovdqu         $xc0,0x20($out)
+       vmovdqu         $xd0,0x30($out)
+       je              .Ldone4xop
+
+       lea             0x40($inp),$inp         # inp+=64*1
+       vmovdqa         $xa1,0x00(%rsp)
+       xor             %r10,%r10
+       vmovdqa         $xb1,0x10(%rsp)
+       lea             0x40($out),$out         # out+=64*1
+       vmovdqa         $xc1,0x20(%rsp)
+       sub             \$64,$len               # len-=64*1
+       vmovdqa         $xd1,0x30(%rsp)
+       jmp             .Loop_tail4xop
+
+.align 32
+.L128_or_more4xop:
+       vpxor           0x00($inp),$xa0,$xa0    # xor with input
+       vpxor           0x10($inp),$xb0,$xb0
+       vpxor           0x20($inp),$xc0,$xc0
+       vpxor           0x30($inp),$xd0,$xd0
+       vpxor           0x40($inp),$xa1,$xa1
+       vpxor           0x50($inp),$xb1,$xb1
+       vpxor           0x60($inp),$xc1,$xc1
+       vpxor           0x70($inp),$xd1,$xd1
+
+       vmovdqu         $xa0,0x00($out)
+       vmovdqu         $xb0,0x10($out)
+       vmovdqu         $xc0,0x20($out)
+       vmovdqu         $xd0,0x30($out)
+       vmovdqu         $xa1,0x40($out)
+       vmovdqu         $xb1,0x50($out)
+       vmovdqu         $xc1,0x60($out)
+       vmovdqu         $xd1,0x70($out)
+       je              .Ldone4xop
+
+       lea             0x80($inp),$inp         # inp+=64*2
+       vmovdqa         $xa2,0x00(%rsp)
+       xor             %r10,%r10
+       vmovdqa         $xb2,0x10(%rsp)
+       lea             0x80($out),$out         # out+=64*2
+       vmovdqa         $xc2,0x20(%rsp)
+       sub             \$128,$len              # len-=64*2
+       vmovdqa         $xd2,0x30(%rsp)
+       jmp             .Loop_tail4xop
+
+.align 32
+.L192_or_more4xop:
+       vpxor           0x00($inp),$xa0,$xa0    # xor with input
+       vpxor           0x10($inp),$xb0,$xb0
+       vpxor           0x20($inp),$xc0,$xc0
+       vpxor           0x30($inp),$xd0,$xd0
+       vpxor           0x40($inp),$xa1,$xa1
+       vpxor           0x50($inp),$xb1,$xb1
+       vpxor           0x60($inp),$xc1,$xc1
+       vpxor           0x70($inp),$xd1,$xd1
+       lea             0x80($inp),$inp         # size optimization
+       vpxor           0x00($inp),$xa2,$xa2
+       vpxor           0x10($inp),$xb2,$xb2
+       vpxor           0x20($inp),$xc2,$xc2
+       vpxor           0x30($inp),$xd2,$xd2
+
+       vmovdqu         $xa0,0x00($out)
+       vmovdqu         $xb0,0x10($out)
+       vmovdqu         $xc0,0x20($out)
+       vmovdqu         $xd0,0x30($out)
+       vmovdqu         $xa1,0x40($out)
+       vmovdqu         $xb1,0x50($out)
+       vmovdqu         $xc1,0x60($out)
+       vmovdqu         $xd1,0x70($out)
+       lea             0x80($out),$out         # size optimization
+       vmovdqu         $xa2,0x00($out)
+       vmovdqu         $xb2,0x10($out)
+       vmovdqu         $xc2,0x20($out)
+       vmovdqu         $xd2,0x30($out)
+       je              .Ldone4xop
+
+       lea             0x40($inp),$inp         # inp+=64*3
+       vmovdqa         $xa2,0x00(%rsp)
+       xor             %r10,%r10
+       vmovdqa         $xb2,0x10(%rsp)
+       lea             0x40($out),$out         # out+=64*3
+       vmovdqa         $xc2,0x20(%rsp)
+       sub             \$192,$len              # len-=64*3
+       vmovdqa         $xd2,0x30(%rsp)
+
+.Loop_tail4xop:
+       movzb           ($inp,%r10),%eax
+       movzb           (%rsp,%r10),%ecx
+       lea             1(%r10),%r10
+       xor             %ecx,%eax
+       mov             %al,-1($out,%r10)
+       dec             $len
+       jnz             .Loop_tail4xop
+
+.Ldone4xop:
+       vzeroupper
+___
+$code.=<<___   if ($win64);
+       lea             0x140+0x30(%rsp),%r11
+       movaps          -0x30(%r11),%xmm6
+       movaps          -0x20(%r11),%xmm7
+       movaps          -0x10(%r11),%xmm8
+       movaps          0x00(%r11),%xmm9
+       movaps          0x10(%r11),%xmm10
+       movaps          0x20(%r11),%xmm11
+       movaps          0x30(%r11),%xmm12
+       movaps          0x40(%r11),%xmm13
+       movaps          0x50(%r11),%xmm14
+       movaps          0x60(%r11),%xmm15
+___
+$code.=<<___;
+       add             \$0x148+$xframe,%rsp
+       ret
+.size  ChaCha20_4xop,.-ChaCha20_4xop
+___
+}
+
+########################################################################
+# AVX2 code path
+if ($avx>1) {
+my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
+    $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
+my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
+       "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
+
+sub AVX2_lane_ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
+my @x=map("\"$_\"",@xx);
+
+       # Consider order in which variables are addressed by their
+       # index:
+       #
+       #       a   b   c   d
+       #
+       #       0   4   8  12 < even round
+       #       1   5   9  13
+       #       2   6  10  14
+       #       3   7  11  15
+       #       0   5  10  15 < odd round
+       #       1   6  11  12
+       #       2   7   8  13
+       #       3   4   9  14
+       #
+       # 'a', 'b' and 'd's are permanently allocated in registers,
+       # @x[0..7,12..15], while 'c's are maintained in memory. If
+       # you observe 'c' column, you'll notice that pair of 'c's is
+       # invariant between rounds. This means that we have to reload
+       # them once per round, in the middle. This is why you'll see
+       # bunch of 'c' stores and loads in the middle, but none in
+       # the beginning or end.
+
+       (
+       "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",     # Q1
+       "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
+       "&vpshufb       (@x[$d0],@x[$d0],$t1)",
+        "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",     # Q2
+        "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
+        "&vpshufb      (@x[$d1],@x[$d1],$t1)",
+
+       "&vpaddd        ($xc,$xc,@x[$d0])",
+       "&vpxor         (@x[$b0],$xc,@x[$b0])",
+       "&vpslld        ($t0,@x[$b0],12)",
+       "&vpsrld        (@x[$b0],@x[$b0],20)",
+       "&vpor          (@x[$b0],$t0,@x[$b0])",
+       "&vbroadcasti128($t0,'(%r11)')",                # .Lrot24(%rip)
+        "&vpaddd       ($xc_,$xc_,@x[$d1])",
+        "&vpxor        (@x[$b1],$xc_,@x[$b1])",
+        "&vpslld       ($t1,@x[$b1],12)",
+        "&vpsrld       (@x[$b1],@x[$b1],20)",
+        "&vpor         (@x[$b1],$t1,@x[$b1])",
+
+       "&vpaddd        (@x[$a0],@x[$a0],@x[$b0])",
+       "&vpxor         (@x[$d0],@x[$a0],@x[$d0])",
+       "&vpshufb       (@x[$d0],@x[$d0],$t0)",
+        "&vpaddd       (@x[$a1],@x[$a1],@x[$b1])",
+        "&vpxor        (@x[$d1],@x[$a1],@x[$d1])",
+        "&vpshufb      (@x[$d1],@x[$d1],$t0)",
+
+       "&vpaddd        ($xc,$xc,@x[$d0])",
+       "&vpxor         (@x[$b0],$xc,@x[$b0])",
+       "&vpslld        ($t1,@x[$b0],7)",
+       "&vpsrld        (@x[$b0],@x[$b0],25)",
+       "&vpor          (@x[$b0],$t1,@x[$b0])",
+       "&vbroadcasti128($t1,'(%r10)')",                # .Lrot16(%rip)
+        "&vpaddd       ($xc_,$xc_,@x[$d1])",
+        "&vpxor        (@x[$b1],$xc_,@x[$b1])",
+        "&vpslld       ($t0,@x[$b1],7)",
+        "&vpsrld       (@x[$b1],@x[$b1],25)",
+        "&vpor         (@x[$b1],$t0,@x[$b1])",
+
+       "&vmovdqa       (\"`32*($c0-8)`(%rsp)\",$xc)",  # reload pair of 'c's
+        "&vmovdqa      (\"`32*($c1-8)`(%rsp)\",$xc_)",
+       "&vmovdqa       ($xc,\"`32*($c2-8)`(%rsp)\")",
+        "&vmovdqa      ($xc_,\"`32*($c3-8)`(%rsp)\")",
+
+       "&vpaddd        (@x[$a2],@x[$a2],@x[$b2])",     # Q3
+       "&vpxor         (@x[$d2],@x[$a2],@x[$d2])",
+       "&vpshufb       (@x[$d2],@x[$d2],$t1)",
+        "&vpaddd       (@x[$a3],@x[$a3],@x[$b3])",     # Q4
+        "&vpxor        (@x[$d3],@x[$a3],@x[$d3])",
+        "&vpshufb      (@x[$d3],@x[$d3],$t1)",
+
+       "&vpaddd        ($xc,$xc,@x[$d2])",
+       "&vpxor         (@x[$b2],$xc,@x[$b2])",
+       "&vpslld        ($t0,@x[$b2],12)",
+       "&vpsrld        (@x[$b2],@x[$b2],20)",
+       "&vpor          (@x[$b2],$t0,@x[$b2])",
+       "&vbroadcasti128($t0,'(%r11)')",                # .Lrot24(%rip)
+        "&vpaddd       ($xc_,$xc_,@x[$d3])",
+        "&vpxor        (@x[$b3],$xc_,@x[$b3])",
+        "&vpslld       ($t1,@x[$b3],12)",
+        "&vpsrld       (@x[$b3],@x[$b3],20)",
+        "&vpor         (@x[$b3],$t1,@x[$b3])",
+
+       "&vpaddd        (@x[$a2],@x[$a2],@x[$b2])",
+       "&vpxor         (@x[$d2],@x[$a2],@x[$d2])",
+       "&vpshufb       (@x[$d2],@x[$d2],$t0)",
+        "&vpaddd       (@x[$a3],@x[$a3],@x[$b3])",
+        "&vpxor        (@x[$d3],@x[$a3],@x[$d3])",
+        "&vpshufb      (@x[$d3],@x[$d3],$t0)",
+
+       "&vpaddd        ($xc,$xc,@x[$d2])",
+       "&vpxor         (@x[$b2],$xc,@x[$b2])",
+       "&vpslld        ($t1,@x[$b2],7)",
+       "&vpsrld        (@x[$b2],@x[$b2],25)",
+       "&vpor          (@x[$b2],$t1,@x[$b2])",
+       "&vbroadcasti128($t1,'(%r10)')",                # .Lrot16(%rip)
+        "&vpaddd       ($xc_,$xc_,@x[$d3])",
+        "&vpxor        (@x[$b3],$xc_,@x[$b3])",
+        "&vpslld       ($t0,@x[$b3],7)",
+        "&vpsrld       (@x[$b3],@x[$b3],25)",
+        "&vpor         (@x[$b3],$t0,@x[$b3])"
+       );
+}
+
+my $xframe = $win64 ? 0xb0 : 8;
+
+$code.=<<___;
+.type  ChaCha20_8x,\@function,5
+.align 32
+ChaCha20_8x:
+.LChaCha20_8x:
+       mov             %rsp,%r10
+       sub             \$0x280+$xframe,%rsp
+       and             \$-32,%rsp
+___
+$code.=<<___   if ($win64);
+       lea             0x290+0x30(%rsp),%r11
+       movaps          %xmm6,-0x30(%r11)
+       movaps          %xmm7,-0x20(%r11)
+       movaps          %xmm8,-0x10(%r11)
+       movaps          %xmm9,0x00(%r11)
+       movaps          %xmm10,0x10(%r11)
+       movaps          %xmm11,0x20(%r11)
+       movaps          %xmm12,0x30(%r11)
+       movaps          %xmm13,0x40(%r11)
+       movaps          %xmm14,0x50(%r11)
+       movaps          %xmm15,0x60(%r11)
+___
+$code.=<<___;
+       vzeroupper
+       mov             %r10,0x280(%rsp)
+
+       ################ stack layout
+       # +0x00         SIMD equivalent of @x[8-12]
+       # ...
+       # +0x80         constant copy of key[0-2] smashed by lanes
+       # ...
+       # +0x200        SIMD counters (with nonce smashed by lanes)
+       # ...
+       # +0x280        saved %rsp
+
+       vbroadcasti128  .Lsigma(%rip),$xa3      # key[0]
+       vbroadcasti128  ($key),$xb3             # key[1]
+       vbroadcasti128  16($key),$xt3           # key[2]
+       vbroadcasti128  ($counter),$xd3         # key[3]
+       lea             0x100(%rsp),%rcx        # size optimization
+       lea             0x200(%rsp),%rax        # size optimization
+       lea             .Lrot16(%rip),%r10
+       lea             .Lrot24(%rip),%r11
+
+       vpshufd         \$0x00,$xa3,$xa0        # smash key by lanes...
+       vpshufd         \$0x55,$xa3,$xa1
+       vmovdqa         $xa0,0x80-0x100(%rcx)   # ... and offload
+       vpshufd         \$0xaa,$xa3,$xa2
+       vmovdqa         $xa1,0xa0-0x100(%rcx)
+       vpshufd         \$0xff,$xa3,$xa3
+       vmovdqa         $xa2,0xc0-0x100(%rcx)
+       vmovdqa         $xa3,0xe0-0x100(%rcx)
+
+       vpshufd         \$0x00,$xb3,$xb0
+       vpshufd         \$0x55,$xb3,$xb1
+       vmovdqa         $xb0,0x100-0x100(%rcx)
+       vpshufd         \$0xaa,$xb3,$xb2
+       vmovdqa         $xb1,0x120-0x100(%rcx)
+       vpshufd         \$0xff,$xb3,$xb3
+       vmovdqa         $xb2,0x140-0x100(%rcx)
+       vmovdqa         $xb3,0x160-0x100(%rcx)
+
+       vpshufd         \$0x00,$xt3,$xt0        # "xc0"
+       vpshufd         \$0x55,$xt3,$xt1        # "xc1"
+       vmovdqa         $xt0,0x180-0x200(%rax)
+       vpshufd         \$0xaa,$xt3,$xt2        # "xc2"
+       vmovdqa         $xt1,0x1a0-0x200(%rax)
+       vpshufd         \$0xff,$xt3,$xt3        # "xc3"
+       vmovdqa         $xt2,0x1c0-0x200(%rax)
+       vmovdqa         $xt3,0x1e0-0x200(%rax)
+
+       vpshufd         \$0x00,$xd3,$xd0
+       vpshufd         \$0x55,$xd3,$xd1
+       vpaddd          .Lincy(%rip),$xd0,$xd0  # don't save counters yet
+       vpshufd         \$0xaa,$xd3,$xd2
+       vmovdqa         $xd1,0x220-0x200(%rax)
+       vpshufd         \$0xff,$xd3,$xd3
+       vmovdqa         $xd2,0x240-0x200(%rax)
+       vmovdqa         $xd3,0x260-0x200(%rax)
+
+       jmp             .Loop_enter8x
+
+.align 32
+.Loop_outer8x:
+       vmovdqa         0x80-0x100(%rcx),$xa0   # re-load smashed key
+       vmovdqa         0xa0-0x100(%rcx),$xa1
+       vmovdqa         0xc0-0x100(%rcx),$xa2
+       vmovdqa         0xe0-0x100(%rcx),$xa3
+       vmovdqa         0x100-0x100(%rcx),$xb0
+       vmovdqa         0x120-0x100(%rcx),$xb1
+       vmovdqa         0x140-0x100(%rcx),$xb2
+       vmovdqa         0x160-0x100(%rcx),$xb3
+       vmovdqa         0x180-0x200(%rax),$xt0  # "xc0"
+       vmovdqa         0x1a0-0x200(%rax),$xt1  # "xc1"
+       vmovdqa         0x1c0-0x200(%rax),$xt2  # "xc2"
+       vmovdqa         0x1e0-0x200(%rax),$xt3  # "xc3"
+       vmovdqa         0x200-0x200(%rax),$xd0
+       vmovdqa         0x220-0x200(%rax),$xd1
+       vmovdqa         0x240-0x200(%rax),$xd2
+       vmovdqa         0x260-0x200(%rax),$xd3
+       vpaddd          .Leight(%rip),$xd0,$xd0 # next SIMD counters
+
+.Loop_enter8x:
+       vmovdqa         $xt2,0x40(%rsp)         # SIMD equivalent of "@x[10]"
+       vmovdqa         $xt3,0x60(%rsp)         # SIMD equivalent of "@x[11]"
+       vbroadcasti128  (%r10),$xt3
+       vmovdqa         $xd0,0x200-0x200(%rax)  # save SIMD counters
+       mov             \$10,%eax
+       jmp             .Loop8x
+
+.align 32
+.Loop8x:
+___
+       foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
+       foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
+$code.=<<___;
+       dec             %eax
+       jnz             .Loop8x
+
+       lea             0x200(%rsp),%rax        # size optimization
+       vpaddd          0x80-0x100(%rcx),$xa0,$xa0      # accumulate key
+       vpaddd          0xa0-0x100(%rcx),$xa1,$xa1
+       vpaddd          0xc0-0x100(%rcx),$xa2,$xa2
+       vpaddd          0xe0-0x100(%rcx),$xa3,$xa3
+
+       vpunpckldq      $xa1,$xa0,$xt2          # "de-interlace" data
+       vpunpckldq      $xa3,$xa2,$xt3
+       vpunpckhdq      $xa1,$xa0,$xa0
+       vpunpckhdq      $xa3,$xa2,$xa2
+       vpunpcklqdq     $xt3,$xt2,$xa1          # "a0"
+       vpunpckhqdq     $xt3,$xt2,$xt2          # "a1"
+       vpunpcklqdq     $xa2,$xa0,$xa3          # "a2"
+       vpunpckhqdq     $xa2,$xa0,$xa0          # "a3"
+___
+       ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
+$code.=<<___;
+       vpaddd          0x100-0x100(%rcx),$xb0,$xb0
+       vpaddd          0x120-0x100(%rcx),$xb1,$xb1
+       vpaddd          0x140-0x100(%rcx),$xb2,$xb2
+       vpaddd          0x160-0x100(%rcx),$xb3,$xb3
+
+       vpunpckldq      $xb1,$xb0,$xt2
+       vpunpckldq      $xb3,$xb2,$xt3
+       vpunpckhdq      $xb1,$xb0,$xb0
+       vpunpckhdq      $xb3,$xb2,$xb2
+       vpunpcklqdq     $xt3,$xt2,$xb1          # "b0"
+       vpunpckhqdq     $xt3,$xt2,$xt2          # "b1"
+       vpunpcklqdq     $xb2,$xb0,$xb3          # "b2"
+       vpunpckhqdq     $xb2,$xb0,$xb0          # "b3"
+___
+       ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
+$code.=<<___;
+       vperm2i128      \$0x20,$xb0,$xa0,$xt3   # "de-interlace" further
+       vperm2i128      \$0x31,$xb0,$xa0,$xb0
+       vperm2i128      \$0x20,$xb1,$xa1,$xa0
+       vperm2i128      \$0x31,$xb1,$xa1,$xb1
+       vperm2i128      \$0x20,$xb2,$xa2,$xa1
+       vperm2i128      \$0x31,$xb2,$xa2,$xb2
+       vperm2i128      \$0x20,$xb3,$xa3,$xa2
+       vperm2i128      \$0x31,$xb3,$xa3,$xb3
+___
+       ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
+       my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
+$code.=<<___;
+       vmovdqa         $xa0,0x00(%rsp)         # offload $xaN
+       vmovdqa         $xa1,0x20(%rsp)
+       vmovdqa         0x40(%rsp),$xc2         # $xa0
+       vmovdqa         0x60(%rsp),$xc3         # $xa1
+
+       vpaddd          0x180-0x200(%rax),$xc0,$xc0
+       vpaddd          0x1a0-0x200(%rax),$xc1,$xc1
+       vpaddd          0x1c0-0x200(%rax),$xc2,$xc2
+       vpaddd          0x1e0-0x200(%rax),$xc3,$xc3
+
+       vpunpckldq      $xc1,$xc0,$xt2
+       vpunpckldq      $xc3,$xc2,$xt3
+       vpunpckhdq      $xc1,$xc0,$xc0
+       vpunpckhdq      $xc3,$xc2,$xc2
+       vpunpcklqdq     $xt3,$xt2,$xc1          # "c0"
+       vpunpckhqdq     $xt3,$xt2,$xt2          # "c1"
+       vpunpcklqdq     $xc2,$xc0,$xc3          # "c2"
+       vpunpckhqdq     $xc2,$xc0,$xc0          # "c3"
+___
+       ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
+$code.=<<___;
+       vpaddd          0x200-0x200(%rax),$xd0,$xd0
+       vpaddd          0x220-0x200(%rax),$xd1,$xd1
+       vpaddd          0x240-0x200(%rax),$xd2,$xd2
+       vpaddd          0x260-0x200(%rax),$xd3,$xd3
+
+       vpunpckldq      $xd1,$xd0,$xt2
+       vpunpckldq      $xd3,$xd2,$xt3
+       vpunpckhdq      $xd1,$xd0,$xd0
+       vpunpckhdq      $xd3,$xd2,$xd2
+       vpunpcklqdq     $xt3,$xt2,$xd1          # "d0"
+       vpunpckhqdq     $xt3,$xt2,$xt2          # "d1"
+       vpunpcklqdq     $xd2,$xd0,$xd3          # "d2"
+       vpunpckhqdq     $xd2,$xd0,$xd0          # "d3"
+___
+       ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
+$code.=<<___;
+       vperm2i128      \$0x20,$xd0,$xc0,$xt3   # "de-interlace" further
+       vperm2i128      \$0x31,$xd0,$xc0,$xd0
+       vperm2i128      \$0x20,$xd1,$xc1,$xc0
+       vperm2i128      \$0x31,$xd1,$xc1,$xd1
+       vperm2i128      \$0x20,$xd2,$xc2,$xc1
+       vperm2i128      \$0x31,$xd2,$xc2,$xd2
+       vperm2i128      \$0x20,$xd3,$xc3,$xc2
+       vperm2i128      \$0x31,$xd3,$xc3,$xd3
+___
+       ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
+       ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
+       ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
+       ($xa0,$xa1)=($xt2,$xt3);
+$code.=<<___;
+       vmovdqa         0x00(%rsp),$xa0         # $xaN was offloaded, remember?
+       vmovdqa         0x20(%rsp),$xa1
+
+       cmp             \$64*8,$len
+       jb              .Ltail8x
+
+       vpxor           0x00($inp),$xa0,$xa0    # xor with input
+       vpxor           0x20($inp),$xb0,$xb0
+       vpxor           0x40($inp),$xc0,$xc0
+       vpxor           0x60($inp),$xd0,$xd0
+       lea             0x80($inp),$inp         # size optimization
+       vmovdqu         $xa0,0x00($out)
+       vmovdqu         $xb0,0x20($out)
+       vmovdqu         $xc0,0x40($out)
+       vmovdqu         $xd0,0x60($out)
+       lea             0x80($out),$out         # size optimization
+
+       vpxor           0x00($inp),$xa1,$xa1
+       vpxor           0x20($inp),$xb1,$xb1
+       vpxor           0x40($inp),$xc1,$xc1
+       vpxor           0x60($inp),$xd1,$xd1
+       lea             0x80($inp),$inp         # size optimization
+       vmovdqu         $xa1,0x00($out)
+       vmovdqu         $xb1,0x20($out)
+       vmovdqu         $xc1,0x40($out)
+       vmovdqu         $xd1,0x60($out)
+       lea             0x80($out),$out         # size optimization
+
+       vpxor           0x00($inp),$xa2,$xa2
+       vpxor           0x20($inp),$xb2,$xb2
+       vpxor           0x40($inp),$xc2,$xc2
+       vpxor           0x60($inp),$xd2,$xd2
+       lea             0x80($inp),$inp         # size optimization
+       vmovdqu         $xa2,0x00($out)
+       vmovdqu         $xb2,0x20($out)
+       vmovdqu         $xc2,0x40($out)
+       vmovdqu         $xd2,0x60($out)
+       lea             0x80($out),$out         # size optimization
+
+       vpxor           0x00($inp),$xa3,$xa3
+       vpxor           0x20($inp),$xb3,$xb3
+       vpxor           0x40($inp),$xc3,$xc3
+       vpxor           0x60($inp),$xd3,$xd3
+       lea             0x80($inp),$inp         # size optimization
+       vmovdqu         $xa3,0x00($out)
+       vmovdqu         $xb3,0x20($out)
+       vmovdqu         $xc3,0x40($out)
+       vmovdqu         $xd3,0x60($out)
+       lea             0x80($out),$out         # size optimization
+
+       sub             \$64*8,$len
+       jnz             .Loop_outer8x
+
+       jmp             .Ldone8x
+
+.Ltail8x:
+       cmp             \$448,$len
+       jae             .L448_or_more8x
+       cmp             \$384,$len
+       jae             .L384_or_more8x
+       cmp             \$320,$len
+       jae             .L320_or_more8x
+       cmp             \$256,$len
+       jae             .L256_or_more8x
+       cmp             \$192,$len
+       jae             .L192_or_more8x
+       cmp             \$128,$len
+       jae             .L128_or_more8x
+       cmp             \$64,$len
+       jae             .L64_or_more8x
+
+       xor             %r10,%r10
+       vmovdqa         $xa0,0x00(%rsp)
+       vmovdqa         $xb0,0x20(%rsp)
+       jmp             .Loop_tail8x
+
+.align 32
+.L64_or_more8x:
+       vpxor           0x00($inp),$xa0,$xa0    # xor with input
+       vpxor           0x20($inp),$xb0,$xb0
+       vmovdqu         $xa0,0x00($out)
+       vmovdqu         $xb0,0x20($out)
+       je              .Ldone8x
+
+       lea             0x40($inp),$inp         # inp+=64*1
+       xor             %r10,%r10
+       vmovdqa         $xc0,0x00(%rsp)
+       lea             0x40($out),$out         # out+=64*1
+       sub             \$64,$len               # len-=64*1
+       vmovdqa         $xd0,0x20(%rsp)
+       jmp             .Loop_tail8x
+
+.align 32
+.L128_or_more8x:
+       vpxor           0x00($inp),$xa0,$xa0    # xor with input
+       vpxor           0x20($inp),$xb0,$xb0
+       vpxor           0x40($inp),$xc0,$xc0
+       vpxor           0x60($inp),$xd0,$xd0
+       vmovdqu         $xa0,0x00($out)
+       vmovdqu         $xb0,0x20($out)
+       vmovdqu         $xc0,0x40($out)
+       vmovdqu         $xd0,0x60($out)
+       je              .Ldone8x
+
+       lea             0x80($inp),$inp         # inp+=64*2
+       xor             %r10,%r10
+       vmovdqa         $xa1,0x00(%rsp)
+       lea             0x80($out),$out         # out+=64*2
+       sub             \$128,$len              # len-=64*2
+       vmovdqa         $xb1,0x20(%rsp)
+       jmp             .Loop_tail8x
+
+.align 32
+.L192_or_more8x:
+       vpxor           0x00($inp),$xa0,$xa0    # xor with input
+       vpxor           0x20($inp),$xb0,$xb0
+       vpxor           0x40($inp),$xc0,$xc0
+       vpxor           0x60($inp),$xd0,$xd0
+       vpxor           0x80($inp),$xa1,$xa1
+       vpxor           0xa0($inp),$xb1,$xb1
+       vmovdqu         $xa0,0x00($out)
+       vmovdqu         $xb0,0x20($out)
+       vmovdqu         $xc0,0x40($out)
+       vmovdqu         $xd0,0x60($out)
+       vmovdqu         $xa1,0x80($out)
+       vmovdqu         $xb1,0xa0($out)
+       je              .Ldone8x
+
+       lea             0xc0($inp),$inp         # inp+=64*3
+       xor             %r10,%r10
+       vmovdqa         $xc1,0x00(%rsp)
+       lea             0xc0($out),$out         # out+=64*3
+       sub             \$192,$len              # len-=64*3
+       vmovdqa         $xd1,0x20(%rsp)
+       jmp             .Loop_tail8x
+
+.align 32
+.L256_or_more8x:
+       vpxor           0x00($inp),$xa0,$xa0    # xor with input
+       vpxor           0x20($inp),$xb0,$xb0
+       vpxor           0x40($inp),$xc0,$xc0
+       vpxor           0x60($inp),$xd0,$xd0
+       vpxor           0x80($inp),$xa1,$xa1
+       vpxor           0xa0($inp),$xb1,$xb1
+       vpxor           0xc0($inp),$xc1,$xc1
+       vpxor           0xe0($inp),$xd1,$xd1
+       vmovdqu         $xa0,0x00($out)
+       vmovdqu         $xb0,0x20($out)
+       vmovdqu         $xc0,0x40($out)
+       vmovdqu         $xd0,0x60($out)
+       vmovdqu         $xa1,0x80($out)
+       vmovdqu         $xb1,0xa0($out)
+       vmovdqu         $xc1,0xc0($out)
+       vmovdqu         $xd1,0xe0($out)
+       je              .Ldone8x
+
+       lea             0x100($inp),$inp        # inp+=64*4
+       xor             %r10,%r10
+       vmovdqa         $xa2,0x00(%rsp)
+       lea             0x100($out),$out        # out+=64*4
+       sub             \$256,$len              # len-=64*4
+       vmovdqa         $xb2,0x20(%rsp)
+       jmp             .Loop_tail8x
+
+.align 32
+.L320_or_more8x:
+       vpxor           0x00($inp),$xa0,$xa0    # xor with input
+       vpxor           0x20($inp),$xb0,$xb0
+       vpxor           0x40($inp),$xc0,$xc0
+       vpxor           0x60($inp),$xd0,$xd0
+       vpxor           0x80($inp),$xa1,$xa1
+       vpxor           0xa0($inp),$xb1,$xb1
+       vpxor           0xc0($inp),$xc1,$xc1
+       vpxor           0xe0($inp),$xd1,$xd1
+       vpxor           0x100($inp),$xa2,$xa2
+       vpxor           0x120($inp),$xb2,$xb2
+       vmovdqu         $xa0,0x00($out)
+       vmovdqu         $xb0,0x20($out)
+       vmovdqu         $xc0,0x40($out)
+       vmovdqu         $xd0,0x60($out)
+       vmovdqu         $xa1,0x80($out)
+       vmovdqu         $xb1,0xa0($out)
+       vmovdqu         $xc1,0xc0($out)
+       vmovdqu         $xd1,0xe0($out)
+       vmovdqu         $xa2,0x100($out)
+       vmovdqu         $xb2,0x120($out)
+       je              .Ldone8x
+
+       lea             0x140($inp),$inp        # inp+=64*5
+       xor             %r10,%r10
+       vmovdqa         $xc2,0x00(%rsp)
+       lea             0x140($out),$out        # out+=64*5
+       sub             \$320,$len              # len-=64*5
+       vmovdqa         $xd2,0x20(%rsp)
+       jmp             .Loop_tail8x
+
+.align 32
+.L384_or_more8x:
+       vpxor           0x00($inp),$xa0,$xa0    # xor with input
+       vpxor           0x20($inp),$xb0,$xb0
+       vpxor           0x40($inp),$xc0,$xc0
+       vpxor           0x60($inp),$xd0,$xd0
+       vpxor           0x80($inp),$xa1,$xa1
+       vpxor           0xa0($inp),$xb1,$xb1
+       vpxor           0xc0($inp),$xc1,$xc1
+       vpxor           0xe0($inp),$xd1,$xd1
+       vpxor           0x100($inp),$xa2,$xa2
+       vpxor           0x120($inp),$xb2,$xb2
+       vpxor           0x140($inp),$xc2,$xc2
+       vpxor           0x160($inp),$xd2,$xd2
+       vmovdqu         $xa0,0x00($out)
+       vmovdqu         $xb0,0x20($out)
+       vmovdqu         $xc0,0x40($out)
+       vmovdqu         $xd0,0x60($out)
+       vmovdqu         $xa1,0x80($out)
+       vmovdqu         $xb1,0xa0($out)
+       vmovdqu         $xc1,0xc0($out)
+       vmovdqu         $xd1,0xe0($out)
+       vmovdqu         $xa2,0x100($out)
+       vmovdqu         $xb2,0x120($out)
+       vmovdqu         $xc2,0x140($out)
+       vmovdqu         $xd2,0x160($out)
+       je              .Ldone8x
+
+       lea             0x180($inp),$inp        # inp+=64*6
+       xor             %r10,%r10
+       vmovdqa         $xa3,0x00(%rsp)
+       lea             0x180($out),$out        # out+=64*6
+       sub             \$384,$len              # len-=64*6
+       vmovdqa         $xb3,0x20(%rsp)
+       jmp             .Loop_tail8x
+
+.align 32
+.L448_or_more8x:
+       vpxor           0x00($inp),$xa0,$xa0    # xor with input
+       vpxor           0x20($inp),$xb0,$xb0
+       vpxor           0x40($inp),$xc0,$xc0
+       vpxor           0x60($inp),$xd0,$xd0
+       vpxor           0x80($inp),$xa1,$xa1
+       vpxor           0xa0($inp),$xb1,$xb1
+       vpxor           0xc0($inp),$xc1,$xc1
+       vpxor           0xe0($inp),$xd1,$xd1
+       vpxor           0x100($inp),$xa2,$xa2
+       vpxor           0x120($inp),$xb2,$xb2
+       vpxor           0x140($inp),$xc2,$xc2
+       vpxor           0x160($inp),$xd2,$xd2
+       vpxor           0x180($inp),$xa3,$xa3
+       vpxor           0x1a0($inp),$xb3,$xb3
+       vmovdqu         $xa0,0x00($out)
+       vmovdqu         $xb0,0x20($out)
+       vmovdqu         $xc0,0x40($out)
+       vmovdqu         $xd0,0x60($out)
+       vmovdqu         $xa1,0x80($out)
+       vmovdqu         $xb1,0xa0($out)
+       vmovdqu         $xc1,0xc0($out)
+       vmovdqu         $xd1,0xe0($out)
+       vmovdqu         $xa2,0x100($out)
+       vmovdqu         $xb2,0x120($out)
+       vmovdqu         $xc2,0x140($out)
+       vmovdqu         $xd2,0x160($out)
+       vmovdqu         $xa3,0x180($out)
+       vmovdqu         $xb3,0x1a0($out)
+       je              .Ldone8x
+
+       lea             0x1c0($inp),$inp        # inp+=64*7
+       xor             %r10,%r10
+       vmovdqa         $xc3,0x00(%rsp)
+       lea             0x1c0($out),$out        # out+=64*7
+       sub             \$448,$len              # len-=64*7
+       vmovdqa         $xd3,0x20(%rsp)
+
+.Loop_tail8x:
+       movzb           ($inp,%r10),%eax
+       movzb           (%rsp,%r10),%ecx
+       lea             1(%r10),%r10
+       xor             %ecx,%eax
+       mov             %al,-1($out,%r10)
+       dec             $len
+       jnz             .Loop_tail8x
+
+.Ldone8x:
+       vzeroall
+___
+$code.=<<___   if ($win64);
+       lea             0x290+0x30(%rsp),%r11
+       movaps          -0x30(%r11),%xmm6
+       movaps          -0x20(%r11),%xmm7
+       movaps          -0x10(%r11),%xmm8
+       movaps          0x00(%r11),%xmm9
+       movaps          0x10(%r11),%xmm10
+       movaps          0x20(%r11),%xmm11
+       movaps          0x30(%r11),%xmm12
+       movaps          0x40(%r11),%xmm13
+       movaps          0x50(%r11),%xmm14
+       movaps          0x60(%r11),%xmm15
+___
+$code.=<<___;
+       mov             0x280(%rsp),%rsp
+       ret
+.size  ChaCha20_8x,.-ChaCha20_8x
+___
+}
+
+foreach (split("\n",$code)) {
+       s/\`([^\`]*)\`/eval $1/geo;
+
+       s/%x#%y/%x/go;
+
+       print $_,"\n";
+}
+
+close STDOUT;
index 63b2301..9ee6fa3 100644 (file)
@@ -17,7 +17,7 @@ sub opsize()
 { my $reg=shift;
     if    ($reg =~ m/^%e/o)            { "l"; }
     elsif ($reg =~ m/^%[a-d][hl]$/o)   { "b"; }
-    elsif ($reg =~ m/^%[xm]/o)         { undef; }
+    elsif ($reg =~ m/^%[yxm]/o)                { undef; }
     else                               { "w"; }
 }
 
index 9d74865..c848843 100644 (file)
@@ -38,6 +38,10 @@ lib: $(LIBOBJ)
 
 poly1305-sparcv9.S:    asm/poly1305-sparcv9.pl
        $(PERL) asm/poly1305-sparcv9.pl > $@
+poly1305-x86.s:                asm/poly1305-x86.pl
+       $(PERL) asm/poly1305-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
+poly1305-x86_64.s:     asm/poly1305-x86_64.pl
+       $(PERL) asm/poly1305-x86_64.pl $(PERLASM_SCHEME) > $@
 
 poly1305-%.S:  asm/poly1305-%.pl;      $(PERL) $< $(PERLASM_SCHEME) $@
 
diff --git a/crypto/poly1305/asm/poly1305-x86.pl b/crypto/poly1305/asm/poly1305-x86.pl
new file mode 100755 (executable)
index 0000000..7c1aee5
--- /dev/null
@@ -0,0 +1,1794 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements Poly1305 hash for x86.
+#
+# April 2015
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone,
+# measured with rdtsc at fixed clock frequency.
+#
+#              IALU/gcc-3.4(*) SSE2(**)        AVX2
+# Pentium      15.7/+80%       -
+# PIII         6.21/+90%       -
+# P4           19.8/+40%       3.24
+# Core 2       4.85/+90%       1.80
+# Westmere     4.58/+100%      1.43
+# Sandy Bridge 3.90/+100%      1.36
+# Haswell      3.88/+70%       1.18            0.72
+# Silvermont   11.0/+40%       4.80
+# VIA Nano     6.71/+90%       2.47
+# Sledgehammer 3.51/+180%      4.27
+# Bulldozer    4.53/+140%      1.31
+#
+# (*)  gcc 4.8 for some reason generated worse code;
+# (**) besides SSE2 there are floating-point and AVX options; FP
+#      is deemed unnecessary, because pre-SSE2 processor are too
+#      old to care about, while it's not the fastest option on
+#      SSE2-capable ones; AVX is omitted, because it doesn't give
+#      a lot of improvement, 5-10% depending on processor;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"poly1305-x86.pl",$ARGV[$#ARGV] eq "386");
+
+$sse2=$avx=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+if ($sse2) {
+       &static_label("const_sse2");
+       &static_label("enter_blocks");
+       &static_label("enter_emit");
+       &external_label("OPENSSL_ia32cap_P");
+
+       if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+                       =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+               $avx = ($1>=2.19) + ($1>=2.22);
+       }
+
+       if (!$avx && $ARGV[0] eq "win32n" &&
+          `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+       $avx = ($1>=2.09) + ($1>=2.10);
+       }
+
+       if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+               $avx = ($2>=3.0) + ($2>3.0);
+       }
+}
+
+########################################################################
+# Layout of opaque area is following.
+#
+#      unsigned __int32 h[5];          # current hash value base 2^32
+#      unsigned __int32 pad;           # is_base2_26 in vector context
+#      unsigned __int32 r[4];          # key value base 2^32
+
+&align(64);
+&function_begin("poly1305_init");
+       &mov    ("edi",&wparam(0));             # context
+       &mov    ("esi",&wparam(1));             # key
+       &mov    ("ebp",&wparam(2));             # function table
+
+       &xor    ("eax","eax");
+       &mov    (&DWP(4*0,"edi"),"eax");        # zero hash value
+       &mov    (&DWP(4*1,"edi"),"eax");
+       &mov    (&DWP(4*2,"edi"),"eax");
+       &mov    (&DWP(4*3,"edi"),"eax");
+       &mov    (&DWP(4*4,"edi"),"eax");
+       &mov    (&DWP(4*5,"edi"),"eax");        # is_base2_26
+
+       &cmp    ("esi",0);
+       &je     (&label("nokey"));
+
+    if ($sse2) {
+       &call   (&label("pic_point"));
+    &set_label("pic_point");
+       &blindpop("ebx");
+
+       &lea    ("eax",&DWP("poly1305_blocks-".&label("pic_point"),"ebx"));
+       &lea    ("edx",&DWP("poly1305_emit-".&label("pic_point"),"ebx"));
+
+       &picmeup("edi","OPENSSL_ia32cap_P","ebx",&label("pic_point"));
+       &mov    ("ecx",&DWP(0,"edi"));
+       &and    ("ecx",1<<26|1<<24);
+       &cmp    ("ecx",1<<26|1<<24);            # SSE2 and XMM?
+       &jne    (&label("no_sse2"));
+
+       &lea    ("eax",&DWP("_poly1305_blocks_sse2-".&label("pic_point"),"ebx"));
+       &lea    ("edx",&DWP("_poly1305_emit_sse2-".&label("pic_point"),"ebx"));
+
+      if ($avx>1) {
+       &mov    ("ecx",&DWP(8,"edi"));
+       &test   ("ecx",1<<5);                   # AVX2?
+       &jz     (&label("no_sse2"));
+
+       &lea    ("eax",&DWP("_poly1305_blocks_avx2-".&label("pic_point"),"ebx"));
+      }
+    &set_label("no_sse2");
+       &mov    ("edi",&wparam(0));             # reload context
+       &mov    (&DWP(0,"ebp"),"eax");          # fill function table
+       &mov    (&DWP(4,"ebp"),"edx");
+    }
+
+       &mov    ("eax",&DWP(4*0,"esi"));        # load input key
+       &mov    ("ebx",&DWP(4*1,"esi"));
+       &mov    ("ecx",&DWP(4*2,"esi"));
+       &mov    ("edx",&DWP(4*3,"esi"));
+       &and    ("eax",0x0fffffff);
+       &and    ("ebx",0x0ffffffc);
+       &and    ("ecx",0x0ffffffc);
+       &and    ("edx",0x0ffffffc);
+       &mov    (&DWP(4*6,"edi"),"eax");
+       &mov    (&DWP(4*7,"edi"),"ebx");
+       &mov    (&DWP(4*8,"edi"),"ecx");
+       &mov    (&DWP(4*9,"edi"),"edx");
+
+       &mov    ("eax",$sse2);
+&set_label("nokey");
+&function_end("poly1305_init");
+
+($h0,$h1,$h2,$h3,$h4,
+ $d0,$d1,$d2,$d3,
+ $r0,$r1,$r2,$r3,
+     $s1,$s2,$s3)=map(4*$_,(0..15));
+
+&function_begin("poly1305_blocks");
+       &mov    ("edi",&wparam(0));             # ctx
+       &mov    ("esi",&wparam(1));             # inp
+       &mov    ("ecx",&wparam(2));             # len
+&set_label("enter_blocks");
+       &and    ("ecx",-15);
+       &jz     (&label("nodata"));
+
+       &stack_push(16);
+       &mov    ("eax",&DWP(4*6,"edi"));        # r0
+       &mov    ("ebx",&DWP(4*7,"edi"));        # r1
+        &lea   ("ebp",&DWP(0,"esi","ecx"));    # end of input
+       &mov    ("ecx",&DWP(4*8,"edi"));        # r2
+       &mov    ("edx",&DWP(4*9,"edi"));        # r3
+
+       &mov    (&wparam(2),"ebp");
+       &mov    ("ebp","esi");
+
+       &mov    (&DWP($r0,"esp"),"eax");        # r0
+       &mov    ("eax","ebx");
+       &shr    ("eax",2);
+       &mov    (&DWP($r1,"esp"),"ebx");        # r1
+       &add    ("eax","ebx");                  # s1
+       &mov    ("ebx","ecx");
+       &shr    ("ebx",2);
+       &mov    (&DWP($r2,"esp"),"ecx");        # r2
+       &add    ("ebx","ecx");                  # s2
+       &mov    ("ecx","edx");
+       &shr    ("ecx",2);
+       &mov    (&DWP($r3,"esp"),"edx");        # r3
+       &add    ("ecx","edx");                  # s3
+       &mov    (&DWP($s1,"esp"),"eax");        # s1
+       &mov    (&DWP($s2,"esp"),"ebx");        # s2
+       &mov    (&DWP($s3,"esp"),"ecx");        # s3
+
+       &mov    ("eax",&DWP(4*0,"edi"));        # load hash value
+       &mov    ("ebx",&DWP(4*1,"edi"));
+       &mov    ("ecx",&DWP(4*2,"edi"));
+       &mov    ("esi",&DWP(4*3,"edi"));
+       &mov    ("edi",&DWP(4*4,"edi"));
+       &jmp    (&label("loop"));
+
+&set_label("loop",32);
+       &add    ("eax",&DWP(4*0,"ebp"));        # accumulate input
+       &adc    ("ebx",&DWP(4*1,"ebp"));
+       &adc    ("ecx",&DWP(4*2,"ebp"));
+       &adc    ("esi",&DWP(4*3,"ebp"));
+       &lea    ("ebp",&DWP(4*4,"ebp"));
+       &adc    ("edi",&wparam(3));             # padbit
+
+       &mov    (&DWP($h0,"esp"),"eax");        # put aside hash[+inp]
+       &mov    (&DWP($h3,"esp"),"esi");
+
+       &mul    (&DWP($r0,"esp"));              # h0*r0
+        &mov   (&DWP($h4,"esp"),"edi");
+       &mov    ("edi","eax");
+       &mov    ("eax","ebx");                  # h1
+       &mov    ("esi","edx");
+       &mul    (&DWP($s3,"esp"));              # h1*s3
+       &add    ("edi","eax");
+       &mov    ("eax","ecx");                  # h2
+       &adc    ("esi","edx");
+       &mul    (&DWP($s2,"esp"));              # h2*s2
+       &add    ("edi","eax");
+       &mov    ("eax",&DWP($h3,"esp"));
+       &adc    ("esi","edx");
+       &mul    (&DWP($s1,"esp"));              # h3*s1
+       &add    ("edi","eax");
+        &mov   ("eax",&DWP($h0,"esp"));
+       &adc    ("esi","edx");
+
+       &mul    (&DWP($r1,"esp"));              # h0*r1
+        &mov   (&DWP($d0,"esp"),"edi");
+       &xor    ("edi","edi");
+       &add    ("esi","eax");
+       &mov    ("eax","ebx");                  # h1
+       &adc    ("edi","edx");
+       &mul    (&DWP($r0,"esp"));              # h1*r0
+       &add    ("esi","eax");
+       &mov    ("eax","ecx");                  # h2
+       &adc    ("edi","edx");
+       &mul    (&DWP($s3,"esp"));              # h2*s3
+       &add    ("esi","eax");
+       &mov    ("eax",&DWP($h3,"esp"));
+       &adc    ("edi","edx");
+       &mul    (&DWP($s2,"esp"));              # h3*s2
+       &add    ("esi","eax");
+       &mov    ("eax",&DWP($h4,"esp"));
+       &adc    ("edi","edx");
+       &imul   ("eax",&DWP($s1,"esp"));        # h4*s1
+       &add    ("esi","eax");
+        &mov   ("eax",&DWP($h0,"esp"));
+       &adc    ("edi",0);
+
+       &mul    (&DWP($r2,"esp"));              # h0*r2
+        &mov   (&DWP($d1,"esp"),"esi");
+       &xor    ("esi","esi");
+       &add    ("edi","eax");
+       &mov    ("eax","ebx");                  # h1
+       &adc    ("esi","edx");
+       &mul    (&DWP($r1,"esp"));              # h1*r1
+       &add    ("edi","eax");
+       &mov    ("eax","ecx");                  # h2
+       &adc    ("esi","edx");
+       &mul    (&DWP($r0,"esp"));              # h2*r0
+       &add    ("edi","eax");
+       &mov    ("eax",&DWP($h3,"esp"));
+       &adc    ("esi","edx");
+       &mul    (&DWP($s3,"esp"));              # h3*s3
+       &add    ("edi","eax");
+       &mov    ("eax",&DWP($h4,"esp"));
+       &adc    ("esi","edx");
+       &imul   ("eax",&DWP($s2,"esp"));        # h4*s2
+       &add    ("edi","eax");
+        &mov   ("eax",&DWP($h0,"esp"));
+       &adc    ("esi",0);
+
+       &mul    (&DWP($r3,"esp"));              # h0*r3
+        &mov   (&DWP($d2,"esp"),"edi");
+       &xor    ("edi","edi");
+       &add    ("esi","eax");
+       &mov    ("eax","ebx");                  # h1
+       &adc    ("edi","edx");
+       &mul    (&DWP($r2,"esp"));              # h1*r2
+       &add    ("esi","eax");
+       &mov    ("eax","ecx");                  # h2
+       &adc    ("edi","edx");
+       &mul    (&DWP($r1,"esp"));              # h2*r1
+       &add    ("esi","eax");
+       &mov    ("eax",&DWP($h3,"esp"));
+       &adc    ("edi","edx");
+       &mul    (&DWP($r0,"esp"));              # h3*r0
+       &add    ("esi","eax");
+        &mov   ("ecx",&DWP($h4,"esp"));
+       &adc    ("edi","edx");
+
+       &mov    ("edx","ecx");
+       &imul   ("ecx",&DWP($s3,"esp"));        # h4*s3
+       &add    ("esi","ecx");
+        &mov   ("eax",&DWP($d0,"esp"));
+       &adc    ("edi",0);
+
+       &imul   ("edx",&DWP($r0,"esp"));        # h4*r0
+       &add    ("edx","edi");
+
+       &mov    ("ebx",&DWP($d1,"esp"));
+       &mov    ("ecx",&DWP($d2,"esp"));
+
+       &mov    ("edi","edx");                  # last reduction step
+       &shr    ("edx",2);
+       &and    ("edi",3);
+       &lea    ("edx",&DWP(0,"edx","edx",4));  # *5
+       &add    ("eax","edx");
+       &adc    ("ebx",0);
+       &adc    ("ecx",0);
+       &adc    ("esi",0);
+
+       &cmp    ("ebp",&wparam(2));             # done yet?
+       &jne    (&label("loop"));
+
+       &mov    ("edx",&wparam(0));             # ctx
+       &stack_pop(16);
+       &mov    (&DWP(4*0,"edx"),"eax");        # store hash value
+       &mov    (&DWP(4*1,"edx"),"ebx");
+       &mov    (&DWP(4*2,"edx"),"ecx");
+       &mov    (&DWP(4*3,"edx"),"esi");
+       &mov    (&DWP(4*4,"edx"),"edi");
+&set_label("nodata");
+&function_end("poly1305_blocks");
+
+&function_begin("poly1305_emit");
+       &mov    ("ebp",&wparam(0));             # context
+&set_label("enter_emit");
+       &mov    ("edi",&wparam(1));             # output
+       &mov    ("eax",&DWP(4*0,"ebp"));        # load hash value
+       &mov    ("ebx",&DWP(4*1,"ebp"));
+       &mov    ("ecx",&DWP(4*2,"ebp"));
+       &mov    ("edx",&DWP(4*3,"ebp"));
+       &mov    ("esi",&DWP(4*4,"ebp"));
+
+       &add    ("eax",5);                      # compare to modulus
+       &adc    ("ebx",0);
+       &adc    ("ecx",0);
+       &adc    ("edx",0);
+       &adc    ("esi",0);
+       &shr    ("esi",2);                      # did it carry/borrow?
+       &neg    ("esi");                        # do we choose hash-modulus?
+
+       &and    ("eax","esi");
+       &and    ("ebx","esi");
+       &and    ("ecx","esi");
+       &and    ("edx","esi");
+       &mov    (&DWP(4*0,"edi"),"eax");
+       &mov    (&DWP(4*1,"edi"),"ebx");
+       &mov    (&DWP(4*2,"edi"),"ecx");
+       &mov    (&DWP(4*3,"edi"),"edx");
+
+       &not    ("esi");                        # or original hash value?
+       &mov    ("eax",&DWP(4*0,"ebp"));
+       &mov    ("ebx",&DWP(4*1,"ebp"));
+       &mov    ("ecx",&DWP(4*2,"ebp"));
+       &mov    ("edx",&DWP(4*3,"ebp"));
+       &mov    ("ebp",&wparam(2));
+       &and    ("eax","esi");
+       &and    ("ebx","esi");
+       &and    ("ecx","esi");
+       &and    ("edx","esi");
+       &or     ("eax",&DWP(4*0,"edi"));
+       &or     ("ebx",&DWP(4*1,"edi"));
+       &or     ("ecx",&DWP(4*2,"edi"));
+       &or     ("edx",&DWP(4*3,"edi"));
+
+       &add    ("eax",&DWP(4*0,"ebp"));        # accumulate key
+       &adc    ("ebx",&DWP(4*1,"ebp"));
+       &adc    ("ecx",&DWP(4*2,"ebp"));
+       &adc    ("edx",&DWP(4*3,"ebp"));
+
+       &mov    (&DWP(4*0,"edi"),"eax");
+       &mov    (&DWP(4*1,"edi"),"ebx");
+       &mov    (&DWP(4*2,"edi"),"ecx");
+       &mov    (&DWP(4*3,"edi"),"edx");
+&function_end("poly1305_emit");
+
+if ($sse2) {
+########################################################################
+# Layout of opaque area is following.
+#
+#      unsigned __int32 h[5];          # current hash value base 2^26
+#      unsigned __int32 is_base2_26;
+#      unsigned __int32 r[4];          # key value base 2^32
+#      unsigned __int32 pad[2];
+#      struct { unsigned __int32 r^4, r^3, r^2, r^1; } r[9];
+#
+# where r^n are base 2^26 digits of degrees of multiplier key. There are
+# 5 digits, but last four are interleaved with multiples of 5, totalling
+# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
+
+my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("xmm$_",(0..7));
+my $MASK=$T2;  # borrow and keep in mind
+
+&align (32);
+&function_begin_B("_poly1305_init_sse2");
+       &movdqu         ($D4,&QWP(4*6,"edi"));          # key base 2^32
+       &lea            ("edi",&DWP(16*3,"edi"));       # size optimization
+       &mov            ("ebp","esp");
+       &sub            ("esp",16*(9+5));
+       &and            ("esp",-16);
+
+       #&pand          ($D4,&QWP(96,"ebx"));           # magic mask
+       &movq           ($MASK,&QWP(64,"ebx"));
+
+       &movdqa         ($D0,$D4);
+       &movdqa         ($D1,$D4);
+       &movdqa         ($D2,$D4);
+
+       &pand           ($D0,$MASK);                    # -> base 2^26
+       &psrlq          ($D1,26);
+       &psrldq         ($D2,6);
+       &pand           ($D1,$MASK);
+       &movdqa         ($D3,$D2);
+       &psrlq          ($D2,4)
+       &psrlq          ($D3,30);
+       &pand           ($D2,$MASK);
+       &pand           ($D3,$MASK);
+       &psrldq         ($D4,13);
+
+       &lea            ("edx",&DWP(16*9,"esp"));       # size optimization
+       &mov            ("ecx",2);
+&set_label("square");
+       &movdqa         (&QWP(16*0,"esp"),$D0);
+       &movdqa         (&QWP(16*1,"esp"),$D1);
+       &movdqa         (&QWP(16*2,"esp"),$D2);
+       &movdqa         (&QWP(16*3,"esp"),$D3);
+       &movdqa         (&QWP(16*4,"esp"),$D4);
+
+       &movdqa         ($T1,$D1);
+       &movdqa         ($T0,$D2);
+       &pslld          ($T1,2);
+       &pslld          ($T0,2);
+       &paddd          ($T1,$D1);                      # *5
+       &paddd          ($T0,$D2);                      # *5
+       &movdqa         (&QWP(16*5,"esp"),$T1);
+       &movdqa         (&QWP(16*6,"esp"),$T0);
+       &movdqa         ($T1,$D3);
+       &movdqa         ($T0,$D4);
+       &pslld          ($T1,2);
+       &pslld          ($T0,2);
+       &paddd          ($T1,$D3);                      # *5
+       &paddd          ($T0,$D4);                      # *5
+       &movdqa         (&QWP(16*7,"esp"),$T1);
+       &movdqa         (&QWP(16*8,"esp"),$T0);
+
+       &pshufd         ($T1,$D0,0b01000100);
+       &movdqa         ($T0,$D1);
+       &pshufd         ($D1,$D1,0b01000100);
+       &pshufd         ($D2,$D2,0b01000100);
+       &pshufd         ($D3,$D3,0b01000100);
+       &pshufd         ($D4,$D4,0b01000100);
+       &movdqa         (&QWP(16*0,"edx"),$T1);
+       &movdqa         (&QWP(16*1,"edx"),$D1);
+       &movdqa         (&QWP(16*2,"edx"),$D2);
+       &movdqa         (&QWP(16*3,"edx"),$D3);
+       &movdqa         (&QWP(16*4,"edx"),$D4);
+
+       ################################################################
+       # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+       # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+       # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+       # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+       # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+       &pmuludq        ($D4,$D0);                      # h4*r0
+       &pmuludq        ($D3,$D0);                      # h3*r0
+       &pmuludq        ($D2,$D0);                      # h2*r0
+       &pmuludq        ($D1,$D0);                      # h1*r0
+       &pmuludq        ($D0,$T1);                      # h0*r0
+
+sub pmuladd {
+my $load = shift;
+my $base = shift; $base = "esp" if (!defined($base));
+
+       ################################################################
+       # As for choice to "rotate" $T0-$T2 in order to move paddq
+       # past next multiplication. While it makes code harder to read
+       # and doesn't have significant effect on most processors, it
+       # makes a lot of difference on Atom, up to 30% improvement.
+
+       &movdqa         ($T1,$T0);
+       &pmuludq        ($T0,&QWP(16*3,$base));         # r1*h3
+       &movdqa         ($T2,$T1);
+       &pmuludq        ($T1,&QWP(16*2,$base));         # r1*h2
+       &paddq          ($D4,$T0);
+       &movdqa         ($T0,$T2);
+       &pmuludq        ($T2,&QWP(16*1,$base));         # r1*h1
+       &paddq          ($D3,$T1);
+       &$load          ($T1,5);                        # s1
+       &pmuludq        ($T0,&QWP(16*0,$base));         # r1*h0
+       &paddq          ($D2,$T2);
+       &pmuludq        ($T1,&QWP(16*4,$base));         # s1*h4
+        &$load         ($T2,2);                        # r2^n
+       &paddq          ($D1,$T0);
+
+       &movdqa         ($T0,$T2);
+       &pmuludq        ($T2,&QWP(16*2,$base));         # r2*h2
+        &paddq         ($D0,$T1);
+       &movdqa         ($T1,$T0);
+       &pmuludq        ($T0,&QWP(16*1,$base));         # r2*h1
+       &paddq          ($D4,$T2);
+       &$load          ($T2,6);                        # s2^n
+       &pmuludq        ($T1,&QWP(16*0,$base));         # r2*h0
+       &paddq          ($D3,$T0);
+       &movdqa         ($T0,$T2);
+       &pmuludq        ($T2,&QWP(16*4,$base));         # s2*h4
+       &paddq          ($D2,$T1);
+       &pmuludq        ($T0,&QWP(16*3,$base));         # s2*h3
+        &$load         ($T1,3);                        # r3^n
+       &paddq          ($D1,$T2);
+
+       &movdqa         ($T2,$T1);
+       &pmuludq        ($T1,&QWP(16*1,$base));         # r3*h1
+        &paddq         ($D0,$T0);
+       &$load          ($T0,7);                        # s3^n
+       &pmuludq        ($T2,&QWP(16*0,$base));         # r3*h0
+       &paddq          ($D4,$T1);
+       &movdqa         ($T1,$T0);
+       &pmuludq        ($T0,&QWP(16*4,$base));         # s3*h4
+       &paddq          ($D3,$T2);
+       &movdqa         ($T2,$T1);
+       &pmuludq        ($T1,&QWP(16*3,$base));         # s3*h3
+       &paddq          ($D2,$T0);
+       &pmuludq        ($T2,&QWP(16*2,$base));         # s3*h2
+        &$load         ($T0,4);                        # r4^n
+       &paddq          ($D1,$T1);
+
+       &$load          ($T1,8);                        # s4^n
+       &pmuludq        ($T0,&QWP(16*0,$base));         # r4*h0
+        &paddq         ($D0,$T2);
+       &movdqa         ($T2,$T1);
+       &pmuludq        ($T1,&QWP(16*4,$base));         # s4*h4
+       &paddq          ($D4,$T0);
+       &movdqa         ($T0,$T2);
+       &pmuludq        ($T2,&QWP(16*1,$base));         # s4*h1
+       &paddq          ($D3,$T1);
+       &movdqa         ($T1,$T0);
+       &pmuludq        ($T0,&QWP(16*2,$base));         # s4*h2
+       &paddq          ($D0,$T2);
+       &pmuludq        ($T1,&QWP(16*3,$base));         # s4*h3
+        &movdqa        ($MASK,&QWP(64,"ebx"));
+       &paddq          ($D1,$T0);
+       &paddq          ($D2,$T1);
+}
+       &pmuladd        (sub {  my ($reg,$i)=@_;
+                               &movdqa ($reg,&QWP(16*$i,"esp"));
+                            },"edx");
+
+sub lazy_reduction {
+       ################################################################
+       # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+       # and P. Schwabe
+
+        &movdqa        ($T0,$D3);
+        &pand          ($D3,$MASK);
+        &psrlq         ($T0,26);
+        &paddq         ($T0,$D4);                      # h3 -> h4
+       &movdqa         ($T1,$D0);
+       &pand           ($D0,$MASK);
+       &psrlq          ($T1,26);
+        &movdqa        ($D4,$T0);
+       &paddq          ($T1,$D1);                      # h0 -> h1
+        &psrlq         ($T0,26);
+        &pand          ($D4,$MASK);
+       &movdqa         ($D1,$T1);
+       &psrlq          ($T1,26);
+        &paddd         ($D0,$T0);                      # favour paddd when
+                                                       # possible, because
+                                                       # paddq is "broken"
+                                                       # on Atom
+       &pand           ($D1,$MASK);
+       &paddq          ($T1,$D2);                      # h1 -> h2
+        &psllq         ($T0,2);
+       &movdqa         ($D2,$T1);
+       &psrlq          ($T1,26);
+        &paddd         ($T0,$D0);                      # h4 -> h0
+       &pand           ($D2,$MASK);
+       &paddd          ($T1,$D3);                      # h2 -> h3
+        &movdqa        ($D0,$T0);
+        &psrlq         ($T0,26);
+       &movdqa         ($D3,$T1);
+       &psrlq          ($T1,26);
+        &pand          ($D0,$MASK);
+        &paddd         ($D1,$T0);                      # h0 -> h1
+       &pand           ($D3,$MASK);
+       &paddd          ($D4,$T1);                      # h3 -> h4
+}
+       &lazy_reduction ();
+
+       &dec            ("ecx");
+       &jz             (&label("square_break"));
+
+       &punpcklqdq     ($D0,&QWP(16*0,"esp"));         # 0:r^1:0:r^2
+       &punpcklqdq     ($D1,&QWP(16*1,"esp"));
+       &punpcklqdq     ($D2,&QWP(16*2,"esp"));
+       &punpcklqdq     ($D3,&QWP(16*3,"esp"));
+       &punpcklqdq     ($D4,&QWP(16*4,"esp"));
+       &jmp            (&label("square"));
+
+&set_label("square_break");
+       &psllq          ($D0,32);                       # -> r^3:0:r^4:0
+       &psllq          ($D1,32);
+       &psllq          ($D2,32);
+       &psllq          ($D3,32);
+       &psllq          ($D4,32);
+       &por            ($D0,&QWP(16*0,"esp"));         # r^3:r^1:r^4:r^2
+       &por            ($D1,&QWP(16*1,"esp"));
+       &por            ($D2,&QWP(16*2,"esp"));
+       &por            ($D3,&QWP(16*3,"esp"));
+       &por            ($D4,&QWP(16*4,"esp"));
+
+       &pshufd         ($D0,$D0,0b10001101);           # -> r^1:r^2:r^3:r^4
+       &pshufd         ($D1,$D1,0b10001101);
+       &pshufd         ($D2,$D2,0b10001101);
+       &pshufd         ($D3,$D3,0b10001101);
+       &pshufd         ($D4,$D4,0b10001101);
+
+       &movdqu         (&QWP(16*0,"edi"),$D0);         # save the table
+       &movdqu         (&QWP(16*1,"edi"),$D1);
+       &movdqu         (&QWP(16*2,"edi"),$D2);
+       &movdqu         (&QWP(16*3,"edi"),$D3);
+       &movdqu         (&QWP(16*4,"edi"),$D4);
+
+       &movdqa         ($T1,$D1);
+       &movdqa         ($T0,$D2);
+       &pslld          ($T1,2);
+       &pslld          ($T0,2);
+       &paddd          ($T1,$D1);                      # *5
+       &paddd          ($T0,$D2);                      # *5
+       &movdqu         (&QWP(16*5,"edi"),$T1);
+       &movdqu         (&QWP(16*6,"edi"),$T0);
+       &movdqa         ($T1,$D3);
+       &movdqa         ($T0,$D4);
+       &pslld          ($T1,2);
+       &pslld          ($T0,2);
+       &paddd          ($T1,$D3);                      # *5
+       &paddd          ($T0,$D4);                      # *5
+       &movdqu         (&QWP(16*7,"edi"),$T1);
+       &movdqu         (&QWP(16*8,"edi"),$T0);
+
+       &mov            ("esp","ebp");
+       &lea            ("edi",&DWP(-16*3,"edi"));      # size de-optimization
+       &ret            ();
+&function_end_B("_poly1305_init_sse2");
+
+&align (32);
+&function_begin("_poly1305_blocks_sse2");
+       &mov    ("edi",&wparam(0));                     # ctx
+       &mov    ("esi",&wparam(1));                     # inp
+       &mov    ("ecx",&wparam(2));                     # len
+
+       &mov    ("eax",&DWP(4*5,"edi"));                # is_base2_26
+       &and    ("ecx",-16);
+       &jz     (&label("nodata"));
+       &cmp    ("ecx",64);
+       &jae    (&label("enter_sse2"));
+       &test   ("eax","eax");                          # is_base2_26?
+       &jz     (&label("enter_blocks"));
+
+&set_label("enter_sse2",16);
+       &call   (&label("pic_point"));
+&set_label("pic_point");
+       &blindpop("ebx");
+       &lea    ("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx"));
+
+       &test   ("eax","eax");                          # is_base2_26?
+       &jnz    (&label("base2_26"));
+
+       &call   ("_poly1305_init_sse2");
+
+       ################################################# base 2^32 -> base 2^26
+       &mov    ("eax",&DWP(0,"edi"));
+       &mov    ("ecx",&DWP(3,"edi"));
+       &mov    ("edx",&DWP(6,"edi"));
+       &mov    ("esi",&DWP(9,"edi"));
+       &mov    ("ebp",&DWP(13,"edi"));
+       &mov    (&DWP(4*5,"edi"),1);                    # is_base2_26
+
+       &shr    ("ecx",2);
+       &and    ("eax",0x3ffffff);
+       &shr    ("edx",4);
+       &and    ("ecx",0x3ffffff);
+       &shr    ("esi",6);
+       &and    ("edx",0x3ffffff);
+
+       &movd   ($D0,"eax");
+       &movd   ($D1,"ecx");
+       &movd   ($D2,"edx");
+       &movd   ($D3,"esi");
+       &movd   ($D4,"ebp");
+
+       &mov    ("esi",&wparam(1));                     # [reload] inp
+       &mov    ("ecx",&wparam(2));                     # [reload] len
+       &jmp    (&label("base2_32"));
+
+&set_label("base2_26",16);
+       &movd   ($D0,&DWP(4*0,"edi"));                  # load hash value
+       &movd   ($D1,&DWP(4*1,"edi"));
+       &movd   ($D2,&DWP(4*2,"edi"));
+       &movd   ($D3,&DWP(4*3,"edi"));
+       &movd   ($D4,&DWP(4*4,"edi"));
+       &movdqa ($MASK,&QWP(64,"ebx"));
+
+&set_label("base2_32");
+       &mov    ("eax",&wparam(3));                     # padbit
+       &mov    ("ebp","esp");
+
+       &sub    ("esp",16*(5+5+5+9+9));
+       &and    ("esp",-16);
+
+       &lea    ("edi",&DWP(16*3,"edi"));               # size optimization
+       &shl    ("eax",24);                             # padbit
+
+       &test   ("ecx",31);
+       &jz     (&label("even"));
+
+       ################################################################
+       # process single block, with SSE2, because it's still faster
+       # even though half of result is discarded
+
+       &movdqu         ($T1,&QWP(0,"esi"));            # input
+       &lea            ("esi",&DWP(16,"esi"));
+
+       &movdqa         ($T0,$T1);                      # -> base 2^26 ...
+       &pand           ($T1,$MASK);
+       &paddd          ($D0,$T1);                      # ... and accumuate
+
+       &movdqa         ($T1,$T0);
+       &psrlq          ($T0,26);
+       &psrldq         ($T1,6);
+       &pand           ($T0,$MASK);
+       &paddd          ($D1,$T0);
+
+       &movdqa         ($T0,$T1);
+       &psrlq          ($T1,4);
+       &pand           ($T1,$MASK);
+       &paddd          ($D2,$T1);
+
+       &movdqa         ($T1,$T0);
+       &psrlq          ($T0,30);
+       &pand           ($T0,$MASK);
+       &psrldq         ($T1,7);
+       &paddd          ($D3,$T0);
+
+       &movd           ($T0,"eax");                    # padbit
+       &paddd          ($D4,$T1);
+        &movd          ($T1,&DWP(16*0+12,"edi"));      # r0
+       &paddd          ($D4,$T0);
+
+       &movdqa         (&QWP(16*0,"esp"),$D0);
+       &movdqa         (&QWP(16*1,"esp"),$D1);
+       &movdqa         (&QWP(16*2,"esp"),$D2);
+       &movdqa         (&QWP(16*3,"esp"),$D3);
+       &movdqa         (&QWP(16*4,"esp"),$D4);
+
+       ################################################################
+       # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+       # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+       # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+       # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+       # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+       &pmuludq        ($D0,$T1);                      # h4*r0
+       &pmuludq        ($D1,$T1);                      # h3*r0
+       &pmuludq        ($D2,$T1);                      # h2*r0
+        &movd          ($T0,&DWP(16*1+12,"edi"));      # r1
+       &pmuludq        ($D3,$T1);                      # h1*r0
+       &pmuludq        ($D4,$T1);                      # h0*r0
+
+       &pmuladd        (sub {  my ($reg,$i)=@_;
+                               &movd ($reg,&DWP(16*$i+12,"edi"));
+                            });
+
+       &lazy_reduction ();
+
+       &sub            ("ecx",16);
+       &jz             (&label("done"));
+
+&set_label("even");
+       &lea            ("edx",&DWP(16*(5+5+5+9),"esp"));# size optimization
+       &lea            ("eax",&DWP(-16*2,"esi"));
+       &sub            ("ecx",64);
+
+       ################################################################
+       # expand and copy pre-calculated table to stack
+
+       &movdqu         ($T0,&QWP(16*0,"edi"));         # r^1:r^2:r^3:r^4
+       &pshufd         ($T1,$T0,0b01000100);           # duplicate r^3:r^4
+       &cmovb          ("esi","eax");
+       &pshufd         ($T0,$T0,0b11101110);           # duplicate r^1:r^2
+       &movdqa         (&QWP(16*0,"edx"),$T1);
+       &lea            ("eax",&DWP(16*10,"esp"));
+       &movdqu         ($T1,&QWP(16*1,"edi"));
+       &movdqa         (&QWP(16*(0-9),"edx"),$T0);
+       &pshufd         ($T0,$T1,0b01000100);
+       &pshufd         ($T1,$T1,0b11101110);
+       &movdqa         (&QWP(16*1,"edx"),$T0);
+       &movdqu         ($T0,&QWP(16*2,"edi"));
+       &movdqa         (&QWP(16*(1-9),"edx"),$T1);
+       &pshufd         ($T1,$T0,0b01000100);
+       &pshufd         ($T0,$T0,0b11101110);
+       &movdqa         (&QWP(16*2,"edx"),$T1);
+       &movdqu         ($T1,&QWP(16*3,"edi"));
+       &movdqa         (&QWP(16*(2-9),"edx"),$T0);
+       &pshufd         ($T0,$T1,0b01000100);
+       &pshufd         ($T1,$T1,0b11101110);
+       &movdqa         (&QWP(16*3,"edx"),$T0);
+       &movdqu         ($T0,&QWP(16*4,"edi"));
+       &movdqa         (&QWP(16*(3-9),"edx"),$T1);
+       &pshufd         ($T1,$T0,0b01000100);
+       &pshufd         ($T0,$T0,0b11101110);
+       &movdqa         (&QWP(16*4,"edx"),$T1);
+       &movdqu         ($T1,&QWP(16*5,"edi"));
+       &movdqa         (&QWP(16*(4-9),"edx"),$T0);
+       &pshufd         ($T0,$T1,0b01000100);
+       &pshufd         ($T1,$T1,0b11101110);
+       &movdqa         (&QWP(16*5,"edx"),$T0);
+       &movdqu         ($T0,&QWP(16*6,"edi"));
+       &movdqa         (&QWP(16*(5-9),"edx"),$T1);
+       &pshufd         ($T1,$T0,0b01000100);
+       &pshufd         ($T0,$T0,0b11101110);
+       &movdqa         (&QWP(16*6,"edx"),$T1);
+       &movdqu         ($T1,&QWP(16*7,"edi"));
+       &movdqa         (&QWP(16*(6-9),"edx"),$T0);
+       &pshufd         ($T0,$T1,0b01000100);
+       &pshufd         ($T1,$T1,0b11101110);
+       &movdqa         (&QWP(16*7,"edx"),$T0);
+       &movdqu         ($T0,&QWP(16*8,"edi"));
+       &movdqa         (&QWP(16*(7-9),"edx"),$T1);
+       &pshufd         ($T1,$T0,0b01000100);
+       &pshufd         ($T0,$T0,0b11101110);
+       &movdqa         (&QWP(16*8,"edx"),$T1);
+       &movdqa         (&QWP(16*(8-9),"edx"),$T0);
+
+sub load_input {
+my ($inpbase,$offbase)=@_;
+
+       &movdqu         ($T0,&QWP($inpbase+0,"esi"));   # load input
+       &movdqu         ($T1,&QWP($inpbase+16,"esi"));
+       &lea            ("esi",&DWP(16*2,"esi"));
+
+       &movdqa         (&QWP($offbase+16*2,"esp"),$D2);
+       &movdqa         (&QWP($offbase+16*3,"esp"),$D3);
+       &movdqa         (&QWP($offbase+16*4,"esp"),$D4);
+
+       &movdqa         ($D2,$T0);                      # splat input
+       &movdqa         ($D3,$T1);
+       &psrldq         ($D2,6);
+       &psrldq         ($D3,6);
+       &movdqa         ($D4,$T0);
+       &punpcklqdq     ($D2,$D3);                      # 2:3
+       &punpckhqdq     ($D4,$T1);                      # 4
+       &punpcklqdq     ($T0,$T1);                      # 0:1
+
+       &movdqa         ($D3,$D2);
+       &psrlq          ($D2,4);
+       &psrlq          ($D3,30);
+       &movdqa         ($T1,$T0);
+       &psrlq          ($D4,40);                       # 4
+       &psrlq          ($T1,26);
+       &pand           ($T0,$MASK);                    # 0
+       &pand           ($T1,$MASK);                    # 1
+       &pand           ($D2,$MASK);                    # 2
+       &pand           ($D3,$MASK);                    # 3
+       &por            ($D4,&QWP(0,"ebx"));            # padbit, yes, always
+
+       &movdqa         (&QWP($offbase+16*0,"esp"),$D0) if ($offbase);
+       &movdqa         (&QWP($offbase+16*1,"esp"),$D1) if ($offbase);
+}
+       &load_input     (16*2,16*5);
+
+       &jbe            (&label("skip_loop"));
+       &jmp            (&label("loop"));
+
+&set_label("loop",32);
+       ################################################################
+       # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+       # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+       #   \___________________/
+       # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+       # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+       #   \___________________/ \____________________/
+       ################################################################
+
+       &movdqa         ($T2,&QWP(16*(0-9),"edx"));     # r0^2
+       &movdqa         (&QWP(16*1,"eax"),$T1);
+       &movdqa         (&QWP(16*2,"eax"),$D2);
+       &movdqa         (&QWP(16*3,"eax"),$D3);
+       &movdqa         (&QWP(16*4,"eax"),$D4);
+
+       ################################################################
+       # d4 = h4*r0 + h0*r4   + h1*r3   + h2*r2   + h3*r1
+       # d3 = h3*r0 + h0*r3   + h1*r2   + h2*r1   + h4*5*r4
+       # d2 = h2*r0 + h0*r2   + h1*r1   + h3*5*r4 + h4*5*r3
+       # d1 = h1*r0 + h0*r1   + h2*5*r4 + h3*5*r3 + h4*5*r2
+       # d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+       &movdqa         ($D1,$T0);
+       &pmuludq        ($T0,$T2);                      # h0*r0
+       &movdqa         ($D0,$T1);
+       &pmuludq        ($T1,$T2);                      # h1*r0
+       &pmuludq        ($D2,$T2);                      # h2*r0
+       &pmuludq        ($D3,$T2);                      # h3*r0
+       &pmuludq        ($D4,$T2);                      # h4*r0
+
+sub pmuladd_alt {
+my $addr = shift;
+
+       &pmuludq        ($D0,&$addr(8));                # h1*s4
+       &movdqa         ($T2,$D1);
+       &pmuludq        ($D1,&$addr(1));                # h0*r1
+       &paddq          ($D0,$T0);
+       &movdqa         ($T0,$T2);
+       &pmuludq        ($T2,&$addr(2));                # h0*r2
+       &paddq          ($D1,$T1);
+       &movdqa         ($T1,$T0);
+       &pmuludq        ($T0,&$addr(3));                # h0*r3
+       &paddq          ($D2,$T2);
+        &movdqa        ($T2,&QWP(16*1,"eax"));         # pull h1
+       &pmuludq        ($T1,&$addr(4));                # h0*r4
+       &paddq          ($D3,$T0);
+
+       &movdqa         ($T0,$T2);
+       &pmuludq        ($T2,&$addr(1));                # h1*r1
+        &paddq         ($D4,$T1);
+       &movdqa         ($T1,$T0);
+       &pmuludq        ($T0,&$addr(2));                # h1*r2
+       &paddq          ($D2,$T2);
+       &movdqa         ($T2,&QWP(16*2,"eax"));         # pull h2
+       &pmuludq        ($T1,&$addr(3));                # h1*r3
+       &paddq          ($D3,$T0);
+       &movdqa         ($T0,$T2);
+       &pmuludq        ($T2,&$addr(7));                # h2*s3
+       &paddq          ($D4,$T1);
+       &movdqa         ($T1,$T0);
+       &pmuludq        ($T0,&$addr(8));                # h2*s4
+       &paddq          ($D0,$T2);
+
+       &movdqa         ($T2,$T1);
+       &pmuludq        ($T1,&$addr(1));                # h2*r1
+        &paddq         ($D1,$T0);
+       &movdqa         ($T0,&QWP(16*3,"eax"));         # pull h3
+       &pmuludq        ($T2,&$addr(2));                # h2*r2
+       &paddq          ($D3,$T1);
+       &movdqa         ($T1,$T0);
+       &pmuludq        ($T0,&$addr(6));                # h3*s2
+       &paddq          ($D4,$T2);
+       &movdqa         ($T2,$T1);
+       &pmuludq        ($T1,&$addr(7));                # h3*s3
+       &paddq          ($D0,$T0);
+       &movdqa         ($T0,$T2);
+       &pmuludq        ($T2,&$addr(8));                # h3*s4
+       &paddq          ($D1,$T1);
+
+       &movdqa         ($T1,&QWP(16*4,"eax"));         # pull h4
+       &pmuludq        ($T0,&$addr(1));                # h3*r1
+        &paddq         ($D2,$T2);
+       &movdqa         ($T2,$T1);
+       &pmuludq        ($T1,&$addr(8));                # h4*s4
+       &paddq          ($D4,$T0);
+       &movdqa         ($T0,$T2);
+       &pmuludq        ($T2,&$addr(5));                # h4*s1
+       &paddq          ($D3,$T1);
+       &movdqa         ($T1,$T0);
+       &pmuludq        ($T0,&$addr(6));                # h4*s2
+       &paddq          ($D0,$T2);
+        &movdqa        ($MASK,&QWP(64,"ebx"));
+       &pmuludq        ($T1,&$addr(7));                # h4*s3
+       &paddq          ($D1,$T0);
+       &paddq          ($D2,$T1);
+}
+       &pmuladd_alt    (sub {  my $i=shift; &QWP(16*($i-9),"edx");     });
+
+       &load_input     (-16*2,0);
+       &lea            ("eax",&DWP(-16*2,"esi"));
+       &sub            ("ecx",64);
+
+       &paddd          ($T0,&QWP(16*(5+0),"esp"));     # add hash value
+       &paddd          ($T1,&QWP(16*(5+1),"esp"));
+       &paddd          ($D2,&QWP(16*(5+2),"esp"));
+       &paddd          ($D3,&QWP(16*(5+3),"esp"));
+       &paddd          ($D4,&QWP(16*(5+4),"esp"));
+
+       &cmovb          ("esi","eax");
+       &lea            ("eax",&DWP(16*10,"esp"));
+
+       &movdqa         ($T2,&QWP(16*0,"edx"));         # r0^4
+       &movdqa         (&QWP(16*1,"esp"),$D1);
+       &movdqa         (&QWP(16*1,"eax"),$T1);
+       &movdqa         (&QWP(16*2,"eax"),$D2);
+       &movdqa         (&QWP(16*3,"eax"),$D3);
+       &movdqa         (&QWP(16*4,"eax"),$D4);
+
+       ################################################################
+       # d4 += h4*r0 + h0*r4   + h1*r3   + h2*r2   + h3*r1
+       # d3 += h3*r0 + h0*r3   + h1*r2   + h2*r1   + h4*5*r4
+       # d2 += h2*r0 + h0*r2   + h1*r1   + h3*5*r4 + h4*5*r3
+       # d1 += h1*r0 + h0*r1   + h2*5*r4 + h3*5*r3 + h4*5*r2
+       # d0 += h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+       &movdqa         ($D1,$T0);
+       &pmuludq        ($T0,$T2);                      # h0*r0
+       &paddq          ($T0,$D0);
+       &movdqa         ($D0,$T1);
+       &pmuludq        ($T1,$T2);                      # h1*r0
+       &pmuludq        ($D2,$T2);                      # h2*r0
+       &pmuludq        ($D3,$T2);                      # h3*r0
+       &pmuludq        ($D4,$T2);                      # h4*r0
+
+       &paddq          ($T1,&QWP(16*1,"esp"));
+       &paddq          ($D2,&QWP(16*2,"esp"));
+       &paddq          ($D3,&QWP(16*3,"esp"));
+       &paddq          ($D4,&QWP(16*4,"esp"));
+
+       &pmuladd_alt    (sub {  my $i=shift; &QWP(16*$i,"edx"); });
+
+       &lazy_reduction ();
+
+       &load_input     (16*2,16*5);
+
+       &ja             (&label("loop"));
+
+&set_label("skip_loop");
+       ################################################################
+       # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+        &pshufd        ($T2,&QWP(16*(0-9),"edx"),0x10);# r0^n
+       &add            ("ecx",32);
+       &jnz            (&label("long_tail"));
+
+       &paddd          ($T0,$D0);                      # add hash value
+       &paddd          ($T1,$D1);
+       &paddd          ($D2,&QWP(16*7,"esp"));
+       &paddd          ($D3,&QWP(16*8,"esp"));
+       &paddd          ($D4,&QWP(16*9,"esp"));
+
+&set_label("long_tail");
+
+       &movdqa         (&QWP(16*0,"eax"),$T0);
+       &movdqa         (&QWP(16*1,"eax"),$T1);
+       &movdqa         (&QWP(16*2,"eax"),$D2);
+       &movdqa         (&QWP(16*3,"eax"),$D3);
+       &movdqa         (&QWP(16*4,"eax"),$D4);
+
+       ################################################################
+       # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+       # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+       # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+       # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+       # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+       &pmuludq        ($T0,$T2);                      # h0*r0
+       &pmuludq        ($T1,$T2);                      # h1*r0
+       &pmuludq        ($D2,$T2);                      # h2*r0
+       &movdqa         ($D0,$T0);
+        &pshufd        ($T0,&QWP(16*(1-9),"edx"),0x10);# r1^n
+       &pmuludq        ($D3,$T2);                      # h3*r0
+       &movdqa         ($D1,$T1);
+       &pmuludq        ($D4,$T2);                      # h4*r0
+
+       &pmuladd        (sub {  my ($reg,$i)=@_;
+                               &pshufd ($reg,&QWP(16*($i-9),"edx"),0x10);
+                            },"eax");
+
+       &jz             (&label("short_tail"));
+
+       &load_input     (-16*2,0);
+
+        &pshufd        ($T2,&QWP(16*0,"edx"),0x10);    # r0^n
+       &paddd          ($T0,&QWP(16*5,"esp"));         # add hash value
+       &paddd          ($T1,&QWP(16*6,"esp"));
+       &paddd          ($D2,&QWP(16*7,"esp"));
+       &paddd          ($D3,&QWP(16*8,"esp"));
+       &paddd          ($D4,&QWP(16*9,"esp"));
+
+       ################################################################
+       # multiply inp[0:1] by r^4:r^3 and accumulate
+
+       &movdqa         (&QWP(16*0,"esp"),$T0);
+       &pmuludq        ($T0,$T2);                      # h0*r0
+       &movdqa         (&QWP(16*1,"esp"),$T1);
+       &pmuludq        ($T1,$T2);                      # h1*r0
+       &paddq          ($D0,$T0);
+       &movdqa         ($T0,$D2);
+       &pmuludq        ($D2,$T2);                      # h2*r0
+       &paddq          ($D1,$T1);
+       &movdqa         ($T1,$D3);
+       &pmuludq        ($D3,$T2);                      # h3*r0
+       &paddq          ($D2,&QWP(16*2,"esp"));
+       &movdqa         (&QWP(16*2,"esp"),$T0);
+        &pshufd        ($T0,&QWP(16*1,"edx"),0x10);    # r1^n
+       &paddq          ($D3,&QWP(16*3,"esp"));
+       &movdqa         (&QWP(16*3,"esp"),$T1);
+       &movdqa         ($T1,$D4);
+       &pmuludq        ($D4,$T2);                      # h4*r0
+       &paddq          ($D4,&QWP(16*4,"esp"));
+       &movdqa         (&QWP(16*4,"esp"),$T1);
+
+       &pmuladd        (sub {  my ($reg,$i)=@_;
+                               &pshufd ($reg,&QWP(16*$i,"edx"),0x10);
+                            });
+
+&set_label("short_tail");
+
+       &lazy_reduction ();
+
+       ################################################################
+       # horizontal addition
+
+       &pshufd         ($T1,$D0,0b01001110);
+       &pshufd         ($T0,$D1,0b01001110);
+       &paddd          ($D0,$T1);
+       &pshufd         ($T1,$D2,0b01001110);
+       &paddd          ($D1,$T0);
+       &pshufd         ($T0,$D3,0b01001110);
+       &paddd          ($D2,$T1);
+       &pshufd         ($T1,$D4,0b01001110);
+       &paddd          ($D3,$T0);
+       &paddd          ($D4,$T1);
+
+&set_label("done");
+       &movd           (&DWP(-16*3+4*0,"edi"),$D0);    # store hash value
+       &movd           (&DWP(-16*3+4*1,"edi"),$D1);
+       &movd           (&DWP(-16*3+4*2,"edi"),$D2);
+       &movd           (&DWP(-16*3+4*3,"edi"),$D3);
+       &movd           (&DWP(-16*3+4*4,"edi"),$D4);
+&set_label("nodata");
+       &mov    ("esp","ebp");
+&function_end("_poly1305_blocks_sse2");
+
+&align (32);
+&function_begin("_poly1305_emit_sse2");
+       &mov    ("ebp",&wparam(0));             # context
+
+       &cmp    (&DWP(4*5,"ebp"),0);            # is_base2_26?
+       &je     (&label("enter_emit"));
+
+       &mov    ("eax",&DWP(4*0,"ebp"));        # load hash value
+       &mov    ("edi",&DWP(4*1,"ebp"));
+       &mov    ("ecx",&DWP(4*2,"ebp"));
+       &mov    ("edx",&DWP(4*3,"ebp"));
+       &mov    ("esi",&DWP(4*4,"ebp"));
+
+       &mov    ("ebx","edi");                  # base 2^26 -> base 2^32
+       &shl    ("edi",26);
+       &shr    ("ebx",6);
+       &add    ("eax","edi");
+       &mov    ("edi","ecx");
+       &adc    ("ebx",0);
+
+       &shl    ("edi",20);
+       &shr    ("ecx",12);
+       &add    ("ebx","edi");
+       &mov    ("edi","edx");
+       &adc    ("ecx",0);
+
+       &shl    ("edi",14);
+       &shr    ("edx",18);
+       &add    ("ecx","edi");
+       &mov    ("edi","esi");
+       &adc    ("edx",0);
+
+       &shl    ("edi",8);
+       &shr    ("esi",24);
+       &add    ("edx","edi");
+       &adc    ("esi",0);                      # can be partially reduced
+
+       &mov    ("edi","esi");                  # final reduction
+       &and    ("esi",3);
+       &shr    ("edi",2);
+       &lea    ("ebp",&DWP(0,"edi","edi",4));  # *5
+        &mov   ("edi",&wparam(1));             # output
+       add     ("eax","ebp");
+        &mov   ("ebp",&wparam(2));             # key
+       adc     ("ebx",0);
+       adc     ("ecx",0);
+       adc     ("edx",0);
+
+       &movd   ($D0,"eax");                    # offload original hash value
+       &add    ("eax",5);                      # compare to modulus
+       &movd   ($D1,"ebx");
+       &adc    ("ebx",0);
+       &movd   ($D2,"ecx");
+       &adc    ("ecx",0);
+       &movd   ($D3,"edx");
+       &adc    ("edx",0);
+       &adc    ("esi",0);
+       &shr    ("esi",2);                      # did it carry/borrow?
+
+       &neg    ("esi");                        # do we choose (hash-modulus) ...
+       &and    ("eax","esi");
+       &and    ("ebx","esi");
+       &and    ("ecx","esi");
+       &and    ("edx","esi");
+       &mov    (&DWP(4*0,"edi"),"eax");
+       &movd   ("eax",$D0);
+       &mov    (&DWP(4*1,"edi"),"ebx");
+       &movd   ("ebx",$D1);
+       &mov    (&DWP(4*2,"edi"),"ecx");
+       &movd   ("ecx",$D2);
+       &mov    (&DWP(4*3,"edi"),"edx");
+       &movd   ("edx",$D3);
+
+       &not    ("esi");                        # ... or original hash value?
+       &and    ("eax","esi");
+       &and    ("ebx","esi");
+       &or     ("eax",&DWP(4*0,"edi"));
+       &and    ("ecx","esi");
+       &or     ("ebx",&DWP(4*1,"edi"));
+       &and    ("edx","esi");
+       &or     ("ecx",&DWP(4*2,"edi"));
+       &or     ("edx",&DWP(4*3,"edi"));
+
+       &add    ("eax",&DWP(4*0,"ebp"));        # accumulate key
+       &adc    ("ebx",&DWP(4*1,"ebp"));
+       &mov    (&DWP(4*0,"edi"),"eax");
+       &adc    ("ecx",&DWP(4*2,"ebp"));
+       &mov    (&DWP(4*1,"edi"),"ebx");
+       &adc    ("edx",&DWP(4*3,"ebp"));
+       &mov    (&DWP(4*2,"edi"),"ecx");
+       &mov    (&DWP(4*3,"edi"),"edx");
+&function_end("_poly1305_emit_sse2");
+
+if ($avx>1) {
+########################################################################
+# Note that poly1305_init_avx2 operates on %xmm, I could have used
+# poly1305_init_sse2...
+
+&align (32);
+&function_begin_B("_poly1305_init_avx2");
+       &vmovdqu        ($D4,&QWP(4*6,"edi"));          # key base 2^32
+       &lea            ("edi",&DWP(16*3,"edi"));       # size optimization
+       &mov            ("ebp","esp");
+       &sub            ("esp",16*(9+5));
+       &and            ("esp",-16);
+
+       #&vpand         ($D4,$D4,&QWP(96,"ebx"));       # magic mask
+       &vmovdqa        ($MASK,&QWP(64,"ebx"));
+
+       &vpand          ($D0,$D4,$MASK);                # -> base 2^26
+       &vpsrlq         ($D1,$D4,26);
+       &vpsrldq        ($D3,$D4,6);
+       &vpand          ($D1,$D1,$MASK);
+       &vpsrlq         ($D2,$D3,4)
+       &vpsrlq         ($D3,$D3,30);
+       &vpand          ($D2,$D2,$MASK);
+       &vpand          ($D3,$D3,$MASK);
+       &vpsrldq        ($D4,$D4,13);
+
+       &lea            ("edx",&DWP(16*9,"esp"));       # size optimization
+       &mov            ("ecx",2);
+&set_label("square");
+       &vmovdqa        (&QWP(16*0,"esp"),$D0);
+       &vmovdqa        (&QWP(16*1,"esp"),$D1);
+       &vmovdqa        (&QWP(16*2,"esp"),$D2);
+       &vmovdqa        (&QWP(16*3,"esp"),$D3);
+       &vmovdqa        (&QWP(16*4,"esp"),$D4);
+
+       &vpslld         ($T1,$D1,2);
+       &vpslld         ($T0,$D2,2);
+       &vpaddd         ($T1,$T1,$D1);                  # *5
+       &vpaddd         ($T0,$T0,$D2);                  # *5
+       &vmovdqa        (&QWP(16*5,"esp"),$T1);
+       &vmovdqa        (&QWP(16*6,"esp"),$T0);
+       &vpslld         ($T1,$D3,2);
+       &vpslld         ($T0,$D4,2);
+       &vpaddd         ($T1,$T1,$D3);                  # *5
+       &vpaddd         ($T0,$T0,$D4);                  # *5
+       &vmovdqa        (&QWP(16*7,"esp"),$T1);
+       &vmovdqa        (&QWP(16*8,"esp"),$T0);
+
+       &vpshufd        ($T0,$D0,0b01000100);
+       &vmovdqa        ($T1,$D1);
+       &vpshufd        ($D1,$D1,0b01000100);
+       &vpshufd        ($D2,$D2,0b01000100);
+       &vpshufd        ($D3,$D3,0b01000100);
+       &vpshufd        ($D4,$D4,0b01000100);
+       &vmovdqa        (&QWP(16*0,"edx"),$T0);
+       &vmovdqa        (&QWP(16*1,"edx"),$D1);
+       &vmovdqa        (&QWP(16*2,"edx"),$D2);
+       &vmovdqa        (&QWP(16*3,"edx"),$D3);
+       &vmovdqa        (&QWP(16*4,"edx"),$D4);
+
+       ################################################################
+       # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+       # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+       # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+       # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+       # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+       &vpmuludq       ($D4,$D4,$D0);                  # h4*r0
+       &vpmuludq       ($D3,$D3,$D0);                  # h3*r0
+       &vpmuludq       ($D2,$D2,$D0);                  # h2*r0
+       &vpmuludq       ($D1,$D1,$D0);                  # h1*r0
+       &vpmuludq       ($D0,$T0,$D0);                  # h0*r0
+
+       &vpmuludq       ($T0,$T1,&QWP(16*3,"edx"));     # r1*h3
+       &vpaddq         ($D4,$D4,$T0);
+       &vpmuludq       ($T2,$T1,&QWP(16*2,"edx"));     # r1*h2
+       &vpaddq         ($D3,$D3,$T2);
+       &vpmuludq       ($T0,$T1,&QWP(16*1,"edx"));     # r1*h1
+       &vpaddq         ($D2,$D2,$T0);
+       &vmovdqa        ($T2,&QWP(16*5,"esp"));         # s1
+       &vpmuludq       ($T1,$T1,&QWP(16*0,"edx"));     # r1*h0
+       &vpaddq         ($D1,$D1,$T1);
+        &vmovdqa       ($T0,&QWP(16*2,"esp"));         # r2
+       &vpmuludq       ($T2,$T2,&QWP(16*4,"edx"));     # s1*h4
+       &vpaddq         ($D0,$D0,$T2);
+
+       &vpmuludq       ($T1,$T0,&QWP(16*2,"edx"));     # r2*h2
+       &vpaddq         ($D4,$D4,$T1);
+       &vpmuludq       ($T2,$T0,&QWP(16*1,"edx"));     # r2*h1
+       &vpaddq         ($D3,$D3,$T2);
+       &vmovdqa        ($T1,&QWP(16*6,"esp"));         # s2
+       &vpmuludq       ($T0,$T0,&QWP(16*0,"edx"));     # r2*h0
+       &vpaddq         ($D2,$D2,$T0);
+       &vpmuludq       ($T2,$T1,&QWP(16*4,"edx"));     # s2*h4
+       &vpaddq         ($D1,$D1,$T2);
+        &vmovdqa       ($T0,&QWP(16*3,"esp"));         # r3
+       &vpmuludq       ($T1,$T1,&QWP(16*3,"edx"));     # s2*h3
+       &vpaddq         ($D0,$D0,$T1);
+
+       &vpmuludq       ($T2,$T0,&QWP(16*1,"edx"));     # r3*h1
+       &vpaddq         ($D4,$D4,$T2);
+       &vmovdqa        ($T1,&QWP(16*7,"esp"));         # s3
+       &vpmuludq       ($T0,$T0,&QWP(16*0,"edx"));     # r3*h0
+       &vpaddq         ($D3,$D3,$T0);
+       &vpmuludq       ($T2,$T1,&QWP(16*4,"edx"));     # s3*h4
+       &vpaddq         ($D2,$D2,$T2);
+       &vpmuludq       ($T0,$T1,&QWP(16*3,"edx"));     # s3*h3
+       &vpaddq         ($D1,$D1,$T0);
+        &vmovdqa       ($T2,&QWP(16*4,"esp"));         # r4
+       &vpmuludq       ($T1,$T1,&QWP(16*2,"edx"));     # s3*h2
+       &vpaddq         ($D0,$D0,$T1);
+
+       &vmovdqa        ($T0,&QWP(16*8,"esp"));         # s4
+       &vpmuludq       ($T2,$T2,&QWP(16*0,"edx"));     # r4*h0
+       &vpaddq         ($D4,$D4,$T2);
+       &vpmuludq       ($T1,$T0,&QWP(16*4,"edx"));     # s4*h4
+       &vpaddq         ($D3,$D3,$T1);
+       &vpmuludq       ($T2,$T0,&QWP(16*1,"edx"));     # s4*h1
+       &vpaddq         ($D0,$D0,$T2);
+       &vpmuludq       ($T1,$T0,&QWP(16*2,"edx"));     # s4*h2
+       &vpaddq         ($D1,$D1,$T1);
+        &vmovdqa       ($MASK,&QWP(64,"ebx"));
+       &vpmuludq       ($T0,$T0,&QWP(16*3,"edx"));     # s4*h3
+       &vpaddq         ($D2,$D2,$T0);
+
+       ################################################################
+       # lazy reduction
+        &vpsrlq        ($T0,$D3,26);
+        &vpand         ($D3,$D3,$MASK);
+       &vpsrlq         ($T1,$D0,26);
+       &vpand          ($D0,$D0,$MASK);
+        &vpaddq        ($D4,$D4,$T0);                  # h3 -> h4
+       &vpaddq         ($D1,$D1,$T1);                  # h0 -> h1
+        &vpsrlq        ($T0,$D4,26);
+        &vpand         ($D4,$D4,$MASK);
+       &vpsrlq         ($T1,$D1,26);
+       &vpand          ($D1,$D1,$MASK);
+       &vpaddq         ($D2,$D2,$T1);                  # h1 -> h2
+        &vpaddd        ($D0,$D0,$T0);
+        &vpsllq        ($T0,$T0,2);
+       &vpsrlq         ($T1,$D2,26);
+       &vpand          ($D2,$D2,$MASK);
+        &vpaddd        ($D0,$D0,$T0);                  # h4 -> h0
+       &vpaddd         ($D3,$D3,$T1);                  # h2 -> h3
+       &vpsrlq         ($T1,$D3,26);
+        &vpsrlq        ($T0,$D0,26);
+        &vpand         ($D0,$D0,$MASK);
+       &vpand          ($D3,$D3,$MASK);
+        &vpaddd        ($D1,$D1,$T0);                  # h0 -> h1
+       &vpaddd         ($D4,$D4,$T1);                  # h3 -> h4
+
+       &dec            ("ecx");
+       &jz             (&label("square_break"));
+
+       &vpunpcklqdq    ($D0,$D0,&QWP(16*0,"esp"));     # 0:r^1:0:r^2
+       &vpunpcklqdq    ($D1,$D1,&QWP(16*1,"esp"));
+       &vpunpcklqdq    ($D2,$D2,&QWP(16*2,"esp"));
+       &vpunpcklqdq    ($D3,$D3,&QWP(16*3,"esp"));
+       &vpunpcklqdq    ($D4,$D4,&QWP(16*4,"esp"));
+       &jmp            (&label("square"));
+
+&set_label("square_break");
+       &vpsllq         ($D0,$D0,32);                   # -> r^3:0:r^4:0
+       &vpsllq         ($D1,$D1,32);
+       &vpsllq         ($D2,$D2,32);
+       &vpsllq         ($D3,$D3,32);
+       &vpsllq         ($D4,$D4,32);
+       &vpor           ($D0,$D0,&QWP(16*0,"esp"));     # r^3:r^1:r^4:r^2
+       &vpor           ($D1,$D1,&QWP(16*1,"esp"));
+       &vpor           ($D2,$D2,&QWP(16*2,"esp"));
+       &vpor           ($D3,$D3,&QWP(16*3,"esp"));
+       &vpor           ($D4,$D4,&QWP(16*4,"esp"));
+
+       &vpshufd        ($D0,$D0,0b10001101);           # -> r^1:r^2:r^3:r^4
+       &vpshufd        ($D1,$D1,0b10001101);
+       &vpshufd        ($D2,$D2,0b10001101);
+       &vpshufd        ($D3,$D3,0b10001101);
+       &vpshufd        ($D4,$D4,0b10001101);
+
+       &vmovdqu        (&QWP(16*0,"edi"),$D0);         # save the table
+       &vmovdqu        (&QWP(16*1,"edi"),$D1);
+       &vmovdqu        (&QWP(16*2,"edi"),$D2);
+       &vmovdqu        (&QWP(16*3,"edi"),$D3);
+       &vmovdqu        (&QWP(16*4,"edi"),$D4);
+
+       &vpslld         ($T1,$D1,2);
+       &vpslld         ($T0,$D2,2);
+       &vpaddd         ($T1,$T1,$D1);                  # *5
+       &vpaddd         ($T0,$T0,$D2);                  # *5
+       &vmovdqu        (&QWP(16*5,"edi"),$T1);
+       &vmovdqu        (&QWP(16*6,"edi"),$T0);
+       &vpslld         ($T1,$D3,2);
+       &vpslld         ($T0,$D4,2);
+       &vpaddd         ($T1,$T1,$D3);                  # *5
+       &vpaddd         ($T0,$T0,$D4);                  # *5
+       &vmovdqu        (&QWP(16*7,"edi"),$T1);
+       &vmovdqu        (&QWP(16*8,"edi"),$T0);
+
+       &mov            ("esp","ebp");
+       &lea            ("edi",&DWP(-16*3,"edi"));      # size de-optimization
+       &ret            ();
+&function_end_B("_poly1305_init_avx2");
+
+########################################################################
+# now it's time to switch to %ymm
+
+my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("ymm$_",(0..7));
+my $MASK=$T2;
+
+sub X { my $reg=shift; $reg=~s/^ymm/xmm/; $reg; }
+
+&align (32);
+&function_begin("_poly1305_blocks_avx2");
+       &mov    ("edi",&wparam(0));                     # ctx
+       &mov    ("esi",&wparam(1));                     # inp
+       &mov    ("ecx",&wparam(2));                     # len
+
+       &mov    ("eax",&DWP(4*5,"edi"));                # is_base2_26
+       &and    ("ecx",-16);
+       &jz     (&label("nodata"));
+       &cmp    ("ecx",64);
+       &jae    (&label("enter_avx2"));
+       &test   ("eax","eax");                          # is_base2_26?
+       &jz     (&label("enter_blocks"));
+
+&set_label("enter_avx2",16);
+       &vzeroupper     ();
+
+       &call   (&label("pic_point"));
+&set_label("pic_point");
+       &blindpop("ebx");
+       &lea    ("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx"));
+
+       &test   ("eax","eax");                          # is_base2_26?
+       &jnz    (&label("base2_26"));
+
+       &call   ("_poly1305_init_avx2");
+
+       ################################################# base 2^32 -> base 2^26
+       &mov    ("eax",&DWP(0,"edi"));
+       &mov    ("ecx",&DWP(3,"edi"));
+       &mov    ("edx",&DWP(6,"edi"));
+       &mov    ("esi",&DWP(9,"edi"));
+       &mov    ("ebp",&DWP(13,"edi"));
+
+       &shr    ("ecx",2);
+       &and    ("eax",0x3ffffff);
+       &shr    ("edx",4);
+       &and    ("ecx",0x3ffffff);
+       &shr    ("esi",6);
+       &and    ("edx",0x3ffffff);
+
+       &mov    (&DWP(4*0,"edi"),"eax");
+       &mov    (&DWP(4*1,"edi"),"ecx");
+       &mov    (&DWP(4*2,"edi"),"edx");
+       &mov    (&DWP(4*3,"edi"),"esi");
+       &mov    (&DWP(4*4,"edi"),"ebp");
+       &mov    (&DWP(4*5,"edi"),1);                    # is_base2_26
+
+       &mov    ("esi",&wparam(1));                     # [reload] inp
+       &mov    ("ecx",&wparam(2));                     # [reload] len
+
+&set_label("base2_26");
+       &mov    ("eax",&wparam(3));                     # padbit
+       &mov    ("ebp","esp");
+
+       &sub    ("esp",32*(5+9));
+       &and    ("esp",-512);                           # ensure that frame
+                                                       # doesn't cross page
+                                                       # boundary, which is
+                                                       # essential for
+                                                       # misaligned 32-byte
+                                                       # loads
+
+       ################################################################
+        # expand and copy pre-calculated table to stack
+
+       &vmovdqu        (&X($D0),&QWP(16*(3+0),"edi"));
+       &lea            ("edx",&DWP(32*5+128,"esp"));   # +128 size optimization
+       &vmovdqu        (&X($D1),&QWP(16*(3+1),"edi"));
+       &vmovdqu        (&X($D2),&QWP(16*(3+2),"edi"));
+       &vmovdqu        (&X($D3),&QWP(16*(3+3),"edi"));
+       &vmovdqu        (&X($D4),&QWP(16*(3+4),"edi"));
+       &lea            ("edi",&DWP(16*3,"edi"));       # size optimization
+       &vpermq         ($D0,$D0,0b01000000);           # 00001234 -> 12343434
+       &vpermq         ($D1,$D1,0b01000000);
+       &vpermq         ($D2,$D2,0b01000000);
+       &vpermq         ($D3,$D3,0b01000000);
+       &vpermq         ($D4,$D4,0b01000000);
+       &vpshufd        ($D0,$D0,0b11001000);           # 12343434 -> 14243444
+       &vpshufd        ($D1,$D1,0b11001000);
+       &vpshufd        ($D2,$D2,0b11001000);
+       &vpshufd        ($D3,$D3,0b11001000);
+       &vpshufd        ($D4,$D4,0b11001000);
+       &vmovdqa        (&QWP(32*0-128,"edx"),$D0);
+       &vmovdqu        (&X($D0),&QWP(16*5,"edi"));
+       &vmovdqa        (&QWP(32*1-128,"edx"),$D1);
+       &vmovdqu        (&X($D1),&QWP(16*6,"edi"));
+       &vmovdqa        (&QWP(32*2-128,"edx"),$D2);
+       &vmovdqu        (&X($D2),&QWP(16*7,"edi"));
+       &vmovdqa        (&QWP(32*3-128,"edx"),$D3);
+       &vmovdqu        (&X($D3),&QWP(16*8,"edi"));
+       &vmovdqa        (&QWP(32*4-128,"edx"),$D4);
+       &vpermq         ($D0,$D0,0b01000000);
+       &vpermq         ($D1,$D1,0b01000000);
+       &vpermq         ($D2,$D2,0b01000000);
+       &vpermq         ($D3,$D3,0b01000000);
+       &vpshufd        ($D0,$D0,0b11001000);
+       &vpshufd        ($D1,$D1,0b11001000);
+       &vpshufd        ($D2,$D2,0b11001000);
+       &vpshufd        ($D3,$D3,0b11001000);
+       &vmovdqa        (&QWP(32*5-128,"edx"),$D0);
+       &vmovd          (&X($D0),&DWP(-16*3+4*0,"edi"));# load hash value
+       &vmovdqa        (&QWP(32*6-128,"edx"),$D1);
+       &vmovd          (&X($D1),&DWP(-16*3+4*1,"edi"));
+       &vmovdqa        (&QWP(32*7-128,"edx"),$D2);
+       &vmovd          (&X($D2),&DWP(-16*3+4*2,"edi"));
+       &vmovdqa        (&QWP(32*8-128,"edx"),$D3);
+       &vmovd          (&X($D3),&DWP(-16*3+4*3,"edi"));
+       &vmovd          (&X($D4),&DWP(-16*3+4*4,"edi"));
+       &vmovdqa        ($MASK,&QWP(64,"ebx"));
+       &neg            ("eax");                        # padbit
+
+       &test           ("ecx",63);
+       &jz             (&label("even"));
+
+       &mov            ("edx","ecx");
+       &and            ("ecx",-64);
+       &and            ("edx",63);
+
+       &vmovdqu        (&X($T0),&QWP(16*0,"esi"));
+       &cmp            ("edx",32);
+       &jb             (&label("one"));
+
+       &vmovdqu        (&X($T1),&QWP(16*1,"esi"));
+       &je             (&label("two"));
+
+       &vinserti128    ($T0,$T0,&QWP(16*2,"esi"),1);
+       &lea            ("esi",&DWP(16*3,"esi"));
+       &lea            ("ebx",&DWP(8,"ebx"));          # three padbits
+       &lea            ("edx",&DWP(32*5+128+8,"esp")); # --:r^1:r^2:r^3 (*)
+       &jmp            (&label("tail"));
+
+&set_label("two");
+       &lea            ("esi",&DWP(16*2,"esi"));
+       &lea            ("ebx",&DWP(16,"ebx"));         # two padbits
+       &lea            ("edx",&DWP(32*5+128+16,"esp"));# --:--:r^1:r^2 (*)
+       &jmp            (&label("tail"));
+
+&set_label("one");
+       &lea            ("esi",&DWP(16*1,"esi"));
+       &vpxor          ($T1,$T1,$T1);
+       &lea            ("ebx",&DWP(32,"ebx","eax",8)); # one or no padbits
+       &lea            ("edx",&DWP(32*5+128+24,"esp"));# --:--:--:r^1 (*)
+       &jmp            (&label("tail"));
+
+# (*)  spots marked with '--' are data from next table entry, but they
+#      are multiplied by 0 and therefore rendered insignificant
+
+&set_label("even",32);
+       &vmovdqu        (&X($T0),&QWP(16*0,"esi"));     # load input
+       &vmovdqu        (&X($T1),&QWP(16*1,"esi"));
+       &vinserti128    ($T0,$T0,&QWP(16*2,"esi"),1);
+       &vinserti128    ($T1,$T1,&QWP(16*3,"esi"),1);
+       &lea            ("esi",&DWP(16*4,"esi"));
+       &sub            ("ecx",64);
+       &jz             (&label("tail"));
+
+&set_label("loop");
+       ################################################################
+       # ((inp[0]*r^4+r[4])*r^4+r[8])*r^4
+       # ((inp[1]*r^4+r[5])*r^4+r[9])*r^3
+       # ((inp[2]*r^4+r[6])*r^4+r[10])*r^2
+       # ((inp[3]*r^4+r[7])*r^4+r[11])*r^1
+       #   \________/ \_______/
+       ################################################################
+
+sub vsplat_input {
+       &vmovdqa        (&QWP(32*2,"esp"),$D2);
+       &vpsrldq        ($D2,$T0,6);                    # splat input
+       &vmovdqa        (&QWP(32*0,"esp"),$D0);
+       &vpsrldq        ($D0,$T1,6);
+       &vmovdqa        (&QWP(32*1,"esp"),$D1);
+       &vpunpckhqdq    ($D1,$T0,$T1);                  # 4
+       &vpunpcklqdq    ($T0,$T0,$T1);                  # 0:1
+       &vpunpcklqdq    ($D2,$D2,$D0);                  # 2:3
+
+       &vpsrlq         ($D0,$D2,30);
+       &vpsrlq         ($D2,$D2,4);
+       &vpsrlq         ($T1,$T0,26);
+       &vpsrlq         ($D1,$D1,40);                   # 4
+       &vpand          ($D2,$D2,$MASK);                # 2
+       &vpand          ($T0,$T0,$MASK);                # 0
+       &vpand          ($T1,$T1,$MASK);                # 1
+       &vpand          ($D0,$D0,$MASK);                # 3 (*)
+       &vpor           ($D1,$D1,&QWP(0,"ebx"));        # padbit, yes, always
+
+       # (*)   note that output is counterintuitive, inp[3:4] is
+       #       returned in $D1-2, while $D3-4 are preserved;
+}
+       &vsplat_input   ();
+
+sub vpmuladd {
+my $addr = shift;
+
+       &vpaddq         ($D2,$D2,&QWP(32*2,"esp"));     # add hash value
+       &vpaddq         ($T0,$T0,&QWP(32*0,"esp"));
+       &vpaddq         ($T1,$T1,&QWP(32*1,"esp"));
+       &vpaddq         ($D0,$D0,$D3);
+       &vpaddq         ($D1,$D1,$D4);
+
+       ################################################################
+       # d3 = h2*r1   + h0*r3 + h1*r2   + h3*r0   + h4*5*r4
+       # d4 = h2*r2   + h0*r4 + h1*r3   + h3*r1   + h4*r0
+       # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
+       # d1 = h2*5*r4 + h0*r1 + h1*r0   + h3*5*r3 + h4*5*r2
+       # d2 = h2*r0   + h0*r2 + h1*r1   + h3*5*r4 + h4*5*r3
+
+       &vpmuludq       ($D3,$D2,&$addr(1));            # d3 = h2*r1
+        &vmovdqa       (QWP(32*1,"esp"),$T1);
+       &vpmuludq       ($D4,$D2,&$addr(2));            # d4 = h2*r2
+        &vmovdqa       (QWP(32*3,"esp"),$D0);
+       &vpmuludq       ($D0,$D2,&$addr(7));            # d0 = h2*s3
+        &vmovdqa       (QWP(32*4,"esp"),$D1);
+       &vpmuludq       ($D1,$D2,&$addr(8));            # d1 = h2*s4
+       &vpmuludq       ($D2,$D2,&$addr(0));            # d2 = h2*r0
+
+       &vpmuludq       ($T2,$T0,&$addr(3));            # h0*r3
+       &vpaddq         ($D3,$D3,$T2);                  # d3 += h0*r3
+       &vpmuludq       ($T1,$T0,&$addr(4));            # h0*r4
+       &vpaddq         ($D4,$D4,$T1);                  # d4 + h0*r4
+       &vpmuludq       ($T2,$T0,&$addr(0));            # h0*r0
+       &vpaddq         ($D0,$D0,$T2);                  # d0 + h0*r0
+        &vmovdqa       ($T2,&QWP(32*1,"esp"));         # h1
+       &vpmuludq       ($T1,$T0,&$addr(1));            # h0*r1
+       &vpaddq         ($D1,$D1,$T1);                  # d1 += h0*r1
+       &vpmuludq       ($T0,$T0,&$addr(2));            # h0*r2
+       &vpaddq         ($D2,$D2,$T0);                  # d2 += h0*r2
+
+       &vpmuludq       ($T1,$T2,&$addr(2));            # h1*r2
+       &vpaddq         ($D3,$D3,$T1);                  # d3 += h1*r2
+       &vpmuludq       ($T0,$T2,&$addr(3));            # h1*r3
+       &vpaddq         ($D4,$D4,$T0);                  # d4 += h1*r3
+       &vpmuludq       ($T1,$T2,&$addr(8));            # h1*s4
+       &vpaddq         ($D0,$D0,$T1);                  # d0 += h1*s4
+        &vmovdqa       ($T1,&QWP(32*3,"esp"));         # h3
+       &vpmuludq       ($T0,$T2,&$addr(0));            # h1*r0
+       &vpaddq         ($D1,$D1,$T0);                  # d1 += h1*r0
+       &vpmuludq       ($T2,$T2,&$addr(1));            # h1*r1
+       &vpaddq         ($D2,$D2,$T2);                  # d2 += h1*r1
+
+       &vpmuludq       ($T0,$T1,&$addr(0));            # h3*r0
+       &vpaddq         ($D3,$D3,$T0);                  # d3 += h3*r0
+       &vpmuludq       ($T2,$T1,&$addr(1));            # h3*r1
+       &vpaddq         ($D4,$D4,$T2);                  # d4 += h3*r1
+       &vpmuludq       ($T0,$T1,&$addr(6));            # h3*s2
+       &vpaddq         ($D0,$D0,$T0);                  # d0 += h3*s2
+        &vmovdqa       ($T0,&QWP(32*4,"esp"));         # h4
+       &vpmuludq       ($T2,$T1,&$addr(7));            # h3*s3
+       &vpaddq         ($D1,$D1,$T2);                  # d1+= h3*s3
+       &vpmuludq       ($T1,$T1,&$addr(8));            # h3*s4
+       &vpaddq         ($D2,$D2,$T1);                  # d2 += h3*s4
+
+       &vpmuludq       ($T2,$T0,&$addr(8));            # h4*s4
+       &vpaddq         ($D3,$D3,$T2);                  # d3 += h4*s4
+       &vpmuludq       ($T1,$T0,&$addr(5));            # h4*s1
+       &vpaddq         ($D0,$D0,$T1);                  # d0 += h4*s1
+       &vpmuludq       ($T2,$T0,&$addr(0));            # h4*r0
+       &vpaddq         ($D4,$D4,$T2);                  # d4 += h4*r0
+        &vmovdqa       ($MASK,&QWP(64,"ebx"));
+       &vpmuludq       ($T1,$T0,&$addr(6));            # h4*s2
+       &vpaddq         ($D1,$D1,$T1);                  # d1 += h4*s2
+       &vpmuludq       ($T0,$T0,&$addr(7));            # h4*s3
+       &vpaddq         ($D2,$D2,$T0);                  # d2 += h4*s3
+}
+       &vpmuladd       (sub {  my $i=shift; &QWP(32*$i-128,"edx");     });
+
+sub vlazy_reduction {
+       ################################################################
+       # lazy reduction
+
+        &vpsrlq        ($T0,$D3,26);
+        &vpand         ($D3,$D3,$MASK);
+       &vpsrlq         ($T1,$D0,26);
+       &vpand          ($D0,$D0,$MASK);
+        &vpaddq        ($D4,$D4,$T0);                  # h3 -> h4
+       &vpaddq         ($D1,$D1,$T1);                  # h0 -> h1
+        &vpsrlq        ($T0,$D4,26);
+        &vpand         ($D4,$D4,$MASK);
+       &vpsrlq         ($T1,$D1,26);
+       &vpand          ($D1,$D1,$MASK);
+       &vpaddq         ($D2,$D2,$T1);                  # h1 -> h2
+        &vpaddd        ($D0,$D0,$T0);
+        &vpsllq        ($T0,$T0,2);
+       &vpsrlq         ($T1,$D2,26);
+       &vpand          ($D2,$D2,$MASK);
+        &vpaddd        ($D0,$D0,$T0);                  # h4 -> h0
+       &vpaddd         ($D3,$D3,$T1);                  # h2 -> h3
+       &vpsrlq         ($T1,$D3,26);
+        &vpsrlq        ($T0,$D0,26);
+        &vpand         ($D0,$D0,$MASK);
+       &vpand          ($D3,$D3,$MASK);
+        &vpaddd        ($D1,$D1,$T0);                  # h0 -> h1
+       &vpaddd         ($D4,$D4,$T1);                  # h3 -> h4
+}
+       &vlazy_reduction();
+
+       &vmovdqu        (&X($T0),&QWP(16*0,"esi"));     # load input
+       &vmovdqu        (&X($T1),&QWP(16*1,"esi"));
+       &vinserti128    ($T0,$T0,&QWP(16*2,"esi"),1);
+       &vinserti128    ($T1,$T1,&QWP(16*3,"esi"),1);
+       &lea            ("esi",&DWP(16*4,"esi"));
+       &sub            ("ecx",64);
+       &jnz            (&label("loop"));
+
+&set_label("tail");
+       &vsplat_input   ();
+       &and            ("ebx",-64);                    # restore pointer
+
+       &vpmuladd       (sub {  my $i=shift; &QWP(4+32*$i-128,"edx");   });
+
+       &vlazy_reduction();
+
+       ################################################################
+       # horizontal addition
+
+       &vpsrldq        ($T0,$D0,8);
+       &vpsrldq        ($T1,$D1,8);
+       &vpaddq         ($D0,$D0,$T0);
+       &vpsrldq        ($T0,$D2,8);
+       &vpaddq         ($D1,$D1,$T1);
+       &vpsrldq        ($T1,$D3,8);
+       &vpaddq         ($D2,$D2,$T0);
+       &vpsrldq        ($T0,$D4,8);
+       &vpaddq         ($D3,$D3,$T1);
+       &vpermq         ($T1,$D0,2);                    # keep folding
+       &vpaddq         ($D4,$D4,$T0);
+       &vpermq         ($T0,$D1,2);
+       &vpaddq         ($D0,$D0,$T1);
+       &vpermq         ($T1,$D2,2);
+       &vpaddq         ($D1,$D1,$T0);
+       &vpermq         ($T0,$D3,2);
+       &vpaddq         ($D2,$D2,$T1);
+       &vpermq         ($T1,$D4,2);
+       &vpaddq         ($D3,$D3,$T0);
+       &vpaddq         ($D4,$D4,$T1);
+
+       &cmp            ("ecx",0);
+       &je             (&label("done"));
+
+       ################################################################
+       # clear all but single word
+
+       &vpshufd        (&X($D0),&X($D0),0b11111100);
+       &lea            ("edx",&DWP(32*5+128,"esp"));   # restore pointer
+       &vpshufd        (&X($D1),&X($D1),0b11111100);
+       &vpshufd        (&X($D2),&X($D2),0b11111100);
+       &vpshufd        (&X($D3),&X($D3),0b11111100);
+       &vpshufd        (&X($D4),&X($D4),0b11111100);
+       &jmp            (&label("even"));
+
+&set_label("done",16);
+       &vmovd          (&DWP(-16*3+4*0,"edi"),"xmm0"); # store hash value
+       &vmovd          (&DWP(-16*3+4*1,"edi"),"xmm1");
+       &vmovd          (&DWP(-16*3+4*2,"edi"),"xmm2");
+       &vmovd          (&DWP(-16*3+4*3,"edi"),"xmm3");
+       &vmovd          (&DWP(-16*3+4*4,"edi"),"xmm4");
+       &vzeroupper     ();
+&set_label("nodata");
+       &mov    ("esp","ebp");
+&function_end("_poly1305_blocks_avx2");
+}
+&set_label("const_sse2",64);
+       &data_word(1<<24,0,     1<<24,0,        1<<24,0,        1<<24,0);
+       &data_word(0,0,         0,0,            0,0,            0,0);
+       &data_word(0x03ffffff,0,0x03ffffff,0,   0x03ffffff,0,   0x03ffffff,0);
+       &data_word(0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc);
+}
+&asciz ("Poly1305 for x86, CRYPTOGAMS by <appro\@openssl.org>");
+&align (4);
+
+&asm_finish();
diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl
new file mode 100755 (executable)
index 0000000..d991365
--- /dev/null
@@ -0,0 +1,2244 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements Poly1305 hash for x86_64.
+#
+# March 2015
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone,
+# measured with rdtsc at fixed clock frequency.
+#
+#              IALU/gcc-4.8(*) AVX(**)         AVX2
+# P4           4.90/+120%      -
+# Core 2       2.39/+90%       -
+# Westmere     1.86/+120%      -
+# Sandy Bridge 1.39/+140%      1.10
+# Haswell      1.10/+175%      1.11            0.65
+# Skylake      1.12/+120%      0.96            0.51
+# Silvermont   2.83/+95%       -
+# VIA Nano     1.82/+150%      -
+# Sledgehammer 1.38/+160%      -
+# Bulldozer    2.21/+130%      0.97
+#
+# (*)  improvement coefficients relative to clang are more modest and
+#      are ~50% on most processors, in both cases we are comparing to
+#      __int128 code;
+# (**) SSE2 implementation was attempted, but among non-AVX processors
+#      it was faster than integer-only code only on older Intel P4 and
+#      Core processors, 50-30%, less newer processor is, but slower on
+#      contemporary ones, for example almost 2x slower on Atom, and as
+#      former are naturally disappearing, SSE2 is deemed unnecessary;
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+               =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+       $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+          `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+       $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+          `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+       $avx = ($1>=10) + ($1>=12);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
+       $avx = ($2>=3.0) + ($2>3.0);
+}
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
+my ($mac,$nonce)=($inp,$len);  # *_emit arguments
+my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
+my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
+
+sub poly1305_iteration {
+# input:       copy of $r1 in %rax, $h0-$h2, $r0-$r1
+# output:      $h0-$h2 *= $r0-$r1
+$code.=<<___;
+       mulq    $h0                     # h0*r1
+       mov     %rax,$d2
+        mov    $r0,%rax
+       mov     %rdx,$d3
+
+       mulq    $h0                     # h0*r0
+       mov     %rax,$h0                # future $h0
+        mov    $r0,%rax
+       mov     %rdx,$d1
+
+       mulq    $h1                     # h1*r0
+       add     %rax,$d2
+        mov    $s1,%rax
+       adc     %rdx,$d3
+
+       mulq    $h1                     # h1*s1
+        mov    $h2,$h1                 # borrow $h1
+       add     %rax,$h0
+       adc     %rdx,$d1
+
+       imulq   $s1,$h1                 # h2*s1
+       add     $h1,$d2
+        mov    $d1,$h1
+       adc     \$0,$d3
+
+       imulq   $r0,$h2                 # h2*r0
+       add     $d2,$h1
+       mov     \$-4,%rax               # mask value
+       adc     $h2,$d3
+
+       and     $d3,%rax                # last reduction step
+       mov     $d3,$h2
+       shr     \$2,$d3
+       and     \$3,$h2
+       add     $d3,%rax
+       add     %rax,$h0
+       adc     \$0,$h1
+___
+}
+
+########################################################################
+# Layout of opaque area is following.
+#
+#      unsigned __int64 h[3];          # current hash value base 2^64
+#      unsigned __int64 r[2];          # key value base 2^64
+
+$code.=<<___;
+.text
+
+.extern        OPENSSL_ia32cap_P
+
+.globl poly1305_init
+.type  poly1305_init,\@function,2
+.align 32
+poly1305_init:
+       xor     %rax,%rax
+       mov     %rax,0($ctx)            # initialize hash value
+       mov     %rax,8($ctx)
+       mov     %rax,16($ctx)
+
+       cmp     \$0,$inp
+       je      .Lno_key
+
+       lea     poly1305_blocks(%rip),%r10
+       lea     poly1305_emit(%rip),%r11
+___
+$code.=<<___   if ($avx);
+       mov     OPENSSL_ia32cap_P+4(%rip),%r9
+       lea     poly1305_blocks_avx(%rip),%rax
+       lea     poly1305_emit_avx(%rip),%rcx
+       bt      \$`60-32`,%r9           # AVX?
+       cmovc   %rax,%r10
+       cmovc   %rcx,%r11
+___
+$code.=<<___   if ($avx>1);
+       lea     poly1305_blocks_avx2(%rip),%rax
+       bt      \$`5+32`,%r9            # AVX2?
+       cmovc   %rax,%r10
+___
+$code.=<<___;
+       mov     \$0x0ffffffc0fffffff,%rax
+       mov     \$0x0ffffffc0ffffffc,%rcx
+       and     0($inp),%rax
+       and     8($inp),%rcx
+       mov     %rax,24($ctx)
+       mov     %rcx,32($ctx)
+
+       mov     %r10,0(%rdx)
+       mov     %r11,8(%rdx)
+
+       mov     \$1,%eax
+.Lno_key:
+       ret
+.size  poly1305_init,.-poly1305_init
+
+.globl poly1305_blocks
+.type  poly1305_blocks,\@function,4
+.align 32
+poly1305_blocks:
+       sub     \$16,$len               # too short?
+       jc      .Lno_data
+
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+.Lblocks_body:
+
+       mov     $len,%r15               # reassign $len
+
+       mov     24($ctx),$r0            # load r
+       mov     32($ctx),$s1
+
+       mov     0($ctx),$h0             # load hash value
+       mov     8($ctx),$h1
+       mov     16($ctx),$h2
+
+       mov     $s1,$r1
+       shr     \$2,$s1
+       mov     $r1,%rax
+       add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
+       jmp     .Loop
+
+.align 32
+.Loop:
+       add     0($inp),$h0             # accumulate input
+       adc     8($inp),$h1
+       lea     16($inp),$inp
+       adc     $padbit,$h2
+___
+       &poly1305_iteration();
+$code.=<<___;
+       mov     $r1,%rax
+       sub     \$16,%r15               # len-=16
+       jnc     .Loop
+
+       mov     $h0,0($ctx)             # store hash value
+       mov     $h1,8($ctx)
+       mov     $h2,16($ctx)
+
+       mov     0(%rsp),%r15
+       mov     8(%rsp),%r14
+       mov     16(%rsp),%r13
+       mov     24(%rsp),%r12
+       mov     32(%rsp),%rbp
+       mov     40(%rsp),%rbx
+       lea     48(%rsp),%rsp
+.Lno_data:
+.Lblocks_epilogue:
+       ret
+.size  poly1305_blocks,.-poly1305_blocks
+
+.globl poly1305_emit
+.type  poly1305_emit,\@function,3
+.align 32
+poly1305_emit:
+       mov     0($ctx),%r8     # load hash value
+       mov     8($ctx),%r9
+       mov     16($ctx),%r10
+
+       mov     %r8,%rax
+       add     \$5,%r8         # compare to modulus
+       mov     %r9,%rcx
+       adc     \$0,%r9
+       adc     \$0,%r10
+       shr     \$2,%r10        # did 130-bit value overfow?
+       cmovnz  %r8,%rax
+       cmovnz  %r9,%rcx
+
+       add     0($nonce),%rax  # accumulate nonce
+       adc     8($nonce),%rcx
+       mov     %rax,0($mac)    # write result
+       mov     %rcx,8($mac)
+
+       ret
+.size  poly1305_emit,.-poly1305_emit
+___
+if ($avx) {
+
+########################################################################
+# Layout of opaque area is following.
+#
+#      unsigned __int32 h[5];          # current hash value base 2^26
+#      unsigned __int32 is_base2_26;
+#      unsigned __int64 r[2];          # key value base 2^64
+#      unsigned __int64 pad;
+#      struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
+#
+# where r^n are base 2^26 digits of degrees of multiplier key. There are
+# 5 digits, but last four are interleaved with multiples of 5, totalling
+# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
+
+my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
+    map("%xmm$_",(0..15));
+
+$code.=<<___;
+.type  __poly1305_block,\@abi-omnipotent
+.align 32
+__poly1305_block:
+___
+       &poly1305_iteration();
+$code.=<<___;
+       ret
+.size  __poly1305_block,.-__poly1305_block
+
+.type  __poly1305_init_avx,\@abi-omnipotent
+.align 32
+__poly1305_init_avx:
+       mov     $r0,$h0
+       mov     $r1,$h1
+       xor     $h2,$h2
+
+       lea     48+64($ctx),$ctx        # size optimization
+
+       mov     $r1,%rax
+       call    __poly1305_block        # r^2
+
+       mov     \$0x3ffffff,%eax        # save interleaved r^2 and r base 2^26
+       mov     \$0x3ffffff,%edx
+       mov     $h0,$d1
+       and     $h0#d,%eax
+       mov     $r0,$d2
+       and     $r0#d,%edx
+       mov     %eax,`16*0+0-64`($ctx)
+       shr     \$26,$d1
+       mov     %edx,`16*0+4-64`($ctx)
+       shr     \$26,$d2
+
+       mov     \$0x3ffffff,%eax
+       mov     \$0x3ffffff,%edx
+       and     $d1#d,%eax
+       and     $d2#d,%edx
+       mov     %eax,`16*1+0-64`($ctx)
+       lea     (%rax,%rax,4),%eax      # *5
+       mov     %edx,`16*1+4-64`($ctx)
+       lea     (%rdx,%rdx,4),%edx      # *5
+       mov     %eax,`16*2+0-64`($ctx)
+       shr     \$26,$d1
+       mov     %edx,`16*2+4-64`($ctx)
+       shr     \$26,$d2
+
+       mov     $h1,%rax
+       mov     $r1,%rdx
+       shl     \$12,%rax
+       shl     \$12,%rdx
+       or      $d1,%rax
+       or      $d2,%rdx
+       and     \$0x3ffffff,%eax
+       and     \$0x3ffffff,%edx
+       mov     %eax,`16*3+0-64`($ctx)
+       lea     (%rax,%rax,4),%eax      # *5
+       mov     %edx,`16*3+4-64`($ctx)
+       lea     (%rdx,%rdx,4),%edx      # *5
+       mov     %eax,`16*4+0-64`($ctx)
+       mov     $h1,$d1
+       mov     %edx,`16*4+4-64`($ctx)
+       mov     $r1,$d2
+
+       mov     \$0x3ffffff,%eax
+       mov     \$0x3ffffff,%edx
+       shr     \$14,$d1
+       shr     \$14,$d2
+       and     $d1#d,%eax
+       and     $d2#d,%edx
+       mov     %eax,`16*5+0-64`($ctx)
+       lea     (%rax,%rax,4),%eax      # *5
+       mov     %edx,`16*5+4-64`($ctx)
+       lea     (%rdx,%rdx,4),%edx      # *5
+       mov     %eax,`16*6+0-64`($ctx)
+       shr     \$26,$d1
+       mov     %edx,`16*6+4-64`($ctx)
+       shr     \$26,$d2
+
+       mov     $h2,%rax
+       shl     \$24,%rax
+       or      %rax,$d1
+       mov     $d1#d,`16*7+0-64`($ctx)
+       lea     ($d1,$d1,4),$d1         # *5
+       mov     $d2#d,`16*7+4-64`($ctx)
+       lea     ($d2,$d2,4),$d2         # *5
+       mov     $d1#d,`16*8+0-64`($ctx)
+       mov     $d2#d,`16*8+4-64`($ctx)
+
+       mov     $r1,%rax
+       call    __poly1305_block        # r^3
+
+       mov     \$0x3ffffff,%eax        # save r^3 base 2^26
+       mov     $h0,$d1
+       and     $h0#d,%eax
+       shr     \$26,$d1
+       mov     %eax,`16*0+12-64`($ctx)
+
+       mov     \$0x3ffffff,%edx
+       and     $d1#d,%edx
+       mov     %edx,`16*1+12-64`($ctx)
+       lea     (%rdx,%rdx,4),%edx      # *5
+       shr     \$26,$d1
+       mov     %edx,`16*2+12-64`($ctx)
+
+       mov     $h1,%rax
+       shl     \$12,%rax
+       or      $d1,%rax
+       and     \$0x3ffffff,%eax
+       mov     %eax,`16*3+12-64`($ctx)
+       lea     (%rax,%rax,4),%eax      # *5
+       mov     $h1,$d1
+       mov     %eax,`16*4+12-64`($ctx)
+
+       mov     \$0x3ffffff,%edx
+       shr     \$14,$d1
+       and     $d1#d,%edx
+       mov     %edx,`16*5+12-64`($ctx)
+       lea     (%rdx,%rdx,4),%edx      # *5
+       shr     \$26,$d1
+       mov     %edx,`16*6+12-64`($ctx)
+
+       mov     $h2,%rax
+       shl     \$24,%rax
+       or      %rax,$d1
+       mov     $d1#d,`16*7+12-64`($ctx)
+       lea     ($d1,$d1,4),$d1         # *5
+       mov     $d1#d,`16*8+12-64`($ctx)
+
+       mov     $r1,%rax
+       call    __poly1305_block        # r^4
+
+       mov     \$0x3ffffff,%eax        # save r^4 base 2^26
+       mov     $h0,$d1
+       and     $h0#d,%eax
+       shr     \$26,$d1
+       mov     %eax,`16*0+8-64`($ctx)
+
+       mov     \$0x3ffffff,%edx
+       and     $d1#d,%edx
+       mov     %edx,`16*1+8-64`($ctx)
+       lea     (%rdx,%rdx,4),%edx      # *5
+       shr     \$26,$d1
+       mov     %edx,`16*2+8-64`($ctx)
+
+       mov     $h1,%rax
+       shl     \$12,%rax
+       or      $d1,%rax
+       and     \$0x3ffffff,%eax
+       mov     %eax,`16*3+8-64`($ctx)
+       lea     (%rax,%rax,4),%eax      # *5
+       mov     $h1,$d1
+       mov     %eax,`16*4+8-64`($ctx)
+
+       mov     \$0x3ffffff,%edx
+       shr     \$14,$d1
+       and     $d1#d,%edx
+       mov     %edx,`16*5+8-64`($ctx)
+       lea     (%rdx,%rdx,4),%edx      # *5
+       shr     \$26,$d1
+       mov     %edx,`16*6+8-64`($ctx)
+
+       mov     $h2,%rax
+       shl     \$24,%rax
+       or      %rax,$d1
+       mov     $d1#d,`16*7+8-64`($ctx)
+       lea     ($d1,$d1,4),$d1         # *5
+       mov     $d1#d,`16*8+8-64`($ctx)
+
+       lea     -48-64($ctx),$ctx       # size [de-]optimization
+       ret
+.size  __poly1305_init_avx,.-__poly1305_init_avx
+
+.type  poly1305_blocks_avx,\@function,4
+.align 32
+poly1305_blocks_avx:
+       mov     20($ctx),%r8d           # is_base2_26
+       cmp     \$128,$len
+       jae     .Lblocks_avx
+       test    %r8d,%r8d
+       jz      poly1305_blocks
+
+.Lblocks_avx:
+       and     \$-16,$len
+       jz      .Lno_data_avx
+
+       vzeroupper
+
+       test    %r8d,%r8d
+       jz      .Lbase2_64_avx
+
+       test    \$31,$len
+       jz      .Leven_avx
+
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+.Lblocks_avx_body:
+
+       mov     $len,%r15               # reassign $len
+
+       mov     0($ctx),$d1             # load hash value
+       mov     8($ctx),$d2
+       mov     16($ctx),$h2#d
+
+       mov     24($ctx),$r0            # load r
+       mov     32($ctx),$s1
+
+       ################################# base 2^26 -> base 2^64
+       mov     $d1#d,$h0#d
+       and     \$-1<<31,$d1
+       mov     $d2,$r1                 # borrow $r1
+       mov     $d2#d,$h1#d
+       and     \$-1<<31,$d2
+
+       shr     \$6,$d1
+       shl     \$52,$r1
+       add     $d1,$h0
+       shr     \$12,$h1
+       shr     \$18,$d2
+       add     $r1,$h0
+       adc     $d2,$h1
+
+       mov     $h2,$d1
+       shl     \$40,$d1
+       shr     \$24,$h2
+       add     $d1,$h1
+       adc     \$0,$h2                 # can be partially reduced...
+
+       mov     \$-4,$d2                # ... so reduce
+       mov     $h2,$d1
+       and     $h2,$d2
+       shr     \$2,$d1
+       and     \$3,$h2
+       add     $d2,$d1                 # =*5
+       add     $d1,$h0
+       adc     \$0,$h1
+
+       mov     $s1,$r1
+       mov     $s1,%rax
+       shr     \$2,$s1
+       add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
+
+       add     0($inp),$h0             # accumulate input
+       adc     8($inp),$h1
+       lea     16($inp),$inp
+       adc     $padbit,$h2
+
+       call    __poly1305_block
+
+       test    $padbit,$padbit         # if $padbit is zero,
+       jz      .Lstore_base2_64_avx    # store hash in base 2^64 format
+
+       ################################# base 2^64 -> base 2^26
+       mov     $h0,%rax
+       mov     $h0,%rdx
+       shr     \$52,$h0
+       mov     $h1,$r0
+       mov     $h1,$r1
+       shr     \$26,%rdx
+       and     \$0x3ffffff,%rax        # h[0]
+       shl     \$12,$r0
+       and     \$0x3ffffff,%rdx        # h[1]
+       shr     \$14,$h1
+       or      $r0,$h0
+       shl     \$24,$h2
+       and     \$0x3ffffff,$h0         # h[2]
+       shr     \$40,$r1
+       and     \$0x3ffffff,$h1         # h[3]
+       or      $r1,$h2                 # h[4]
+
+       sub     \$16,%r15
+       jz      .Lstore_base2_26_avx
+
+       vmovd   %rax#d,$H0
+       vmovd   %rdx#d,$H1
+       vmovd   $h0#d,$H2
+       vmovd   $h1#d,$H3
+       vmovd   $h2#d,$H4
+       jmp     .Lproceed_avx
+
+.align 32
+.Lstore_base2_64_avx:
+       mov     $h0,0($ctx)
+       mov     $h1,8($ctx)
+       mov     $h2,16($ctx)            # note that is_base2_26 is zeroed
+       jmp     .Ldone_avx
+
+.align 16
+.Lstore_base2_26_avx:
+       mov     %rax#d,0($ctx)          # store hash value base 2^26
+       mov     %rdx#d,4($ctx)
+       mov     $h0#d,8($ctx)
+       mov     $h1#d,12($ctx)
+       mov     $h2#d,16($ctx)
+.align 16
+.Ldone_avx:
+       mov     0(%rsp),%r15
+       mov     8(%rsp),%r14
+       mov     16(%rsp),%r13
+       mov     24(%rsp),%r12
+       mov     32(%rsp),%rbp
+       mov     40(%rsp),%rbx
+       lea     48(%rsp),%rsp
+.Lno_data_avx:
+.Lblocks_avx_epilogue:
+       ret
+
+.align 32
+.Lbase2_64_avx:
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+.Lbase2_64_avx_body:
+
+       mov     $len,%r15               # reassign $len
+
+       mov     24($ctx),$r0            # load r
+       mov     32($ctx),$s1
+
+       mov     0($ctx),$h0             # load hash value
+       mov     8($ctx),$h1
+       mov     16($ctx),$h2#d
+
+       mov     $s1,$r1
+       mov     $s1,%rax
+       shr     \$2,$s1
+       add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
+
+       test    \$31,$len
+       jz      .Linit_avx
+
+       add     0($inp),$h0             # accumulate input
+       adc     8($inp),$h1
+       lea     16($inp),$inp
+       adc     $padbit,$h2
+       sub     \$16,%r15
+
+       call    __poly1305_block
+
+.Linit_avx:
+       ################################# base 2^64 -> base 2^26
+       mov     $h0,%rax
+       mov     $h0,%rdx
+       shr     \$52,$h0
+       mov     $h1,$d1
+       mov     $h1,$d2
+       shr     \$26,%rdx
+       and     \$0x3ffffff,%rax        # h[0]
+       shl     \$12,$d1
+       and     \$0x3ffffff,%rdx        # h[1]
+       shr     \$14,$h1
+       or      $d1,$h0
+       shl     \$24,$h2
+       and     \$0x3ffffff,$h0         # h[2]
+       shr     \$40,$d2
+       and     \$0x3ffffff,$h1         # h[3]
+       or      $d2,$h2                 # h[4]
+
+       vmovd   %rax#d,$H0
+       vmovd   %rdx#d,$H1
+       vmovd   $h0#d,$H2
+       vmovd   $h1#d,$H3
+       vmovd   $h2#d,$H4
+       movl    \$1,20($ctx)            # set is_base2_26
+
+       call    __poly1305_init_avx
+
+.Lproceed_avx:
+       mov     %r15,$len
+
+       mov     0(%rsp),%r15
+       mov     8(%rsp),%r14
+       mov     16(%rsp),%r13
+       mov     24(%rsp),%r12
+       mov     32(%rsp),%rbp
+       mov     40(%rsp),%rbx
+       lea     48(%rsp),%rax
+       lea     48(%rsp),%rsp
+.Lbase2_64_avx_epilogue:
+       jmp     .Ldo_avx
+
+.align 32
+.Leven_avx:
+       vmovd           4*0($ctx),$H0           # load hash value
+       vmovd           4*1($ctx),$H1
+       vmovd           4*2($ctx),$H2
+       vmovd           4*3($ctx),$H3
+       vmovd           4*4($ctx),$H4
+
+.Ldo_avx:
+___
+$code.=<<___   if (!$win64);
+       lea             -0x58(%rsp),%r11
+       sub             \$0x178,%rsp
+___
+$code.=<<___   if ($win64);
+       lea             -0xf8(%rsp),%r11
+       sub             \$0x218,%rsp
+       vmovdqa         %xmm6,0x50(%r11)
+       vmovdqa         %xmm7,0x60(%r11)
+       vmovdqa         %xmm8,0x70(%r11)
+       vmovdqa         %xmm9,0x80(%r11)
+       vmovdqa         %xmm10,0x90(%r11)
+       vmovdqa         %xmm11,0xa0(%r11)
+       vmovdqa         %xmm12,0xb0(%r11)
+       vmovdqa         %xmm13,0xc0(%r11)
+       vmovdqa         %xmm14,0xd0(%r11)
+       vmovdqa         %xmm15,0xe0(%r11)
+.Ldo_avx_body:
+___
+$code.=<<___;
+       sub             \$64,$len
+       lea             -32($inp),%rax
+       cmovc           %rax,$inp
+
+       vmovdqu         `16*3`($ctx),$D4        # preload r0^2
+       lea             `16*3+64`($ctx),$ctx    # size optimization
+       lea             .Lconst(%rip),%rcx
+
+       ################################################################
+       # load input
+       vmovdqu         16*2($inp),$T0
+       vmovdqu         16*3($inp),$T1
+       vmovdqa         64(%rcx),$MASK          # .Lmask26
+
+       vpsrldq         \$6,$T0,$T2             # splat input
+       vpsrldq         \$6,$T1,$T3
+       vpunpckhqdq     $T1,$T0,$T4             # 4
+       vpunpcklqdq     $T1,$T0,$T0             # 0:1
+       vpunpcklqdq     $T3,$T2,$T3             # 2:3
+
+       vpsrlq          \$40,$T4,$T4            # 4
+       vpsrlq          \$26,$T0,$T1
+       vpand           $MASK,$T0,$T0           # 0
+       vpsrlq          \$4,$T3,$T2
+       vpand           $MASK,$T1,$T1           # 1
+       vpsrlq          \$30,$T3,$T3
+       vpand           $MASK,$T2,$T2           # 2
+       vpand           $MASK,$T3,$T3           # 3
+       vpor            32(%rcx),$T4,$T4        # padbit, yes, always
+
+       jbe             .Lskip_loop_avx
+
+       # expand and copy pre-calculated table to stack
+       vmovdqu         `16*1-64`($ctx),$D1
+       vmovdqu         `16*2-64`($ctx),$D2
+       vpshufd         \$0xEE,$D4,$D3          # 34xx -> 3434
+       vpshufd         \$0x44,$D4,$D0          # xx12 -> 1212
+       vmovdqa         $D3,-0x90(%r11)
+       vmovdqa         $D0,0x00(%rsp)
+       vpshufd         \$0xEE,$D1,$D4
+       vmovdqu         `16*3-64`($ctx),$D0
+       vpshufd         \$0x44,$D1,$D1
+       vmovdqa         $D4,-0x80(%r11)
+       vmovdqa         $D1,0x10(%rsp)
+       vpshufd         \$0xEE,$D2,$D3
+       vmovdqu         `16*4-64`($ctx),$D1
+       vpshufd         \$0x44,$D2,$D2
+       vmovdqa         $D3,-0x70(%r11)
+       vmovdqa         $D2,0x20(%rsp)
+       vpshufd         \$0xEE,$D0,$D4
+       vmovdqu         `16*5-64`($ctx),$D2
+       vpshufd         \$0x44,$D0,$D0
+       vmovdqa         $D4,-0x60(%r11)
+       vmovdqa         $D0,0x30(%rsp)
+       vpshufd         \$0xEE,$D1,$D3
+       vmovdqu         `16*6-64`($ctx),$D0
+       vpshufd         \$0x44,$D1,$D1
+       vmovdqa         $D3,-0x50(%r11)
+       vmovdqa         $D1,0x40(%rsp)
+       vpshufd         \$0xEE,$D2,$D4
+       vmovdqu         `16*7-64`($ctx),$D1
+       vpshufd         \$0x44,$D2,$D2
+       vmovdqa         $D4,-0x40(%r11)
+       vmovdqa         $D2,0x50(%rsp)
+       vpshufd         \$0xEE,$D0,$D3
+       vmovdqu         `16*8-64`($ctx),$D2
+       vpshufd         \$0x44,$D0,$D0
+       vmovdqa         $D3,-0x30(%r11)
+       vmovdqa         $D0,0x60(%rsp)
+       vpshufd         \$0xEE,$D1,$D4
+       vpshufd         \$0x44,$D1,$D1
+       vmovdqa         $D4,-0x20(%r11)
+       vmovdqa         $D1,0x70(%rsp)
+       vpshufd         \$0xEE,$D2,$D3
+        vmovdqa        0x00(%rsp),$D4          # preload r0^2
+       vpshufd         \$0x44,$D2,$D2
+       vmovdqa         $D3,-0x10(%r11)
+       vmovdqa         $D2,0x80(%rsp)
+
+       jmp             .Loop_avx
+
+.align 32
+.Loop_avx:
+       ################################################################
+       # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+       # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+       #   \___________________/
+       # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+       # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+       #   \___________________/ \____________________/
+       #
+       # Note that we start with inp[2:3]*r^2. This is because it
+       # doesn't depend on reduction in previous iteration.
+       ################################################################
+       # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+       # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+       # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+       # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+       # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+       #
+       # though note that $Tx and $Hx are "reversed" in this section,
+       # and $D4 is preloaded with r0^2...
+
+       vpmuludq        $T0,$D4,$D0             # d0 = h0*r0
+       vpmuludq        $T1,$D4,$D1             # d1 = h1*r0
+         vmovdqa       $H2,0x20(%r11)                          # offload hash
+       vpmuludq        $T2,$D4,$D2             # d3 = h2*r0
+        vmovdqa        0x10(%rsp),$H2          # r1^2
+       vpmuludq        $T3,$D4,$D3             # d3 = h3*r0
+       vpmuludq        $T4,$D4,$D4             # d4 = h4*r0
+
+         vmovdqa       $H0,0x00(%r11)                          #
+       vpmuludq        0x20(%rsp),$T4,$H0      # h4*s1
+         vmovdqa       $H1,0x10(%r11)                          #
+       vpmuludq        $T3,$H2,$H1             # h3*r1
+       vpaddq          $H0,$D0,$D0             # d0 += h4*s1
+       vpaddq          $H1,$D4,$D4             # d4 += h3*r1
+         vmovdqa       $H3,0x30(%r11)                          #
+       vpmuludq        $T2,$H2,$H0             # h2*r1
+       vpmuludq        $T1,$H2,$H1             # h1*r1
+       vpaddq          $H0,$D3,$D3             # d3 += h2*r1
+        vmovdqa        0x30(%rsp),$H3          # r2^2
+       vpaddq          $H1,$D2,$D2             # d2 += h1*r1
+         vmovdqa       $H4,0x40(%r11)                          #
+       vpmuludq        $T0,$H2,$H2             # h0*r1
+        vpmuludq       $T2,$H3,$H0             # h2*r2
+       vpaddq          $H2,$D1,$D1             # d1 += h0*r1
+
+        vmovdqa        0x40(%rsp),$H4          # s2^2
+       vpaddq          $H0,$D4,$D4             # d4 += h2*r2
+       vpmuludq        $T1,$H3,$H1             # h1*r2
+       vpmuludq        $T0,$H3,$H3             # h0*r2
+       vpaddq          $H1,$D3,$D3             # d3 += h1*r2
+        vmovdqa        0x50(%rsp),$H2          # r3^2
+       vpaddq          $H3,$D2,$D2             # d2 += h0*r2
+       vpmuludq        $T4,$H4,$H0             # h4*s2
+       vpmuludq        $T3,$H4,$H4             # h3*s2
+       vpaddq          $H0,$D1,$D1             # d1 += h4*s2
+        vmovdqa        0x60(%rsp),$H3          # s3^2
+       vpaddq          $H4,$D0,$D0             # d0 += h3*s2
+
+        vmovdqa        0x80(%rsp),$H4          # s4^2
+       vpmuludq        $T1,$H2,$H1             # h1*r3
+       vpmuludq        $T0,$H2,$H2             # h0*r3
+       vpaddq          $H1,$D4,$D4             # d4 += h1*r3
+       vpaddq          $H2,$D3,$D3             # d3 += h0*r3
+       vpmuludq        $T4,$H3,$H0             # h4*s3
+       vpmuludq        $T3,$H3,$H1             # h3*s3
+       vpaddq          $H0,$D2,$D2             # d2 += h4*s3
+        vmovdqu        16*0($inp),$H0                          # load input
+       vpaddq          $H1,$D1,$D1             # d1 += h3*s3
+       vpmuludq        $T2,$H3,$H3             # h2*s3
+        vpmuludq       $T2,$H4,$T2             # h2*s4
+       vpaddq          $H3,$D0,$D0             # d0 += h2*s3
+
+        vmovdqu        16*1($inp),$H1                          #
+       vpaddq          $T2,$D1,$D1             # d1 += h2*s4
+       vpmuludq        $T3,$H4,$T3             # h3*s4
+       vpmuludq        $T4,$H4,$T4             # h4*s4
+        vpsrldq        \$6,$H0,$H2                             # splat input
+       vpaddq          $T3,$D2,$D2             # d2 += h3*s4
+       vpaddq          $T4,$D3,$D3             # d3 += h4*s4
+        vpsrldq        \$6,$H1,$H3                             #
+       vpmuludq        0x70(%rsp),$T0,$T4      # h0*r4
+       vpmuludq        $T1,$H4,$T0             # h1*s4
+        vpunpckhqdq    $H1,$H0,$H4             # 4
+       vpaddq          $T4,$D4,$D4             # d4 += h0*r4
+        vmovdqa        -0x90(%r11),$T4         # r0^4
+       vpaddq          $T0,$D0,$D0             # d0 += h1*s4
+
+       vpunpcklqdq     $H1,$H0,$H0             # 0:1
+       vpunpcklqdq     $H3,$H2,$H3             # 2:3
+
+       #vpsrlq         \$40,$H4,$H4            # 4
+       vpsrldq         \$`40/8`,$H4,$H4        # 4
+       vpsrlq          \$26,$H0,$H1
+       vpand           $MASK,$H0,$H0           # 0
+       vpsrlq          \$4,$H3,$H2
+       vpand           $MASK,$H1,$H1           # 1
+       vpand           0(%rcx),$H4,$H4         # .Lmask24
+       vpsrlq          \$30,$H3,$H3
+       vpand           $MASK,$H2,$H2           # 2
+       vpand           $MASK,$H3,$H3           # 3
+       vpor            32(%rcx),$H4,$H4        # padbit, yes, always
+
+       vpaddq          0x00(%r11),$H0,$H0      # add hash value
+       vpaddq          0x10(%r11),$H1,$H1
+       vpaddq          0x20(%r11),$H2,$H2
+       vpaddq          0x30(%r11),$H3,$H3
+       vpaddq          0x40(%r11),$H4,$H4
+
+       lea             16*2($inp),%rax
+       lea             16*4($inp),$inp
+       sub             \$64,$len
+       cmovc           %rax,$inp
+
+       ################################################################
+       # Now we accumulate (inp[0:1]+hash)*r^4
+       ################################################################
+       # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+       # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+       # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+       # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+       # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+       vpmuludq        $H0,$T4,$T0             # h0*r0
+       vpmuludq        $H1,$T4,$T1             # h1*r0
+       vpaddq          $T0,$D0,$D0
+       vpaddq          $T1,$D1,$D1
+        vmovdqa        -0x80(%r11),$T2         # r1^4
+       vpmuludq        $H2,$T4,$T0             # h2*r0
+       vpmuludq        $H3,$T4,$T1             # h3*r0
+       vpaddq          $T0,$D2,$D2
+       vpaddq          $T1,$D3,$D3
+       vpmuludq        $H4,$T4,$T4             # h4*r0
+        vpmuludq       -0x70(%r11),$H4,$T0     # h4*s1
+       vpaddq          $T4,$D4,$D4
+
+       vpaddq          $T0,$D0,$D0             # d0 += h4*s1
+       vpmuludq        $H2,$T2,$T1             # h2*r1
+       vpmuludq        $H3,$T2,$T0             # h3*r1
+       vpaddq          $T1,$D3,$D3             # d3 += h2*r1
+        vmovdqa        -0x60(%r11),$T3         # r2^4
+       vpaddq          $T0,$D4,$D4             # d4 += h3*r1
+       vpmuludq        $H1,$T2,$T1             # h1*r1
+       vpmuludq        $H0,$T2,$T2             # h0*r1
+       vpaddq          $T1,$D2,$D2             # d2 += h1*r1
+       vpaddq          $T2,$D1,$D1             # d1 += h0*r1
+
+        vmovdqa        -0x50(%r11),$T4         # s2^4
+       vpmuludq        $H2,$T3,$T0             # h2*r2
+       vpmuludq        $H1,$T3,$T1             # h1*r2
+       vpaddq          $T0,$D4,$D4             # d4 += h2*r2
+       vpaddq          $T1,$D3,$D3             # d3 += h1*r2
+        vmovdqa        -0x40(%r11),$T2         # r3^4
+       vpmuludq        $H0,$T3,$T3             # h0*r2
+       vpmuludq        $H4,$T4,$T0             # h4*s2
+       vpaddq          $T3,$D2,$D2             # d2 += h0*r2
+       vpaddq          $T0,$D1,$D1             # d1 += h4*s2
+        vmovdqa        -0x30(%r11),$T3         # s3^4
+       vpmuludq        $H3,$T4,$T4             # h3*s2
+        vpmuludq       $H1,$T2,$T1             # h1*r3
+       vpaddq          $T4,$D0,$D0             # d0 += h3*s2
+
+        vmovdqa        -0x10(%r11),$T4         # s4^4
+       vpaddq          $T1,$D4,$D4             # d4 += h1*r3
+       vpmuludq        $H0,$T2,$T2             # h0*r3
+       vpmuludq        $H4,$T3,$T0             # h4*s3
+       vpaddq          $T2,$D3,$D3             # d3 += h0*r3
+       vpaddq          $T0,$D2,$D2             # d2 += h4*s3
+        vmovdqu        16*2($inp),$T0                          # load input
+       vpmuludq        $H3,$T3,$T2             # h3*s3
+       vpmuludq        $H2,$T3,$T3             # h2*s3
+       vpaddq          $T2,$D1,$D1             # d1 += h3*s3
+        vmovdqu        16*3($inp),$T1                          #
+       vpaddq          $T3,$D0,$D0             # d0 += h2*s3
+
+       vpmuludq        $H2,$T4,$H2             # h2*s4
+       vpmuludq        $H3,$T4,$H3             # h3*s4
+        vpsrldq        \$6,$T0,$T2                             # splat input
+       vpaddq          $H2,$D1,$D1             # d1 += h2*s4
+       vpmuludq        $H4,$T4,$H4             # h4*s4
+        vpsrldq        \$6,$T1,$T3                             #
+       vpaddq          $H3,$D2,$H2             # h2 = d2 + h3*s4
+       vpaddq          $H4,$D3,$H3             # h3 = d3 + h4*s4
+       vpmuludq        -0x20(%r11),$H0,$H4     # h0*r4
+       vpmuludq        $H1,$T4,$H0
+        vpunpckhqdq    $T1,$T0,$T4             # 4
+       vpaddq          $H4,$D4,$H4             # h4 = d4 + h0*r4
+       vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
+
+       vpunpcklqdq     $T1,$T0,$T0             # 0:1
+       vpunpcklqdq     $T3,$T2,$T3             # 2:3
+
+       #vpsrlq         \$40,$T4,$T4            # 4
+       vpsrldq         \$`40/8`,$T4,$T4        # 4
+       vpsrlq          \$26,$T0,$T1
+        vmovdqa        0x00(%rsp),$D4          # preload r0^2
+       vpand           $MASK,$T0,$T0           # 0
+       vpsrlq          \$4,$T3,$T2
+       vpand           $MASK,$T1,$T1           # 1
+       vpand           0(%rcx),$T4,$T4         # .Lmask24
+       vpsrlq          \$30,$T3,$T3
+       vpand           $MASK,$T2,$T2           # 2
+       vpand           $MASK,$T3,$T3           # 3
+       vpor            32(%rcx),$T4,$T4        # padbit, yes, always
+
+       ################################################################
+       # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+       # and P. Schwabe
+
+       vpsrlq          \$26,$H3,$D3
+       vpand           $MASK,$H3,$H3
+       vpaddq          $D3,$H4,$H4             # h3 -> h4
+
+       vpsrlq          \$26,$H0,$D0
+       vpand           $MASK,$H0,$H0
+       vpaddq          $D0,$D1,$H1             # h0 -> h1
+
+       vpsrlq          \$26,$H4,$D0
+       vpand           $MASK,$H4,$H4
+
+       vpsrlq          \$26,$H1,$D1
+       vpand           $MASK,$H1,$H1
+       vpaddq          $D1,$H2,$H2             # h1 -> h2
+
+       vpaddq          $D0,$H0,$H0
+       vpsllq          \$2,$D0,$D0
+       vpaddq          $D0,$H0,$H0             # h4 -> h0
+
+       vpsrlq          \$26,$H2,$D2
+       vpand           $MASK,$H2,$H2
+       vpaddq          $D2,$H3,$H3             # h2 -> h3
+
+       vpsrlq          \$26,$H0,$D0
+       vpand           $MASK,$H0,$H0
+       vpaddq          $D0,$H1,$H1             # h0 -> h1
+
+       vpsrlq          \$26,$H3,$D3
+       vpand           $MASK,$H3,$H3
+       vpaddq          $D3,$H4,$H4             # h3 -> h4
+
+       ja              .Loop_avx
+
+.Lskip_loop_avx:
+       ################################################################
+       # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+       vpshufd         \$0x10,$D4,$D4          # r0^n, xx12 -> x1x2
+       add             \$32,$len
+       jnz             .Long_tail_avx
+
+       vpaddq          $H2,$T2,$T2
+       vpaddq          $H0,$T0,$T0
+       vpaddq          $H1,$T1,$T1
+       vpaddq          $H3,$T3,$T3
+       vpaddq          $H4,$T4,$T4
+
+.Long_tail_avx:
+       vmovdqa         $H2,0x20(%r11)
+       vmovdqa         $H0,0x00(%r11)
+       vmovdqa         $H1,0x10(%r11)
+       vmovdqa         $H3,0x30(%r11)
+       vmovdqa         $H4,0x40(%r11)
+
+       # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+       # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+       # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+       # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+       # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+       vpmuludq        $T2,$D4,$D2             # d2 = h2*r0
+       vpmuludq        $T0,$D4,$D0             # d0 = h0*r0
+        vpshufd        \$0x10,`16*1-64`($ctx),$H2              # r1^n
+       vpmuludq        $T1,$D4,$D1             # d1 = h1*r0
+       vpmuludq        $T3,$D4,$D3             # d3 = h3*r0
+       vpmuludq        $T4,$D4,$D4             # d4 = h4*r0
+
+       vpmuludq        $T3,$H2,$H0             # h3*r1
+       vpaddq          $H0,$D4,$D4             # d4 += h3*r1
+        vpshufd        \$0x10,`16*2-64`($ctx),$H3              # s1^n
+       vpmuludq        $T2,$H2,$H1             # h2*r1
+       vpaddq          $H1,$D3,$D3             # d3 += h2*r1
+        vpshufd        \$0x10,`16*3-64`($ctx),$H4              # r2^n
+       vpmuludq        $T1,$H2,$H0             # h1*r1
+       vpaddq          $H0,$D2,$D2             # d2 += h1*r1
+       vpmuludq        $T0,$H2,$H2             # h0*r1
+       vpaddq          $H2,$D1,$D1             # d1 += h0*r1
+       vpmuludq        $T4,$H3,$H3             # h4*s1
+       vpaddq          $H3,$D0,$D0             # d0 += h4*s1
+
+        vpshufd        \$0x10,`16*4-64`($ctx),$H2              # s2^n
+       vpmuludq        $T2,$H4,$H1             # h2*r2
+       vpaddq          $H1,$D4,$D4             # d4 += h2*r2
+       vpmuludq        $T1,$H4,$H0             # h1*r2
+       vpaddq          $H0,$D3,$D3             # d3 += h1*r2
+        vpshufd        \$0x10,`16*5-64`($ctx),$H3              # r3^n
+       vpmuludq        $T0,$H4,$H4             # h0*r2
+       vpaddq          $H4,$D2,$D2             # d2 += h0*r2
+       vpmuludq        $T4,$H2,$H1             # h4*s2
+       vpaddq          $H1,$D1,$D1             # d1 += h4*s2
+        vpshufd        \$0x10,`16*6-64`($ctx),$H4              # s3^n
+       vpmuludq        $T3,$H2,$H2             # h3*s2
+       vpaddq          $H2,$D0,$D0             # d0 += h3*s2
+
+       vpmuludq        $T1,$H3,$H0             # h1*r3
+       vpaddq          $H0,$D4,$D4             # d4 += h1*r3
+       vpmuludq        $T0,$H3,$H3             # h0*r3
+       vpaddq          $H3,$D3,$D3             # d3 += h0*r3
+        vpshufd        \$0x10,`16*7-64`($ctx),$H2              # r4^n
+       vpmuludq        $T4,$H4,$H1             # h4*s3
+       vpaddq          $H1,$D2,$D2             # d2 += h4*s3
+        vpshufd        \$0x10,`16*8-64`($ctx),$H3              # s4^n
+       vpmuludq        $T3,$H4,$H0             # h3*s3
+       vpaddq          $H0,$D1,$D1             # d1 += h3*s3
+       vpmuludq        $T2,$H4,$H4             # h2*s3
+       vpaddq          $H4,$D0,$D0             # d0 += h2*s3
+
+       vpmuludq        $T0,$H2,$H2             # h0*r4
+       vpaddq          $H2,$D4,$D4             # h4 = d4 + h0*r4
+       vpmuludq        $T4,$H3,$H1             # h4*s4
+       vpaddq          $H1,$D3,$D3             # h3 = d3 + h4*s4
+       vpmuludq        $T3,$H3,$H0             # h3*s4
+       vpaddq          $H0,$D2,$D2             # h2 = d2 + h3*s4
+       vpmuludq        $T2,$H3,$H1             # h2*s4
+       vpaddq          $H1,$D1,$D1             # h1 = d1 + h2*s4
+       vpmuludq        $T1,$H3,$H3             # h1*s4
+       vpaddq          $H3,$D0,$D0             # h0 = d0 + h1*s4
+
+       jz              .Lshort_tail_avx
+
+       vmovdqu         16*0($inp),$H0          # load input
+       vmovdqu         16*1($inp),$H1
+
+       vpsrldq         \$6,$H0,$H2             # splat input
+       vpsrldq         \$6,$H1,$H3
+       vpunpckhqdq     $H1,$H0,$H4             # 4
+       vpunpcklqdq     $H1,$H0,$H0             # 0:1
+       vpunpcklqdq     $H3,$H2,$H3             # 2:3
+
+       vpsrlq          \$40,$H4,$H4            # 4
+       vpsrlq          \$26,$H0,$H1
+       vpand           $MASK,$H0,$H0           # 0
+       vpsrlq          \$4,$H3,$H2
+       vpand           $MASK,$H1,$H1           # 1
+       vpsrlq          \$30,$H3,$H3
+       vpand           $MASK,$H2,$H2           # 2
+       vpand           $MASK,$H3,$H3           # 3
+       vpor            32(%rcx),$H4,$H4        # padbit, yes, always
+
+       vpshufd         \$0x32,`16*0-64`($ctx),$T4      # r0^n, 34xx -> x3x4
+       vpaddq          0x00(%r11),$H0,$H0
+       vpaddq          0x10(%r11),$H1,$H1
+       vpaddq          0x20(%r11),$H2,$H2
+       vpaddq          0x30(%r11),$H3,$H3
+       vpaddq          0x40(%r11),$H4,$H4
+
+       ################################################################
+       # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
+
+       vpmuludq        $H0,$T4,$T0             # h0*r0
+       vpaddq          $T0,$D0,$D0             # d0 += h0*r0
+       vpmuludq        $H1,$T4,$T1             # h1*r0
+       vpaddq          $T1,$D1,$D1             # d1 += h1*r0
+       vpmuludq        $H2,$T4,$T0             # h2*r0
+       vpaddq          $T0,$D2,$D2             # d2 += h2*r0
+        vpshufd        \$0x32,`16*1-64`($ctx),$T2              # r1^n
+       vpmuludq        $H3,$T4,$T1             # h3*r0
+       vpaddq          $T1,$D3,$D3             # d3 += h3*r0
+       vpmuludq        $H4,$T4,$T4             # h4*r0
+       vpaddq          $T4,$D4,$D4             # d4 += h4*r0
+
+       vpmuludq        $H3,$T2,$T0             # h3*r1
+       vpaddq          $T0,$D4,$D4             # d4 += h3*r1
+        vpshufd        \$0x32,`16*2-64`($ctx),$T3              # s1
+       vpmuludq        $H2,$T2,$T1             # h2*r1
+       vpaddq          $T1,$D3,$D3             # d3 += h2*r1
+        vpshufd        \$0x32,`16*3-64`($ctx),$T4              # r2
+       vpmuludq        $H1,$T2,$T0             # h1*r1
+       vpaddq          $T0,$D2,$D2             # d2 += h1*r1
+       vpmuludq        $H0,$T2,$T2             # h0*r1
+       vpaddq          $T2,$D1,$D1             # d1 += h0*r1
+       vpmuludq        $H4,$T3,$T3             # h4*s1
+       vpaddq          $T3,$D0,$D0             # d0 += h4*s1
+
+        vpshufd        \$0x32,`16*4-64`($ctx),$T2              # s2
+       vpmuludq        $H2,$T4,$T1             # h2*r2
+       vpaddq          $T1,$D4,$D4             # d4 += h2*r2
+       vpmuludq        $H1,$T4,$T0             # h1*r2
+       vpaddq          $T0,$D3,$D3             # d3 += h1*r2
+        vpshufd        \$0x32,`16*5-64`($ctx),$T3              # r3
+       vpmuludq        $H0,$T4,$T4             # h0*r2
+       vpaddq          $T4,$D2,$D2             # d2 += h0*r2
+       vpmuludq        $H4,$T2,$T1             # h4*s2
+       vpaddq          $T1,$D1,$D1             # d1 += h4*s2
+        vpshufd        \$0x32,`16*6-64`($ctx),$T4              # s3
+       vpmuludq        $H3,$T2,$T2             # h3*s2
+       vpaddq          $T2,$D0,$D0             # d0 += h3*s2
+
+       vpmuludq        $H1,$T3,$T0             # h1*r3
+       vpaddq          $T0,$D4,$D4             # d4 += h1*r3
+       vpmuludq        $H0,$T3,$T3             # h0*r3
+       vpaddq          $T3,$D3,$D3             # d3 += h0*r3
+        vpshufd        \$0x32,`16*7-64`($ctx),$T2              # r4
+       vpmuludq        $H4,$T4,$T1             # h4*s3
+       vpaddq          $T1,$D2,$D2             # d2 += h4*s3
+        vpshufd        \$0x32,`16*8-64`($ctx),$T3              # s4
+       vpmuludq        $H3,$T4,$T0             # h3*s3
+       vpaddq          $T0,$D1,$D1             # d1 += h3*s3
+       vpmuludq        $H2,$T4,$T4             # h2*s3
+       vpaddq          $T4,$D0,$D0             # d0 += h2*s3
+
+       vpmuludq        $H0,$T2,$T2             # h0*r4
+       vpaddq          $T2,$D4,$D4             # d4 += h0*r4
+       vpmuludq        $H4,$T3,$T1             # h4*s4
+       vpaddq          $T1,$D3,$D3             # d3 += h4*s4
+       vpmuludq        $H3,$T3,$T0             # h3*s4
+       vpaddq          $T0,$D2,$D2             # d2 += h3*s4
+       vpmuludq        $H2,$T3,$T1             # h2*s4
+       vpaddq          $T1,$D1,$D1             # d1 += h2*s4
+       vpmuludq        $H1,$T3,$T3             # h1*s4
+       vpaddq          $T3,$D0,$D0             # d0 += h1*s4
+
+.Lshort_tail_avx:
+       ################################################################
+       # lazy reduction
+
+       vpsrlq          \$26,$D3,$H3
+       vpand           $MASK,$D3,$D3
+       vpaddq          $H3,$D4,$D4             # h3 -> h4
+
+       vpsrlq          \$26,$D0,$H0
+       vpand           $MASK,$D0,$D0
+       vpaddq          $H0,$D1,$D1             # h0 -> h1
+
+       vpsrlq          \$26,$D4,$H4
+       vpand           $MASK,$D4,$D4
+
+       vpsrlq          \$26,$D1,$H1
+       vpand           $MASK,$D1,$D1
+       vpaddq          $H1,$D2,$D2             # h1 -> h2
+
+       vpaddq          $H4,$D0,$D0
+       vpsllq          \$2,$H4,$H4
+       vpaddq          $H4,$D0,$D0             # h4 -> h0
+
+       vpsrlq          \$26,$D2,$H2
+       vpand           $MASK,$D2,$D2
+       vpaddq          $H2,$D3,$D3             # h2 -> h3
+
+       vpsrlq          \$26,$D0,$H0
+       vpand           $MASK,$D0,$D0
+       vpaddq          $H0,$D1,$D1             # h0 -> h1
+
+       vpsrlq          \$26,$D3,$H3
+       vpand           $MASK,$D3,$D3
+       vpaddq          $H3,$D4,$D4             # h3 -> h4
+
+       ################################################################
+       # horizontal addition
+
+       vpsrldq         \$8,$D2,$T2
+       vpsrldq         \$8,$D0,$T0
+       vpsrldq         \$8,$D1,$T1
+       vpsrldq         \$8,$D3,$T3
+       vpsrldq         \$8,$D4,$T4
+       vpaddq          $T2,$D2,$H2
+       vpaddq          $T0,$D0,$H0
+       vpaddq          $T1,$D1,$H1
+       vpaddq          $T3,$D3,$H3
+       vpaddq          $T4,$D4,$H4
+
+       vmovd           $H0,`4*0-48-64`($ctx)   # save partially reduced
+       vmovd           $H1,`4*1-48-64`($ctx)
+       vmovd           $H2,`4*2-48-64`($ctx)
+       vmovd           $H3,`4*3-48-64`($ctx)
+       vmovd           $H4,`4*4-48-64`($ctx)
+___
+$code.=<<___   if ($win64);
+       vmovdqa         0x50(%r11),%xmm6
+       vmovdqa         0x60(%r11),%xmm7
+       vmovdqa         0x70(%r11),%xmm8
+       vmovdqa         0x80(%r11),%xmm9
+       vmovdqa         0x90(%r11),%xmm10
+       vmovdqa         0xa0(%r11),%xmm11
+       vmovdqa         0xb0(%r11),%xmm12
+       vmovdqa         0xc0(%r11),%xmm13
+       vmovdqa         0xd0(%r11),%xmm14
+       vmovdqa         0xe0(%r11),%xmm15
+       lea             0xf8(%r11),%rsp
+.Ldo_avx_epilogue:
+___
+$code.=<<___   if (!$win64);
+       lea             0x58(%r11),%rsp
+___
+$code.=<<___;
+       vzeroupper
+       ret
+.size  poly1305_blocks_avx,.-poly1305_blocks_avx
+
+.type  poly1305_emit_avx,\@function,3
+.align 32
+poly1305_emit_avx:
+       cmpl    \$0,20($ctx)    # is_base2_26?
+       je      poly1305_emit
+
+       mov     0($ctx),%eax    # load hash value base 2^26
+       mov     4($ctx),%ecx
+       mov     8($ctx),%r8d
+       mov     12($ctx),%r11d
+       mov     16($ctx),%r10d
+
+       shl     \$26,%rcx       # base 2^26 -> base 2^64
+       mov     %r8,%r9
+       shl     \$52,%r8
+       add     %rcx,%rax
+       shr     \$12,%r9
+       add     %rax,%r8        # h0
+       adc     \$0,%r9
+
+       shl     \$14,%r11
+       mov     %r10,%rax
+       shr     \$24,%r10
+       add     %r11,%r9
+       shl     \$40,%rax
+       add     %rax,%r9        # h1
+       adc     \$0,%r10        # h2
+
+       mov     %r10,%rax       # could be partially reduced, so reduce
+       mov     %r10,%rcx
+       and     \$3,%r10
+       shr     \$2,%rax
+       and     \$-4,%rcx
+       add     %rcx,%rax
+       add     %rax,%r8
+       adc     \$0,%r9
+
+       mov     %r8,%rax
+       add     \$5,%r8         # compare to modulus
+       mov     %r9,%rcx
+       adc     \$0,%r9
+       adc     \$0,%r10
+       shr     \$2,%r10        # did 130-bit value overfow?
+       cmovnz  %r8,%rax
+       cmovnz  %r9,%rcx
+
+       add     0($nonce),%rax  # accumulate nonce
+       adc     8($nonce),%rcx
+       mov     %rax,0($mac)    # write result
+       mov     %rcx,8($mac)
+
+       ret
+.size  poly1305_emit_avx,.-poly1305_emit_avx
+___
+
+if ($avx>1) {
+my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
+    map("%ymm$_",(0..15));
+my $S4=$MASK;
+
+$code.=<<___;
+.type  poly1305_blocks_avx2,\@function,4
+.align 32
+poly1305_blocks_avx2:
+       mov     20($ctx),%r8d           # is_base2_26
+       cmp     \$128,$len
+       jae     .Lblocks_avx2
+       test    %r8d,%r8d
+       jz      poly1305_blocks
+
+.Lblocks_avx2:
+       and     \$-16,$len
+       jz      .Lno_data_avx2
+
+       vzeroupper
+
+       test    %r8d,%r8d
+       jz      .Lbase2_64_avx2
+
+       test    \$63,$len
+       jz      .Leven_avx2
+
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+.Lblocks_avx2_body:
+
+       mov     $len,%r15               # reassign $len
+
+       mov     0($ctx),$d1             # load hash value
+       mov     8($ctx),$d2
+       mov     16($ctx),$h2#d
+
+       mov     24($ctx),$r0            # load r
+       mov     32($ctx),$s1
+
+       ################################# base 2^26 -> base 2^64
+       mov     $d1#d,$h0#d
+       and     \$-1<<31,$d1
+       mov     $d2,$r1                 # borrow $r1
+       mov     $d2#d,$h1#d
+       and     \$-1<<31,$d2
+
+       shr     \$6,$d1
+       shl     \$52,$r1
+       add     $d1,$h0
+       shr     \$12,$h1
+       shr     \$18,$d2
+       add     $r1,$h0
+       adc     $d2,$h1
+
+       mov     $h2,$d1
+       shl     \$40,$d1
+       shr     \$24,$h2
+       add     $d1,$h1
+       adc     \$0,$h2                 # can be partially reduced...
+
+       mov     \$-4,$d2                # ... so reduce
+       mov     $h2,$d1
+       and     $h2,$d2
+       shr     \$2,$d1
+       and     \$3,$h2
+       add     $d2,$d1                 # =*5
+       add     $d1,$h0
+       adc     \$0,$h1
+
+       mov     $s1,$r1
+       mov     $s1,%rax
+       shr     \$2,$s1
+       add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
+
+.Lbase2_26_pre_avx2:
+       add     0($inp),$h0             # accumulate input
+       adc     8($inp),$h1
+       lea     16($inp),$inp
+       adc     $padbit,$h2
+       sub     \$16,%r15
+
+       call    __poly1305_block
+       mov     $r1,%rax
+
+       test    \$63,%r15
+       jnz     .Lbase2_26_pre_avx2
+
+       test    $padbit,$padbit         # if $padbit is zero,
+       jz      .Lstore_base2_64_avx2   # store hash in base 2^64 format
+
+       ################################# base 2^64 -> base 2^26
+       mov     $h0,%rax
+       mov     $h0,%rdx
+       shr     \$52,$h0
+       mov     $h1,$r0
+       mov     $h1,$r1
+       shr     \$26,%rdx
+       and     \$0x3ffffff,%rax        # h[0]
+       shl     \$12,$r0
+       and     \$0x3ffffff,%rdx        # h[1]
+       shr     \$14,$h1
+       or      $r0,$h0
+       shl     \$24,$h2
+       and     \$0x3ffffff,$h0         # h[2]
+       shr     \$40,$r1
+       and     \$0x3ffffff,$h1         # h[3]
+       or      $r1,$h2                 # h[4]
+
+       test    %r15,%r15
+       jz      .Lstore_base2_26_avx2
+
+       vmovd   %rax#d,%x#$H0
+       vmovd   %rdx#d,%x#$H1
+       vmovd   $h0#d,%x#$H2
+       vmovd   $h1#d,%x#$H3
+       vmovd   $h2#d,%x#$H4
+       jmp     .Lproceed_avx2
+
+.align 32
+.Lstore_base2_64_avx2:
+       mov     $h0,0($ctx)
+       mov     $h1,8($ctx)
+       mov     $h2,16($ctx)            # note that is_base2_26 is zeroed
+       jmp     .Ldone_avx2
+
+.align 16
+.Lstore_base2_26_avx2:
+       mov     %rax#d,0($ctx)          # store hash value base 2^26
+       mov     %rdx#d,4($ctx)
+       mov     $h0#d,8($ctx)
+       mov     $h1#d,12($ctx)
+       mov     $h2#d,16($ctx)
+.align 16
+.Ldone_avx2:
+       mov     0(%rsp),%r15
+       mov     8(%rsp),%r14
+       mov     16(%rsp),%r13
+       mov     24(%rsp),%r12
+       mov     32(%rsp),%rbp
+       mov     40(%rsp),%rbx
+       lea     48(%rsp),%rsp
+.Lno_data_avx2:
+.Lblocks_avx2_epilogue:
+       ret
+
+.align 32
+.Lbase2_64_avx2:
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+.Lbase2_64_avx2_body:
+
+       mov     $len,%r15               # reassign $len
+
+       mov     24($ctx),$r0            # load r
+       mov     32($ctx),$s1
+
+       mov     0($ctx),$h0             # load hash value
+       mov     8($ctx),$h1
+       mov     16($ctx),$h2#d
+
+       mov     $s1,$r1
+       mov     $s1,%rax
+       shr     \$2,$s1
+       add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
+
+       test    \$63,$len
+       jz      .Linit_avx2
+
+.Lbase2_64_pre_avx2:
+       add     0($inp),$h0             # accumulate input
+       adc     8($inp),$h1
+       lea     16($inp),$inp
+       adc     $padbit,$h2
+       sub     \$16,%r15
+
+       call    __poly1305_block
+       mov     $r1,%rax
+
+       test    \$63,%r15
+       jnz     .Lbase2_64_pre_avx2
+
+.Linit_avx2:
+       ################################# base 2^64 -> base 2^26
+       mov     $h0,%rax
+       mov     $h0,%rdx
+       shr     \$52,$h0
+       mov     $h1,$d1
+       mov     $h1,$d2
+       shr     \$26,%rdx
+       and     \$0x3ffffff,%rax        # h[0]
+       shl     \$12,$d1
+       and     \$0x3ffffff,%rdx        # h[1]
+       shr     \$14,$h1
+       or      $d1,$h0
+       shl     \$24,$h2
+       and     \$0x3ffffff,$h0         # h[2]
+       shr     \$40,$d2
+       and     \$0x3ffffff,$h1         # h[3]
+       or      $d2,$h2                 # h[4]
+
+       vmovd   %rax#d,%x#$H0
+       vmovd   %rdx#d,%x#$H1
+       vmovd   $h0#d,%x#$H2
+       vmovd   $h1#d,%x#$H3
+       vmovd   $h2#d,%x#$H4
+       movl    \$1,20($ctx)            # set is_base2_26
+
+       call    __poly1305_init_avx
+
+.Lproceed_avx2:
+       mov     %r15,$len
+
+       mov     0(%rsp),%r15
+       mov     8(%rsp),%r14
+       mov     16(%rsp),%r13
+       mov     24(%rsp),%r12
+       mov     32(%rsp),%rbp
+       mov     40(%rsp),%rbx
+       lea     48(%rsp),%rax
+       lea     48(%rsp),%rsp
+.Lbase2_64_avx2_epilogue:
+       jmp     .Ldo_avx2
+
+.align 32
+.Leven_avx2:
+       vmovd           4*0($ctx),%x#$H0        # load hash value base 2^26
+       vmovd           4*1($ctx),%x#$H1
+       vmovd           4*2($ctx),%x#$H2
+       vmovd           4*3($ctx),%x#$H3
+       vmovd           4*4($ctx),%x#$H4
+
+.Ldo_avx2:
+___
+$code.=<<___   if (!$win64);
+       lea             -8(%rsp),%r11
+       sub             \$0x128,%rsp
+___
+$code.=<<___   if ($win64);
+       lea             -0xf8(%rsp),%r11
+       sub             \$0x1c8,%rsp
+       vmovdqa         %xmm6,0x50(%r11)
+       vmovdqa         %xmm7,0x60(%r11)
+       vmovdqa         %xmm8,0x70(%r11)
+       vmovdqa         %xmm9,0x80(%r11)
+       vmovdqa         %xmm10,0x90(%r11)
+       vmovdqa         %xmm11,0xa0(%r11)
+       vmovdqa         %xmm12,0xb0(%r11)
+       vmovdqa         %xmm13,0xc0(%r11)
+       vmovdqa         %xmm14,0xd0(%r11)
+       vmovdqa         %xmm15,0xe0(%r11)
+.Ldo_avx2_body:
+___
+$code.=<<___;
+       lea             48+64($ctx),$ctx        # size optimization
+       lea             .Lconst(%rip),%rcx
+
+       # expand and copy pre-calculated table to stack
+       vmovdqu         `16*0-64`($ctx),%x#$T2
+       and             \$-512,%rsp
+       vmovdqu         `16*1-64`($ctx),%x#$T3
+       vmovdqu         `16*2-64`($ctx),%x#$T4
+       vmovdqu         `16*3-64`($ctx),%x#$D0
+       vmovdqu         `16*4-64`($ctx),%x#$D1
+       vmovdqu         `16*5-64`($ctx),%x#$D2
+       vmovdqu         `16*6-64`($ctx),%x#$D3
+       vpermq          \$0x15,$T2,$T2          # 00003412 -> 12343434
+       vmovdqu         `16*7-64`($ctx),%x#$D4
+       vpermq          \$0x15,$T3,$T3
+       vpshufd         \$0xc8,$T2,$T2          # 12343434 -> 14243444
+       vmovdqu         `16*8-64`($ctx),%x#$MASK
+       vpermq          \$0x15,$T4,$T4
+       vpshufd         \$0xc8,$T3,$T3
+       vmovdqa         $T2,0x00(%rsp)
+       vpermq          \$0x15,$D0,$D0
+       vpshufd         \$0xc8,$T4,$T4
+       vmovdqa         $T3,0x20(%rsp)
+       vpermq          \$0x15,$D1,$D1
+       vpshufd         \$0xc8,$D0,$D0
+       vmovdqa         $T4,0x40(%rsp)
+       vpermq          \$0x15,$D2,$D2
+       vpshufd         \$0xc8,$D1,$D1
+       vmovdqa         $D0,0x60(%rsp)
+       vpermq          \$0x15,$D3,$D3
+       vpshufd         \$0xc8,$D2,$D2
+       vmovdqa         $D1,0x80(%rsp)
+       vpermq          \$0x15,$D4,$D4
+       vpshufd         \$0xc8,$D3,$D3
+       vmovdqa         $D2,0xa0(%rsp)
+       vpermq          \$0x15,$MASK,$MASK
+       vpshufd         \$0xc8,$D4,$D4
+       vmovdqa         $D3,0xc0(%rsp)
+       vpshufd         \$0xc8,$MASK,$MASK
+       vmovdqa         $D4,0xe0(%rsp)
+       vmovdqa         $MASK,0x100(%rsp)
+       vmovdqa         64(%rcx),$MASK          # .Lmask26
+
+       ################################################################
+       # load input
+       vmovdqu         16*0($inp),%x#$T0
+       vmovdqu         16*1($inp),%x#$T1
+       vinserti128     \$1,16*2($inp),$T0,$T0
+       vinserti128     \$1,16*3($inp),$T1,$T1
+       lea             16*4($inp),$inp
+
+       vpsrldq         \$6,$T0,$T2             # splat input
+       vpsrldq         \$6,$T1,$T3
+       vpunpckhqdq     $T1,$T0,$T4             # 4
+       vpunpcklqdq     $T3,$T2,$T2             # 2:3
+       vpunpcklqdq     $T1,$T0,$T0             # 0:1
+
+       vpsrlq          \$30,$T2,$T3
+       vpsrlq          \$4,$T2,$T2
+       vpsrlq          \$26,$T0,$T1
+       vpsrlq          \$40,$T4,$T4            # 4
+       vpand           $MASK,$T2,$T2           # 2
+       vpand           $MASK,$T0,$T0           # 0
+       vpand           $MASK,$T1,$T1           # 1
+       vpand           $MASK,$T3,$T3           # 3
+       vpor            32(%rcx),$T4,$T4        # padbit, yes, always
+
+       lea             0x90(%rsp),%rax         # size optimization
+       vpaddq          $H2,$T2,$H2             # accumulate input
+       sub             \$64,$len
+       jz              .Ltail_avx2
+       jmp             .Loop_avx2
+
+.align 32
+.Loop_avx2:
+       ################################################################
+       # ((inp[0]*r^4+r[4])*r^4+r[8])*r^4
+       # ((inp[1]*r^4+r[5])*r^4+r[9])*r^3
+       # ((inp[2]*r^4+r[6])*r^4+r[10])*r^2
+       # ((inp[3]*r^4+r[7])*r^4+r[11])*r^1
+       #   \________/\________/
+       ################################################################
+       #vpaddq         $H2,$T2,$H2             # accumulate input
+       vpaddq          $H0,$T0,$H0
+       vmovdqa         `32*0`(%rsp),$T0        # r0^4
+       vpaddq          $H1,$T1,$H1
+       vmovdqa         `32*1`(%rsp),$T1        # r1^4
+       vpaddq          $H3,$T3,$H3
+       vmovdqa         `32*3`(%rsp),$T2        # r2^4
+       vpaddq          $H4,$T4,$H4
+       vmovdqa         `32*6-0x90`(%rax),$T3   # s3^4
+       vmovdqa         `32*8-0x90`(%rax),$S4   # s4^4
+
+       # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+       # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+       # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+       # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+       # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+       #
+       # however, as h2 is "chronologically" first one available pull
+       # corresponding operations up, so it's
+       #
+       # d4 = h2*r2   + h4*r0 + h3*r1             + h1*r3   + h0*r4
+       # d3 = h2*r1   + h3*r0           + h1*r2   + h0*r3   + h4*5*r4
+       # d2 = h2*r0           + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+       # d1 = h2*5*r4 + h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3
+       # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2           + h1*5*r4
+
+       vpmuludq        $H2,$T0,$D2             # d2 = h2*r0
+       vpmuludq        $H2,$T1,$D3             # d3 = h2*r1
+       vpmuludq        $H2,$T2,$D4             # d4 = h2*r2
+       vpmuludq        $H2,$T3,$D0             # d0 = h2*s3
+       vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
+
+       vpmuludq        $H0,$T1,$T4             # h0*r1
+       vpmuludq        $H1,$T1,$H2             # h1*r1, borrow $H2 as temp
+       vpaddq          $T4,$D1,$D1             # d1 += h0*r1
+       vpaddq          $H2,$D2,$D2             # d2 += h1*r1
+       vpmuludq        $H3,$T1,$T4             # h3*r1
+       vpmuludq        `32*2`(%rsp),$H4,$H2    # h4*s1
+       vpaddq          $T4,$D4,$D4             # d4 += h3*r1
+       vpaddq          $H2,$D0,$D0             # d0 += h4*s1
+        vmovdqa        `32*4-0x90`(%rax),$T1   # s2
+
+       vpmuludq        $H0,$T0,$T4             # h0*r0
+       vpmuludq        $H1,$T0,$H2             # h1*r0
+       vpaddq          $T4,$D0,$D0             # d0 += h0*r0
+       vpaddq          $H2,$D1,$D1             # d1 += h1*r0
+       vpmuludq        $H3,$T0,$T4             # h3*r0
+       vpmuludq        $H4,$T0,$H2             # h4*r0
+        vmovdqu        16*0($inp),%x#$T0       # load input
+       vpaddq          $T4,$D3,$D3             # d3 += h3*r0
+       vpaddq          $H2,$D4,$D4             # d4 += h4*r0
+        vinserti128    \$1,16*2($inp),$T0,$T0
+
+       vpmuludq        $H3,$T1,$T4             # h3*s2
+       vpmuludq        $H4,$T1,$H2             # h4*s2
+        vmovdqu        16*1($inp),%x#$T1
+       vpaddq          $T4,$D0,$D0             # d0 += h3*s2
+       vpaddq          $H2,$D1,$D1             # d1 += h4*s2
+        vmovdqa        `32*5-0x90`(%rax),$H2   # r3
+       vpmuludq        $H1,$T2,$T4             # h1*r2
+       vpmuludq        $H0,$T2,$T2             # h0*r2
+       vpaddq          $T4,$D3,$D3             # d3 += h1*r2
+       vpaddq          $T2,$D2,$D2             # d2 += h0*r2
+        vinserti128    \$1,16*3($inp),$T1,$T1
+        lea            16*4($inp),$inp
+
+       vpmuludq        $H1,$H2,$T4             # h1*r3
+       vpmuludq        $H0,$H2,$H2             # h0*r3
+        vpsrldq        \$6,$T0,$T2             # splat input
+       vpaddq          $T4,$D4,$D4             # d4 += h1*r3
+       vpaddq          $H2,$D3,$D3             # d3 += h0*r3
+       vpmuludq        $H3,$T3,$T4             # h3*s3
+       vpmuludq        $H4,$T3,$H2             # h4*s3
+        vpsrldq        \$6,$T1,$T3
+       vpaddq          $T4,$D1,$D1             # d1 += h3*s3
+       vpaddq          $H2,$D2,$D2             # d2 += h4*s3
+        vpunpckhqdq    $T1,$T0,$T4             # 4
+
+       vpmuludq        $H3,$S4,$H3             # h3*s4
+       vpmuludq        $H4,$S4,$H4             # h4*s4
+        vpunpcklqdq    $T1,$T0,$T0             # 0:1
+       vpaddq          $H3,$D2,$H2             # h2 = d2 + h3*r4
+       vpaddq          $H4,$D3,$H3             # h3 = d3 + h4*r4
+        vpunpcklqdq    $T3,$T2,$T3             # 2:3
+       vpmuludq        `32*7-0x90`(%rax),$H0,$H4       # h0*r4
+       vpmuludq        $H1,$S4,$H0             # h1*s4
+       vmovdqa         64(%rcx),$MASK          # .Lmask26
+       vpaddq          $H4,$D4,$H4             # h4 = d4 + h0*r4
+       vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
+
+       ################################################################
+       # lazy reduction (interleaved with tail of input splat)
+
+       vpsrlq          \$26,$H3,$D3
+       vpand           $MASK,$H3,$H3
+       vpaddq          $D3,$H4,$H4             # h3 -> h4
+
+       vpsrlq          \$26,$H0,$D0
+       vpand           $MASK,$H0,$H0
+       vpaddq          $D0,$D1,$H1             # h0 -> h1
+
+       vpsrlq          \$26,$H4,$D4
+       vpand           $MASK,$H4,$H4
+
+        vpsrlq         \$4,$T3,$T2
+
+       vpsrlq          \$26,$H1,$D1
+       vpand           $MASK,$H1,$H1
+       vpaddq          $D1,$H2,$H2             # h1 -> h2
+
+       vpaddq          $D4,$H0,$H0
+       vpsllq          \$2,$D4,$D4
+       vpaddq          $D4,$H0,$H0             # h4 -> h0
+
+        vpand          $MASK,$T2,$T2           # 2
+        vpsrlq         \$26,$T0,$T1
+
+       vpsrlq          \$26,$H2,$D2
+       vpand           $MASK,$H2,$H2
+       vpaddq          $D2,$H3,$H3             # h2 -> h3
+
+        vpaddq         $T2,$H2,$H2             # modulo-scheduled
+        vpsrlq         \$30,$T3,$T3
+
+       vpsrlq          \$26,$H0,$D0
+       vpand           $MASK,$H0,$H0
+       vpaddq          $D0,$H1,$H1             # h0 -> h1
+
+        vpsrlq         \$40,$T4,$T4            # 4
+
+       vpsrlq          \$26,$H3,$D3
+       vpand           $MASK,$H3,$H3
+       vpaddq          $D3,$H4,$H4             # h3 -> h4
+
+        vpand          $MASK,$T0,$T0           # 0
+        vpand          $MASK,$T1,$T1           # 1
+        vpand          $MASK,$T3,$T3           # 3
+        vpor           32(%rcx),$T4,$T4        # padbit, yes, always
+
+       sub             \$64,$len
+       jnz             .Loop_avx2
+
+       .byte           0x66,0x90
+.Ltail_avx2:
+       ################################################################
+       # while above multiplications were by r^4 in all lanes, in last
+       # iteration we multiply least significant lane by r^4 and most
+       # significant one by r, so copy of above except that references
+       # to the precomputed table are displaced by 4...
+
+       #vpaddq         $H2,$T2,$H2             # accumulate input
+       vpaddq          $H0,$T0,$H0
+       vmovdqu         `32*0+4`(%rsp),$T0      # r0^4
+       vpaddq          $H1,$T1,$H1
+       vmovdqu         `32*1+4`(%rsp),$T1      # r1^4
+       vpaddq          $H3,$T3,$H3
+       vmovdqu         `32*3+4`(%rsp),$T2      # r2^4
+       vpaddq          $H4,$T4,$H4
+       vmovdqu         `32*6+4-0x90`(%rax),$T3 # s3^4
+       vmovdqu         `32*8+4-0x90`(%rax),$S4 # s4^4
+
+       vpmuludq        $H2,$T0,$D2             # d2 = h2*r0
+       vpmuludq        $H2,$T1,$D3             # d3 = h2*r1
+       vpmuludq        $H2,$T2,$D4             # d4 = h2*r2
+       vpmuludq        $H2,$T3,$D0             # d0 = h2*s3
+       vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
+
+       vpmuludq        $H0,$T1,$T4             # h0*r1
+       vpmuludq        $H1,$T1,$H2             # h1*r1
+       vpaddq          $T4,$D1,$D1             # d1 += h0*r1
+       vpaddq          $H2,$D2,$D2             # d2 += h1*r1
+       vpmuludq        $H3,$T1,$T4             # h3*r1
+       vpmuludq        `32*2+4`(%rsp),$H4,$H2  # h4*s1
+       vpaddq          $T4,$D4,$D4             # d4 += h3*r1
+       vpaddq          $H2,$D0,$D0             # d0 += h4*s1
+
+       vpmuludq        $H0,$T0,$T4             # h0*r0
+       vpmuludq        $H1,$T0,$H2             # h1*r0
+       vpaddq          $T4,$D0,$D0             # d0 += h0*r0
+        vmovdqu        `32*4+4-0x90`(%rax),$T1 # s2
+       vpaddq          $H2,$D1,$D1             # d1 += h1*r0
+       vpmuludq        $H3,$T0,$T4             # h3*r0
+       vpmuludq        $H4,$T0,$H2             # h4*r0
+       vpaddq          $T4,$D3,$D3             # d3 += h3*r0
+       vpaddq          $H2,$D4,$D4             # d4 += h4*r0
+
+       vpmuludq        $H3,$T1,$T4             # h3*s2
+       vpmuludq        $H4,$T1,$H2             # h4*s2
+       vpaddq          $T4,$D0,$D0             # d0 += h3*s2
+       vpaddq          $H2,$D1,$D1             # d1 += h4*s2
+        vmovdqu        `32*5+4-0x90`(%rax),$H2 # r3
+       vpmuludq        $H1,$T2,$T4             # h1*r2
+       vpmuludq        $H0,$T2,$T2             # h0*r2
+       vpaddq          $T4,$D3,$D3             # d3 += h1*r2
+       vpaddq          $T2,$D2,$D2             # d2 += h0*r2
+
+       vpmuludq        $H1,$H2,$T4             # h1*r3
+       vpmuludq        $H0,$H2,$H2             # h0*r3
+       vpaddq          $T4,$D4,$D4             # d4 += h1*r3
+       vpaddq          $H2,$D3,$D3             # d3 += h0*r3
+       vpmuludq        $H3,$T3,$T4             # h3*s3
+       vpmuludq        $H4,$T3,$H2             # h4*s3
+       vpaddq          $T4,$D1,$D1             # d1 += h3*s3
+       vpaddq          $H2,$D2,$D2             # d2 += h4*s3
+
+       vpmuludq        $H3,$S4,$H3             # h3*s4
+       vpmuludq        $H4,$S4,$H4             # h4*s4
+       vpaddq          $H3,$D2,$H2             # h2 = d2 + h3*r4
+       vpaddq          $H4,$D3,$H3             # h3 = d3 + h4*r4
+       vpmuludq        `32*7+4-0x90`(%rax),$H0,$H4             # h0*r4
+       vpmuludq        $H1,$S4,$H0             # h1*s4
+       vmovdqa         64(%rcx),$MASK          # .Lmask26
+       vpaddq          $H4,$D4,$H4             # h4 = d4 + h0*r4
+       vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
+
+       ################################################################
+       # lazy reduction
+
+       vpsrlq          \$26,$H3,$D3
+       vpand           $MASK,$H3,$H3
+       vpaddq          $D3,$H4,$H4             # h3 -> h4
+
+       vpsrlq          \$26,$H0,$D0
+       vpand           $MASK,$H0,$H0
+       vpaddq          $D0,$D1,$H1             # h0 -> h1
+
+       vpsrlq          \$26,$H4,$D4
+       vpand           $MASK,$H4,$H4
+
+       vpsrlq          \$26,$H1,$D1
+       vpand           $MASK,$H1,$H1
+       vpaddq          $D1,$H2,$H2             # h1 -> h2
+
+       vpaddq          $D4,$H0,$H0
+       vpsllq          \$2,$D4,$D4
+       vpaddq          $D4,$H0,$H0             # h4 -> h0
+
+       vpsrlq          \$26,$H2,$D2
+       vpand           $MASK,$H2,$H2
+       vpaddq          $D2,$H3,$H3             # h2 -> h3
+
+       vpsrlq          \$26,$H0,$D0
+       vpand           $MASK,$H0,$H0
+       vpaddq          $D0,$H1,$H1             # h0 -> h1
+
+       vpsrlq          \$26,$H3,$D3
+       vpand           $MASK,$H3,$H3
+       vpaddq          $D3,$H4,$H4             # h3 -> h4
+
+       ################################################################
+       # horizontal addition
+
+       vpsrldq         \$8,$H2,$T2
+       vpsrldq         \$8,$H0,$T0
+       vpsrldq         \$8,$H1,$T1
+       vpsrldq         \$8,$H3,$T3
+       vpsrldq         \$8,$H4,$T4
+       vpaddq          $T2,$H2,$H2
+       vpaddq          $T0,$H0,$H0
+       vpaddq          $T1,$H1,$H1
+       vpaddq          $T3,$H3,$H3
+       vpaddq          $T4,$H4,$H4
+
+       vpermq          \$0x2,$H2,$T2
+       vpermq          \$0x2,$H0,$T0
+       vpermq          \$0x2,$H1,$T1
+       vpermq          \$0x2,$H3,$T3
+       vpermq          \$0x2,$H4,$T4
+       vpaddq          $T2,$H2,$H2
+       vpaddq          $T0,$H0,$H0
+       vpaddq          $T1,$H1,$H1
+       vpaddq          $T3,$H3,$H3
+       vpaddq          $T4,$H4,$H4
+
+       vmovd           %x#$H0,`4*0-48-64`($ctx)# save partially reduced
+       vmovd           %x#$H1,`4*1-48-64`($ctx)
+       vmovd           %x#$H2,`4*2-48-64`($ctx)
+       vmovd           %x#$H3,`4*3-48-64`($ctx)
+       vmovd           %x#$H4,`4*4-48-64`($ctx)
+___
+$code.=<<___   if ($win64);
+       vmovdqa         0x50(%r11),%xmm6
+       vmovdqa         0x60(%r11),%xmm7
+       vmovdqa         0x70(%r11),%xmm8
+       vmovdqa         0x80(%r11),%xmm9
+       vmovdqa         0x90(%r11),%xmm10
+       vmovdqa         0xa0(%r11),%xmm11
+       vmovdqa         0xb0(%r11),%xmm12
+       vmovdqa         0xc0(%r11),%xmm13
+       vmovdqa         0xd0(%r11),%xmm14
+       vmovdqa         0xe0(%r11),%xmm15
+       lea             0xf8(%r11),%rsp
+.Ldo_avx2_epilogue:
+___
+$code.=<<___   if (!$win64);
+       lea             8(%r11),%rsp
+___
+$code.=<<___;
+       vzeroupper
+       ret
+.size  poly1305_blocks_avx2,.-poly1305_blocks_avx2
+___
+}
+$code.=<<___;
+.align 64
+.Lconst:
+.Lmask24:
+.long  0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.L129:
+.long  1<<24,0,1<<24,0,1<<24,0,1<<24,0
+.Lmask26:
+.long  0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.Lfive:
+.long  5,0,5,0,5,0,5,0
+___
+}
+
+$code.=<<___;
+.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#              CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";