ARM assembly pack: add ChaCha20 and Poly1305 modules.
authorAndy Polyakov <appro@openssl.org>
Mon, 14 Dec 2015 17:12:07 +0000 (18:12 +0100)
committerAndy Polyakov <appro@openssl.org>
Sat, 13 Feb 2016 11:07:45 +0000 (12:07 +0100)
Reviewed-by: Richard Levitte <levitte@openssl.org>
crypto/chacha/Makefile.in
crypto/chacha/asm/chacha-armv4.pl [new file with mode: 0755]
crypto/chacha/asm/chacha-armv8.pl [new file with mode: 0755]
crypto/poly1305/Makefile.in
crypto/poly1305/asm/poly1305-armv4.pl [new file with mode: 0755]
crypto/poly1305/asm/poly1305-armv8.pl [new file with mode: 0755]

index 6fb63c184bfc841a913176b24eaa7734ca50b828..dd0f36ce00ece6ebac5ea3ec8f7314297bf423d7 100644 (file)
@@ -43,6 +43,9 @@ chacha-x86_64.s:      asm/chacha-x86_64.pl
 
 chacha-%.S:    asm/chacha-%.pl;        $(PERL) $< $(PERLASM_SCHEME) $@
 
 
 chacha-%.S:    asm/chacha-%.pl;        $(PERL) $< $(PERLASM_SCHEME) $@
 
+chacha-armv4.o:        chacha-armv4.S
+chacha-armv8.o:        chacha-armv8.S
+
 files:
        $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
 
 files:
        $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
 
diff --git a/crypto/chacha/asm/chacha-armv4.pl b/crypto/chacha/asm/chacha-armv4.pl
new file mode 100755 (executable)
index 0000000..4d234b7
--- /dev/null
@@ -0,0 +1,1144 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# December 2014
+# 
+# ChaCha20 for ARMv4.
+#
+# Performance in cycles per byte out of large buffer.
+#
+#                      IALU/gcc-4.4    1xNEON      3xNEON+1xIALU
+#
+# Cortex-A5            19.3(*)/+95%    21.8        14.1
+# Cortex-A8            10.5(*)/+160%   13.9        6.35
+# Cortex-A9            12.9(**)/+110%  14.3        6.50
+# Cortex-A15           11.0/+40%       16.0        5.00
+# Snapdragon S4                11.5/+125%      13.6        4.90
+#
+# (*)  most "favourable" result for aligned data on little-endian
+#      processor, result for misaligned data is 10-15% lower;
+# (**) this result is a trade-off: it can be improved by 20%,
+#      but then Snapdragon S4 and Cortex-A8 results get
+#      20-25% worse;
+
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+sub AUTOLOAD()         # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
+my @t=map("r$_",(8..11));
+
+sub ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my $odd = $d0&1;
+my ($xc,$xc_) = (@t[0..1]);
+my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
+my @ret;
+
+       # Consider order in which variables are addressed by their
+       # index:
+       #
+       #       a   b   c   d
+       #
+       #       0   4   8  12 < even round
+       #       1   5   9  13
+       #       2   6  10  14
+       #       3   7  11  15
+       #       0   5  10  15 < odd round
+       #       1   6  11  12
+       #       2   7   8  13
+       #       3   4   9  14
+       #
+       # 'a', 'b' are permanently allocated in registers, @x[0..7],
+       # while 'c's and pair of 'd's are maintained in memory. If
+       # you observe 'c' column, you'll notice that pair of 'c's is
+       # invariant between rounds. This means that we have to reload
+       # them once per round, in the middle. This is why you'll see
+       # bunch of 'c' stores and loads in the middle, but none in
+       # the beginning or end. If you observe 'd' column, you'll
+       # notice that 15 and 13 are reused in next pair of rounds.
+       # This is why these two are chosen for offloading to memory,
+       # to make loads count more.
+                                                       push @ret,(
+       "&add   (@x[$a0],@x[$a0],@x[$b0])",
+       "&mov   ($xd,$xd,'ror#16')",
+        "&add  (@x[$a1],@x[$a1],@x[$b1])",
+        "&mov  ($xd_,$xd_,'ror#16')",
+       "&eor   ($xd,$xd,@x[$a0],'ror#16')",
+        "&eor  ($xd_,$xd_,@x[$a1],'ror#16')",
+
+       "&add   ($xc,$xc,$xd)",
+       "&mov   (@x[$b0],@x[$b0],'ror#20')",
+        "&add  ($xc_,$xc_,$xd_)",
+        "&mov  (@x[$b1],@x[$b1],'ror#20')",
+       "&eor   (@x[$b0],@x[$b0],$xc,'ror#20')",
+        "&eor  (@x[$b1],@x[$b1],$xc_,'ror#20')",
+
+       "&add   (@x[$a0],@x[$a0],@x[$b0])",
+       "&mov   ($xd,$xd,'ror#24')",
+        "&add  (@x[$a1],@x[$a1],@x[$b1])",
+        "&mov  ($xd_,$xd_,'ror#24')",
+       "&eor   ($xd,$xd,@x[$a0],'ror#24')",
+        "&eor  ($xd_,$xd_,@x[$a1],'ror#24')",
+
+       "&add   ($xc,$xc,$xd)",
+       "&mov   (@x[$b0],@x[$b0],'ror#25')"             );
+                                                       push @ret,(
+       "&str   ($xd,'[sp,#4*(16+$d0)]')",
+       "&ldr   ($xd,'[sp,#4*(16+$d2)]')"               ) if ($odd);
+                                                       push @ret,(
+        "&add  ($xc_,$xc_,$xd_)",
+        "&mov  (@x[$b1],@x[$b1],'ror#25')"             );
+                                                       push @ret,(
+        "&str  ($xd_,'[sp,#4*(16+$d1)]')",
+        "&ldr  ($xd_,'[sp,#4*(16+$d3)]')"              ) if (!$odd);
+                                                       push @ret,(
+       "&eor   (@x[$b0],@x[$b0],$xc,'ror#25')",
+        "&eor  (@x[$b1],@x[$b1],$xc_,'ror#25')"        );
+
+       $xd=@x[$d2]                                     if (!$odd);
+       $xd_=@x[$d3]                                    if ($odd);
+                                                       push @ret,(
+       "&str   ($xc,'[sp,#4*(16+$c0)]')",
+       "&ldr   ($xc,'[sp,#4*(16+$c2)]')",
+       "&add   (@x[$a2],@x[$a2],@x[$b2])",
+       "&mov   ($xd,$xd,'ror#16')",
+        "&str  ($xc_,'[sp,#4*(16+$c1)]')",
+        "&ldr  ($xc_,'[sp,#4*(16+$c3)]')",
+        "&add  (@x[$a3],@x[$a3],@x[$b3])",
+        "&mov  ($xd_,$xd_,'ror#16')",
+       "&eor   ($xd,$xd,@x[$a2],'ror#16')",
+        "&eor  ($xd_,$xd_,@x[$a3],'ror#16')",
+
+       "&add   ($xc,$xc,$xd)",
+       "&mov   (@x[$b2],@x[$b2],'ror#20')",
+        "&add  ($xc_,$xc_,$xd_)",
+        "&mov  (@x[$b3],@x[$b3],'ror#20')",
+       "&eor   (@x[$b2],@x[$b2],$xc,'ror#20')",
+        "&eor  (@x[$b3],@x[$b3],$xc_,'ror#20')",
+
+       "&add   (@x[$a2],@x[$a2],@x[$b2])",
+       "&mov   ($xd,$xd,'ror#24')",
+        "&add  (@x[$a3],@x[$a3],@x[$b3])",
+        "&mov  ($xd_,$xd_,'ror#24')",
+       "&eor   ($xd,$xd,@x[$a2],'ror#24')",
+        "&eor  ($xd_,$xd_,@x[$a3],'ror#24')",
+
+       "&add   ($xc,$xc,$xd)",
+       "&mov   (@x[$b2],@x[$b2],'ror#25')",
+        "&add  ($xc_,$xc_,$xd_)",
+        "&mov  (@x[$b3],@x[$b3],'ror#25')",
+       "&eor   (@x[$b2],@x[$b2],$xc,'ror#25')",
+        "&eor  (@x[$b3],@x[$b3],$xc_,'ror#25')"        );
+
+       @ret;
+}
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+#if defined(__thumb2__)
+.syntax        unified
+.thumb
+#else
+.code  32
+#endif
+
+#if defined(__thumb2__) || defined(__clang__)
+#define ldrhsb ldrbhs
+#endif
+
+.align 5
+.Lsigma:
+.long  0x61707865,0x3320646e,0x79622d32,0x6b206574     @ endian-neutral
+.Lone:
+.long  1,0,0,0
+#if __ARM_MAX_ARCH__>=7
+.LOPENSSL_armcap:
+.word   OPENSSL_armcap_P-.LChaCha20_ctr32
+#else
+.word  -1
+#endif
+
+.globl ChaCha20_ctr32
+.type  ChaCha20_ctr32,%function
+.align 5
+ChaCha20_ctr32:
+.LChaCha20_ctr32:
+       ldr     r12,[sp,#0]             @ pull pointer to counter and nonce
+       stmdb   sp!,{r0-r2,r4-r11,lr}
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+       sub     r14,pc,#16              @ ChaCha20_ctr32
+#else
+       adr     r14,.LChaCha20_ctr32
+#endif
+#if __ARM_MAX_ARCH__>=7
+       cmp     r2,#192                 @ test len
+       bls     .Lshort
+       ldr     r4,[r14,#-32]
+       ldr     r4,[r14,r4]
+# ifdef        __APPLE__
+       ldr     r4,[r4]
+# endif
+       tst     r4,#1
+       bne     .LChaCha20_neon
+.Lshort:
+#endif
+       ldmia   r12,{r4-r7}             @ load counter and nonce
+       sub     sp,sp,#4*(16)           @ off-load area
+       sub     r14,r14,#64             @ .Lsigma
+       stmdb   sp!,{r4-r7}             @ copy counter and nonce
+       ldmia   r3,{r4-r11}             @ load key
+       ldmia   r14,{r0-r3}             @ load sigma
+       stmdb   sp!,{r4-r11}            @ copy key
+       stmdb   sp!,{r0-r3}             @ copy sigma
+       str     r10,[sp,#4*(16+10)]     @ off-load "@x[10]"
+       str     r11,[sp,#4*(16+11)]     @ off-load "@x[11]"
+       b       .Loop_outer_enter
+
+.align 4
+.Loop_outer:
+       ldmia   sp,{r0-r9}              @ load key material
+       str     @t[3],[sp,#4*(32+2)]    @ save len
+       str     r12,  [sp,#4*(32+1)]    @ save inp
+       str     r14,  [sp,#4*(32+0)]    @ save out
+.Loop_outer_enter:
+       ldr     @t[3], [sp,#4*(15)]
+       ldr     @x[12],[sp,#4*(12)]     @ modulo-scheduled load
+       ldr     @t[2], [sp,#4*(13)]
+       ldr     @x[14],[sp,#4*(14)]
+       str     @t[3], [sp,#4*(16+15)]
+       mov     @t[3],#10
+       b       .Loop
+
+.align 4
+.Loop:
+       subs    @t[3],@t[3],#1
+___
+       foreach (&ROUND(0, 4, 8,12)) { eval; }
+       foreach (&ROUND(0, 5,10,15)) { eval; }
+$code.=<<___;
+       bne     .Loop
+
+       ldr     @t[3],[sp,#4*(32+2)]    @ load len
+
+       str     @t[0], [sp,#4*(16+8)]   @ modulo-scheduled store
+       str     @t[1], [sp,#4*(16+9)]
+       str     @x[12],[sp,#4*(16+12)]
+       str     @t[2], [sp,#4*(16+13)]
+       str     @x[14],[sp,#4*(16+14)]
+
+       @ at this point we have first half of 512-bit result in
+       @ @x[0-7] and second half at sp+4*(16+8)
+
+       cmp     @t[3],#64               @ done yet?
+#ifdef __thumb2__
+       itete   lo
+#endif
+       addlo   r12,sp,#4*(0)           @ shortcut or ...
+       ldrhs   r12,[sp,#4*(32+1)]      @ ... load inp
+       addlo   r14,sp,#4*(0)           @ shortcut or ...
+       ldrhs   r14,[sp,#4*(32+0)]      @ ... load out
+
+       ldr     @t[0],[sp,#4*(0)]       @ load key material
+       ldr     @t[1],[sp,#4*(1)]
+
+#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
+# if __ARM_ARCH__<7
+       orr     @t[2],r12,r14
+       tst     @t[2],#3                @ are input and output aligned?
+       ldr     @t[2],[sp,#4*(2)]
+       bne     .Lunaligned
+       cmp     @t[3],#64               @ restore flags
+# else
+       ldr     @t[2],[sp,#4*(2)]
+# endif
+       ldr     @t[3],[sp,#4*(3)]
+
+       add     @x[0],@x[0],@t[0]       @ accumulate key material
+       add     @x[1],@x[1],@t[1]
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       ldrhs   @t[0],[r12],#16         @ load input
+       ldrhs   @t[1],[r12,#-12]
+
+       add     @x[2],@x[2],@t[2]
+       add     @x[3],@x[3],@t[3]
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       ldrhs   @t[2],[r12,#-8]
+       ldrhs   @t[3],[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+       rev     @x[0],@x[0]
+       rev     @x[1],@x[1]
+       rev     @x[2],@x[2]
+       rev     @x[3],@x[3]
+# endif
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       eorhs   @x[0],@x[0],@t[0]       @ xor with input
+       eorhs   @x[1],@x[1],@t[1]
+        add    @t[0],sp,#4*(4)
+       str     @x[0],[r14],#16         @ store output
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       eorhs   @x[2],@x[2],@t[2]
+       eorhs   @x[3],@x[3],@t[3]
+        ldmia  @t[0],{@t[0]-@t[3]}     @ load key material
+       str     @x[1],[r14,#-12]
+       str     @x[2],[r14,#-8]
+       str     @x[3],[r14,#-4]
+
+       add     @x[4],@x[4],@t[0]       @ accumulate key material
+       add     @x[5],@x[5],@t[1]
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       ldrhs   @t[0],[r12],#16         @ load input
+       ldrhs   @t[1],[r12,#-12]
+       add     @x[6],@x[6],@t[2]
+       add     @x[7],@x[7],@t[3]
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       ldrhs   @t[2],[r12,#-8]
+       ldrhs   @t[3],[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+       rev     @x[4],@x[4]
+       rev     @x[5],@x[5]
+       rev     @x[6],@x[6]
+       rev     @x[7],@x[7]
+# endif
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       eorhs   @x[4],@x[4],@t[0]
+       eorhs   @x[5],@x[5],@t[1]
+        add    @t[0],sp,#4*(8)
+       str     @x[4],[r14],#16         @ store output
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       eorhs   @x[6],@x[6],@t[2]
+       eorhs   @x[7],@x[7],@t[3]
+       str     @x[5],[r14,#-12]
+        ldmia  @t[0],{@t[0]-@t[3]}     @ load key material
+       str     @x[6],[r14,#-8]
+        add    @x[0],sp,#4*(16+8)
+       str     @x[7],[r14,#-4]
+
+       ldmia   @x[0],{@x[0]-@x[7]}     @ load second half
+
+       add     @x[0],@x[0],@t[0]       @ accumulate key material
+       add     @x[1],@x[1],@t[1]
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       ldrhs   @t[0],[r12],#16         @ load input
+       ldrhs   @t[1],[r12,#-12]
+# ifdef        __thumb2__
+       itt     hi
+# endif
+        strhi  @t[2],[sp,#4*(16+10)]   @ copy "@x[10]" while at it
+        strhi  @t[3],[sp,#4*(16+11)]   @ copy "@x[11]" while at it
+       add     @x[2],@x[2],@t[2]
+       add     @x[3],@x[3],@t[3]
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       ldrhs   @t[2],[r12,#-8]
+       ldrhs   @t[3],[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+       rev     @x[0],@x[0]
+       rev     @x[1],@x[1]
+       rev     @x[2],@x[2]
+       rev     @x[3],@x[3]
+# endif
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       eorhs   @x[0],@x[0],@t[0]
+       eorhs   @x[1],@x[1],@t[1]
+        add    @t[0],sp,#4*(12)
+       str     @x[0],[r14],#16         @ store output
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       eorhs   @x[2],@x[2],@t[2]
+       eorhs   @x[3],@x[3],@t[3]
+       str     @x[1],[r14,#-12]
+        ldmia  @t[0],{@t[0]-@t[3]}     @ load key material
+       str     @x[2],[r14,#-8]
+       str     @x[3],[r14,#-4]
+
+       add     @x[4],@x[4],@t[0]       @ accumulate key material
+       add     @x[5],@x[5],@t[1]
+# ifdef        __thumb2__
+       itt     hi
+# endif
+        addhi  @t[0],@t[0],#1          @ next counter value
+        strhi  @t[0],[sp,#4*(12)]      @ save next counter value
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       ldrhs   @t[0],[r12],#16         @ load input
+       ldrhs   @t[1],[r12,#-12]
+       add     @x[6],@x[6],@t[2]
+       add     @x[7],@x[7],@t[3]
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       ldrhs   @t[2],[r12,#-8]
+       ldrhs   @t[3],[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+       rev     @x[4],@x[4]
+       rev     @x[5],@x[5]
+       rev     @x[6],@x[6]
+       rev     @x[7],@x[7]
+# endif
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       eorhs   @x[4],@x[4],@t[0]
+       eorhs   @x[5],@x[5],@t[1]
+# ifdef        __thumb2__
+       it      hi
+# endif
+        ldrhi  @t[0],[sp,#4*(32+2)]    @ re-load len
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       eorhs   @x[6],@x[6],@t[2]
+       eorhs   @x[7],@x[7],@t[3]
+       str     @x[4],[r14],#16         @ store output
+       str     @x[5],[r14,#-12]
+# ifdef        __thumb2__
+       it      hs
+# endif
+        subhs  @t[3],@t[0],#64         @ len-=64
+       str     @x[6],[r14,#-8]
+       str     @x[7],[r14,#-4]
+       bhi     .Loop_outer
+
+       beq     .Ldone
+# if __ARM_ARCH__<7
+       b       .Ltail
+
+.align 4
+.Lunaligned:                           @ unaligned endian-neutral path
+       cmp     @t[3],#64               @ restore flags
+# endif
+#endif
+#if __ARM_ARCH__<7
+       ldr     @t[3],[sp,#4*(3)]
+___
+for ($i=0;$i<16;$i+=4) {
+my $j=$i&0x7;
+
+$code.=<<___   if ($i==4);
+       add     @x[0],sp,#4*(16+8)
+___
+$code.=<<___   if ($i==8);
+       ldmia   @x[0],{@x[0]-@x[7]}             @ load second half
+# ifdef        __thumb2__
+       itt     hi
+# endif
+       strhi   @t[2],[sp,#4*(16+10)]           @ copy "@x[10]"
+       strhi   @t[3],[sp,#4*(16+11)]           @ copy "@x[11]"
+___
+$code.=<<___;
+       add     @x[$j+0],@x[$j+0],@t[0]         @ accumulate key material
+___
+$code.=<<___   if ($i==12);
+# ifdef        __thumb2__
+       itt     hi
+# endif
+       addhi   @t[0],@t[0],#1                  @ next counter value
+       strhi   @t[0],[sp,#4*(12)]              @ save next counter value
+___
+$code.=<<___;
+       add     @x[$j+1],@x[$j+1],@t[1]
+       add     @x[$j+2],@x[$j+2],@t[2]
+# ifdef        __thumb2__
+       itete   lo
+# endif
+       eorlo   @t[0],@t[0],@t[0]               @ zero or ...
+       ldrhsb  @t[0],[r12],#16                 @ ... load input
+       eorlo   @t[1],@t[1],@t[1]
+       ldrhsb  @t[1],[r12,#-12]
+
+       add     @x[$j+3],@x[$j+3],@t[3]
+# ifdef        __thumb2__
+       itete   lo
+# endif
+       eorlo   @t[2],@t[2],@t[2]
+       ldrhsb  @t[2],[r12,#-8]
+       eorlo   @t[3],@t[3],@t[3]
+       ldrhsb  @t[3],[r12,#-4]
+
+       eor     @x[$j+0],@t[0],@x[$j+0]         @ xor with input (or zero)
+       eor     @x[$j+1],@t[1],@x[$j+1]
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       ldrhsb  @t[0],[r12,#-15]                @ load more input
+       ldrhsb  @t[1],[r12,#-11]
+       eor     @x[$j+2],@t[2],@x[$j+2]
+        strb   @x[$j+0],[r14],#16              @ store output
+       eor     @x[$j+3],@t[3],@x[$j+3]
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       ldrhsb  @t[2],[r12,#-7]
+       ldrhsb  @t[3],[r12,#-3]
+        strb   @x[$j+1],[r14,#-12]
+       eor     @x[$j+0],@t[0],@x[$j+0],lsr#8
+        strb   @x[$j+2],[r14,#-8]
+       eor     @x[$j+1],@t[1],@x[$j+1],lsr#8
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       ldrhsb  @t[0],[r12,#-14]                @ load more input
+       ldrhsb  @t[1],[r12,#-10]
+        strb   @x[$j+3],[r14,#-4]
+       eor     @x[$j+2],@t[2],@x[$j+2],lsr#8
+        strb   @x[$j+0],[r14,#-15]
+       eor     @x[$j+3],@t[3],@x[$j+3],lsr#8
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       ldrhsb  @t[2],[r12,#-6]
+       ldrhsb  @t[3],[r12,#-2]
+        strb   @x[$j+1],[r14,#-11]
+       eor     @x[$j+0],@t[0],@x[$j+0],lsr#8
+        strb   @x[$j+2],[r14,#-7]
+       eor     @x[$j+1],@t[1],@x[$j+1],lsr#8
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       ldrhsb  @t[0],[r12,#-13]                @ load more input
+       ldrhsb  @t[1],[r12,#-9]
+        strb   @x[$j+3],[r14,#-3]
+       eor     @x[$j+2],@t[2],@x[$j+2],lsr#8
+        strb   @x[$j+0],[r14,#-14]
+       eor     @x[$j+3],@t[3],@x[$j+3],lsr#8
+# ifdef        __thumb2__
+       itt     hs
+# endif
+       ldrhsb  @t[2],[r12,#-5]
+       ldrhsb  @t[3],[r12,#-1]
+        strb   @x[$j+1],[r14,#-10]
+        strb   @x[$j+2],[r14,#-6]
+       eor     @x[$j+0],@t[0],@x[$j+0],lsr#8
+        strb   @x[$j+3],[r14,#-2]
+       eor     @x[$j+1],@t[1],@x[$j+1],lsr#8
+        strb   @x[$j+0],[r14,#-13]
+       eor     @x[$j+2],@t[2],@x[$j+2],lsr#8
+        strb   @x[$j+1],[r14,#-9]
+       eor     @x[$j+3],@t[3],@x[$j+3],lsr#8
+        strb   @x[$j+2],[r14,#-5]
+        strb   @x[$j+3],[r14,#-1]
+___
+$code.=<<___   if ($i<12);
+       add     @t[0],sp,#4*(4+$i)
+       ldmia   @t[0],{@t[0]-@t[3]}             @ load key material
+___
+}
+$code.=<<___;
+# ifdef        __thumb2__
+       it      hi
+# endif
+       ldrhi   @t[0],[sp,#4*(32+2)]            @ re-load len
+# ifdef        __thumb2__
+       it      hs
+# endif
+       subhs   @t[3],@t[0],#64                 @ len-=64
+       bhi     .Loop_outer
+
+       beq     .Ldone
+#endif
+
+.Ltail:
+       ldr     r12,[sp,#4*(32+1)]      @ load inp
+       add     @t[2],sp,#4*(0)
+       ldr     r14,[sp,#4*(32+0)]      @ load out
+
+.Loop_tail:
+       ldrb    @t[0],[@t[2]],#1        @ read buffer on stack
+       ldrb    @t[1],[r12],#1          @ read input
+       subs    @t[3],@t[3],#1
+       eor     @t[0],@t[0],@t[1]
+       strb    @t[0],[r14],#1          @ store output
+       bne     .Loop_tail
+
+.Ldone:
+       add     sp,sp,#4*(32+3)
+       ldmia   sp!,{r4-r11,pc}
+.size  ChaCha20_ctr32,.-ChaCha20_ctr32
+___
+
+{{{
+my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
+    map("q$_",(0..15));
+
+sub NEONROUND {
+my $odd = pop;
+my ($a,$b,$c,$d,$t)=@_;
+
+       (
+       "&vadd_i32      ($a,$a,$b)",
+       "&veor          ($d,$d,$a)",
+       "&vrev32_16     ($d,$d)",       # vrot ($d,16)
+
+       "&vadd_i32      ($c,$c,$d)",
+       "&veor          ($t,$b,$c)",
+       "&vshr_u32      ($b,$t,20)",
+       "&vsli_32       ($b,$t,12)",
+
+       "&vadd_i32      ($a,$a,$b)",
+       "&veor          ($t,$d,$a)",
+       "&vshr_u32      ($d,$t,24)",
+       "&vsli_32       ($d,$t,8)",
+
+       "&vadd_i32      ($c,$c,$d)",
+       "&veor          ($t,$b,$c)",
+       "&vshr_u32      ($b,$t,25)",
+       "&vsli_32       ($b,$t,7)",
+
+       "&vext_8        ($c,$c,$c,8)",
+       "&vext_8        ($b,$b,$b,$odd?12:4)",
+       "&vext_8        ($d,$d,$d,$odd?4:12)"
+       );
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch  armv7-a
+.fpu   neon
+
+.type  ChaCha20_neon,%function
+.align 5
+ChaCha20_neon:
+       ldr             r12,[sp,#0]             @ pull pointer to counter and nonce
+       stmdb           sp!,{r0-r2,r4-r11,lr}
+.LChaCha20_neon:
+       adr             r14,.Lsigma
+       vstmdb          sp!,{d8-d15}            @ ABI spec says so
+       stmdb           sp!,{r0-r3}
+
+       vld1.32         {$b0-$c0},[r3]          @ load key
+       ldmia           r3,{r4-r11}             @ load key
+
+       sub             sp,sp,#4*(16+16)
+       vld1.32         {$d0},[r12]             @ load counter and nonce
+       add             r12,sp,#4*8
+       ldmia           r14,{r0-r3}             @ load sigma
+       vld1.32         {$a0},[r14]!            @ load sigma
+       vld1.32         {$t0},[r14]             @ one
+       vst1.32         {$c0-$d0},[r12]         @ copy 1/2key|counter|nonce
+       vst1.32         {$a0-$b0},[sp]          @ copy sigma|1/2key
+
+       str             r10,[sp,#4*(16+10)]     @ off-load "@x[10]"
+       str             r11,[sp,#4*(16+11)]     @ off-load "@x[11]"
+       vshl.i32        $t1#lo,$t0#lo,#1        @ two
+       vstr            $t0#lo,[sp,#4*(16+0)]
+       vshl.i32        $t2#lo,$t0#lo,#2        @ four
+       vstr            $t1#lo,[sp,#4*(16+2)]
+       vmov            $a1,$a0
+       vstr            $t2#lo,[sp,#4*(16+4)]
+       vmov            $a2,$a0
+       vmov            $b1,$b0
+       vmov            $b2,$b0
+       b               .Loop_neon_enter
+
+.align 4
+.Loop_neon_outer:
+       ldmia           sp,{r0-r9}              @ load key material
+       cmp             @t[3],#64*2             @ if len<=64*2
+       bls             .Lbreak_neon            @ switch to integer-only
+       vmov            $a1,$a0
+       str             @t[3],[sp,#4*(32+2)]    @ save len
+       vmov            $a2,$a0
+       str             r12,  [sp,#4*(32+1)]    @ save inp
+       vmov            $b1,$b0
+       str             r14,  [sp,#4*(32+0)]    @ save out
+       vmov            $b2,$b0
+.Loop_neon_enter:
+       ldr             @t[3], [sp,#4*(15)]
+       vadd.i32        $d1,$d0,$t0             @ counter+1
+       ldr             @x[12],[sp,#4*(12)]     @ modulo-scheduled load
+       vmov            $c1,$c0
+       ldr             @t[2], [sp,#4*(13)]
+       vmov            $c2,$c0
+       ldr             @x[14],[sp,#4*(14)]
+       vadd.i32        $d2,$d1,$t0             @ counter+2
+       str             @t[3], [sp,#4*(16+15)]
+       mov             @t[3],#10
+       add             @x[12],@x[12],#3        @ counter+3 
+       b               .Loop_neon
+
+.align 4
+.Loop_neon:
+       subs            @t[3],@t[3],#1
+___
+       my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
+       my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
+       my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
+       my @thread3=&ROUND(0,4,8,12);
+
+       foreach (@thread0) {
+               eval;                   eval(shift(@thread3));
+               eval(shift(@thread1));  eval(shift(@thread3));
+               eval(shift(@thread2));  eval(shift(@thread3));
+       }
+
+       @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
+       @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
+       @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
+       @thread3=&ROUND(0,5,10,15);
+
+       foreach (@thread0) {
+               eval;                   eval(shift(@thread3));
+               eval(shift(@thread1));  eval(shift(@thread3));
+               eval(shift(@thread2));  eval(shift(@thread3));
+       }
+$code.=<<___;
+       bne             .Loop_neon
+
+       add             @t[3],sp,#32
+       vld1.32         {$t0-$t1},[sp]          @ load key material
+       vld1.32         {$t2-$t3},[@t[3]]
+
+       ldr             @t[3],[sp,#4*(32+2)]    @ load len
+
+       str             @t[0], [sp,#4*(16+8)]   @ modulo-scheduled store
+       str             @t[1], [sp,#4*(16+9)]
+       str             @x[12],[sp,#4*(16+12)]
+       str             @t[2], [sp,#4*(16+13)]
+       str             @x[14],[sp,#4*(16+14)]
+
+       @ at this point we have first half of 512-bit result in
+       @ @x[0-7] and second half at sp+4*(16+8)
+
+       ldr             r12,[sp,#4*(32+1)]      @ load inp
+       ldr             r14,[sp,#4*(32+0)]      @ load out
+
+       vadd.i32        $a0,$a0,$t0             @ accumulate key material
+       vadd.i32        $a1,$a1,$t0
+       vadd.i32        $a2,$a2,$t0
+       vldr            $t0#lo,[sp,#4*(16+0)]   @ one
+
+       vadd.i32        $b0,$b0,$t1
+       vadd.i32        $b1,$b1,$t1
+       vadd.i32        $b2,$b2,$t1
+       vldr            $t1#lo,[sp,#4*(16+2)]   @ two
+
+       vadd.i32        $c0,$c0,$t2
+       vadd.i32        $c1,$c1,$t2
+       vadd.i32        $c2,$c2,$t2
+       vadd.i32        $d1#lo,$d1#lo,$t0#lo    @ counter+1
+       vadd.i32        $d2#lo,$d2#lo,$t1#lo    @ counter+2
+
+       vadd.i32        $d0,$d0,$t3
+       vadd.i32        $d1,$d1,$t3
+       vadd.i32        $d2,$d2,$t3
+
+       cmp             @t[3],#64*4
+       blo             .Ltail_neon
+
+       vld1.8          {$t0-$t1},[r12]!        @ load input
+        mov            @t[3],sp
+       vld1.8          {$t2-$t3},[r12]!
+       veor            $a0,$a0,$t0             @ xor with input
+       veor            $b0,$b0,$t1
+       vld1.8          {$t0-$t1},[r12]!
+       veor            $c0,$c0,$t2
+       veor            $d0,$d0,$t3
+       vld1.8          {$t2-$t3},[r12]!
+
+       veor            $a1,$a1,$t0
+        vst1.8         {$a0-$b0},[r14]!        @ store output
+       veor            $b1,$b1,$t1
+       vld1.8          {$t0-$t1},[r12]!
+       veor            $c1,$c1,$t2
+        vst1.8         {$c0-$d0},[r14]!
+       veor            $d1,$d1,$t3
+       vld1.8          {$t2-$t3},[r12]!
+
+       veor            $a2,$a2,$t0
+        vld1.32        {$a0-$b0},[@t[3]]!      @ load for next iteration
+        veor           $t0#hi,$t0#hi,$t0#hi
+        vldr           $t0#lo,[sp,#4*(16+4)]   @ four
+       veor            $b2,$b2,$t1
+        vld1.32        {$c0-$d0},[@t[3]]
+       veor            $c2,$c2,$t2
+        vst1.8         {$a1-$b1},[r14]!
+       veor            $d2,$d2,$t3
+        vst1.8         {$c1-$d1},[r14]!
+
+       vadd.i32        $d0#lo,$d0#lo,$t0#lo    @ next counter value
+       vldr            $t0#lo,[sp,#4*(16+0)]   @ one
+
+       ldmia           sp,{@t[0]-@t[3]}        @ load key material
+       add             @x[0],@x[0],@t[0]       @ accumulate key material
+       ldr             @t[0],[r12],#16         @ load input
+        vst1.8         {$a2-$b2},[r14]!
+       add             @x[1],@x[1],@t[1]
+       ldr             @t[1],[r12,#-12]
+        vst1.8         {$c2-$d2},[r14]!
+       add             @x[2],@x[2],@t[2]
+       ldr             @t[2],[r12,#-8]
+       add             @x[3],@x[3],@t[3]
+       ldr             @t[3],[r12,#-4]
+# ifdef        __ARMEB__
+       rev             @x[0],@x[0]
+       rev             @x[1],@x[1]
+       rev             @x[2],@x[2]
+       rev             @x[3],@x[3]
+# endif
+       eor             @x[0],@x[0],@t[0]       @ xor with input
+        add            @t[0],sp,#4*(4)
+       eor             @x[1],@x[1],@t[1]
+       str             @x[0],[r14],#16         @ store output
+       eor             @x[2],@x[2],@t[2]
+       str             @x[1],[r14,#-12]
+       eor             @x[3],@x[3],@t[3]
+        ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
+       str             @x[2],[r14,#-8]
+       str             @x[3],[r14,#-4]
+
+       add             @x[4],@x[4],@t[0]       @ accumulate key material
+       ldr             @t[0],[r12],#16         @ load input
+       add             @x[5],@x[5],@t[1]
+       ldr             @t[1],[r12,#-12]
+       add             @x[6],@x[6],@t[2]
+       ldr             @t[2],[r12,#-8]
+       add             @x[7],@x[7],@t[3]
+       ldr             @t[3],[r12,#-4]
+# ifdef        __ARMEB__
+       rev             @x[4],@x[4]
+       rev             @x[5],@x[5]
+       rev             @x[6],@x[6]
+       rev             @x[7],@x[7]
+# endif
+       eor             @x[4],@x[4],@t[0]
+        add            @t[0],sp,#4*(8)
+       eor             @x[5],@x[5],@t[1]
+       str             @x[4],[r14],#16         @ store output
+       eor             @x[6],@x[6],@t[2]
+       str             @x[5],[r14,#-12]
+       eor             @x[7],@x[7],@t[3]
+        ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
+       str             @x[6],[r14,#-8]
+        add            @x[0],sp,#4*(16+8)
+       str             @x[7],[r14,#-4]
+
+       ldmia           @x[0],{@x[0]-@x[7]}     @ load second half
+
+       add             @x[0],@x[0],@t[0]       @ accumulate key material
+       ldr             @t[0],[r12],#16         @ load input
+       add             @x[1],@x[1],@t[1]
+       ldr             @t[1],[r12,#-12]
+# ifdef        __thumb2__
+       it      hi
+# endif
+        strhi          @t[2],[sp,#4*(16+10)]   @ copy "@x[10]" while at it
+       add             @x[2],@x[2],@t[2]
+       ldr             @t[2],[r12,#-8]
+# ifdef        __thumb2__
+       it      hi
+# endif
+        strhi          @t[3],[sp,#4*(16+11)]   @ copy "@x[11]" while at it
+       add             @x[3],@x[3],@t[3]
+       ldr             @t[3],[r12,#-4]
+# ifdef        __ARMEB__
+       rev             @x[0],@x[0]
+       rev             @x[1],@x[1]
+       rev             @x[2],@x[2]
+       rev             @x[3],@x[3]
+# endif
+       eor             @x[0],@x[0],@t[0]
+        add            @t[0],sp,#4*(12)
+       eor             @x[1],@x[1],@t[1]
+       str             @x[0],[r14],#16         @ store output
+       eor             @x[2],@x[2],@t[2]
+       str             @x[1],[r14,#-12]
+       eor             @x[3],@x[3],@t[3]
+        ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
+       str             @x[2],[r14,#-8]
+       str             @x[3],[r14,#-4]
+
+       add             @x[4],@x[4],@t[0]       @ accumulate key material
+        add            @t[0],@t[0],#4          @ next counter value
+       add             @x[5],@x[5],@t[1]
+        str            @t[0],[sp,#4*(12)]      @ save next counter value
+       ldr             @t[0],[r12],#16         @ load input
+       add             @x[6],@x[6],@t[2]
+        add            @x[4],@x[4],#3          @ counter+3
+       ldr             @t[1],[r12,#-12]
+       add             @x[7],@x[7],@t[3]
+       ldr             @t[2],[r12,#-8]
+       ldr             @t[3],[r12,#-4]
+# ifdef        __ARMEB__
+       rev             @x[4],@x[4]
+       rev             @x[5],@x[5]
+       rev             @x[6],@x[6]
+       rev             @x[7],@x[7]
+# endif
+       eor             @x[4],@x[4],@t[0]
+# ifdef        __thumb2__
+       it      hi
+# endif
+        ldrhi          @t[0],[sp,#4*(32+2)]    @ re-load len
+       eor             @x[5],@x[5],@t[1]
+       eor             @x[6],@x[6],@t[2]
+       str             @x[4],[r14],#16         @ store output
+       eor             @x[7],@x[7],@t[3]
+       str             @x[5],[r14,#-12]
+        sub            @t[3],@t[0],#64*4       @ len-=64*4
+       str             @x[6],[r14,#-8]
+       str             @x[7],[r14,#-4]
+       bhi             .Loop_neon_outer
+
+       b               .Ldone_neon
+
+.align 4
+.Lbreak_neon:
+       @ harmonize NEON and integer-only stack frames: load data
+       @ from NEON frame, but save to integer-only one; distance
+       @ between the two is 4*(32+4+16-32)=4*(20).
+
+       str             @t[3], [sp,#4*(20+32+2)]        @ save len
+        add            @t[3],sp,#4*(32+4)
+       str             r12,   [sp,#4*(20+32+1)]        @ save inp
+       str             r14,   [sp,#4*(20+32+0)]        @ save out
+
+       ldr             @x[12],[sp,#4*(16+10)]
+       ldr             @x[14],[sp,#4*(16+11)]
+        vldmia         @t[3],{d8-d15}                  @ fulfill ABI requirement
+       str             @x[12],[sp,#4*(20+16+10)]       @ copy "@x[10]"
+       str             @x[14],[sp,#4*(20+16+11)]       @ copy "@x[11]"
+
+       ldr             @t[3], [sp,#4*(15)]
+       ldr             @x[12],[sp,#4*(12)]             @ modulo-scheduled load
+       ldr             @t[2], [sp,#4*(13)]
+       ldr             @x[14],[sp,#4*(14)]
+       str             @t[3], [sp,#4*(20+16+15)]
+       add             @t[3],sp,#4*(20)
+       vst1.32         {$a0-$b0},[@t[3]]!              @ copy key
+       add             sp,sp,#4*(20)                   @ switch frame
+       vst1.32         {$c0-$d0},[@t[3]]
+       mov             @t[3],#10
+       b               .Loop                           @ go integer-only
+
+.align 4
+.Ltail_neon:
+       cmp             @t[3],#64*3
+       bhs             .L192_or_more_neon
+       cmp             @t[3],#64*2
+       bhs             .L128_or_more_neon
+       cmp             @t[3],#64*1
+       bhs             .L64_or_more_neon
+
+       add             @t[0],sp,#4*(8)
+       vst1.8          {$a0-$b0},[sp]
+       add             @t[2],sp,#4*(0)
+       vst1.8          {$c0-$d0},[@t[0]]
+       b               .Loop_tail_neon
+
+.align 4
+.L64_or_more_neon:
+       vld1.8          {$t0-$t1},[r12]!
+       vld1.8          {$t2-$t3},[r12]!
+       veor            $a0,$a0,$t0
+       veor            $b0,$b0,$t1
+       veor            $c0,$c0,$t2
+       veor            $d0,$d0,$t3
+       vst1.8          {$a0-$b0},[r14]!
+       vst1.8          {$c0-$d0},[r14]!
+
+       beq             .Ldone_neon
+
+       add             @t[0],sp,#4*(8)
+       vst1.8          {$a1-$b1},[sp]
+       add             @t[2],sp,#4*(0)
+       vst1.8          {$c1-$d1},[@t[0]]
+       sub             @t[3],@t[3],#64*1       @ len-=64*1
+       b               .Loop_tail_neon
+
+.align 4
+.L128_or_more_neon:
+       vld1.8          {$t0-$t1},[r12]!
+       vld1.8          {$t2-$t3},[r12]!
+       veor            $a0,$a0,$t0
+       veor            $b0,$b0,$t1
+       vld1.8          {$t0-$t1},[r12]!
+       veor            $c0,$c0,$t2
+       veor            $d0,$d0,$t3
+       vld1.8          {$t2-$t3},[r12]!
+
+       veor            $a1,$a1,$t0
+       veor            $b1,$b1,$t1
+        vst1.8         {$a0-$b0},[r14]!
+       veor            $c1,$c1,$t2
+        vst1.8         {$c0-$d0},[r14]!
+       veor            $d1,$d1,$t3
+       vst1.8          {$a1-$b1},[r14]!
+       vst1.8          {$c1-$d1},[r14]!
+
+       beq             .Ldone_neon
+
+       add             @t[0],sp,#4*(8)
+       vst1.8          {$a2-$b2},[sp]
+       add             @t[2],sp,#4*(0)
+       vst1.8          {$c2-$d2},[@t[0]]
+       sub             @t[3],@t[3],#64*2       @ len-=64*2
+       b               .Loop_tail_neon
+
+.align 4
+.L192_or_more_neon:
+       vld1.8          {$t0-$t1},[r12]!
+       vld1.8          {$t2-$t3},[r12]!
+       veor            $a0,$a0,$t0
+       veor            $b0,$b0,$t1
+       vld1.8          {$t0-$t1},[r12]!
+       veor            $c0,$c0,$t2
+       veor            $d0,$d0,$t3
+       vld1.8          {$t2-$t3},[r12]!
+
+       veor            $a1,$a1,$t0
+       veor            $b1,$b1,$t1
+       vld1.8          {$t0-$t1},[r12]!
+       veor            $c1,$c1,$t2
+        vst1.8         {$a0-$b0},[r14]!
+       veor            $d1,$d1,$t3
+       vld1.8          {$t2-$t3},[r12]!
+
+       veor            $a2,$a2,$t0
+        vst1.8         {$c0-$d0},[r14]!
+       veor            $b2,$b2,$t1
+        vst1.8         {$a1-$b1},[r14]!
+       veor            $c2,$c2,$t2
+        vst1.8         {$c1-$d1},[r14]!
+       veor            $d2,$d2,$t3
+       vst1.8          {$a2-$b2},[r14]!
+       vst1.8          {$c2-$d2},[r14]!
+
+       beq             .Ldone_neon
+
+       ldmia           sp,{@t[0]-@t[3]}        @ load key material
+       add             @x[0],@x[0],@t[0]       @ accumulate key material
+        add            @t[0],sp,#4*(4)
+       add             @x[1],@x[1],@t[1]
+       add             @x[2],@x[2],@t[2]
+       add             @x[3],@x[3],@t[3]
+        ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
+
+       add             @x[4],@x[4],@t[0]       @ accumulate key material
+        add            @t[0],sp,#4*(8)
+       add             @x[5],@x[5],@t[1]
+       add             @x[6],@x[6],@t[2]
+       add             @x[7],@x[7],@t[3]
+        ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
+# ifdef        __ARMEB__
+       rev             @x[0],@x[0]
+       rev             @x[1],@x[1]
+       rev             @x[2],@x[2]
+       rev             @x[3],@x[3]
+       rev             @x[4],@x[4]
+       rev             @x[5],@x[5]
+       rev             @x[6],@x[6]
+       rev             @x[7],@x[7]
+# endif
+       stmia           sp,{@x[0]-@x[7]}
+        add            @x[0],sp,#4*(16+8)
+
+       ldmia           @x[0],{@x[0]-@x[7]}     @ load second half
+
+       add             @x[0],@x[0],@t[0]       @ accumulate key material
+        add            @t[0],sp,#4*(12)
+       add             @x[1],@x[1],@t[1]
+       add             @x[2],@x[2],@t[2]
+       add             @x[3],@x[3],@t[3]
+        ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
+
+       add             @x[4],@x[4],@t[0]       @ accumulate key material
+        add            @t[0],sp,#4*(8)
+       add             @x[5],@x[5],@t[1]
+        add            @x[4],@x[4],#3          @ counter+3
+       add             @x[6],@x[6],@t[2]
+       add             @x[7],@x[7],@t[3]
+        ldr            @t[3],[sp,#4*(32+2)]    @ re-load len
+# ifdef        __ARMEB__
+       rev             @x[0],@x[0]
+       rev             @x[1],@x[1]
+       rev             @x[2],@x[2]
+       rev             @x[3],@x[3]
+       rev             @x[4],@x[4]
+       rev             @x[5],@x[5]
+       rev             @x[6],@x[6]
+       rev             @x[7],@x[7]
+# endif
+       stmia           @t[0],{@x[0]-@x[7]}
+        add            @t[2],sp,#4*(0)
+        sub            @t[3],@t[0],#64*3       @ len-=64*3
+
+.Loop_tail_neon:
+       ldrb            @t[0],[@t[2]],#1        @ read buffer on stack
+       ldrb            @t[1],[r12],#1          @ read input
+       subs            @t[3],@t[3],#1
+       eor             @t[0],@t[0],@t[1]
+       strb            @t[0],[r14],#1          @ store ouput
+       bne             .Loop_tail_neon
+
+.Ldone_neon:
+       add             sp,sp,#4*(32+4)
+       vldmia          sp,{d8-d15}
+       add             sp,sp,#4*(16+3)
+       ldmia           sp!,{r4-r11,pc}
+.size  ChaCha20_neon,.-ChaCha20_neon
+.comm  OPENSSL_armcap_P,4,4
+#endif
+___
+}}}
+
+foreach (split("\n",$code)) {
+       s/\`([^\`]*)\`/eval $1/geo;
+
+       s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
+
+       print $_,"\n";
+}
+close STDOUT;
diff --git a/crypto/chacha/asm/chacha-armv8.pl b/crypto/chacha/asm/chacha-armv8.pl
new file mode 100755 (executable)
index 0000000..6ddb31f
--- /dev/null
@@ -0,0 +1,1126 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# June 2015
+# 
+# ChaCha20 for ARMv8.
+#
+# Performance in cycles per byte out of large buffer.
+#
+#                      IALU/gcc-4.9    3xNEON+1xIALU   6xNEON+2xIALU
+#
+# Apple A7             5.50/+49%       3.33            1.70
+# Cortex-A53           8.40/+80%       4.72            4.72(*)
+# Cortex-A57           8.06/+43%       4.90            4.43(**)
+# Denver               4.50/+82%       2.63            2.67(*)
+# X-Gene               9.50/+46%       8.82            8.89(*)
+#
+# (*)  it's expected that doubling interleave factor doesn't help
+#      all processors, only those with higher NEON latency and
+#      higher instruction issue rate;
+# (**) expected improvement was actually higher;
+
+$flavour=shift;
+$output=shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+sub AUTOLOAD()         # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
+
+my @x=map("x$_",(5..17,19..21));
+my @d=map("x$_",(22..28,30));
+
+sub ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+
+    (
+       "&add_32        (@x[$a0],@x[$a0],@x[$b0])",
+        "&add_32       (@x[$a1],@x[$a1],@x[$b1])",
+         "&add_32      (@x[$a2],@x[$a2],@x[$b2])",
+          "&add_32     (@x[$a3],@x[$a3],@x[$b3])",
+       "&eor_32        (@x[$d0],@x[$d0],@x[$a0])",
+        "&eor_32       (@x[$d1],@x[$d1],@x[$a1])",
+         "&eor_32      (@x[$d2],@x[$d2],@x[$a2])",
+          "&eor_32     (@x[$d3],@x[$d3],@x[$a3])",
+       "&ror_32        (@x[$d0],@x[$d0],16)",
+        "&ror_32       (@x[$d1],@x[$d1],16)",
+         "&ror_32      (@x[$d2],@x[$d2],16)",
+          "&ror_32     (@x[$d3],@x[$d3],16)",
+
+       "&add_32        (@x[$c0],@x[$c0],@x[$d0])",
+        "&add_32       (@x[$c1],@x[$c1],@x[$d1])",
+         "&add_32      (@x[$c2],@x[$c2],@x[$d2])",
+          "&add_32     (@x[$c3],@x[$c3],@x[$d3])",
+       "&eor_32        (@x[$b0],@x[$b0],@x[$c0])",
+        "&eor_32       (@x[$b1],@x[$b1],@x[$c1])",
+         "&eor_32      (@x[$b2],@x[$b2],@x[$c2])",
+          "&eor_32     (@x[$b3],@x[$b3],@x[$c3])",
+       "&ror_32        (@x[$b0],@x[$b0],20)",
+        "&ror_32       (@x[$b1],@x[$b1],20)",
+         "&ror_32      (@x[$b2],@x[$b2],20)",
+          "&ror_32     (@x[$b3],@x[$b3],20)",
+
+       "&add_32        (@x[$a0],@x[$a0],@x[$b0])",
+        "&add_32       (@x[$a1],@x[$a1],@x[$b1])",
+         "&add_32      (@x[$a2],@x[$a2],@x[$b2])",
+          "&add_32     (@x[$a3],@x[$a3],@x[$b3])",
+       "&eor_32        (@x[$d0],@x[$d0],@x[$a0])",
+        "&eor_32       (@x[$d1],@x[$d1],@x[$a1])",
+         "&eor_32      (@x[$d2],@x[$d2],@x[$a2])",
+          "&eor_32     (@x[$d3],@x[$d3],@x[$a3])",
+       "&ror_32        (@x[$d0],@x[$d0],24)",
+        "&ror_32       (@x[$d1],@x[$d1],24)",
+         "&ror_32      (@x[$d2],@x[$d2],24)",
+          "&ror_32     (@x[$d3],@x[$d3],24)",
+
+       "&add_32        (@x[$c0],@x[$c0],@x[$d0])",
+        "&add_32       (@x[$c1],@x[$c1],@x[$d1])",
+         "&add_32      (@x[$c2],@x[$c2],@x[$d2])",
+          "&add_32     (@x[$c3],@x[$c3],@x[$d3])",
+       "&eor_32        (@x[$b0],@x[$b0],@x[$c0])",
+        "&eor_32       (@x[$b1],@x[$b1],@x[$c1])",
+         "&eor_32      (@x[$b2],@x[$b2],@x[$c2])",
+          "&eor_32     (@x[$b3],@x[$b3],@x[$c3])",
+       "&ror_32        (@x[$b0],@x[$b0],25)",
+        "&ror_32       (@x[$b1],@x[$b1],25)",
+         "&ror_32      (@x[$b2],@x[$b2],25)",
+          "&ror_32     (@x[$b3],@x[$b3],25)"
+    );
+}
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+
+.extern        OPENSSL_armcap_P
+
+.align 5
+.Lsigma:
+.quad  0x3320646e61707865,0x6b20657479622d32           // endian-neutral
+.Lone:
+.long  1,0,0,0
+.LOPENSSL_armcap_P:
+#ifdef __ILP32__
+.long  OPENSSL_armcap_P-.
+#else
+.quad  OPENSSL_armcap_P-.
+#endif
+.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+
+.globl ChaCha20_ctr32
+.type  ChaCha20_ctr32,%function
+.align 5
+ChaCha20_ctr32:
+       cbz     $len,.Labort
+       adr     @x[0],.LOPENSSL_armcap_P
+       cmp     $len,#192
+       b.lo    .Lshort
+#ifdef __ILP32__
+       ldrsw   @x[1],[@x[0]]
+#else
+       ldr     @x[1],[@x[0]]
+#endif
+       ldr     w17,[@x[1],@x[0]]
+       tst     w17,#ARMV7_NEON
+       b.ne    ChaCha20_neon
+
+.Lshort:
+       stp     x29,x30,[sp,#-96]!
+       add     x29,sp,#0
+
+       adr     @x[0],.Lsigma
+       stp     x19,x20,[sp,#16]
+       stp     x21,x22,[sp,#32]
+       stp     x23,x24,[sp,#48]
+       stp     x25,x26,[sp,#64]
+       stp     x27,x28,[sp,#80]
+       sub     sp,sp,#64
+
+       ldp     @d[0],@d[1],[@x[0]]             // load sigma
+       ldp     @d[2],@d[3],[$key]              // load key
+       ldp     @d[4],@d[5],[$key,#16]
+       ldp     @d[6],@d[7],[$ctr]              // load counter
+#ifdef __ARMEB__
+       ror     @d[2],@d[2],#32
+       ror     @d[3],@d[3],#32
+       ror     @d[4],@d[4],#32
+       ror     @d[5],@d[5],#32
+       ror     @d[6],@d[6],#32
+       ror     @d[7],@d[7],#32
+#endif
+
+.Loop_outer:
+       mov.32  @x[0],@d[0]                     // unpack key block
+       lsr     @x[1],@d[0],#32
+       mov.32  @x[2],@d[1]
+       lsr     @x[3],@d[1],#32
+       mov.32  @x[4],@d[2]
+       lsr     @x[5],@d[2],#32
+       mov.32  @x[6],@d[3]
+       lsr     @x[7],@d[3],#32
+       mov.32  @x[8],@d[4]
+       lsr     @x[9],@d[4],#32
+       mov.32  @x[10],@d[5]
+       lsr     @x[11],@d[5],#32
+       mov.32  @x[12],@d[6]
+       lsr     @x[13],@d[6],#32
+       mov.32  @x[14],@d[7]
+       lsr     @x[15],@d[7],#32
+
+       mov     $ctr,#10
+       subs    $len,$len,#64
+.Loop:
+       sub     $ctr,$ctr,#1    
+___
+       foreach (&ROUND(0, 4, 8,12)) { eval; }
+       foreach (&ROUND(0, 5,10,15)) { eval; }
+$code.=<<___;
+       cbnz    $ctr,.Loop
+
+       add.32  @x[0],@x[0],@d[0]               // accumulate key block
+       add     @x[1],@x[1],@d[0],lsr#32
+       add.32  @x[2],@x[2],@d[1]
+       add     @x[3],@x[3],@d[1],lsr#32
+       add.32  @x[4],@x[4],@d[2]
+       add     @x[5],@x[5],@d[2],lsr#32
+       add.32  @x[6],@x[6],@d[3]
+       add     @x[7],@x[7],@d[3],lsr#32
+       add.32  @x[8],@x[8],@d[4]
+       add     @x[9],@x[9],@d[4],lsr#32
+       add.32  @x[10],@x[10],@d[5]
+       add     @x[11],@x[11],@d[5],lsr#32
+       add.32  @x[12],@x[12],@d[6]
+       add     @x[13],@x[13],@d[6],lsr#32
+       add.32  @x[14],@x[14],@d[7]
+       add     @x[15],@x[15],@d[7],lsr#32
+
+       b.lo    .Ltail
+
+       add     @x[0],@x[0],@x[1],lsl#32        // pack
+       add     @x[2],@x[2],@x[3],lsl#32
+       ldp     @x[1],@x[3],[$inp,#0]           // load input
+       add     @x[4],@x[4],@x[5],lsl#32
+       add     @x[6],@x[6],@x[7],lsl#32
+       ldp     @x[5],@x[7],[$inp,#16]
+       add     @x[8],@x[8],@x[9],lsl#32
+       add     @x[10],@x[10],@x[11],lsl#32
+       ldp     @x[9],@x[11],[$inp,#32]
+       add     @x[12],@x[12],@x[13],lsl#32
+       add     @x[14],@x[14],@x[15],lsl#32
+       ldp     @x[13],@x[15],[$inp,#48]
+       add     $inp,$inp,#64
+#ifdef __ARMEB__
+       rev     @x[0],@x[0]
+       rev     @x[2],@x[2]
+       rev     @x[4],@x[4]
+       rev     @x[6],@x[6]
+       rev     @x[8],@x[8]
+       rev     @x[10],@x[10]
+       rev     @x[12],@x[12]
+       rev     @x[14],@x[14]
+#endif
+       eor     @x[0],@x[0],@x[1]
+       eor     @x[2],@x[2],@x[3]
+       eor     @x[4],@x[4],@x[5]
+       eor     @x[6],@x[6],@x[7]
+       eor     @x[8],@x[8],@x[9]
+       eor     @x[10],@x[10],@x[11]
+       eor     @x[12],@x[12],@x[13]
+       eor     @x[14],@x[14],@x[15]
+
+       stp     @x[0],@x[2],[$out,#0]           // store output
+        add    @d[6],@d[6],#1                  // increment counter
+       stp     @x[4],@x[6],[$out,#16]
+       stp     @x[8],@x[10],[$out,#32]
+       stp     @x[12],@x[14],[$out,#48]
+       add     $out,$out,#64
+
+       b.hi    .Loop_outer
+
+       ldp     x19,x20,[x29,#16]
+       add     sp,sp,#64
+       ldp     x21,x22,[x29,#32]
+       ldp     x23,x24,[x29,#48]
+       ldp     x25,x26,[x29,#64]
+       ldp     x27,x28,[x29,#80]
+       ldp     x29,x30,[sp],#96
+.Labort:
+       ret
+
+.align 4
+.Ltail:
+       add     $len,$len,#64
+.Less_than_64:
+       sub     $out,$out,#1
+       add     $inp,$inp,$len
+       add     $out,$out,$len
+       add     $ctr,sp,$len
+       neg     $len,$len
+
+       add     @x[0],@x[0],@x[1],lsl#32        // pack
+       add     @x[2],@x[2],@x[3],lsl#32
+       add     @x[4],@x[4],@x[5],lsl#32
+       add     @x[6],@x[6],@x[7],lsl#32
+       add     @x[8],@x[8],@x[9],lsl#32
+       add     @x[10],@x[10],@x[11],lsl#32
+       add     @x[12],@x[12],@x[13],lsl#32
+       add     @x[14],@x[14],@x[15],lsl#32
+#ifdef __ARMEB__
+       rev     @x[0],@x[0]
+       rev     @x[2],@x[2]
+       rev     @x[4],@x[4]
+       rev     @x[6],@x[6]
+       rev     @x[8],@x[8]
+       rev     @x[10],@x[10]
+       rev     @x[12],@x[12]
+       rev     @x[14],@x[14]
+#endif
+       stp     @x[0],@x[2],[sp,#0]
+       stp     @x[4],@x[6],[sp,#16]
+       stp     @x[8],@x[10],[sp,#32]
+       stp     @x[12],@x[14],[sp,#48]
+
+.Loop_tail:
+       ldrb    w10,[$inp,$len]
+       ldrb    w11,[$ctr,$len]
+       add     $len,$len,#1
+       eor     w10,w10,w11
+       strb    w10,[$out,$len]
+       cbnz    $len,.Loop_tail
+
+       stp     xzr,xzr,[sp,#0]
+       stp     xzr,xzr,[sp,#16]
+       stp     xzr,xzr,[sp,#32]
+       stp     xzr,xzr,[sp,#48]
+
+       ldp     x19,x20,[x29,#16]
+       add     sp,sp,#64
+       ldp     x21,x22,[x29,#32]
+       ldp     x23,x24,[x29,#48]
+       ldp     x25,x26,[x29,#64]
+       ldp     x27,x28,[x29,#80]
+       ldp     x29,x30,[sp],#96
+       ret
+.size  ChaCha20_ctr32,.-ChaCha20_ctr32
+___
+
+{{{
+my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
+    map("v$_.4s",(0..7,16..23));
+my (@K)=map("v$_.4s",(24..30));
+my $ONE="v31.4s";
+
+sub NEONROUND {
+my $odd = pop;
+my ($a,$b,$c,$d,$t)=@_;
+
+       (
+       "&add           ('$a','$a','$b')",
+       "&eor           ('$d','$d','$a')",
+       "&rev32_16      ('$d','$d')",           # vrot ($d,16)
+
+       "&add           ('$c','$c','$d')",
+       "&eor           ('$t','$b','$c')",
+       "&ushr          ('$b','$t',20)",
+       "&sli           ('$b','$t',12)",
+
+       "&add           ('$a','$a','$b')",
+       "&eor           ('$t','$d','$a')",
+       "&ushr          ('$d','$t',24)",
+       "&sli           ('$d','$t',8)",
+
+       "&add           ('$c','$c','$d')",
+       "&eor           ('$t','$b','$c')",
+       "&ushr          ('$b','$t',25)",
+       "&sli           ('$b','$t',7)",
+
+       "&ext           ('$c','$c','$c',8)",
+       "&ext           ('$d','$d','$d',$odd?4:12)",
+       "&ext           ('$b','$b','$b',$odd?12:4)"
+       );
+}
+
+$code.=<<___;
+
+.type  ChaCha20_neon,%function
+.align 5
+ChaCha20_neon:
+       stp     x29,x30,[sp,#-96]!
+       add     x29,sp,#0
+
+       adr     @x[0],.Lsigma
+       stp     x19,x20,[sp,#16]
+       stp     x21,x22,[sp,#32]
+       stp     x23,x24,[sp,#48]
+       stp     x25,x26,[sp,#64]
+       stp     x27,x28,[sp,#80]
+       cmp     $len,#512
+       b.hs    .L512_or_more_neon
+
+       sub     sp,sp,#64
+
+       ldp     @d[0],@d[1],[@x[0]]             // load sigma
+       ld1     {@K[0]},[@x[0]],#16
+       ldp     @d[2],@d[3],[$key]              // load key
+       ldp     @d[4],@d[5],[$key,#16]
+       ld1     {@K[1],@K[2]},[$key]
+       ldp     @d[6],@d[7],[$ctr]              // load counter
+       ld1     {@K[3]},[$ctr]
+       ld1     {$ONE},[@x[0]]
+#ifdef __ARMEB__
+       rev64   @K[0],@K[0]
+       ror     @d[2],@d[2],#32
+       ror     @d[3],@d[3],#32
+       ror     @d[4],@d[4],#32
+       ror     @d[5],@d[5],#32
+       ror     @d[6],@d[6],#32
+       ror     @d[7],@d[7],#32
+#endif
+       add     @K[3],@K[3],$ONE                // += 1
+       add     @K[4],@K[3],$ONE
+       add     @K[5],@K[4],$ONE
+       shl     $ONE,$ONE,#2                    // 1 -> 4
+
+.Loop_outer_neon:
+       mov.32  @x[0],@d[0]                     // unpack key block
+       lsr     @x[1],@d[0],#32
+        mov    $A0,@K[0]
+       mov.32  @x[2],@d[1]
+       lsr     @x[3],@d[1],#32
+        mov    $A1,@K[0]
+       mov.32  @x[4],@d[2]
+       lsr     @x[5],@d[2],#32
+        mov    $A2,@K[0]
+       mov.32  @x[6],@d[3]
+        mov    $B0,@K[1]
+       lsr     @x[7],@d[3],#32
+        mov    $B1,@K[1]
+       mov.32  @x[8],@d[4]
+        mov    $B2,@K[1]
+       lsr     @x[9],@d[4],#32
+        mov    $D0,@K[3]
+       mov.32  @x[10],@d[5]
+        mov    $D1,@K[4]
+       lsr     @x[11],@d[5],#32
+        mov    $D2,@K[5]
+       mov.32  @x[12],@d[6]
+        mov    $C0,@K[2]
+       lsr     @x[13],@d[6],#32
+        mov    $C1,@K[2]
+       mov.32  @x[14],@d[7]
+        mov    $C2,@K[2]
+       lsr     @x[15],@d[7],#32
+
+       mov     $ctr,#10
+       subs    $len,$len,#256
+.Loop_neon:
+       sub     $ctr,$ctr,#1
+___
+       my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
+       my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
+       my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
+       my @thread3=&ROUND(0,4,8,12);
+
+       foreach (@thread0) {
+               eval;                   eval(shift(@thread3));
+               eval(shift(@thread1));  eval(shift(@thread3));
+               eval(shift(@thread2));  eval(shift(@thread3));
+       }
+
+       @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
+       @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
+       @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
+       @thread3=&ROUND(0,5,10,15);
+
+       foreach (@thread0) {
+               eval;                   eval(shift(@thread3));
+               eval(shift(@thread1));  eval(shift(@thread3));
+               eval(shift(@thread2));  eval(shift(@thread3));
+       }
+$code.=<<___;
+       cbnz    $ctr,.Loop_neon
+
+       add.32  @x[0],@x[0],@d[0]               // accumulate key block
+        add    $A0,$A0,@K[0]
+       add     @x[1],@x[1],@d[0],lsr#32
+        add    $A1,$A1,@K[0]
+       add.32  @x[2],@x[2],@d[1]
+        add    $A2,$A2,@K[0]
+       add     @x[3],@x[3],@d[1],lsr#32
+        add    $C0,$C0,@K[2]
+       add.32  @x[4],@x[4],@d[2]
+        add    $C1,$C1,@K[2]
+       add     @x[5],@x[5],@d[2],lsr#32
+        add    $C2,$C2,@K[2]
+       add.32  @x[6],@x[6],@d[3]
+        add    $D0,$D0,@K[3]
+       add     @x[7],@x[7],@d[3],lsr#32
+       add.32  @x[8],@x[8],@d[4]
+        add    $D1,$D1,@K[4]
+       add     @x[9],@x[9],@d[4],lsr#32
+       add.32  @x[10],@x[10],@d[5]
+        add    $D2,$D2,@K[5]
+       add     @x[11],@x[11],@d[5],lsr#32
+       add.32  @x[12],@x[12],@d[6]
+        add    $B0,$B0,@K[1]
+       add     @x[13],@x[13],@d[6],lsr#32
+       add.32  @x[14],@x[14],@d[7]
+        add    $B1,$B1,@K[1]
+       add     @x[15],@x[15],@d[7],lsr#32
+        add    $B2,$B2,@K[1]
+
+       b.lo    .Ltail_neon
+
+       add     @x[0],@x[0],@x[1],lsl#32        // pack
+       add     @x[2],@x[2],@x[3],lsl#32
+       ldp     @x[1],@x[3],[$inp,#0]           // load input
+       add     @x[4],@x[4],@x[5],lsl#32
+       add     @x[6],@x[6],@x[7],lsl#32
+       ldp     @x[5],@x[7],[$inp,#16]
+       add     @x[8],@x[8],@x[9],lsl#32
+       add     @x[10],@x[10],@x[11],lsl#32
+       ldp     @x[9],@x[11],[$inp,#32]
+       add     @x[12],@x[12],@x[13],lsl#32
+       add     @x[14],@x[14],@x[15],lsl#32
+       ldp     @x[13],@x[15],[$inp,#48]
+       add     $inp,$inp,#64
+#ifdef __ARMEB__
+       rev     @x[0],@x[0]
+       rev     @x[2],@x[2]
+       rev     @x[4],@x[4]
+       rev     @x[6],@x[6]
+       rev     @x[8],@x[8]
+       rev     @x[10],@x[10]
+       rev     @x[12],@x[12]
+       rev     @x[14],@x[14]
+#endif
+       ld1.8   {$T0-$T3},[$inp],#64
+       eor     @x[0],@x[0],@x[1]
+       eor     @x[2],@x[2],@x[3]
+       eor     @x[4],@x[4],@x[5]
+       eor     @x[6],@x[6],@x[7]
+       eor     @x[8],@x[8],@x[9]
+        eor    $A0,$A0,$T0
+       eor     @x[10],@x[10],@x[11]
+        eor    $B0,$B0,$T1
+       eor     @x[12],@x[12],@x[13]
+        eor    $C0,$C0,$T2
+       eor     @x[14],@x[14],@x[15]
+        eor    $D0,$D0,$T3
+        ld1.8  {$T0-$T3},[$inp],#64
+
+       stp     @x[0],@x[2],[$out,#0]           // store output
+        add    @d[6],@d[6],#4                  // increment counter
+       stp     @x[4],@x[6],[$out,#16]
+        add    @K[3],@K[3],$ONE                // += 4
+       stp     @x[8],@x[10],[$out,#32]
+        add    @K[4],@K[4],$ONE
+       stp     @x[12],@x[14],[$out,#48]
+        add    @K[5],@K[5],$ONE
+       add     $out,$out,#64
+
+       st1.8   {$A0-$D0},[$out],#64
+       ld1.8   {$A0-$D0},[$inp],#64
+
+       eor     $A1,$A1,$T0
+       eor     $B1,$B1,$T1
+       eor     $C1,$C1,$T2
+       eor     $D1,$D1,$T3
+       st1.8   {$A1-$D1},[$out],#64
+
+       eor     $A2,$A2,$A0
+       eor     $B2,$B2,$B0
+       eor     $C2,$C2,$C0
+       eor     $D2,$D2,$D0
+       st1.8   {$A2-$D2},[$out],#64
+
+       b.hi    .Loop_outer_neon
+
+       ldp     x19,x20,[x29,#16]
+       add     sp,sp,#64
+       ldp     x21,x22,[x29,#32]
+       ldp     x23,x24,[x29,#48]
+       ldp     x25,x26,[x29,#64]
+       ldp     x27,x28,[x29,#80]
+       ldp     x29,x30,[sp],#96
+       ret
+
+.Ltail_neon:
+       add     $len,$len,#256
+       cmp     $len,#64
+       b.lo    .Less_than_64
+
+       add     @x[0],@x[0],@x[1],lsl#32        // pack
+       add     @x[2],@x[2],@x[3],lsl#32
+       ldp     @x[1],@x[3],[$inp,#0]           // load input
+       add     @x[4],@x[4],@x[5],lsl#32
+       add     @x[6],@x[6],@x[7],lsl#32
+       ldp     @x[5],@x[7],[$inp,#16]
+       add     @x[8],@x[8],@x[9],lsl#32
+       add     @x[10],@x[10],@x[11],lsl#32
+       ldp     @x[9],@x[11],[$inp,#32]
+       add     @x[12],@x[12],@x[13],lsl#32
+       add     @x[14],@x[14],@x[15],lsl#32
+       ldp     @x[13],@x[15],[$inp,#48]
+       add     $inp,$inp,#64
+#ifdef __ARMEB__
+       rev     @x[0],@x[0]
+       rev     @x[2],@x[2]
+       rev     @x[4],@x[4]
+       rev     @x[6],@x[6]
+       rev     @x[8],@x[8]
+       rev     @x[10],@x[10]
+       rev     @x[12],@x[12]
+       rev     @x[14],@x[14]
+#endif
+       eor     @x[0],@x[0],@x[1]
+       eor     @x[2],@x[2],@x[3]
+       eor     @x[4],@x[4],@x[5]
+       eor     @x[6],@x[6],@x[7]
+       eor     @x[8],@x[8],@x[9]
+       eor     @x[10],@x[10],@x[11]
+       eor     @x[12],@x[12],@x[13]
+       eor     @x[14],@x[14],@x[15]
+
+       stp     @x[0],@x[2],[$out,#0]           // store output
+        add    @d[6],@d[6],#4                  // increment counter
+       stp     @x[4],@x[6],[$out,#16]
+       stp     @x[8],@x[10],[$out,#32]
+       stp     @x[12],@x[14],[$out,#48]
+       add     $out,$out,#64
+       b.eq    .Ldone_neon
+       sub     $len,$len,#64
+       cmp     $len,#64
+       b.lo    .Less_than_128
+
+       ld1.8   {$T0-$T3},[$inp],#64
+       eor     $A0,$A0,$T0
+       eor     $B0,$B0,$T1
+       eor     $C0,$C0,$T2
+       eor     $D0,$D0,$T3
+       st1.8   {$A0-$D0},[$out],#64
+       b.eq    .Ldone_neon
+       sub     $len,$len,#64
+       cmp     $len,#64
+       b.lo    .Less_than_192
+
+       ld1.8   {$T0-$T3},[$inp],#64
+       eor     $A1,$A1,$T0
+       eor     $B1,$B1,$T1
+       eor     $C1,$C1,$T2
+       eor     $D1,$D1,$T3
+       st1.8   {$A1-$D1},[$out],#64
+       b.eq    .Ldone_neon
+       sub     $len,$len,#64
+
+       st1.8   {$A2-$D2},[sp]
+       b       .Last_neon
+
+.Less_than_128:
+       st1.8   {$A0-$D0},[sp]
+       b       .Last_neon
+.Less_than_192:
+       st1.8   {$A1-$D1},[sp]
+       b       .Last_neon
+
+.align 4
+.Last_neon:
+       sub     $out,$out,#1
+       add     $inp,$inp,$len
+       add     $out,$out,$len
+       add     $ctr,sp,$len
+       neg     $len,$len
+
+.Loop_tail_neon:
+       ldrb    w10,[$inp,$len]
+       ldrb    w11,[$ctr,$len]
+       add     $len,$len,#1
+       eor     w10,w10,w11
+       strb    w10,[$out,$len]
+       cbnz    $len,.Loop_tail_neon
+
+       stp     xzr,xzr,[sp,#0]
+       stp     xzr,xzr,[sp,#16]
+       stp     xzr,xzr,[sp,#32]
+       stp     xzr,xzr,[sp,#48]
+
+.Ldone_neon:
+       ldp     x19,x20,[x29,#16]
+       add     sp,sp,#64
+       ldp     x21,x22,[x29,#32]
+       ldp     x23,x24,[x29,#48]
+       ldp     x25,x26,[x29,#64]
+       ldp     x27,x28,[x29,#80]
+       ldp     x29,x30,[sp],#96
+       ret
+.size  ChaCha20_neon,.-ChaCha20_neon
+___
+{
+my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
+my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
+    $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
+
+$code.=<<___;
+.type  ChaCha20_512_neon,%function
+.align 5
+ChaCha20_512_neon:
+       stp     x29,x30,[sp,#-96]!
+       add     x29,sp,#0
+
+       adr     @x[0],.Lsigma
+       stp     x19,x20,[sp,#16]
+       stp     x21,x22,[sp,#32]
+       stp     x23,x24,[sp,#48]
+       stp     x25,x26,[sp,#64]
+       stp     x27,x28,[sp,#80]
+
+.L512_or_more_neon:
+       sub     sp,sp,#128+64
+
+       ldp     @d[0],@d[1],[@x[0]]             // load sigma
+       ld1     {@K[0]},[@x[0]],#16
+       ldp     @d[2],@d[3],[$key]              // load key
+       ldp     @d[4],@d[5],[$key,#16]
+       ld1     {@K[1],@K[2]},[$key]
+       ldp     @d[6],@d[7],[$ctr]              // load counter
+       ld1     {@K[3]},[$ctr]
+       ld1     {$ONE},[@x[0]]
+#ifdef __ARMEB__
+       rev64   @K[0],@K[0]
+       ror     @d[2],@d[2],#32
+       ror     @d[3],@d[3],#32
+       ror     @d[4],@d[4],#32
+       ror     @d[5],@d[5],#32
+       ror     @d[6],@d[6],#32
+       ror     @d[7],@d[7],#32
+#endif
+       add     @K[3],@K[3],$ONE                // += 1
+       stp     @K[0],@K[1],[sp,#0]             // off-load key block, invariant part
+       add     @K[3],@K[3],$ONE                // not typo
+       str     @K[2],[sp,#32]
+       add     @K[4],@K[3],$ONE
+       add     @K[5],@K[4],$ONE
+       add     @K[6],@K[5],$ONE
+       shl     $ONE,$ONE,#2                    // 1 -> 4
+
+       stp     d8,d9,[sp,#128+0]               // meet ABI requirements
+       stp     d10,d11,[sp,#128+16]
+       stp     d12,d13,[sp,#128+32]
+       stp     d14,d15,[sp,#128+48]
+
+       sub     $len,$len,#512                  // not typo
+
+.Loop_outer_512_neon:
+        mov    $A0,@K[0]
+        mov    $A1,@K[0]
+        mov    $A2,@K[0]
+        mov    $A3,@K[0]
+        mov    $A4,@K[0]
+        mov    $A5,@K[0]
+        mov    $B0,@K[1]
+       mov.32  @x[0],@d[0]                     // unpack key block
+        mov    $B1,@K[1]
+       lsr     @x[1],@d[0],#32
+        mov    $B2,@K[1]
+       mov.32  @x[2],@d[1]
+        mov    $B3,@K[1]
+       lsr     @x[3],@d[1],#32
+        mov    $B4,@K[1]
+       mov.32  @x[4],@d[2]
+        mov    $B5,@K[1]
+       lsr     @x[5],@d[2],#32
+        mov    $D0,@K[3]
+       mov.32  @x[6],@d[3]
+        mov    $D1,@K[4]
+       lsr     @x[7],@d[3],#32
+        mov    $D2,@K[5]
+       mov.32  @x[8],@d[4]
+        mov    $D3,@K[6]
+       lsr     @x[9],@d[4],#32
+        mov    $C0,@K[2]
+       mov.32  @x[10],@d[5]
+        mov    $C1,@K[2]
+       lsr     @x[11],@d[5],#32
+        add    $D4,$D0,$ONE                    // +4
+       mov.32  @x[12],@d[6]
+        add    $D5,$D1,$ONE                    // +4
+       lsr     @x[13],@d[6],#32
+        mov    $C2,@K[2]
+       mov.32  @x[14],@d[7]
+        mov    $C3,@K[2]
+       lsr     @x[15],@d[7],#32
+        mov    $C4,@K[2]
+        stp    @K[3],@K[4],[sp,#48]            // off-load key block, variable part
+        mov    $C5,@K[2]
+        str    @K[5],[sp,#80]
+
+       mov     $ctr,#5
+       subs    $len,$len,#512
+.Loop_upper_neon:
+       sub     $ctr,$ctr,#1
+___
+       my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
+       my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
+       my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
+       my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
+       my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
+       my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
+       my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
+       my $diff = ($#thread0+1)*6 - $#thread67 - 1;
+       my $i = 0;
+
+       foreach (@thread0) {
+               eval;                   eval(shift(@thread67));
+               eval(shift(@thread1));  eval(shift(@thread67));
+               eval(shift(@thread2));  eval(shift(@thread67));
+               eval(shift(@thread3));  eval(shift(@thread67));
+               eval(shift(@thread4));  eval(shift(@thread67));
+               eval(shift(@thread5));  eval(shift(@thread67));
+       }
+
+       @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
+       @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
+       @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
+       @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
+       @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
+       @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
+       @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
+
+       foreach (@thread0) {
+               eval;                   eval(shift(@thread67));
+               eval(shift(@thread1));  eval(shift(@thread67));
+               eval(shift(@thread2));  eval(shift(@thread67));
+               eval(shift(@thread3));  eval(shift(@thread67));
+               eval(shift(@thread4));  eval(shift(@thread67));
+               eval(shift(@thread5));  eval(shift(@thread67));
+       }
+$code.=<<___;
+       cbnz    $ctr,.Loop_upper_neon
+
+       add.32  @x[0],@x[0],@d[0]               // accumulate key block
+       add     @x[1],@x[1],@d[0],lsr#32
+       add.32  @x[2],@x[2],@d[1]
+       add     @x[3],@x[3],@d[1],lsr#32
+       add.32  @x[4],@x[4],@d[2]
+       add     @x[5],@x[5],@d[2],lsr#32
+       add.32  @x[6],@x[6],@d[3]
+       add     @x[7],@x[7],@d[3],lsr#32
+       add.32  @x[8],@x[8],@d[4]
+       add     @x[9],@x[9],@d[4],lsr#32
+       add.32  @x[10],@x[10],@d[5]
+       add     @x[11],@x[11],@d[5],lsr#32
+       add.32  @x[12],@x[12],@d[6]
+       add     @x[13],@x[13],@d[6],lsr#32
+       add.32  @x[14],@x[14],@d[7]
+       add     @x[15],@x[15],@d[7],lsr#32
+
+       add     @x[0],@x[0],@x[1],lsl#32        // pack
+       add     @x[2],@x[2],@x[3],lsl#32
+       ldp     @x[1],@x[3],[$inp,#0]           // load input
+       add     @x[4],@x[4],@x[5],lsl#32
+       add     @x[6],@x[6],@x[7],lsl#32
+       ldp     @x[5],@x[7],[$inp,#16]
+       add     @x[8],@x[8],@x[9],lsl#32
+       add     @x[10],@x[10],@x[11],lsl#32
+       ldp     @x[9],@x[11],[$inp,#32]
+       add     @x[12],@x[12],@x[13],lsl#32
+       add     @x[14],@x[14],@x[15],lsl#32
+       ldp     @x[13],@x[15],[$inp,#48]
+       add     $inp,$inp,#64
+#ifdef __ARMEB__
+       rev     @x[0],@x[0]
+       rev     @x[2],@x[2]
+       rev     @x[4],@x[4]
+       rev     @x[6],@x[6]
+       rev     @x[8],@x[8]
+       rev     @x[10],@x[10]
+       rev     @x[12],@x[12]
+       rev     @x[14],@x[14]
+#endif
+       eor     @x[0],@x[0],@x[1]
+       eor     @x[2],@x[2],@x[3]
+       eor     @x[4],@x[4],@x[5]
+       eor     @x[6],@x[6],@x[7]
+       eor     @x[8],@x[8],@x[9]
+       eor     @x[10],@x[10],@x[11]
+       eor     @x[12],@x[12],@x[13]
+       eor     @x[14],@x[14],@x[15]
+
+        stp    @x[0],@x[2],[$out,#0]           // store output
+        add    @d[6],@d[6],#1                  // increment counter
+       mov.32  @x[0],@d[0]                     // unpack key block
+       lsr     @x[1],@d[0],#32
+        stp    @x[4],@x[6],[$out,#16]
+       mov.32  @x[2],@d[1]
+       lsr     @x[3],@d[1],#32
+        stp    @x[8],@x[10],[$out,#32]
+       mov.32  @x[4],@d[2]
+       lsr     @x[5],@d[2],#32
+        stp    @x[12],@x[14],[$out,#48]
+        add    $out,$out,#64
+       mov.32  @x[6],@d[3]
+       lsr     @x[7],@d[3],#32
+       mov.32  @x[8],@d[4]
+       lsr     @x[9],@d[4],#32
+       mov.32  @x[10],@d[5]
+       lsr     @x[11],@d[5],#32
+       mov.32  @x[12],@d[6]
+       lsr     @x[13],@d[6],#32
+       mov.32  @x[14],@d[7]
+       lsr     @x[15],@d[7],#32
+
+       mov     $ctr,#5
+.Loop_lower_neon:
+       sub     $ctr,$ctr,#1
+___
+       @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
+       @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
+       @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
+       @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
+       @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
+       @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
+       @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
+
+       foreach (@thread0) {
+               eval;                   eval(shift(@thread67));
+               eval(shift(@thread1));  eval(shift(@thread67));
+               eval(shift(@thread2));  eval(shift(@thread67));
+               eval(shift(@thread3));  eval(shift(@thread67));
+               eval(shift(@thread4));  eval(shift(@thread67));
+               eval(shift(@thread5));  eval(shift(@thread67));
+       }
+
+       @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
+       @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
+       @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
+       @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
+       @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
+       @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
+       @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
+
+       foreach (@thread0) {
+               eval;                   eval(shift(@thread67));
+               eval(shift(@thread1));  eval(shift(@thread67));
+               eval(shift(@thread2));  eval(shift(@thread67));
+               eval(shift(@thread3));  eval(shift(@thread67));
+               eval(shift(@thread4));  eval(shift(@thread67));
+               eval(shift(@thread5));  eval(shift(@thread67));
+       }
+$code.=<<___;
+       cbnz    $ctr,.Loop_lower_neon
+
+       add.32  @x[0],@x[0],@d[0]               // accumulate key block
+        ldp    @K[0],@K[1],[sp,#0]
+       add     @x[1],@x[1],@d[0],lsr#32
+        ldp    @K[2],@K[3],[sp,#32]
+       add.32  @x[2],@x[2],@d[1]
+        ldp    @K[4],@K[5],[sp,#64]
+       add     @x[3],@x[3],@d[1],lsr#32
+        add    $A0,$A0,@K[0]
+       add.32  @x[4],@x[4],@d[2]
+        add    $A1,$A1,@K[0]
+       add     @x[5],@x[5],@d[2],lsr#32
+        add    $A2,$A2,@K[0]
+       add.32  @x[6],@x[6],@d[3]
+        add    $A3,$A3,@K[0]
+       add     @x[7],@x[7],@d[3],lsr#32
+        add    $A4,$A4,@K[0]
+       add.32  @x[8],@x[8],@d[4]
+        add    $A5,$A5,@K[0]
+       add     @x[9],@x[9],@d[4],lsr#32
+        add    $C0,$C0,@K[2]
+       add.32  @x[10],@x[10],@d[5]
+        add    $C1,$C1,@K[2]
+       add     @x[11],@x[11],@d[5],lsr#32
+        add    $C2,$C2,@K[2]
+       add.32  @x[12],@x[12],@d[6]
+        add    $C3,$C3,@K[2]
+       add     @x[13],@x[13],@d[6],lsr#32
+        add    $C4,$C4,@K[2]
+       add.32  @x[14],@x[14],@d[7]
+        add    $C5,$C5,@K[2]
+       add     @x[15],@x[15],@d[7],lsr#32
+        add    $D4,$D4,$ONE                    // +4
+       add     @x[0],@x[0],@x[1],lsl#32        // pack
+        add    $D5,$D5,$ONE                    // +4
+       add     @x[2],@x[2],@x[3],lsl#32
+        add    $D0,$D0,@K[3]
+       ldp     @x[1],@x[3],[$inp,#0]           // load input
+        add    $D1,$D1,@K[4]
+       add     @x[4],@x[4],@x[5],lsl#32
+        add    $D2,$D2,@K[5]
+       add     @x[6],@x[6],@x[7],lsl#32
+        add    $D3,$D3,@K[6]
+       ldp     @x[5],@x[7],[$inp,#16]
+        add    $D4,$D4,@K[3]
+       add     @x[8],@x[8],@x[9],lsl#32
+        add    $D5,$D5,@K[4]
+       add     @x[10],@x[10],@x[11],lsl#32
+        add    $B0,$B0,@K[1]
+       ldp     @x[9],@x[11],[$inp,#32]
+        add    $B1,$B1,@K[1]
+       add     @x[12],@x[12],@x[13],lsl#32
+        add    $B2,$B2,@K[1]
+       add     @x[14],@x[14],@x[15],lsl#32
+        add    $B3,$B3,@K[1]
+       ldp     @x[13],@x[15],[$inp,#48]
+        add    $B4,$B4,@K[1]
+       add     $inp,$inp,#64
+        add    $B5,$B5,@K[1]
+
+#ifdef __ARMEB__
+       rev     @x[0],@x[0]
+       rev     @x[2],@x[2]
+       rev     @x[4],@x[4]
+       rev     @x[6],@x[6]
+       rev     @x[8],@x[8]
+       rev     @x[10],@x[10]
+       rev     @x[12],@x[12]
+       rev     @x[14],@x[14]
+#endif
+       ld1.8   {$T0-$T3},[$inp],#64
+       eor     @x[0],@x[0],@x[1]
+       eor     @x[2],@x[2],@x[3]
+       eor     @x[4],@x[4],@x[5]
+       eor     @x[6],@x[6],@x[7]
+       eor     @x[8],@x[8],@x[9]
+        eor    $A0,$A0,$T0
+       eor     @x[10],@x[10],@x[11]
+        eor    $B0,$B0,$T1
+       eor     @x[12],@x[12],@x[13]
+        eor    $C0,$C0,$T2
+       eor     @x[14],@x[14],@x[15]
+        eor    $D0,$D0,$T3
+        ld1.8  {$T0-$T3},[$inp],#64
+
+       stp     @x[0],@x[2],[$out,#0]           // store output
+        add    @d[6],@d[6],#7                  // increment counter
+       stp     @x[4],@x[6],[$out,#16]
+       stp     @x[8],@x[10],[$out,#32]
+       stp     @x[12],@x[14],[$out,#48]
+       add     $out,$out,#64
+       st1.8   {$A0-$D0},[$out],#64
+
+       ld1.8   {$A0-$D0},[$inp],#64
+       eor     $A1,$A1,$T0
+       eor     $B1,$B1,$T1
+       eor     $C1,$C1,$T2
+       eor     $D1,$D1,$T3
+       st1.8   {$A1-$D1},[$out],#64
+
+       ld1.8   {$A1-$D1},[$inp],#64
+       eor     $A2,$A2,$A0
+        ldp    @K[0],@K[1],[sp,#0]
+       eor     $B2,$B2,$B0
+        ldp    @K[2],@K[3],[sp,#32]
+       eor     $C2,$C2,$C0
+       eor     $D2,$D2,$D0
+       st1.8   {$A2-$D2},[$out],#64
+
+       ld1.8   {$A2-$D2},[$inp],#64
+       eor     $A3,$A3,$A1
+       eor     $B3,$B3,$B1
+       eor     $C3,$C3,$C1
+       eor     $D3,$D3,$D1
+       st1.8   {$A3-$D3},[$out],#64
+
+       ld1.8   {$A3-$D3},[$inp],#64
+       eor     $A4,$A4,$A2
+       eor     $B4,$B4,$B2
+       eor     $C4,$C4,$C2
+       eor     $D4,$D4,$D2
+       st1.8   {$A4-$D4},[$out],#64
+
+       shl     $A0,$ONE,#1                     // 4 -> 8
+       eor     $A5,$A5,$A3
+       eor     $B5,$B5,$B3
+       eor     $C5,$C5,$C3
+       eor     $D5,$D5,$D3
+       st1.8   {$A5-$D5},[$out],#64
+
+       add     @K[3],@K[3],$A0                 // += 8
+       add     @K[4],@K[4],$A0
+       add     @K[5],@K[5],$A0
+       add     @K[6],@K[6],$A0
+
+       b.hs    .Loop_outer_512_neon
+
+       adds    $len,$len,#512
+       ushr    $A0,$ONE,#2                     // 4 -> 1
+
+       ldp     d8,d9,[sp,#128+0]               // meet ABI requirements
+       ldp     d10,d11,[sp,#128+16]
+       ldp     d12,d13,[sp,#128+32]
+       ldp     d14,d15,[sp,#128+48]
+
+       stp     @K[0],$ONE,[sp,#0]              // wipe off-load area
+       stp     @K[0],$ONE,[sp,#32]
+       stp     @K[0],$ONE,[sp,#64]
+
+       b.eq    .Ldone_512_neon
+
+       cmp     $len,#192
+       sub     @K[3],@K[3],$A0                 // -= 1
+       sub     @K[4],@K[4],$A0
+       sub     @K[5],@K[5],$A0
+       add     sp,sp,#128
+       b.hs    .Loop_outer_neon
+
+       eor     @K[1],@K[1],@K[1]
+       eor     @K[2],@K[2],@K[2]
+       eor     @K[3],@K[3],@K[3]
+       eor     @K[4],@K[4],@K[4]
+       eor     @K[5],@K[5],@K[5]
+       eor     @K[6],@K[6],@K[6]
+       b       .Loop_outer
+
+.Ldone_512_neon:
+       ldp     x19,x20,[x29,#16]
+       add     sp,sp,#128+64
+       ldp     x21,x22,[x29,#32]
+       ldp     x23,x24,[x29,#48]
+       ldp     x25,x26,[x29,#64]
+       ldp     x27,x28,[x29,#80]
+       ldp     x29,x30,[sp],#96
+       ret
+.size  ChaCha20_512_neon,.-ChaCha20_512_neon
+___
+}
+}}}
+
+foreach (split("\n",$code)) {
+       s/\`([^\`]*)\`/eval $1/geo;
+
+       (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1))   or
+       (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1))        or
+       (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1))   or
+       (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1))   or
+       (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
+
+       #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
+
+       print $_,"\n";
+}
index c8488431890a43d8690909499d92534275206b79..0984a5221c085c08c7b8fb0156df0b90d8695284 100644 (file)
@@ -45,6 +45,9 @@ poly1305-x86_64.s:    asm/poly1305-x86_64.pl
 
 poly1305-%.S:  asm/poly1305-%.pl;      $(PERL) $< $(PERLASM_SCHEME) $@
 
 
 poly1305-%.S:  asm/poly1305-%.pl;      $(PERL) $< $(PERLASM_SCHEME) $@
 
+poly1305-armv4.o:      poly1305-armv4.S
+poly1305-armv8.o:      poly1305-armv8.S
+
 files:
        $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
 
 files:
        $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
 
diff --git a/crypto/poly1305/asm/poly1305-armv4.pl b/crypto/poly1305/asm/poly1305-armv4.pl
new file mode 100755 (executable)
index 0000000..2cce9df
--- /dev/null
@@ -0,0 +1,1216 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+#                      IALU(*)/gcc-4.4         NEON
+#
+# ARM11xx(ARMv6)       7.78/+100%              -
+# Cortex-A5            6.30/+130%              2.96
+# Cortex-A8            6.25/+115%              2.36
+# Cortex-A9            5.10/+95%               2.55
+# Cortex-A15           3.79/+85%               1.25(**)
+# Snapdragon S4                5.70/+100%              1.48(**)
+#
+# (*)  this is for -march=armv6, i.e. with bunch of ldrb loading data;
+# (**) these are trade-off results, they can be improved by ~8% but at
+#      the cost of 15/12% regression on Cortex-A5/A7, it's even possible
+#      to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
+
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+#if defined(__thumb2__)
+.syntax        unified
+.thumb
+#else
+.code  32
+#endif
+
+.globl poly1305_emit
+.globl poly1305_blocks
+.globl poly1305_init
+.type  poly1305_init,%function
+.align 5
+poly1305_init:
+.Lpoly1305_init:
+       stmdb   sp!,{r4-r11}
+
+       eor     r3,r3,r3
+       cmp     $inp,#0
+       str     r3,[$ctx,#0]            @ zero hash value
+       str     r3,[$ctx,#4]
+       str     r3,[$ctx,#8]
+       str     r3,[$ctx,#12]
+       str     r3,[$ctx,#16]
+       str     r3,[$ctx,#36]           @ is_base2_26
+       add     $ctx,$ctx,#20
+
+#ifdef __thumb2__
+       it      eq
+#endif
+       moveq   r0,#0
+       beq     .Lno_key
+
+#if    __ARM_MAX_ARCH__>=7
+       adr     r11,.Lpoly1305_init
+       ldr     r12,.LOPENSSL_armcap
+#endif
+       ldrb    r4,[$inp,#0]
+       mov     r10,#0x0fffffff
+       ldrb    r5,[$inp,#1]
+       and     r3,r10,#-4              @ 0x0ffffffc
+       ldrb    r6,[$inp,#2]
+       ldrb    r7,[$inp,#3]
+       orr     r4,r4,r5,lsl#8
+       ldrb    r5,[$inp,#4]
+       orr     r4,r4,r6,lsl#16
+       ldrb    r6,[$inp,#5]
+       orr     r4,r4,r7,lsl#24
+       ldrb    r7,[$inp,#6]
+       and     r4,r4,r10
+
+#if    __ARM_MAX_ARCH__>=7
+       ldr     r12,[r11,r12]           @ OPENSSL_armcap_P
+# ifdef        __APPLE__
+       ldr     r12,[r12]
+# endif
+#endif
+       ldrb    r8,[$inp,#7]
+       orr     r5,r5,r6,lsl#8
+       ldrb    r6,[$inp,#8]
+       orr     r5,r5,r7,lsl#16
+       ldrb    r7,[$inp,#9]
+       orr     r5,r5,r8,lsl#24
+       ldrb    r8,[$inp,#10]
+       and     r5,r5,r3
+
+#if    __ARM_MAX_ARCH__>=7
+       tst     r12,#1                  @ check for NEON
+# ifdef        __APPLE__
+       adr     r9,poly1305_blocks_neon
+       adr     r11,poly1305_blocks
+#  ifdef __thumb2__
+       it      ne
+#  endif
+       movne   r11,r9
+       adr     r12,poly1305_emit
+       adr     r10,poly1305_emit_neon
+#  ifdef __thumb2__
+       it      ne
+#  endif
+       movne   r12,r10
+# else
+#  ifdef __thumb2__
+       itete   eq
+#  endif
+       addeq   r12,r11,#(poly1305_emit-.Lpoly1305_init)
+       addne   r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
+       addeq   r11,r11,#(poly1305_blocks-.Lpoly1305_init)
+       addne   r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
+# endif
+# ifdef        __thumb2__
+       orr     r12,r12,#1      @ thumb-ify address
+       orr     r11,r11,#1
+# endif
+#endif
+       ldrb    r9,[$inp,#11]
+       orr     r6,r6,r7,lsl#8
+       ldrb    r7,[$inp,#12]
+       orr     r6,r6,r8,lsl#16
+       ldrb    r8,[$inp,#13]
+       orr     r6,r6,r9,lsl#24
+       ldrb    r9,[$inp,#14]
+       and     r6,r6,r3
+
+       ldrb    r10,[$inp,#15]
+       orr     r7,r7,r8,lsl#8
+       str     r4,[$ctx,#0]
+       orr     r7,r7,r9,lsl#16
+       str     r5,[$ctx,#4]
+       orr     r7,r7,r10,lsl#24
+       str     r6,[$ctx,#8]
+       and     r7,r7,r3
+       str     r7,[$ctx,#12]
+#if    __ARM_MAX_ARCH__>=7
+       stmia   r2,{r11,r12}            @ fill functions table
+       mov     r0,#1
+#else
+       mov     r0,#0
+#endif
+.Lno_key:
+       ldmia   sp!,{r4-r11}
+#if    __ARM_ARCH__>=5
+       ret                             @ bx    lr
+#else
+       tst     lr,#1
+       moveq   pc,lr                   @ be binary compatible with V4, yet
+       bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
+.size  poly1305_init,.-poly1305_init
+___
+{
+my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
+my ($s1,$s2,$s3)=($r1,$r2,$r3);
+
+$code.=<<___;
+.type  poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+       stmdb   sp!,{r3-r11,lr}
+
+       ands    $len,$len,#-16
+       beq     .Lno_data
+
+       cmp     $padbit,#0
+       add     $len,$len,$inp          @ end pointer
+       sub     sp,sp,#32
+
+       ldmia   $ctx,{$h0-$r3}          @ load context
+
+       str     $ctx,[sp,#12]           @ offload stuff
+       mov     lr,$inp
+       str     $len,[sp,#16]
+       str     $r1,[sp,#20]
+       str     $r2,[sp,#24]
+       str     $r3,[sp,#28]
+       b       .Loop
+
+.Loop:
+#if __ARM_ARCH__<7
+       ldrb    r0,[lr],#16             @ load input
+# ifdef        __thumb2__
+       it      hi
+# endif
+       addhi   $h4,$h4,#1              @ 1<<128
+       ldrb    r1,[lr,#-15]
+       ldrb    r2,[lr,#-14]
+       ldrb    r3,[lr,#-13]
+       orr     r1,r0,r1,lsl#8
+       ldrb    r0,[lr,#-12]
+       orr     r2,r1,r2,lsl#16
+       ldrb    r1,[lr,#-11]
+       orr     r3,r2,r3,lsl#24
+       ldrb    r2,[lr,#-10]
+       adds    $h0,$h0,r3              @ accumulate input
+
+       ldrb    r3,[lr,#-9]
+       orr     r1,r0,r1,lsl#8
+       ldrb    r0,[lr,#-8]
+       orr     r2,r1,r2,lsl#16
+       ldrb    r1,[lr,#-7]
+       orr     r3,r2,r3,lsl#24
+       ldrb    r2,[lr,#-6]
+       adcs    $h1,$h1,r3
+
+       ldrb    r3,[lr,#-5]
+       orr     r1,r0,r1,lsl#8
+       ldrb    r0,[lr,#-4]
+       orr     r2,r1,r2,lsl#16
+       ldrb    r1,[lr,#-3]
+       orr     r3,r2,r3,lsl#24
+       ldrb    r2,[lr,#-2]
+       adcs    $h2,$h2,r3
+
+       ldrb    r3,[lr,#-1]
+       orr     r1,r0,r1,lsl#8
+       str     lr,[sp,#8]              @ offload input pointer
+       orr     r2,r1,r2,lsl#16
+       add     $s1,$r1,$r1,lsr#2
+       orr     r3,r2,r3,lsl#24
+#else
+       ldr     r0,[lr],#16             @ load input
+# ifdef        __thumb2__
+       it      hi
+# endif
+       addhi   $h4,$h4,#1              @ padbit
+       ldr     r1,[lr,#-12]
+       ldr     r2,[lr,#-8]
+       ldr     r3,[lr,#-4]
+# ifdef        __ARMEB__
+       rev     r0,r0
+       rev     r1,r1
+       rev     r2,r2
+       rev     r3,r3
+# endif
+       adds    $h0,$h0,r0              @ accumulate input
+       str     lr,[sp,#8]              @ offload input pointer
+       adcs    $h1,$h1,r1
+       add     $s1,$r1,$r1,lsr#2
+       adcs    $h2,$h2,r2
+#endif
+       add     $s2,$r2,$r2,lsr#2
+       adcs    $h3,$h3,r3
+       add     $s3,$r3,$r3,lsr#2
+
+       umull   r2,r3,$h1,$r0
+        adc    $h4,$h4,#0
+       umull   r0,r1,$h0,$r0
+       umlal   r2,r3,$h4,$s1
+       umlal   r0,r1,$h3,$s1
+       ldr     $r1,[sp,#20]            @ reload $r1
+       umlal   r2,r3,$h2,$s3
+       umlal   r0,r1,$h1,$s3
+       umlal   r2,r3,$h3,$s2
+       umlal   r0,r1,$h2,$s2
+       umlal   r2,r3,$h0,$r1
+       str     r0,[sp,#0]              @ future $h0
+        mul    r0,$s2,$h4
+       ldr     $r2,[sp,#24]            @ reload $r2
+       adds    r2,r2,r1                @ d1+=d0>>32
+        eor    r1,r1,r1
+       adc     lr,r3,#0                @ future $h2
+       str     r2,[sp,#4]              @ future $h1
+
+       mul     r2,$s3,$h4
+       eor     r3,r3,r3
+       umlal   r0,r1,$h3,$s3
+       ldr     $r3,[sp,#28]            @ reload $r3
+       umlal   r2,r3,$h3,$r0
+       umlal   r0,r1,$h2,$r0
+       umlal   r2,r3,$h2,$r1
+       umlal   r0,r1,$h1,$r1
+       umlal   r2,r3,$h1,$r2
+       umlal   r0,r1,$h0,$r2
+       umlal   r2,r3,$h0,$r3
+       ldr     $h0,[sp,#0]
+       mul     $h4,$r0,$h4
+       ldr     $h1,[sp,#4]
+
+       adds    $h2,lr,r0               @ d2+=d1>>32
+       ldr     lr,[sp,#8]              @ reload input pointer
+       adc     r1,r1,#0
+       adds    $h3,r2,r1               @ d3+=d2>>32
+       ldr     r0,[sp,#16]             @ reload end pointer
+       adc     r3,r3,#0
+       add     $h4,$h4,r3              @ h4+=d3>>32
+
+       and     r1,$h4,#-4
+       and     $h4,$h4,#3
+       add     r1,r1,r1,lsr#2          @ *=5
+       adds    $h0,$h0,r1
+       adcs    $h1,$h1,#0
+       adcs    $h2,$h2,#0
+       adc     $h3,$h3,#0
+
+       cmp     r0,lr                   @ done yet?
+       bhi     .Loop
+
+       ldr     $ctx,[sp,#12]
+       add     sp,sp,#32
+       stmia   $ctx,{$h0-$h4}          @ store the result
+
+.Lno_data:
+#if    __ARM_ARCH__>=5
+       ldmia   sp!,{r3-r11,pc}
+#else
+       ldmia   sp!,{r3-r11,lr}
+       tst     lr,#1
+       moveq   pc,lr                   @ be binary compatible with V4, yet
+       bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
+.size  poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce)=map("r$_",(0..2));
+my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
+my $g4=$h4;
+
+$code.=<<___;
+.type  poly1305_emit,%function
+.align 5
+poly1305_emit:
+       stmdb   sp!,{r4-r11}
+.Lpoly1305_emit_enter:
+
+       ldmia   $ctx,{$h0-$h4}
+       adds    $g0,$h0,#5              @ compare to modulus
+       adcs    $g1,$h1,#0
+       adcs    $g2,$h2,#0
+       adcs    $g3,$h3,#0
+       adc     $g4,$h4,#0
+       tst     $g4,#4                  @ did it carry/borrow?
+
+#ifdef __thumb2__
+       it      ne
+#endif
+       movne   $h0,$g0
+       ldr     $g0,[$nonce,#0]
+#ifdef __thumb2__
+       it      ne
+#endif
+       movne   $h1,$g1
+       ldr     $g1,[$nonce,#4]
+#ifdef __thumb2__
+       it      ne
+#endif
+       movne   $h2,$g2
+       ldr     $g2,[$nonce,#8]
+#ifdef __thumb2__
+       it      ne
+#endif
+       movne   $h3,$g3
+       ldr     $g3,[$nonce,#12]
+
+       adds    $h0,$h0,$g0
+       adcs    $h1,$h1,$g1
+       adcs    $h2,$h2,$g2
+       adc     $h3,$h3,$g3
+
+#if __ARM_ARCH__>=7
+# ifdef __ARMEB__
+       rev     $h0,$h0
+       rev     $h1,$h1
+       rev     $h2,$h2
+       rev     $h3,$h3
+# endif
+       str     $h0,[$mac,#0]
+       str     $h1,[$mac,#4]
+       str     $h2,[$mac,#8]
+       str     $h3,[$mac,#12]
+#else
+       strb    $h0,[$mac,#0]
+       mov     $h0,$h0,lsr#8
+       strb    $h1,[$mac,#4]
+       mov     $h1,$h1,lsr#8
+       strb    $h2,[$mac,#8]
+       mov     $h2,$h2,lsr#8
+       strb    $h3,[$mac,#12]
+       mov     $h3,$h3,lsr#8
+
+       strb    $h0,[$mac,#1]
+       mov     $h0,$h0,lsr#8
+       strb    $h1,[$mac,#5]
+       mov     $h1,$h1,lsr#8
+       strb    $h2,[$mac,#9]
+       mov     $h2,$h2,lsr#8
+       strb    $h3,[$mac,#13]
+       mov     $h3,$h3,lsr#8
+
+       strb    $h0,[$mac,#2]
+       mov     $h0,$h0,lsr#8
+       strb    $h1,[$mac,#6]
+       mov     $h1,$h1,lsr#8
+       strb    $h2,[$mac,#10]
+       mov     $h2,$h2,lsr#8
+       strb    $h3,[$mac,#14]
+       mov     $h3,$h3,lsr#8
+
+       strb    $h0,[$mac,#3]
+       strb    $h1,[$mac,#7]
+       strb    $h2,[$mac,#11]
+       strb    $h3,[$mac,#15]
+#endif
+       ldmia   sp!,{r4-r11}
+#if    __ARM_ARCH__>=5
+       ret                             @ bx    lr
+#else
+       tst     lr,#1
+       moveq   pc,lr                   @ be binary compatible with V4, yet
+       bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
+.size  poly1305_emit,.-poly1305_emit
+___
+{
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
+my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
+my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
+
+my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
+
+$code.=<<___;
+#if    __ARM_MAX_ARCH__>=7
+.fpu   neon
+
+.type  poly1305_init_neon,%function
+.align 5
+poly1305_init_neon:
+       ldr     r4,[$ctx,#20]           @ load key base 2^32
+       ldr     r5,[$ctx,#24]
+       ldr     r6,[$ctx,#28]
+       ldr     r7,[$ctx,#32]
+
+       and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
+       mov     r3,r4,lsr#26
+       mov     r4,r5,lsr#20
+       orr     r3,r3,r5,lsl#6
+       mov     r5,r6,lsr#14
+       orr     r4,r4,r6,lsl#12
+       mov     r6,r7,lsr#8
+       orr     r5,r5,r7,lsl#18
+       and     r3,r3,#0x03ffffff
+       and     r4,r4,#0x03ffffff
+       and     r5,r5,#0x03ffffff
+
+       vdup.32 $R0,r2                  @ r^1 in both lanes
+       add     r2,r3,r3,lsl#2          @ *5
+       vdup.32 $R1,r3
+       add     r3,r4,r4,lsl#2
+       vdup.32 $S1,r2
+       vdup.32 $R2,r4
+       add     r4,r5,r5,lsl#2
+       vdup.32 $S2,r3
+       vdup.32 $R3,r5
+       add     r5,r6,r6,lsl#2
+       vdup.32 $S3,r4
+       vdup.32 $R4,r6
+       vdup.32 $S4,r5
+
+       mov     $zeros,#2               @ counter
+
+.Lsquare_neon:
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+       @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+       @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+       @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+       @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+
+       vmull.u32       $D0,$R0,${R0}[1]
+       vmull.u32       $D1,$R1,${R0}[1]
+       vmull.u32       $D2,$R2,${R0}[1]
+       vmull.u32       $D3,$R3,${R0}[1]
+       vmull.u32       $D4,$R4,${R0}[1]
+
+       vmlal.u32       $D0,$R4,${S1}[1]
+       vmlal.u32       $D1,$R0,${R1}[1]
+       vmlal.u32       $D2,$R1,${R1}[1]
+       vmlal.u32       $D3,$R2,${R1}[1]
+       vmlal.u32       $D4,$R3,${R1}[1]
+
+       vmlal.u32       $D0,$R3,${S2}[1]
+       vmlal.u32       $D1,$R4,${S2}[1]
+       vmlal.u32       $D3,$R1,${R2}[1]
+       vmlal.u32       $D2,$R0,${R2}[1]
+       vmlal.u32       $D4,$R2,${R2}[1]
+
+       vmlal.u32       $D0,$R2,${S3}[1]
+       vmlal.u32       $D3,$R0,${R3}[1]
+       vmlal.u32       $D1,$R3,${S3}[1]
+       vmlal.u32       $D2,$R4,${S3}[1]
+       vmlal.u32       $D4,$R1,${R3}[1]
+
+       vmlal.u32       $D3,$R4,${S4}[1]
+       vmlal.u32       $D0,$R1,${S4}[1]
+       vmlal.u32       $D1,$R2,${S4}[1]
+       vmlal.u32       $D2,$R3,${S4}[1]
+       vmlal.u32       $D4,$R0,${R4}[1]
+
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+       @ and P. Schwabe
+
+       vshr.u64        $T0,$D3,#26
+       vmovn.i64       $D3#lo,$D3
+        vshr.u64       $T1,$D0,#26
+        vmovn.i64      $D0#lo,$D0
+       vadd.i64        $D4,$D4,$T0             @ h3 -> h4
+       vbic.i32        $D3#lo,#0xfc000000      @ &=0x03ffffff
+        vadd.i64       $D1,$D1,$T1             @ h0 -> h1
+        vbic.i32       $D0#lo,#0xfc000000
+
+       vshrn.u64       $T0#lo,$D4,#26
+       vmovn.i64       $D4#lo,$D4
+        vshr.u64       $T1,$D1,#26
+        vmovn.i64      $D1#lo,$D1
+        vadd.i64       $D2,$D2,$T1             @ h1 -> h2
+       vbic.i32        $D4#lo,#0xfc000000
+        vbic.i32       $D1#lo,#0xfc000000
+
+       vadd.i32        $D0#lo,$D0#lo,$T0#lo
+       vshl.u32        $T0#lo,$T0#lo,#2
+        vshrn.u64      $T1#lo,$D2,#26
+        vmovn.i64      $D2#lo,$D2
+       vadd.i32        $D0#lo,$D0#lo,$T0#lo    @ h4 -> h0
+        vadd.i32       $D3#lo,$D3#lo,$T1#lo    @ h2 -> h3
+        vbic.i32       $D2#lo,#0xfc000000
+
+       vshr.u32        $T0#lo,$D0#lo,#26
+       vbic.i32        $D0#lo,#0xfc000000
+        vshr.u32       $T1#lo,$D3#lo,#26
+        vbic.i32       $D3#lo,#0xfc000000
+       vadd.i32        $D1#lo,$D1#lo,$T0#lo    @ h0 -> h1
+        vadd.i32       $D4#lo,$D4#lo,$T1#lo    @ h3 -> h4
+
+       subs            $zeros,$zeros,#1
+       beq             .Lsquare_break_neon
+
+       add             $tbl0,$ctx,#(48+0*9*4)
+       add             $tbl1,$ctx,#(48+1*9*4)
+
+       vtrn.32         $R0,$D0#lo              @ r^2:r^1
+       vtrn.32         $R2,$D2#lo
+       vtrn.32         $R3,$D3#lo
+       vtrn.32         $R1,$D1#lo
+       vtrn.32         $R4,$D4#lo
+
+       vshl.u32        $S2,$R2,#2              @ *5
+       vshl.u32        $S3,$R3,#2
+       vshl.u32        $S1,$R1,#2
+       vshl.u32        $S4,$R4,#2
+       vadd.i32        $S2,$S2,$R2
+       vadd.i32        $S1,$S1,$R1
+       vadd.i32        $S3,$S3,$R3
+       vadd.i32        $S4,$S4,$R4
+
+       vst4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+       vst4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+       vst4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+       vst4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+       vst1.32         {${S4}[0]},[$tbl0,:32]
+       vst1.32         {${S4}[1]},[$tbl1,:32]
+
+       b               .Lsquare_neon
+
+.align 4
+.Lsquare_break_neon:
+       add             $tbl0,$ctx,#(48+2*4*9)
+       add             $tbl1,$ctx,#(48+3*4*9)
+
+       vmov            $R0,$D0#lo              @ r^4:r^3
+       vshl.u32        $S1,$D1#lo,#2           @ *5
+       vmov            $R1,$D1#lo
+       vshl.u32        $S2,$D2#lo,#2
+       vmov            $R2,$D2#lo
+       vshl.u32        $S3,$D3#lo,#2
+       vmov            $R3,$D3#lo
+       vshl.u32        $S4,$D4#lo,#2
+       vmov            $R4,$D4#lo
+       vadd.i32        $S1,$S1,$D1#lo
+       vadd.i32        $S2,$S2,$D2#lo
+       vadd.i32        $S3,$S3,$D3#lo
+       vadd.i32        $S4,$S4,$D4#lo
+
+       vst4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+       vst4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+       vst4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+       vst4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+       vst1.32         {${S4}[0]},[$tbl0]
+       vst1.32         {${S4}[1]},[$tbl1]
+
+       ret                             @ bx    lr
+.size  poly1305_init_neon,.-poly1305_init_neon
+
+.type  poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+       ldr     ip,[$ctx,#36]           @ is_base2_26
+       ands    $len,$len,#-16
+       beq     .Lno_data_neon
+
+       cmp     $len,#64
+       bhs     .Lenter_neon
+       tst     ip,ip                   @ is_base2_26?
+       beq     poly1305_blocks
+
+.Lenter_neon:
+       stmdb   sp!,{r4-r7}
+       vstmdb  sp!,{d8-d15}            @ ABI specification says so
+
+       tst     ip,ip                   @ is_base2_26?
+       bne     .Lbase2_26_neon
+
+       stmdb   sp!,{r1-r3,lr}
+       bl      poly1305_init_neon
+
+       ldr     r4,[$ctx,#0]            @ load hash value base 2^32
+       ldr     r5,[$ctx,#4]
+       ldr     r6,[$ctx,#8]
+       ldr     r7,[$ctx,#12]
+       ldr     ip,[$ctx,#16]
+
+       and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
+       mov     r3,r4,lsr#26
+        veor   $D0#lo,$D0#lo,$D0#lo
+       mov     r4,r5,lsr#20
+       orr     r3,r3,r5,lsl#6
+        veor   $D1#lo,$D1#lo,$D1#lo
+       mov     r5,r6,lsr#14
+       orr     r4,r4,r6,lsl#12
+        veor   $D2#lo,$D2#lo,$D2#lo
+       mov     r6,r7,lsr#8
+       orr     r5,r5,r7,lsl#18
+        veor   $D3#lo,$D3#lo,$D3#lo
+       and     r3,r3,#0x03ffffff
+       orr     r6,r6,ip,lsl#24
+        veor   $D4#lo,$D4#lo,$D4#lo
+       and     r4,r4,#0x03ffffff
+       mov     r1,#1
+       and     r5,r5,#0x03ffffff
+       str     r1,[$ctx,#36]           @ is_base2_26
+
+       vmov.32 $D0#lo[0],r2
+       vmov.32 $D1#lo[0],r3
+       vmov.32 $D2#lo[0],r4
+       vmov.32 $D3#lo[0],r5
+       vmov.32 $D4#lo[0],r6
+       adr     $zeros,.Lzeros
+
+       ldmia   sp!,{r1-r3,lr}
+       b       .Lbase2_32_neon
+
+.align 4
+.Lbase2_26_neon:
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ load hash value
+
+       veor            $D0#lo,$D0#lo,$D0#lo
+       veor            $D1#lo,$D1#lo,$D1#lo
+       veor            $D2#lo,$D2#lo,$D2#lo
+       veor            $D3#lo,$D3#lo,$D3#lo
+       veor            $D4#lo,$D4#lo,$D4#lo
+       vld4.32         {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+       adr             $zeros,.Lzeros
+       vld1.32         {$D4#lo[0]},[$ctx]
+       sub             $ctx,$ctx,#16           @ rewind
+
+.Lbase2_32_neon:
+       add             $in2,$inp,#32
+       mov             $padbit,$padbit,lsl#24
+       tst             $len,#31
+       beq             .Leven
+
+       vld4.32         {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
+       vmov.32         $H4#lo[0],$padbit
+       sub             $len,$len,#16
+       add             $in2,$inp,#32
+
+# ifdef        __ARMEB__
+       vrev32.8        $H0,$H0
+       vrev32.8        $H3,$H3
+       vrev32.8        $H1,$H1
+       vrev32.8        $H2,$H2
+# endif
+       vsri.u32        $H4#lo,$H3#lo,#8        @ base 2^32 -> base 2^26
+       vshl.u32        $H3#lo,$H3#lo,#18
+
+       vsri.u32        $H3#lo,$H2#lo,#14
+       vshl.u32        $H2#lo,$H2#lo,#12
+       vadd.i32        $H4#hi,$H4#lo,$D4#lo    @ add hash value and move to #hi
+
+       vbic.i32        $H3#lo,#0xfc000000
+       vsri.u32        $H2#lo,$H1#lo,#20
+       vshl.u32        $H1#lo,$H1#lo,#6
+
+       vbic.i32        $H2#lo,#0xfc000000
+       vsri.u32        $H1#lo,$H0#lo,#26
+       vadd.i32        $H3#hi,$H3#lo,$D3#lo
+
+       vbic.i32        $H0#lo,#0xfc000000
+       vbic.i32        $H1#lo,#0xfc000000
+       vadd.i32        $H2#hi,$H2#lo,$D2#lo
+
+       vadd.i32        $H0#hi,$H0#lo,$D0#lo
+       vadd.i32        $H1#hi,$H1#lo,$D1#lo
+
+       mov             $tbl1,$zeros
+       add             $tbl0,$ctx,#48
+
+       cmp             $len,$len
+       b               .Long_tail
+
+.align 4
+.Leven:
+       subs            $len,$len,#64
+# ifdef        __thumb2__
+       it              lo
+# endif
+       movlo           $in2,$zeros
+
+       vmov.i32        $H4,#1<<24              @ padbit, yes, always
+       vld4.32         {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]    @ inp[0:1]
+       add             $inp,$inp,#64
+       vld4.32         {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]    @ inp[2:3] (or 0)
+       add             $in2,$in2,#64
+# ifdef        __thumb2__
+       itt             hi
+# endif
+       addhi           $tbl1,$ctx,#(48+1*9*4)
+       addhi           $tbl0,$ctx,#(48+3*9*4)
+
+# ifdef        __ARMEB__
+       vrev32.8        $H0,$H0
+       vrev32.8        $H3,$H3
+       vrev32.8        $H1,$H1
+       vrev32.8        $H2,$H2
+# endif
+       vsri.u32        $H4,$H3,#8              @ base 2^32 -> base 2^26
+       vshl.u32        $H3,$H3,#18
+
+       vsri.u32        $H3,$H2,#14
+       vshl.u32        $H2,$H2,#12
+
+       vbic.i32        $H3,#0xfc000000
+       vsri.u32        $H2,$H1,#20
+       vshl.u32        $H1,$H1,#6
+
+       vbic.i32        $H2,#0xfc000000
+       vsri.u32        $H1,$H0,#26
+
+       vbic.i32        $H0,#0xfc000000
+       vbic.i32        $H1,#0xfc000000
+
+       bls             .Lskip_loop
+
+       vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^2
+       vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^4
+       vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+       vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+       b               .Loop_neon
+
+.align 5
+.Loop_neon:
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+       @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+       @   \___________________/
+       @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+       @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+       @   \___________________/ \____________________/
+       @
+       @ Note that we start with inp[2:3]*r^2. This is because it
+       @ doesn't depend on reduction in previous iteration.
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+       @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+       @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+       @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+       @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ inp[2:3]*r^2
+
+       vadd.i32        $H2#lo,$H2#lo,$D2#lo    @ accumulate inp[0:1]
+       vmull.u32       $D2,$H2#hi,${R0}[1]
+       vadd.i32        $H0#lo,$H0#lo,$D0#lo
+       vmull.u32       $D0,$H0#hi,${R0}[1]
+       vadd.i32        $H3#lo,$H3#lo,$D3#lo
+       vmull.u32       $D3,$H3#hi,${R0}[1]
+       vmlal.u32       $D2,$H1#hi,${R1}[1]
+       vadd.i32        $H1#lo,$H1#lo,$D1#lo
+       vmull.u32       $D1,$H1#hi,${R0}[1]
+
+       vadd.i32        $H4#lo,$H4#lo,$D4#lo
+       vmull.u32       $D4,$H4#hi,${R0}[1]
+       subs            $len,$len,#64
+       vmlal.u32       $D0,$H4#hi,${S1}[1]
+# ifdef        __thumb2__
+       it              lo
+# endif
+       movlo           $in2,$zeros
+       vmlal.u32       $D3,$H2#hi,${R1}[1]
+       vld1.32         ${S4}[1],[$tbl1,:32]
+       vmlal.u32       $D1,$H0#hi,${R1}[1]
+       vmlal.u32       $D4,$H3#hi,${R1}[1]
+
+       vmlal.u32       $D0,$H3#hi,${S2}[1]
+       vmlal.u32       $D3,$H1#hi,${R2}[1]
+       vmlal.u32       $D4,$H2#hi,${R2}[1]
+       vmlal.u32       $D1,$H4#hi,${S2}[1]
+       vmlal.u32       $D2,$H0#hi,${R2}[1]
+
+       vmlal.u32       $D3,$H0#hi,${R3}[1]
+       vmlal.u32       $D0,$H2#hi,${S3}[1]
+       vmlal.u32       $D4,$H1#hi,${R3}[1]
+       vmlal.u32       $D1,$H3#hi,${S3}[1]
+       vmlal.u32       $D2,$H4#hi,${S3}[1]
+
+       vmlal.u32       $D3,$H4#hi,${S4}[1]
+       vmlal.u32       $D0,$H1#hi,${S4}[1]
+       vmlal.u32       $D4,$H0#hi,${R4}[1]
+       vmlal.u32       $D1,$H2#hi,${S4}[1]
+       vmlal.u32       $D2,$H3#hi,${S4}[1]
+
+       vld4.32         {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]    @ inp[2:3] (or 0)
+       add             $in2,$in2,#64
+
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ (hash+inp[0:1])*r^4 and accumulate
+
+       vmlal.u32       $D3,$H3#lo,${R0}[0]
+       vmlal.u32       $D0,$H0#lo,${R0}[0]
+       vmlal.u32       $D4,$H4#lo,${R0}[0]
+       vmlal.u32       $D1,$H1#lo,${R0}[0]
+       vmlal.u32       $D2,$H2#lo,${R0}[0]
+       vld1.32         ${S4}[0],[$tbl0,:32]
+
+       vmlal.u32       $D3,$H2#lo,${R1}[0]
+       vmlal.u32       $D0,$H4#lo,${S1}[0]
+       vmlal.u32       $D4,$H3#lo,${R1}[0]
+       vmlal.u32       $D1,$H0#lo,${R1}[0]
+       vmlal.u32       $D2,$H1#lo,${R1}[0]
+
+       vmlal.u32       $D3,$H1#lo,${R2}[0]
+       vmlal.u32       $D0,$H3#lo,${S2}[0]
+       vmlal.u32       $D4,$H2#lo,${R2}[0]
+       vmlal.u32       $D1,$H4#lo,${S2}[0]
+       vmlal.u32       $D2,$H0#lo,${R2}[0]
+
+       vmlal.u32       $D3,$H0#lo,${R3}[0]
+       vmlal.u32       $D0,$H2#lo,${S3}[0]
+       vmlal.u32       $D4,$H1#lo,${R3}[0]
+       vmlal.u32       $D1,$H3#lo,${S3}[0]
+       vmlal.u32       $D3,$H4#lo,${S4}[0]
+
+       vmlal.u32       $D2,$H4#lo,${S3}[0]
+       vmlal.u32       $D0,$H1#lo,${S4}[0]
+       vmlal.u32       $D4,$H0#lo,${R4}[0]
+       vmov.i32        $H4,#1<<24              @ padbit, yes, always
+       vmlal.u32       $D1,$H2#lo,${S4}[0]
+       vmlal.u32       $D2,$H3#lo,${S4}[0]
+
+       vld4.32         {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]    @ inp[0:1]
+       add             $inp,$inp,#64
+# ifdef        __ARMEB__
+       vrev32.8        $H0,$H0
+       vrev32.8        $H1,$H1
+       vrev32.8        $H2,$H2
+       vrev32.8        $H3,$H3
+# endif
+
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ lazy reduction interleaved with base 2^32 -> base 2^26
+
+       vshr.u64        $T0,$D3,#26
+       vmovn.i64       $D3#lo,$D3
+        vshr.u64       $T1,$D0,#26
+        vmovn.i64      $D0#lo,$D0
+       vadd.i64        $D4,$D4,$T0             @ h3 -> h4
+       vbic.i32        $D3#lo,#0xfc000000
+         vsri.u32      $H4,$H3,#8              @ base 2^32 -> base 2^26
+        vadd.i64       $D1,$D1,$T1             @ h0 -> h1
+         vshl.u32      $H3,$H3,#18
+        vbic.i32       $D0#lo,#0xfc000000
+
+       vshrn.u64       $T0#lo,$D4,#26
+       vmovn.i64       $D4#lo,$D4
+        vshr.u64       $T1,$D1,#26
+        vmovn.i64      $D1#lo,$D1
+        vadd.i64       $D2,$D2,$T1             @ h1 -> h2
+         vsri.u32      $H3,$H2,#14
+       vbic.i32        $D4#lo,#0xfc000000
+         vshl.u32      $H2,$H2,#12
+        vbic.i32       $D1#lo,#0xfc000000
+
+       vadd.i32        $D0#lo,$D0#lo,$T0#lo
+       vshl.u32        $T0#lo,$T0#lo,#2
+         vbic.i32      $H3,#0xfc000000
+        vshrn.u64      $T1#lo,$D2,#26
+        vmovn.i64      $D2#lo,$D2
+       vadd.i32        $D0#lo,$D0#lo,$T0#lo    @ h4 -> h0
+         vsri.u32      $H2,$H1,#20
+        vadd.i32       $D3#lo,$D3#lo,$T1#lo    @ h2 -> h3
+         vshl.u32      $H1,$H1,#6
+        vbic.i32       $D2#lo,#0xfc000000
+         vbic.i32      $H2,#0xfc000000
+
+       vshr.u32        $T0#lo,$D0#lo,#26
+       vbic.i32        $D0#lo,#0xfc000000
+         vsri.u32      $H1,$H0,#26
+         vbic.i32      $H0,#0xfc000000
+        vshr.u32       $T1#lo,$D3#lo,#26
+        vbic.i32       $D3#lo,#0xfc000000
+       vadd.i32        $D1#lo,$D1#lo,$T0#lo    @ h0 -> h1
+        vadd.i32       $D4#lo,$D4#lo,$T1#lo    @ h3 -> h4
+         vbic.i32      $H1,#0xfc000000
+
+       bhi             .Loop_neon
+
+.Lskip_loop:
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+       add             $tbl1,$ctx,#(48+0*9*4)
+       add             $tbl0,$ctx,#(48+1*9*4)
+       adds            $len,$len,#32
+# ifdef        __thumb2__
+       it              ne
+# endif
+       movne           $len,#0
+       bne             .Long_tail
+
+       vadd.i32        $H2#hi,$H2#lo,$D2#lo    @ add hash value and move to #hi
+       vadd.i32        $H0#hi,$H0#lo,$D0#lo
+       vadd.i32        $H3#hi,$H3#lo,$D3#lo
+       vadd.i32        $H1#hi,$H1#lo,$D1#lo
+       vadd.i32        $H4#hi,$H4#lo,$D4#lo
+
+.Long_tail:
+       vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^1
+       vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^2
+
+       vadd.i32        $H2#lo,$H2#lo,$D2#lo    @ can be redundant
+       vmull.u32       $D2,$H2#hi,$R0
+       vadd.i32        $H0#lo,$H0#lo,$D0#lo
+       vmull.u32       $D0,$H0#hi,$R0
+       vadd.i32        $H3#lo,$H3#lo,$D3#lo
+       vmull.u32       $D3,$H3#hi,$R0
+       vadd.i32        $H1#lo,$H1#lo,$D1#lo
+       vmull.u32       $D1,$H1#hi,$R0
+       vadd.i32        $H4#lo,$H4#lo,$D4#lo
+       vmull.u32       $D4,$H4#hi,$R0
+
+       vmlal.u32       $D0,$H4#hi,$S1
+       vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+       vmlal.u32       $D3,$H2#hi,$R1
+       vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+       vmlal.u32       $D1,$H0#hi,$R1
+       vmlal.u32       $D4,$H3#hi,$R1
+       vmlal.u32       $D2,$H1#hi,$R1
+
+       vmlal.u32       $D3,$H1#hi,$R2
+       vld1.32         ${S4}[1],[$tbl1,:32]
+       vmlal.u32       $D0,$H3#hi,$S2
+       vld1.32         ${S4}[0],[$tbl0,:32]
+       vmlal.u32       $D4,$H2#hi,$R2
+       vmlal.u32       $D1,$H4#hi,$S2
+       vmlal.u32       $D2,$H0#hi,$R2
+
+       vmlal.u32       $D3,$H0#hi,$R3
+# ifdef        __thumb2__
+       it              ne
+# endif
+        addne          $tbl1,$ctx,#(48+2*9*4)
+       vmlal.u32       $D0,$H2#hi,$S3
+# ifdef        __thumb2__
+       it              ne
+# endif
+        addne          $tbl0,$ctx,#(48+3*9*4)
+       vmlal.u32       $D4,$H1#hi,$R3
+       vmlal.u32       $D1,$H3#hi,$S3
+       vmlal.u32       $D2,$H4#hi,$S3
+
+       vmlal.u32       $D3,$H4#hi,$S4
+        vmov.u64       $MASK,#-1               @ can be redundant
+       vmlal.u32       $D0,$H1#hi,$S4
+        vshr.u64       $MASK,$MASK,#38
+       vmlal.u32       $D4,$H0#hi,$R4
+       vmlal.u32       $D1,$H2#hi,$S4
+       vmlal.u32       $D2,$H3#hi,$S4
+
+       beq             .Lshort_tail
+
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ (hash+inp[0:1])*r^4:r^3 and accumulate
+
+       vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^3
+       vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^4
+
+       vmlal.u32       $D2,$H2#lo,$R0
+       vmlal.u32       $D0,$H0#lo,$R0
+       vmlal.u32       $D3,$H3#lo,$R0
+       vmlal.u32       $D1,$H1#lo,$R0
+       vmlal.u32       $D4,$H4#lo,$R0
+
+       vmlal.u32       $D0,$H4#lo,$S1
+       vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+       vmlal.u32       $D3,$H2#lo,$R1
+       vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+       vmlal.u32       $D1,$H0#lo,$R1
+       vmlal.u32       $D4,$H3#lo,$R1
+       vmlal.u32       $D2,$H1#lo,$R1
+
+       vmlal.u32       $D3,$H1#lo,$R2
+       vld1.32         ${S4}[1],[$tbl1,:32]
+       vmlal.u32       $D0,$H3#lo,$S2
+       vld1.32         ${S4}[0],[$tbl0,:32]
+       vmlal.u32       $D4,$H2#lo,$R2
+       vmlal.u32       $D1,$H4#lo,$S2
+       vmlal.u32       $D2,$H0#lo,$R2
+
+       vmlal.u32       $D3,$H0#lo,$R3
+       vmlal.u32       $D0,$H2#lo,$S3
+       vmlal.u32       $D4,$H1#lo,$R3
+       vmlal.u32       $D1,$H3#lo,$S3
+       vmlal.u32       $D2,$H4#lo,$S3
+
+       vmlal.u32       $D3,$H4#lo,$S4
+        vmov.u64       $MASK,#-1
+       vmlal.u32       $D0,$H1#lo,$S4
+        vshr.u64       $MASK,$MASK,#38
+       vmlal.u32       $D4,$H0#lo,$R4
+       vmlal.u32       $D1,$H2#lo,$S4
+       vmlal.u32       $D2,$H3#lo,$S4
+
+.Lshort_tail:
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ lazy reduction, but without narrowing
+
+       vshr.u64        $T0,$D3,#26
+       vand.i64        $D3,$D3,$MASK
+        vshr.u64       $T1,$D0,#26
+        vand.i64       $D0,$D0,$MASK
+       vadd.i64        $D4,$D4,$T0             @ h3 -> h4
+        vadd.i64       $D1,$D1,$T1             @ h0 -> h1
+
+       vshr.u64        $T0,$D4,#26
+       vand.i64        $D4,$D4,$MASK
+        vshr.u64       $T1,$D1,#26
+        vand.i64       $D1,$D1,$MASK
+        vadd.i64       $D2,$D2,$T1             @ h1 -> h2
+
+       vadd.i64        $D0,$D0,$T0
+       vshl.u64        $T0,$T0,#2
+        vshr.u64       $T1,$D2,#26
+        vand.i64       $D2,$D2,$MASK
+       vadd.i64        $D0,$D0,$T0             @ h4 -> h0
+        vadd.i64       $D3,$D3,$T1             @ h2 -> h3
+
+       vshr.u64        $T0,$D0,#26
+       vand.i64        $D0,$D0,$MASK
+        vshr.u64       $T1,$D3,#26
+        vand.i64       $D3,$D3,$MASK
+       vadd.i64        $D1,$D1,$T0             @ h0 -> h1
+        vadd.i64       $D4,$D4,$T1             @ h3 -> h4
+
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ horizontal addition
+
+       vadd.i64        $D2#lo,$D2#lo,$D2#hi
+       vadd.i64        $D0#lo,$D0#lo,$D0#hi
+       vadd.i64        $D3#lo,$D3#lo,$D3#hi
+       vadd.i64        $D1#lo,$D1#lo,$D1#hi
+       vadd.i64        $D4#lo,$D4#lo,$D4#hi
+
+       cmp             $len,#0
+       bne             .Leven
+
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ store hash value
+
+       vst4.32         {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+       vst1.32         {$D4#lo[0]},[$ctx]
+
+       vldmia  sp!,{d8-d15}                    @ epilogue
+       ldmia   sp!,{r4-r7}
+.Lno_data_neon:
+       ret                                     @ bx    lr
+.size  poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.type  poly1305_emit_neon,%function
+.align 5
+poly1305_emit_neon:
+       ldr     ip,[$ctx,#36]           @ is_base2_26
+
+       stmdb   sp!,{r4-r11}
+
+       tst     ip,ip
+       beq     .Lpoly1305_emit_enter
+
+       ldmia   $ctx,{$h0-$h4}
+       eor     $g0,$g0,$g0
+
+       adds    $h0,$h0,$h1,lsl#26      @ base 2^26 -> base 2^32
+       mov     $h1,$h1,lsr#6
+       adcs    $h1,$h1,$h2,lsl#20
+       mov     $h2,$h2,lsr#12
+       adcs    $h2,$h2,$h3,lsl#14
+       mov     $h3,$h3,lsr#18
+       adcs    $h3,$h3,$h4,lsl#8
+       adc     $h4,$g0,$h4,lsr#24      @ can be partially reduced ...
+
+       and     $g0,$h4,#-4             @ ... so reduce
+       and     $h4,$h3,#3
+       add     $g0,$g0,$g0,lsr#2       @ *= 5
+       adds    $h0,$h0,$g0
+       adcs    $h1,$h1,#0
+       adcs    $h2,$h2,#0
+       adc     $h3,$h3,#0
+
+       adds    $g0,$h0,#5              @ compare to modulus
+       adcs    $g1,$h1,#0
+       adcs    $g2,$h2,#0
+       adcs    $g3,$h3,#0
+       adc     $g4,$h4,#0
+       tst     $g4,#4                  @ did it carry/borrow?
+
+# ifdef        __thumb2__
+       it      ne
+# endif
+       movne   $h0,$g0
+       ldr     $g0,[$nonce,#0]
+# ifdef        __thumb2__
+       it      ne
+# endif
+       movne   $h1,$g1
+       ldr     $g1,[$nonce,#4]
+# ifdef        __thumb2__
+       it      ne
+# endif
+       movne   $h2,$g2
+       ldr     $g2,[$nonce,#8]
+# ifdef        __thumb2__
+       it      ne
+# endif
+       movne   $h3,$g3
+       ldr     $g3,[$nonce,#12]
+
+       adds    $h0,$h0,$g0             @ accumulate nonce
+       adcs    $h1,$h1,$g1
+       adcs    $h2,$h2,$g2
+       adc     $h3,$h3,$g3
+
+# ifdef __ARMEB__
+       rev     $h0,$h0
+       rev     $h1,$h1
+       rev     $h2,$h2
+       rev     $h3,$h3
+# endif
+       str     $h0,[$mac,#0]           @ store the result
+       str     $h1,[$mac,#4]
+       str     $h2,[$mac,#8]
+       str     $h3,[$mac,#12]
+
+       ldmia   sp!,{r4-r11}
+       ret                             @ bx    lr
+.size  poly1305_emit_neon,.-poly1305_emit_neon
+
+.align 5
+.Lzeros:
+.long  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.LOPENSSL_armcap:
+.word  OPENSSL_armcap_P-.Lpoly1305_init
+#endif
+___
+}      }
+$code.=<<___;
+.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#if    __ARM_MAX_ARCH__>=7
+.comm   OPENSSL_armcap_P,4,4
+#endif
+___
+
+foreach (split("\n",$code)) {
+       s/\`([^\`]*)\`/eval $1/geo;
+
+       s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
+       s/\bret\b/bx    lr/go                                           or
+       s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
+
+       print $_,"\n";
+}
+close STDOUT; # enforce flush
diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl
new file mode 100755 (executable)
index 0000000..79185d2
--- /dev/null
@@ -0,0 +1,925 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements Poly1305 hash for ARMv8.
+#
+# June 2015
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+#              IALU/gcc-4.9    NEON
+#
+# Apple A7     1.86/+5%        0.72
+# Cortex-A53   2.63/+58%       1.47
+# Cortex-A57   2.70/+7%        1.14
+# Denver       1.39/+50%       1.18(*)
+# X-Gene       2.00/+68%       2.19
+#
+# (*)  estimate based on resources availability is less than 1.0,
+#      i.e. measured result is worse than expected, presumably binary
+#      translator is not almighty;
+
+$flavour=shift;
+$output=shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
+my ($mac,$nonce)=($inp,$len);
+
+my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+
+// forward "declarations" are required for Apple
+.extern        OPENSSL_armcap_P
+.globl poly1305_blocks
+.globl poly1305_emit
+
+.globl poly1305_init
+.type  poly1305_init,%function
+.align 5
+poly1305_init:
+       cmp     $inp,xzr
+       stp     xzr,xzr,[$ctx]          // zero hash value
+       stp     xzr,xzr,[$ctx,#16]      // [along with is_base2_26]
+
+       csel    x0,xzr,x0,eq
+       b.eq    .Lno_key
+
+#ifdef __ILP32__
+       ldrsw   $t1,.LOPENSSL_armcap_P
+#else
+       ldr     $t1,.LOPENSSL_armcap_P
+#endif
+       adr     $t0,.LOPENSSL_armcap_P
+
+       ldp     $r0,$r1,[$inp]          // load key
+       mov     $s1,#0xfffffffc0fffffff
+       movk    $s1,#0x0fff,lsl#48
+       ldr     w17,[$t0,$t1]
+#ifdef __ARMEB__
+       rev     $r0,$r0                 // flip bytes
+       rev     $r1,$r1
+#endif
+       and     $r0,$r0,$s1             // &=0ffffffc0fffffff
+       and     $s1,$s1,#-4
+       and     $r1,$r1,$s1             // &=0ffffffc0ffffffc
+       stp     $r0,$r1,[$ctx,#32]      // save key value
+
+       tst     w17,#ARMV7_NEON
+
+       adr     $d0,poly1305_blocks
+       adr     $r0,poly1305_blocks_neon
+       adr     $d1,poly1305_emit
+       adr     $r1,poly1305_emit_neon
+
+       csel    $d0,$d0,$r0,eq
+       csel    $d1,$d1,$r1,eq
+
+       stp     $d0,$d1,[$len]
+
+       mov     x0,#1
+.Lno_key:
+       ret
+.size  poly1305_init,.-poly1305_init
+
+.type  poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+       ands    $len,$len,#-16
+       b.eq    .Lno_data
+
+       ldp     $h0,$h1,[$ctx]          // load hash value
+       ldp     $r0,$r1,[$ctx,#32]      // load key value
+       ldr     $h2,[$ctx,#16]
+       add     $s1,$r1,$r1,lsr#2       // s1 = r1 + (r1 >> 2)
+       b       .Loop
+
+.align 5
+.Loop:
+       ldp     $t0,$t1,[$inp],#16      // load input
+       sub     $len,$len,#16
+#ifdef __ARMEB__
+       rev     $t0,$t0
+       rev     $t1,$t1
+#endif
+       adds    $h0,$h0,$t0             // accumulate input
+       adcs    $h1,$h1,$t1
+
+       mul     $d0,$h0,$r0             // h0*r0
+       adc     $h2,$h2,$padbit
+       umulh   $d1,$h0,$r0
+
+       mul     $t0,$h1,$s1             // h1*5*r1
+       umulh   $t1,$h1,$s1
+
+       adds    $d0,$d0,$t0
+       mul     $t0,$h0,$r1             // h0*r1
+       adc     $d1,$d1,$t1
+       umulh   $d2,$h0,$r1
+
+       adds    $d1,$d1,$t0
+       mul     $t0,$h1,$r0             // h1*r0
+       adc     $d2,$d2,xzr
+       umulh   $t1,$h1,$r0
+
+       adds    $d1,$d1,$t0
+       mul     $t0,$h2,$s1             // h2*5*r1
+       adc     $d2,$d2,$t1
+       mul     $t1,$h2,$r0             // h2*r0
+
+       adds    $d1,$d1,$t0
+       adc     $d2,$d2,$t1
+
+       and     $t0,$d2,#-4             // final reduction
+       and     $h2,$d2,#3
+       add     $t0,$t0,$d2,lsr#2
+       adds    $h0,$d0,$t0
+       adc     $h1,$d1,xzr
+
+       cbnz    $len,.Loop
+
+       stp     $h0,$h1,[$ctx]          // store hash value
+       str     $h2,[$ctx,#16]
+
+.Lno_data:
+       ret
+.size  poly1305_blocks,.-poly1305_blocks
+
+.type  poly1305_emit,%function
+.align 5
+poly1305_emit:
+       ldp     $h0,$h1,[$ctx]          // load hash base 2^64
+       ldr     $h2,[$ctx,#16]
+       ldp     $t0,$t1,[$nonce]        // load nonce
+
+       adds    $d0,$h0,#5              // compare to modulus
+       adcs    $d1,$h1,xzr
+       adc     $d2,$h2,xzr
+
+       tst     $d2,#-4                 // see if it's carried/borrowed
+
+       csel    $h0,$h0,$d0,eq
+       csel    $h1,$h1,$d1,eq
+
+#ifdef __ARMEB__
+       ror     $t0,$t0,#32             // flip nonce words
+       ror     $t1,$t1,#32
+#endif
+       adds    $h0,$h0,$t0             // accumulate nonce
+       adc     $h1,$h1,$t1
+#ifdef __ARMEB__
+       rev     $h0,$h0                 // flip output bytes
+       rev     $h1,$h1
+#endif
+       stp     $h0,$h1,[$mac]          // write result
+
+       ret
+.size  poly1305_emit,.-poly1305_emit
+___
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
+my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
+my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
+my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
+my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
+my ($T0,$T1,$MASK) = map("v$_",(29..31));
+
+my ($in2,$zeros)=("x16","x17");
+my $is_base2_26 = $zeros;              # borrow
+
+$code.=<<___;
+.type  poly1305_mult,%function
+.align 5
+poly1305_mult:
+       mul     $d0,$h0,$r0             // h0*r0
+       umulh   $d1,$h0,$r0
+
+       mul     $t0,$h1,$s1             // h1*5*r1
+       umulh   $t1,$h1,$s1
+
+       adds    $d0,$d0,$t0
+       mul     $t0,$h0,$r1             // h0*r1
+       adc     $d1,$d1,$t1
+       umulh   $d2,$h0,$r1
+
+       adds    $d1,$d1,$t0
+       mul     $t0,$h1,$r0             // h1*r0
+       adc     $d2,$d2,xzr
+       umulh   $t1,$h1,$r0
+
+       adds    $d1,$d1,$t0
+       mul     $t0,$h2,$s1             // h2*5*r1
+       adc     $d2,$d2,$t1
+       mul     $t1,$h2,$r0             // h2*r0
+
+       adds    $d1,$d1,$t0
+       adc     $d2,$d2,$t1
+
+       and     $t0,$d2,#-4             // final reduction
+       and     $h2,$d2,#3
+       add     $t0,$t0,$d2,lsr#2
+       adds    $h0,$d0,$t0
+       adc     $h1,$d1,xzr
+
+       ret
+.size  poly1305_mult,.-poly1305_mult
+
+.type  poly1305_splat,%function
+.align 5
+poly1305_splat:
+       and     x12,$h0,#0x03ffffff     // base 2^64 -> base 2^26
+       ubfx    x13,$h0,#26,#26
+       extr    x14,$h1,$h0,#52
+       and     x14,x14,#0x03ffffff
+       ubfx    x15,$h1,#14,#26
+       extr    x16,$h2,$h1,#40
+
+       str     w12,[$ctx,#16*0]        // r0
+       add     w12,w13,w13,lsl#2       // r1*5
+       str     w13,[$ctx,#16*1]        // r1
+       add     w13,w14,w14,lsl#2       // r2*5
+       str     w12,[$ctx,#16*2]        // s1
+       str     w14,[$ctx,#16*3]        // r2
+       add     w14,w15,w15,lsl#2       // r3*5
+       str     w13,[$ctx,#16*4]        // s2
+       str     w15,[$ctx,#16*5]        // r3
+       add     w15,w16,w16,lsl#2       // r4*5
+       str     w14,[$ctx,#16*6]        // s3
+       str     w16,[$ctx,#16*7]        // r4
+       str     w15,[$ctx,#16*8]        // s4
+
+       ret
+.size  poly1305_splat,.-poly1305_splat
+
+.type  poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+       ldr     $is_base2_26,[$ctx,#24]
+       cmp     $len,#128
+       b.hs    .Lblocks_neon
+       cbz     $is_base2_26,poly1305_blocks
+
+.Lblocks_neon:
+       stp     x29,x30,[sp,#-80]!
+       add     x29,sp,#0
+
+       ands    $len,$len,#-16
+       b.eq    .Lno_data_neon
+
+       cbz     $is_base2_26,.Lbase2_64_neon
+
+       ldp     w10,w11,[$ctx]          // load hash value base 2^26
+       ldp     w12,w13,[$ctx,#8]
+       ldr     w14,[$ctx,#16]
+
+       tst     $len,#31
+       b.eq    .Leven_neon
+
+       ldp     $r0,$r1,[$ctx,#32]      // load key value
+
+       add     $h0,x10,x11,lsl#26      // base 2^26 -> base 2^64
+       lsr     $h1,x12,#12
+       adds    $h0,$h0,x12,lsl#52
+       add     $h1,$h1,x13,lsl#14
+       adc     $h1,$h1,xzr
+       lsr     $h2,x14,#24
+       adds    $h1,$h1,x14,lsl#40
+       adc     $d2,$h2,xzr             // can be partially reduced...
+
+       ldp     $d0,$d1,[$inp],#16      // load input
+       sub     $len,$len,#16
+       add     $s1,$r1,$r1,lsr#2       // s1 = r1 + (r1 >> 2)
+
+       and     $t0,$d2,#-4             // ... so reduce
+       and     $h2,$d2,#3
+       add     $t0,$t0,$d2,lsr#2
+       adds    $h0,$h0,$t0
+       adc     $h1,$h1,xzr
+
+#ifdef __ARMEB__
+       rev     $d0,$d0
+       rev     $d1,$d1
+#endif
+       adds    $h0,$h0,$d0             // accumulate input
+       adcs    $h1,$h1,$d1
+       adc     $h2,$h2,$padbit
+
+       bl      poly1305_mult
+       ldr     x30,[sp,#8]
+
+       cbz     $padbit,.Lstore_base2_64_neon
+
+       and     x10,$h0,#0x03ffffff     // base 2^64 -> base 2^26
+       ubfx    x11,$h0,#26,#26
+       extr    x12,$h1,$h0,#52
+       and     x12,x12,#0x03ffffff
+       ubfx    x13,$h1,#14,#26
+       extr    x14,$h2,$h1,#40
+
+       cbnz    $len,.Leven_neon
+
+       stp     w10,w11,[$ctx]          // store hash value base 2^26
+       stp     w12,w13,[$ctx,#8]
+       str     w14,[$ctx,#16]
+       b       .Lno_data_neon
+
+.align 4
+.Lstore_base2_64_neon:
+       stp     $h0,$h1,[$ctx]          // store hash value base 2^64
+       stp     $h2,xzr,[$ctx,#16]      // note that is_base2_26 is zeroed
+       b       .Lno_data_neon
+
+.align 4
+.Lbase2_64_neon:
+       ldp     $r0,$r1,[$ctx,#32]      // load key value
+
+       ldp     $h0,$h1,[$ctx]          // load hash value base 2^64
+       ldr     $h2,[$ctx,#16]
+
+       tst     $len,#31
+       b.eq    .Linit_neon
+
+       ldp     $d0,$d1,[$inp],#16      // load input
+       sub     $len,$len,#16
+       add     $s1,$r1,$r1,lsr#2       // s1 = r1 + (r1 >> 2)
+#ifdef __ARMEB__
+       rev     $d0,$d0
+       rev     $d1,$d1
+#endif
+       adds    $h0,$h0,$d0             // accumulate input
+       adcs    $h1,$h1,$d1
+       adc     $h2,$h2,$padbit
+
+       bl      poly1305_mult
+
+.Linit_neon:
+       and     x10,$h0,#0x03ffffff     // base 2^64 -> base 2^26
+       ubfx    x11,$h0,#26,#26
+       extr    x12,$h1,$h0,#52
+       and     x12,x12,#0x03ffffff
+       ubfx    x13,$h1,#14,#26
+       extr    x14,$h2,$h1,#40
+
+       stp     d8,d9,[sp,#16]          // meet ABI requirements
+       stp     d10,d11,[sp,#32]
+       stp     d12,d13,[sp,#48]
+       stp     d14,d15,[sp,#64]
+
+       fmov    ${H0},x10
+       fmov    ${H1},x11
+       fmov    ${H2},x12
+       fmov    ${H3},x13
+       fmov    ${H4},x14
+
+       ////////////////////////////////// initialize r^n table
+       mov     $h0,$r0                 // r^1
+       add     $s1,$r1,$r1,lsr#2       // s1 = r1 + (r1 >> 2)
+       mov     $h1,$r1
+       mov     $h2,xzr
+       add     $ctx,$ctx,#48+12
+       bl      poly1305_splat
+
+       bl      poly1305_mult           // r^2
+       sub     $ctx,$ctx,#4
+       bl      poly1305_splat
+
+       bl      poly1305_mult           // r^3
+       sub     $ctx,$ctx,#4
+       bl      poly1305_splat
+
+       bl      poly1305_mult           // r^4
+       sub     $ctx,$ctx,#4
+       bl      poly1305_splat
+       ldr     x30,[sp,#8]
+
+       add     $in2,$inp,#32
+       adr     $zeros,.Lzeros
+       subs    $len,$len,#64
+       csel    $in2,$zeros,$in2,lo
+
+       mov     x4,#1
+       str     x4,[$ctx,#-24]          // set is_base2_26
+       sub     $ctx,$ctx,#48           // restore original $ctx
+       b       .Ldo_neon
+
+.align 4
+.Leven_neon:
+       add     $in2,$inp,#32
+       adr     $zeros,.Lzeros
+       subs    $len,$len,#64
+       csel    $in2,$zeros,$in2,lo
+
+       stp     d8,d9,[sp,#16]          // meet ABI requirements
+       stp     d10,d11,[sp,#32]
+       stp     d12,d13,[sp,#48]
+       stp     d14,d15,[sp,#64]
+
+       fmov    ${H0},x10
+       fmov    ${H1},x11
+       fmov    ${H2},x12
+       fmov    ${H3},x13
+       fmov    ${H4},x14
+
+.Ldo_neon:
+       ldp     x8,x12,[$in2],#16       // inp[2:3] (or zero)
+       ldp     x9,x13,[$in2],#48
+
+       lsl     $padbit,$padbit,#24
+       add     x15,$ctx,#48
+
+#ifdef __ARMEB__
+       rev     x8,x8
+       rev     x12,x12
+       rev     x9,x9
+       rev     x13,x13
+#endif
+       and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
+       and     x5,x9,#0x03ffffff
+       ubfx    x6,x8,#26,#26
+       ubfx    x7,x9,#26,#26
+       add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
+       extr    x8,x12,x8,#52
+       extr    x9,x13,x9,#52
+       add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
+       fmov    $IN23_0,x4
+       and     x8,x8,#0x03ffffff
+       and     x9,x9,#0x03ffffff
+       ubfx    x10,x12,#14,#26
+       ubfx    x11,x13,#14,#26
+       add     x12,$padbit,x12,lsr#40
+       add     x13,$padbit,x13,lsr#40
+       add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
+       fmov    $IN23_1,x6
+       add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
+       add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
+       fmov    $IN23_2,x8
+       fmov    $IN23_3,x10
+       fmov    $IN23_4,x12
+
+       ldp     x8,x12,[$inp],#16       // inp[0:1]
+       ldp     x9,x13,[$inp],#48
+
+       ld1     {$R0,$R1,$S1,$R2},[x15],#64
+       ld1     {$S2,$R3,$S3,$R4},[x15],#64
+       ld1     {$S4},[x15]
+
+#ifdef __ARMEB__
+       rev     x8,x8
+       rev     x12,x12
+       rev     x9,x9
+       rev     x13,x13
+#endif
+       and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
+       and     x5,x9,#0x03ffffff
+       ubfx    x6,x8,#26,#26
+       ubfx    x7,x9,#26,#26
+       add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
+       extr    x8,x12,x8,#52
+       extr    x9,x13,x9,#52
+       add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
+       fmov    $IN01_0,x4
+       and     x8,x8,#0x03ffffff
+       and     x9,x9,#0x03ffffff
+       ubfx    x10,x12,#14,#26
+       ubfx    x11,x13,#14,#26
+       add     x12,$padbit,x12,lsr#40
+       add     x13,$padbit,x13,lsr#40
+       add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
+       fmov    $IN01_1,x6
+       add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
+       add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
+       fmov    $IN01_2,x8
+       fmov    $IN01_3,x10
+       fmov    $IN01_4,x12
+
+       b.ls    .Lskip_loop
+
+.align 4
+.Loop_neon:
+       ////////////////////////////////////////////////////////////////
+       // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+       // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+       //   \___________________/
+       // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+       // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+       //   \___________________/ \____________________/
+       //
+       // Note that we start with inp[2:3]*r^2. This is because it
+       // doesn't depend on reduction in previous iteration.
+       ////////////////////////////////////////////////////////////////
+       // d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
+       // d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
+       // d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
+       // d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
+       // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+       subs    $len,$len,#64
+       umull   $ACC4,$IN23_0,${R4}[2]
+       csel    $in2,$zeros,$in2,lo
+       umull   $ACC3,$IN23_0,${R3}[2]
+       umull   $ACC2,$IN23_0,${R2}[2]
+        ldp    x8,x12,[$in2],#16       // inp[2:3] (or zero)
+       umull   $ACC1,$IN23_0,${R1}[2]
+        ldp    x9,x13,[$in2],#48
+       umull   $ACC0,$IN23_0,${R0}[2]
+#ifdef __ARMEB__
+        rev    x8,x8
+        rev    x12,x12
+        rev    x9,x9
+        rev    x13,x13
+#endif
+
+       umlal   $ACC4,$IN23_1,${R3}[2]
+        and    x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
+       umlal   $ACC3,$IN23_1,${R2}[2]
+        and    x5,x9,#0x03ffffff
+       umlal   $ACC2,$IN23_1,${R1}[2]
+        ubfx   x6,x8,#26,#26
+       umlal   $ACC1,$IN23_1,${R0}[2]
+        ubfx   x7,x9,#26,#26
+       umlal   $ACC0,$IN23_1,${S4}[2]
+        add    x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
+
+       umlal   $ACC4,$IN23_2,${R2}[2]
+        extr   x8,x12,x8,#52
+       umlal   $ACC3,$IN23_2,${R1}[2]
+        extr   x9,x13,x9,#52
+       umlal   $ACC2,$IN23_2,${R0}[2]
+        add    x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
+       umlal   $ACC1,$IN23_2,${S4}[2]
+        fmov   $IN23_0,x4
+       umlal   $ACC0,$IN23_2,${S3}[2]
+        and    x8,x8,#0x03ffffff
+
+       umlal   $ACC4,$IN23_3,${R1}[2]
+        and    x9,x9,#0x03ffffff
+       umlal   $ACC3,$IN23_3,${R0}[2]
+        ubfx   x10,x12,#14,#26
+       umlal   $ACC2,$IN23_3,${S4}[2]
+        ubfx   x11,x13,#14,#26
+       umlal   $ACC1,$IN23_3,${S3}[2]
+        add    x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
+       umlal   $ACC0,$IN23_3,${S2}[2]
+        fmov   $IN23_1,x6
+
+       add     $IN01_2,$IN01_2,$H2
+        add    x12,$padbit,x12,lsr#40
+       umlal   $ACC4,$IN23_4,${R0}[2]
+        add    x13,$padbit,x13,lsr#40
+       umlal   $ACC3,$IN23_4,${S4}[2]
+        add    x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
+       umlal   $ACC2,$IN23_4,${S3}[2]
+        add    x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
+       umlal   $ACC1,$IN23_4,${S2}[2]
+        fmov   $IN23_2,x8
+       umlal   $ACC0,$IN23_4,${S1}[2]
+        fmov   $IN23_3,x10
+
+       ////////////////////////////////////////////////////////////////
+       // (hash+inp[0:1])*r^4 and accumulate
+
+       add     $IN01_0,$IN01_0,$H0
+        fmov   $IN23_4,x12
+       umlal   $ACC3,$IN01_2,${R1}[0]
+        ldp    x8,x12,[$inp],#16       // inp[0:1]
+       umlal   $ACC0,$IN01_2,${S3}[0]
+        ldp    x9,x13,[$inp],#48
+       umlal   $ACC4,$IN01_2,${R2}[0]
+       umlal   $ACC1,$IN01_2,${S4}[0]
+       umlal   $ACC2,$IN01_2,${R0}[0]
+#ifdef __ARMEB__
+        rev    x8,x8
+        rev    x12,x12
+        rev    x9,x9
+        rev    x13,x13
+#endif
+
+       add     $IN01_1,$IN01_1,$H1
+       umlal   $ACC3,$IN01_0,${R3}[0]
+       umlal   $ACC4,$IN01_0,${R4}[0]
+        and    x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
+       umlal   $ACC2,$IN01_0,${R2}[0]
+        and    x5,x9,#0x03ffffff
+       umlal   $ACC0,$IN01_0,${R0}[0]
+        ubfx   x6,x8,#26,#26
+       umlal   $ACC1,$IN01_0,${R1}[0]
+        ubfx   x7,x9,#26,#26
+
+       add     $IN01_3,$IN01_3,$H3
+        add    x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
+       umlal   $ACC3,$IN01_1,${R2}[0]
+        extr   x8,x12,x8,#52
+       umlal   $ACC4,$IN01_1,${R3}[0]
+        extr   x9,x13,x9,#52
+       umlal   $ACC0,$IN01_1,${S4}[0]
+        add    x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
+       umlal   $ACC2,$IN01_1,${R1}[0]
+        fmov   $IN01_0,x4
+       umlal   $ACC1,$IN01_1,${R0}[0]
+        and    x8,x8,#0x03ffffff
+
+       add     $IN01_4,$IN01_4,$H4
+        and    x9,x9,#0x03ffffff
+       umlal   $ACC3,$IN01_3,${R0}[0]
+        ubfx   x10,x12,#14,#26
+       umlal   $ACC0,$IN01_3,${S2}[0]
+        ubfx   x11,x13,#14,#26
+       umlal   $ACC4,$IN01_3,${R1}[0]
+        add    x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
+       umlal   $ACC1,$IN01_3,${S3}[0]
+        fmov   $IN01_1,x6
+       umlal   $ACC2,$IN01_3,${S4}[0]
+        add    x12,$padbit,x12,lsr#40
+
+       umlal   $ACC3,$IN01_4,${S4}[0]
+        add    x13,$padbit,x13,lsr#40
+       umlal   $ACC0,$IN01_4,${S1}[0]
+        add    x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
+       umlal   $ACC4,$IN01_4,${R0}[0]
+        add    x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
+       umlal   $ACC1,$IN01_4,${S2}[0]
+        fmov   $IN01_2,x8
+       umlal   $ACC2,$IN01_4,${S3}[0]
+        fmov   $IN01_3,x10
+
+       /////////////////////////////////////////////////////////////////
+       // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+        // and P. Schwabe
+
+       ushr    $T0.2d,$ACC3,#26
+        fmov   $IN01_4,x12
+       xtn     $H3,$ACC3
+        ushr   $T1.2d,$ACC0,#26
+        xtn    $H0,$ACC0
+       add     $ACC4,$ACC4,$T0.2d      // h3 -> h4
+       bic     $H3,#0xfc,lsl#24        // &=0x03ffffff
+        add    $ACC1,$ACC1,$T1.2d      // h0 -> h1
+        bic    $H0,#0xfc,lsl#24
+
+       shrn    $T0.2s,$ACC4,#26
+       xtn     $H4,$ACC4
+        ushr   $T1.2d,$ACC1,#26
+        xtn    $H1,$ACC1
+        add    $ACC2,$ACC2,$T1.2d      // h1 -> h2
+       bic     $H4,#0xfc,lsl#24
+        bic    $H1,#0xfc,lsl#24
+
+       add     $H0,$H0,$T0.2s
+       shl     $T0.2s,$T0.2s,#2
+        shrn   $T1.2s,$ACC2,#26
+        xtn    $H2,$ACC2
+       add     $H0,$H0,$T0.2s          // h4 -> h0
+        add    $H3,$H3,$T1.2s          // h2 -> h3
+        bic    $H2,#0xfc,lsl#24
+
+       ushr    $T0.2s,$H0,#26
+       bic     $H0,#0xfc,lsl#24
+        ushr   $T1.2s,$H3,#26
+        bic    $H3,#0xfc,lsl#24
+       add     $H1,$H1,$T0.2s          // h0 -> h1
+        add    $H4,$H4,$T1.2s          // h3 -> h4
+
+       b.hi    .Loop_neon
+
+.Lskip_loop:
+       dup     $IN23_2,${IN23_2}[0]
+       movi    $MASK.2d,#-1
+       add     $IN01_2,$IN01_2,$H2
+       ushr    $MASK.2d,$MASK.2d,#38
+
+       ////////////////////////////////////////////////////////////////
+       // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+       adds    $len,$len,#32
+       b.ne    .Long_tail
+
+       dup     $IN23_2,${IN01_2}[0]
+       add     $IN23_0,$IN01_0,$H0
+       add     $IN23_3,$IN01_3,$H3
+       add     $IN23_1,$IN01_1,$H1
+       add     $IN23_4,$IN01_4,$H4
+
+.Long_tail:
+       dup     $IN23_0,${IN23_0}[0]
+       umull2  $ACC0,$IN23_2,${S3}
+       umull2  $ACC3,$IN23_2,${R1}
+       umull2  $ACC4,$IN23_2,${R2}
+       umull2  $ACC2,$IN23_2,${R0}
+       umull2  $ACC1,$IN23_2,${S4}
+
+       dup     $IN23_1,${IN23_1}[0]
+       umlal2  $ACC0,$IN23_0,${R0}
+       umlal2  $ACC2,$IN23_0,${R2}
+       umlal2  $ACC3,$IN23_0,${R3}
+       umlal2  $ACC4,$IN23_0,${R4}
+       umlal2  $ACC1,$IN23_0,${R1}
+
+       dup     $IN23_3,${IN23_3}[0]
+       umlal2  $ACC0,$IN23_1,${S4}
+       umlal2  $ACC3,$IN23_1,${R2}
+       umlal2  $ACC2,$IN23_1,${R1}
+       umlal2  $ACC4,$IN23_1,${R3}
+       umlal2  $ACC1,$IN23_1,${R0}
+
+       dup     $IN23_4,${IN23_4}[0]
+       umlal2  $ACC3,$IN23_3,${R0}
+       umlal2  $ACC4,$IN23_3,${R1}
+       umlal2  $ACC0,$IN23_3,${S2}
+       umlal2  $ACC1,$IN23_3,${S3}
+       umlal2  $ACC2,$IN23_3,${S4}
+
+       umlal2  $ACC3,$IN23_4,${S4}
+       umlal2  $ACC0,$IN23_4,${S1}
+       umlal2  $ACC4,$IN23_4,${R0}
+       umlal2  $ACC1,$IN23_4,${S2}
+       umlal2  $ACC2,$IN23_4,${S3}
+
+       b.eq    .Lshort_tail
+
+       ////////////////////////////////////////////////////////////////
+       // (hash+inp[0:1])*r^4:r^3 and accumulate
+
+       add     $IN01_0,$IN01_0,$H0
+       umlal   $ACC3,$IN01_2,${R1}
+       umlal   $ACC0,$IN01_2,${S3}
+       umlal   $ACC4,$IN01_2,${R2}
+       umlal   $ACC1,$IN01_2,${S4}
+       umlal   $ACC2,$IN01_2,${R0}
+
+       add     $IN01_1,$IN01_1,$H1
+       umlal   $ACC3,$IN01_0,${R3}
+       umlal   $ACC0,$IN01_0,${R0}
+       umlal   $ACC4,$IN01_0,${R4}
+       umlal   $ACC1,$IN01_0,${R1}
+       umlal   $ACC2,$IN01_0,${R2}
+
+       add     $IN01_3,$IN01_3,$H3
+       umlal   $ACC3,$IN01_1,${R2}
+       umlal   $ACC0,$IN01_1,${S4}
+       umlal   $ACC4,$IN01_1,${R3}
+       umlal   $ACC1,$IN01_1,${R0}
+       umlal   $ACC2,$IN01_1,${R1}
+
+       add     $IN01_4,$IN01_4,$H4
+       umlal   $ACC3,$IN01_3,${R0}
+       umlal   $ACC0,$IN01_3,${S2}
+       umlal   $ACC4,$IN01_3,${R1}
+       umlal   $ACC1,$IN01_3,${S3}
+       umlal   $ACC2,$IN01_3,${S4}
+
+       umlal   $ACC3,$IN01_4,${S4}
+       umlal   $ACC0,$IN01_4,${S1}
+       umlal   $ACC4,$IN01_4,${R0}
+       umlal   $ACC1,$IN01_4,${S2}
+       umlal   $ACC2,$IN01_4,${S3}
+
+.Lshort_tail:
+       ////////////////////////////////////////////////////////////////
+       // lazy reduction, but without narrowing
+
+       ushr    $T0.2d,$ACC3,#26
+       and     $ACC3,$ACC3,$MASK.2d
+        ushr   $T1.2d,$ACC0,#26
+        and    $ACC0,$ACC0,$MASK.2d
+
+       add     $ACC4,$ACC4,$T0.2d      // h3 -> h4
+        add    $ACC1,$ACC1,$T1.2d      // h0 -> h1
+
+       ushr    $T0.2d,$ACC4,#26
+       and     $ACC4,$ACC4,$MASK.2d
+        ushr   $T1.2d,$ACC1,#26
+        and    $ACC1,$ACC1,$MASK.2d
+        add    $ACC2,$ACC2,$T1.2d      // h1 -> h2
+
+       add     $ACC0,$ACC0,$T0.2d
+       shl     $T0.2d,$T0.2d,#2
+        ushr   $T1.2d,$ACC2,#26
+        and    $ACC2,$ACC2,$MASK.2d
+       add     $ACC0,$ACC0,$T0.2d      // h4 -> h0
+        add    $ACC3,$ACC3,$T1.2d      // h2 -> h3
+
+       ushr    $T0.2d,$ACC0,#26
+       and     $ACC0,$ACC0,$MASK.2d
+        ushr   $T1.2d,$ACC3,#26
+        and    $ACC3,$ACC3,$MASK.2d
+       add     $ACC1,$ACC1,$T0.2d      // h0 -> h1
+        add    $ACC4,$ACC4,$T1.2d      // h3 -> h4
+
+       ////////////////////////////////////////////////////////////////
+       // horizontal add
+
+       addp    $ACC2,$ACC2,$ACC2
+        ldp    d8,d9,[sp,#16]          // meet ABI requirements
+       addp    $ACC0,$ACC0,$ACC0
+        ldp    d10,d11,[sp,#32]
+       addp    $ACC1,$ACC1,$ACC1
+        ldp    d12,d13,[sp,#48]
+       addp    $ACC3,$ACC3,$ACC3
+        ldp    d14,d15,[sp,#64]
+       addp    $ACC4,$ACC4,$ACC4
+
+       ////////////////////////////////////////////////////////////////
+       // write the result, can be partially reduced
+
+       st4     {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
+       st1     {$ACC4}[0],[$ctx]
+
+.Lno_data_neon:
+       ldr     x29,[sp],#80
+       ret
+.size  poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.type  poly1305_emit_neon,%function
+.align 5
+poly1305_emit_neon:
+       ldr     $is_base2_26,[$ctx,#24]
+       cbz     $is_base2_26,poly1305_emit
+
+       ldp     w10,w11,[$ctx]          // load hash value base 2^26
+       ldp     w12,w13,[$ctx,#8]
+       ldr     w14,[$ctx,#16]
+
+       add     $h0,x10,x11,lsl#26      // base 2^26 -> base 2^64
+       lsr     $h1,x12,#12
+       adds    $h0,$h0,x12,lsl#52
+       add     $h1,$h1,x13,lsl#14
+       adc     $h1,$h1,xzr
+       lsr     $h2,x14,#24
+       adds    $h1,$h1,x14,lsl#40
+       adc     $h2,$h2,xzr             // can be partially reduced...
+
+       ldp     $t0,$t1,[$nonce]        // load nonce
+
+       and     $d0,$h2,#-4             // ... so reduce
+       add     $d0,$d0,$h2,lsr#2
+       and     $h2,$h2,#3
+       adds    $h0,$h0,$d0
+       adc     $h1,$h1,xzr
+
+       adds    $d0,$h0,#5              // compare to modulus
+       adcs    $d1,$h1,xzr
+       adc     $d2,$h2,xzr
+
+       tst     $d2,#-4                 // see if it's carried/borrowed
+
+       csel    $h0,$h0,$d0,eq
+       csel    $h1,$h1,$d1,eq
+
+#ifdef __ARMEB__
+       ror     $t0,$t0,#32             // flip nonce words
+       ror     $t1,$t1,#32
+#endif
+       adds    $h0,$h0,$t0             // accumulate nonce
+       adc     $h1,$h1,$t1
+#ifdef __ARMEB__
+       rev     $h0,$h0                 // flip output bytes
+       rev     $h1,$h1
+#endif
+       stp     $h0,$h1,[$mac]          // write result
+
+       ret
+.size  poly1305_emit_neon,.-poly1305_emit_neon
+
+.align 5
+.Lzeros:
+.long  0,0,0,0,0,0,0,0
+.LOPENSSL_armcap_P:
+#ifdef __ILP32__
+.long  OPENSSL_armcap_P-.
+#else
+.quad  OPENSSL_armcap_P-.
+#endif
+.asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+foreach (split("\n",$code)) {
+       s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/                      or
+       s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/     or
+       (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1))                 or
+       (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1))       or
+       (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1))             or
+       (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1))            or
+       (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
+
+       s/\.[124]([sd])\[/.$1\[/;
+
+       print $_,"\n";
+}
+close STDOUT;