9bbfc6b376cd91a24de1eb89269a0fde6df3b048
[openssl.git] / crypto / chacha / asm / chacha-armv4.pl
1 #! /usr/bin/env perl
2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # December 2014
18 #
19 # ChaCha20 for ARMv4.
20 #
21 # Performance in cycles per byte out of large buffer.
22 #
23 #                       IALU/gcc-4.4    1xNEON      3xNEON+1xIALU
24 #
25 # Cortex-A5             19.3(*)/+95%    21.8        14.1
26 # Cortex-A8             10.5(*)/+160%   13.9        6.35
27 # Cortex-A9             12.9(**)/+110%  14.3        6.50
28 # Cortex-A15            11.0/+40%       16.0        5.00
29 # Snapdragon S4         11.5/+125%      13.6        4.90
30 #
31 # (*)   most "favourable" result for aligned data on little-endian
32 #       processor, result for misaligned data is 10-15% lower;
33 # (**)  this result is a trade-off: it can be improved by 20%,
34 #       but then Snapdragon S4 and Cortex-A8 results get
35 #       20-25% worse;
36
37 $flavour = shift;
38 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
39 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
40
41 if ($flavour && $flavour ne "void") {
42     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
44     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
45     die "can't locate arm-xlate.pl";
46
47     open STDOUT,"| \"$^X\" $xlate $flavour $output";
48 } else {
49     open STDOUT,">$output";
50 }
51
52 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
53 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
54   my $arg = pop;
55     $arg = "#$arg" if ($arg*1 eq $arg);
56     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
57 }
58
59 my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
60 my @t=map("r$_",(8..11));
61
62 sub ROUND {
63 my ($a0,$b0,$c0,$d0)=@_;
64 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
65 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
66 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
67 my $odd = $d0&1;
68 my ($xc,$xc_) = (@t[0..1]);
69 my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
70 my @ret;
71
72         # Consider order in which variables are addressed by their
73         # index:
74         #
75         #       a   b   c   d
76         #
77         #       0   4   8  12 < even round
78         #       1   5   9  13
79         #       2   6  10  14
80         #       3   7  11  15
81         #       0   5  10  15 < odd round
82         #       1   6  11  12
83         #       2   7   8  13
84         #       3   4   9  14
85         #
86         # 'a', 'b' are permanently allocated in registers, @x[0..7],
87         # while 'c's and pair of 'd's are maintained in memory. If
88         # you observe 'c' column, you'll notice that pair of 'c's is
89         # invariant between rounds. This means that we have to reload
90         # them once per round, in the middle. This is why you'll see
91         # bunch of 'c' stores and loads in the middle, but none in
92         # the beginning or end. If you observe 'd' column, you'll
93         # notice that 15 and 13 are reused in next pair of rounds.
94         # This is why these two are chosen for offloading to memory,
95         # to make loads count more.
96                                                         push @ret,(
97         "&add   (@x[$a0],@x[$a0],@x[$b0])",
98         "&mov   ($xd,$xd,'ror#16')",
99          "&add  (@x[$a1],@x[$a1],@x[$b1])",
100          "&mov  ($xd_,$xd_,'ror#16')",
101         "&eor   ($xd,$xd,@x[$a0],'ror#16')",
102          "&eor  ($xd_,$xd_,@x[$a1],'ror#16')",
103
104         "&add   ($xc,$xc,$xd)",
105         "&mov   (@x[$b0],@x[$b0],'ror#20')",
106          "&add  ($xc_,$xc_,$xd_)",
107          "&mov  (@x[$b1],@x[$b1],'ror#20')",
108         "&eor   (@x[$b0],@x[$b0],$xc,'ror#20')",
109          "&eor  (@x[$b1],@x[$b1],$xc_,'ror#20')",
110
111         "&add   (@x[$a0],@x[$a0],@x[$b0])",
112         "&mov   ($xd,$xd,'ror#24')",
113          "&add  (@x[$a1],@x[$a1],@x[$b1])",
114          "&mov  ($xd_,$xd_,'ror#24')",
115         "&eor   ($xd,$xd,@x[$a0],'ror#24')",
116          "&eor  ($xd_,$xd_,@x[$a1],'ror#24')",
117
118         "&add   ($xc,$xc,$xd)",
119         "&mov   (@x[$b0],@x[$b0],'ror#25')"             );
120                                                         push @ret,(
121         "&str   ($xd,'[sp,#4*(16+$d0)]')",
122         "&ldr   ($xd,'[sp,#4*(16+$d2)]')"               ) if ($odd);
123                                                         push @ret,(
124          "&add  ($xc_,$xc_,$xd_)",
125          "&mov  (@x[$b1],@x[$b1],'ror#25')"             );
126                                                         push @ret,(
127          "&str  ($xd_,'[sp,#4*(16+$d1)]')",
128          "&ldr  ($xd_,'[sp,#4*(16+$d3)]')"              ) if (!$odd);
129                                                         push @ret,(
130         "&eor   (@x[$b0],@x[$b0],$xc,'ror#25')",
131          "&eor  (@x[$b1],@x[$b1],$xc_,'ror#25')"        );
132
133         $xd=@x[$d2]                                     if (!$odd);
134         $xd_=@x[$d3]                                    if ($odd);
135                                                         push @ret,(
136         "&str   ($xc,'[sp,#4*(16+$c0)]')",
137         "&ldr   ($xc,'[sp,#4*(16+$c2)]')",
138         "&add   (@x[$a2],@x[$a2],@x[$b2])",
139         "&mov   ($xd,$xd,'ror#16')",
140          "&str  ($xc_,'[sp,#4*(16+$c1)]')",
141          "&ldr  ($xc_,'[sp,#4*(16+$c3)]')",
142          "&add  (@x[$a3],@x[$a3],@x[$b3])",
143          "&mov  ($xd_,$xd_,'ror#16')",
144         "&eor   ($xd,$xd,@x[$a2],'ror#16')",
145          "&eor  ($xd_,$xd_,@x[$a3],'ror#16')",
146
147         "&add   ($xc,$xc,$xd)",
148         "&mov   (@x[$b2],@x[$b2],'ror#20')",
149          "&add  ($xc_,$xc_,$xd_)",
150          "&mov  (@x[$b3],@x[$b3],'ror#20')",
151         "&eor   (@x[$b2],@x[$b2],$xc,'ror#20')",
152          "&eor  (@x[$b3],@x[$b3],$xc_,'ror#20')",
153
154         "&add   (@x[$a2],@x[$a2],@x[$b2])",
155         "&mov   ($xd,$xd,'ror#24')",
156          "&add  (@x[$a3],@x[$a3],@x[$b3])",
157          "&mov  ($xd_,$xd_,'ror#24')",
158         "&eor   ($xd,$xd,@x[$a2],'ror#24')",
159          "&eor  ($xd_,$xd_,@x[$a3],'ror#24')",
160
161         "&add   ($xc,$xc,$xd)",
162         "&mov   (@x[$b2],@x[$b2],'ror#25')",
163          "&add  ($xc_,$xc_,$xd_)",
164          "&mov  (@x[$b3],@x[$b3],'ror#25')",
165         "&eor   (@x[$b2],@x[$b2],$xc,'ror#25')",
166          "&eor  (@x[$b3],@x[$b3],$xc_,'ror#25')"        );
167
168         @ret;
169 }
170
171 $code.=<<___;
172 #include "arm_arch.h"
173
174 .text
175 #if defined(__thumb2__) || defined(__clang__)
176 .syntax unified
177 #endif
178 #if defined(__thumb2__)
179 .thumb
180 #else
181 .code   32
182 #endif
183
184 #if defined(__thumb2__) || defined(__clang__)
185 #define ldrhsb  ldrbhs
186 #endif
187
188 .align  5
189 .Lsigma:
190 .long   0x61707865,0x3320646e,0x79622d32,0x6b206574     @ endian-neutral
191 .Lone:
192 .long   1,0,0,0
193 #if __ARM_MAX_ARCH__>=7
194 .LOPENSSL_armcap:
195 .word   OPENSSL_armcap_P-.LChaCha20_ctr32
196 #else
197 .word   -1
198 #endif
199
200 .globl  ChaCha20_ctr32
201 .type   ChaCha20_ctr32,%function
202 .align  5
203 ChaCha20_ctr32:
204 .LChaCha20_ctr32:
205         ldr     r12,[sp,#0]             @ pull pointer to counter and nonce
206         stmdb   sp!,{r0-r2,r4-r11,lr}
207 #if __ARM_ARCH__<7 && !defined(__thumb2__)
208         sub     r14,pc,#16              @ ChaCha20_ctr32
209 #else
210         adr     r14,.LChaCha20_ctr32
211 #endif
212         cmp     r2,#0                   @ len==0?
213 #ifdef  __thumb2__
214         itt     eq
215 #endif
216         addeq   sp,sp,#4*3
217         beq     .Lno_data
218 #if __ARM_MAX_ARCH__>=7
219         cmp     r2,#192                 @ test len
220         bls     .Lshort
221         ldr     r4,[r14,#-32]
222         ldr     r4,[r14,r4]
223 # ifdef __APPLE__
224         ldr     r4,[r4]
225 # endif
226         tst     r4,#ARMV7_NEON
227         bne     .LChaCha20_neon
228 .Lshort:
229 #endif
230         ldmia   r12,{r4-r7}             @ load counter and nonce
231         sub     sp,sp,#4*(16)           @ off-load area
232         sub     r14,r14,#64             @ .Lsigma
233         stmdb   sp!,{r4-r7}             @ copy counter and nonce
234         ldmia   r3,{r4-r11}             @ load key
235         ldmia   r14,{r0-r3}             @ load sigma
236         stmdb   sp!,{r4-r11}            @ copy key
237         stmdb   sp!,{r0-r3}             @ copy sigma
238         str     r10,[sp,#4*(16+10)]     @ off-load "@x[10]"
239         str     r11,[sp,#4*(16+11)]     @ off-load "@x[11]"
240         b       .Loop_outer_enter
241
242 .align  4
243 .Loop_outer:
244         ldmia   sp,{r0-r9}              @ load key material
245         str     @t[3],[sp,#4*(32+2)]    @ save len
246         str     r12,  [sp,#4*(32+1)]    @ save inp
247         str     r14,  [sp,#4*(32+0)]    @ save out
248 .Loop_outer_enter:
249         ldr     @t[3], [sp,#4*(15)]
250         ldr     @x[12],[sp,#4*(12)]     @ modulo-scheduled load
251         ldr     @t[2], [sp,#4*(13)]
252         ldr     @x[14],[sp,#4*(14)]
253         str     @t[3], [sp,#4*(16+15)]
254         mov     @t[3],#10
255         b       .Loop
256
257 .align  4
258 .Loop:
259         subs    @t[3],@t[3],#1
260 ___
261         foreach (&ROUND(0, 4, 8,12)) { eval; }
262         foreach (&ROUND(0, 5,10,15)) { eval; }
263 $code.=<<___;
264         bne     .Loop
265
266         ldr     @t[3],[sp,#4*(32+2)]    @ load len
267
268         str     @t[0], [sp,#4*(16+8)]   @ modulo-scheduled store
269         str     @t[1], [sp,#4*(16+9)]
270         str     @x[12],[sp,#4*(16+12)]
271         str     @t[2], [sp,#4*(16+13)]
272         str     @x[14],[sp,#4*(16+14)]
273
274         @ at this point we have first half of 512-bit result in
275         @ @x[0-7] and second half at sp+4*(16+8)
276
277         cmp     @t[3],#64               @ done yet?
278 #ifdef  __thumb2__
279         itete   lo
280 #endif
281         addlo   r12,sp,#4*(0)           @ shortcut or ...
282         ldrhs   r12,[sp,#4*(32+1)]      @ ... load inp
283         addlo   r14,sp,#4*(0)           @ shortcut or ...
284         ldrhs   r14,[sp,#4*(32+0)]      @ ... load out
285
286         ldr     @t[0],[sp,#4*(0)]       @ load key material
287         ldr     @t[1],[sp,#4*(1)]
288
289 #if __ARM_ARCH__>=6 || !defined(__ARMEB__)
290 # if __ARM_ARCH__<7
291         orr     @t[2],r12,r14
292         tst     @t[2],#3                @ are input and output aligned?
293         ldr     @t[2],[sp,#4*(2)]
294         bne     .Lunaligned
295         cmp     @t[3],#64               @ restore flags
296 # else
297         ldr     @t[2],[sp,#4*(2)]
298 # endif
299         ldr     @t[3],[sp,#4*(3)]
300
301         add     @x[0],@x[0],@t[0]       @ accumulate key material
302         add     @x[1],@x[1],@t[1]
303 # ifdef __thumb2__
304         itt     hs
305 # endif
306         ldrhs   @t[0],[r12],#16         @ load input
307         ldrhs   @t[1],[r12,#-12]
308
309         add     @x[2],@x[2],@t[2]
310         add     @x[3],@x[3],@t[3]
311 # ifdef __thumb2__
312         itt     hs
313 # endif
314         ldrhs   @t[2],[r12,#-8]
315         ldrhs   @t[3],[r12,#-4]
316 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
317         rev     @x[0],@x[0]
318         rev     @x[1],@x[1]
319         rev     @x[2],@x[2]
320         rev     @x[3],@x[3]
321 # endif
322 # ifdef __thumb2__
323         itt     hs
324 # endif
325         eorhs   @x[0],@x[0],@t[0]       @ xor with input
326         eorhs   @x[1],@x[1],@t[1]
327          add    @t[0],sp,#4*(4)
328         str     @x[0],[r14],#16         @ store output
329 # ifdef __thumb2__
330         itt     hs
331 # endif
332         eorhs   @x[2],@x[2],@t[2]
333         eorhs   @x[3],@x[3],@t[3]
334          ldmia  @t[0],{@t[0]-@t[3]}     @ load key material
335         str     @x[1],[r14,#-12]
336         str     @x[2],[r14,#-8]
337         str     @x[3],[r14,#-4]
338
339         add     @x[4],@x[4],@t[0]       @ accumulate key material
340         add     @x[5],@x[5],@t[1]
341 # ifdef __thumb2__
342         itt     hs
343 # endif
344         ldrhs   @t[0],[r12],#16         @ load input
345         ldrhs   @t[1],[r12,#-12]
346         add     @x[6],@x[6],@t[2]
347         add     @x[7],@x[7],@t[3]
348 # ifdef __thumb2__
349         itt     hs
350 # endif
351         ldrhs   @t[2],[r12,#-8]
352         ldrhs   @t[3],[r12,#-4]
353 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
354         rev     @x[4],@x[4]
355         rev     @x[5],@x[5]
356         rev     @x[6],@x[6]
357         rev     @x[7],@x[7]
358 # endif
359 # ifdef __thumb2__
360         itt     hs
361 # endif
362         eorhs   @x[4],@x[4],@t[0]
363         eorhs   @x[5],@x[5],@t[1]
364          add    @t[0],sp,#4*(8)
365         str     @x[4],[r14],#16         @ store output
366 # ifdef __thumb2__
367         itt     hs
368 # endif
369         eorhs   @x[6],@x[6],@t[2]
370         eorhs   @x[7],@x[7],@t[3]
371         str     @x[5],[r14,#-12]
372          ldmia  @t[0],{@t[0]-@t[3]}     @ load key material
373         str     @x[6],[r14,#-8]
374          add    @x[0],sp,#4*(16+8)
375         str     @x[7],[r14,#-4]
376
377         ldmia   @x[0],{@x[0]-@x[7]}     @ load second half
378
379         add     @x[0],@x[0],@t[0]       @ accumulate key material
380         add     @x[1],@x[1],@t[1]
381 # ifdef __thumb2__
382         itt     hs
383 # endif
384         ldrhs   @t[0],[r12],#16         @ load input
385         ldrhs   @t[1],[r12,#-12]
386 # ifdef __thumb2__
387         itt     hi
388 # endif
389          strhi  @t[2],[sp,#4*(16+10)]   @ copy "@x[10]" while at it
390          strhi  @t[3],[sp,#4*(16+11)]   @ copy "@x[11]" while at it
391         add     @x[2],@x[2],@t[2]
392         add     @x[3],@x[3],@t[3]
393 # ifdef __thumb2__
394         itt     hs
395 # endif
396         ldrhs   @t[2],[r12,#-8]
397         ldrhs   @t[3],[r12,#-4]
398 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
399         rev     @x[0],@x[0]
400         rev     @x[1],@x[1]
401         rev     @x[2],@x[2]
402         rev     @x[3],@x[3]
403 # endif
404 # ifdef __thumb2__
405         itt     hs
406 # endif
407         eorhs   @x[0],@x[0],@t[0]
408         eorhs   @x[1],@x[1],@t[1]
409          add    @t[0],sp,#4*(12)
410         str     @x[0],[r14],#16         @ store output
411 # ifdef __thumb2__
412         itt     hs
413 # endif
414         eorhs   @x[2],@x[2],@t[2]
415         eorhs   @x[3],@x[3],@t[3]
416         str     @x[1],[r14,#-12]
417          ldmia  @t[0],{@t[0]-@t[3]}     @ load key material
418         str     @x[2],[r14,#-8]
419         str     @x[3],[r14,#-4]
420
421         add     @x[4],@x[4],@t[0]       @ accumulate key material
422         add     @x[5],@x[5],@t[1]
423 # ifdef __thumb2__
424         itt     hi
425 # endif
426          addhi  @t[0],@t[0],#1          @ next counter value
427          strhi  @t[0],[sp,#4*(12)]      @ save next counter value
428 # ifdef __thumb2__
429         itt     hs
430 # endif
431         ldrhs   @t[0],[r12],#16         @ load input
432         ldrhs   @t[1],[r12,#-12]
433         add     @x[6],@x[6],@t[2]
434         add     @x[7],@x[7],@t[3]
435 # ifdef __thumb2__
436         itt     hs
437 # endif
438         ldrhs   @t[2],[r12,#-8]
439         ldrhs   @t[3],[r12,#-4]
440 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
441         rev     @x[4],@x[4]
442         rev     @x[5],@x[5]
443         rev     @x[6],@x[6]
444         rev     @x[7],@x[7]
445 # endif
446 # ifdef __thumb2__
447         itt     hs
448 # endif
449         eorhs   @x[4],@x[4],@t[0]
450         eorhs   @x[5],@x[5],@t[1]
451 # ifdef __thumb2__
452          it     ne
453 # endif
454          ldrne  @t[0],[sp,#4*(32+2)]    @ re-load len
455 # ifdef __thumb2__
456         itt     hs
457 # endif
458         eorhs   @x[6],@x[6],@t[2]
459         eorhs   @x[7],@x[7],@t[3]
460         str     @x[4],[r14],#16         @ store output
461         str     @x[5],[r14,#-12]
462 # ifdef __thumb2__
463         it      hs
464 # endif
465          subhs  @t[3],@t[0],#64         @ len-=64
466         str     @x[6],[r14,#-8]
467         str     @x[7],[r14,#-4]
468         bhi     .Loop_outer
469
470         beq     .Ldone
471 # if __ARM_ARCH__<7
472         b       .Ltail
473
474 .align  4
475 .Lunaligned:                            @ unaligned endian-neutral path
476         cmp     @t[3],#64               @ restore flags
477 # endif
478 #endif
479 #if __ARM_ARCH__<7
480         ldr     @t[3],[sp,#4*(3)]
481 ___
482 for ($i=0;$i<16;$i+=4) {
483 my $j=$i&0x7;
484
485 $code.=<<___    if ($i==4);
486         add     @x[0],sp,#4*(16+8)
487 ___
488 $code.=<<___    if ($i==8);
489         ldmia   @x[0],{@x[0]-@x[7]}             @ load second half
490 # ifdef __thumb2__
491         itt     hi
492 # endif
493         strhi   @t[2],[sp,#4*(16+10)]           @ copy "@x[10]"
494         strhi   @t[3],[sp,#4*(16+11)]           @ copy "@x[11]"
495 ___
496 $code.=<<___;
497         add     @x[$j+0],@x[$j+0],@t[0]         @ accumulate key material
498 ___
499 $code.=<<___    if ($i==12);
500 # ifdef __thumb2__
501         itt     hi
502 # endif
503         addhi   @t[0],@t[0],#1                  @ next counter value
504         strhi   @t[0],[sp,#4*(12)]              @ save next counter value
505 ___
506 $code.=<<___;
507         add     @x[$j+1],@x[$j+1],@t[1]
508         add     @x[$j+2],@x[$j+2],@t[2]
509 # ifdef __thumb2__
510         itete   lo
511 # endif
512         eorlo   @t[0],@t[0],@t[0]               @ zero or ...
513         ldrhsb  @t[0],[r12],#16                 @ ... load input
514         eorlo   @t[1],@t[1],@t[1]
515         ldrhsb  @t[1],[r12,#-12]
516
517         add     @x[$j+3],@x[$j+3],@t[3]
518 # ifdef __thumb2__
519         itete   lo
520 # endif
521         eorlo   @t[2],@t[2],@t[2]
522         ldrhsb  @t[2],[r12,#-8]
523         eorlo   @t[3],@t[3],@t[3]
524         ldrhsb  @t[3],[r12,#-4]
525
526         eor     @x[$j+0],@t[0],@x[$j+0]         @ xor with input (or zero)
527         eor     @x[$j+1],@t[1],@x[$j+1]
528 # ifdef __thumb2__
529         itt     hs
530 # endif
531         ldrhsb  @t[0],[r12,#-15]                @ load more input
532         ldrhsb  @t[1],[r12,#-11]
533         eor     @x[$j+2],@t[2],@x[$j+2]
534          strb   @x[$j+0],[r14],#16              @ store output
535         eor     @x[$j+3],@t[3],@x[$j+3]
536 # ifdef __thumb2__
537         itt     hs
538 # endif
539         ldrhsb  @t[2],[r12,#-7]
540         ldrhsb  @t[3],[r12,#-3]
541          strb   @x[$j+1],[r14,#-12]
542         eor     @x[$j+0],@t[0],@x[$j+0],lsr#8
543          strb   @x[$j+2],[r14,#-8]
544         eor     @x[$j+1],@t[1],@x[$j+1],lsr#8
545 # ifdef __thumb2__
546         itt     hs
547 # endif
548         ldrhsb  @t[0],[r12,#-14]                @ load more input
549         ldrhsb  @t[1],[r12,#-10]
550          strb   @x[$j+3],[r14,#-4]
551         eor     @x[$j+2],@t[2],@x[$j+2],lsr#8
552          strb   @x[$j+0],[r14,#-15]
553         eor     @x[$j+3],@t[3],@x[$j+3],lsr#8
554 # ifdef __thumb2__
555         itt     hs
556 # endif
557         ldrhsb  @t[2],[r12,#-6]
558         ldrhsb  @t[3],[r12,#-2]
559          strb   @x[$j+1],[r14,#-11]
560         eor     @x[$j+0],@t[0],@x[$j+0],lsr#8
561          strb   @x[$j+2],[r14,#-7]
562         eor     @x[$j+1],@t[1],@x[$j+1],lsr#8
563 # ifdef __thumb2__
564         itt     hs
565 # endif
566         ldrhsb  @t[0],[r12,#-13]                @ load more input
567         ldrhsb  @t[1],[r12,#-9]
568          strb   @x[$j+3],[r14,#-3]
569         eor     @x[$j+2],@t[2],@x[$j+2],lsr#8
570          strb   @x[$j+0],[r14,#-14]
571         eor     @x[$j+3],@t[3],@x[$j+3],lsr#8
572 # ifdef __thumb2__
573         itt     hs
574 # endif
575         ldrhsb  @t[2],[r12,#-5]
576         ldrhsb  @t[3],[r12,#-1]
577          strb   @x[$j+1],[r14,#-10]
578          strb   @x[$j+2],[r14,#-6]
579         eor     @x[$j+0],@t[0],@x[$j+0],lsr#8
580          strb   @x[$j+3],[r14,#-2]
581         eor     @x[$j+1],@t[1],@x[$j+1],lsr#8
582          strb   @x[$j+0],[r14,#-13]
583         eor     @x[$j+2],@t[2],@x[$j+2],lsr#8
584          strb   @x[$j+1],[r14,#-9]
585         eor     @x[$j+3],@t[3],@x[$j+3],lsr#8
586          strb   @x[$j+2],[r14,#-5]
587          strb   @x[$j+3],[r14,#-1]
588 ___
589 $code.=<<___    if ($i<12);
590         add     @t[0],sp,#4*(4+$i)
591         ldmia   @t[0],{@t[0]-@t[3]}             @ load key material
592 ___
593 }
594 $code.=<<___;
595 # ifdef __thumb2__
596         it      ne
597 # endif
598         ldrne   @t[0],[sp,#4*(32+2)]            @ re-load len
599 # ifdef __thumb2__
600         it      hs
601 # endif
602         subhs   @t[3],@t[0],#64                 @ len-=64
603         bhi     .Loop_outer
604
605         beq     .Ldone
606 #endif
607
608 .Ltail:
609         ldr     r12,[sp,#4*(32+1)]      @ load inp
610         add     @t[1],sp,#4*(0)
611         ldr     r14,[sp,#4*(32+0)]      @ load out
612
613 .Loop_tail:
614         ldrb    @t[2],[@t[1]],#1        @ read buffer on stack
615         ldrb    @t[3],[r12],#1          @ read input
616         subs    @t[0],@t[0],#1
617         eor     @t[3],@t[3],@t[2]
618         strb    @t[3],[r14],#1          @ store output
619         bne     .Loop_tail
620
621 .Ldone:
622         add     sp,sp,#4*(32+3)
623 .Lno_data:
624         ldmia   sp!,{r4-r11,pc}
625 .size   ChaCha20_ctr32,.-ChaCha20_ctr32
626 ___
627
628 {{{
629 my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
630     map("q$_",(0..15));
631
632 sub NEONROUND {
633 my $odd = pop;
634 my ($a,$b,$c,$d,$t)=@_;
635
636         (
637         "&vadd_i32      ($a,$a,$b)",
638         "&veor          ($d,$d,$a)",
639         "&vrev32_16     ($d,$d)",       # vrot ($d,16)
640
641         "&vadd_i32      ($c,$c,$d)",
642         "&veor          ($t,$b,$c)",
643         "&vshr_u32      ($b,$t,20)",
644         "&vsli_32       ($b,$t,12)",
645
646         "&vadd_i32      ($a,$a,$b)",
647         "&veor          ($t,$d,$a)",
648         "&vshr_u32      ($d,$t,24)",
649         "&vsli_32       ($d,$t,8)",
650
651         "&vadd_i32      ($c,$c,$d)",
652         "&veor          ($t,$b,$c)",
653         "&vshr_u32      ($b,$t,25)",
654         "&vsli_32       ($b,$t,7)",
655
656         "&vext_8        ($c,$c,$c,8)",
657         "&vext_8        ($b,$b,$b,$odd?12:4)",
658         "&vext_8        ($d,$d,$d,$odd?4:12)"
659         );
660 }
661
662 $code.=<<___;
663 #if __ARM_MAX_ARCH__>=7
664 .arch   armv7-a
665 .fpu    neon
666
667 .type   ChaCha20_neon,%function
668 .align  5
669 ChaCha20_neon:
670         ldr             r12,[sp,#0]             @ pull pointer to counter and nonce
671         stmdb           sp!,{r0-r2,r4-r11,lr}
672 .LChaCha20_neon:
673         adr             r14,.Lsigma
674         vstmdb          sp!,{d8-d15}            @ ABI spec says so
675         stmdb           sp!,{r0-r3}
676
677         vld1.32         {$b0-$c0},[r3]          @ load key
678         ldmia           r3,{r4-r11}             @ load key
679
680         sub             sp,sp,#4*(16+16)
681         vld1.32         {$d0},[r12]             @ load counter and nonce
682         add             r12,sp,#4*8
683         ldmia           r14,{r0-r3}             @ load sigma
684         vld1.32         {$a0},[r14]!            @ load sigma
685         vld1.32         {$t0},[r14]             @ one
686         vst1.32         {$c0-$d0},[r12]         @ copy 1/2key|counter|nonce
687         vst1.32         {$a0-$b0},[sp]          @ copy sigma|1/2key
688
689         str             r10,[sp,#4*(16+10)]     @ off-load "@x[10]"
690         str             r11,[sp,#4*(16+11)]     @ off-load "@x[11]"
691         vshl.i32        $t1#lo,$t0#lo,#1        @ two
692         vstr            $t0#lo,[sp,#4*(16+0)]
693         vshl.i32        $t2#lo,$t0#lo,#2        @ four
694         vstr            $t1#lo,[sp,#4*(16+2)]
695         vmov            $a1,$a0
696         vstr            $t2#lo,[sp,#4*(16+4)]
697         vmov            $a2,$a0
698         vmov            $b1,$b0
699         vmov            $b2,$b0
700         b               .Loop_neon_enter
701
702 .align  4
703 .Loop_neon_outer:
704         ldmia           sp,{r0-r9}              @ load key material
705         cmp             @t[3],#64*2             @ if len<=64*2
706         bls             .Lbreak_neon            @ switch to integer-only
707         vmov            $a1,$a0
708         str             @t[3],[sp,#4*(32+2)]    @ save len
709         vmov            $a2,$a0
710         str             r12,  [sp,#4*(32+1)]    @ save inp
711         vmov            $b1,$b0
712         str             r14,  [sp,#4*(32+0)]    @ save out
713         vmov            $b2,$b0
714 .Loop_neon_enter:
715         ldr             @t[3], [sp,#4*(15)]
716         vadd.i32        $d1,$d0,$t0             @ counter+1
717         ldr             @x[12],[sp,#4*(12)]     @ modulo-scheduled load
718         vmov            $c1,$c0
719         ldr             @t[2], [sp,#4*(13)]
720         vmov            $c2,$c0
721         ldr             @x[14],[sp,#4*(14)]
722         vadd.i32        $d2,$d1,$t0             @ counter+2
723         str             @t[3], [sp,#4*(16+15)]
724         mov             @t[3],#10
725         add             @x[12],@x[12],#3        @ counter+3
726         b               .Loop_neon
727
728 .align  4
729 .Loop_neon:
730         subs            @t[3],@t[3],#1
731 ___
732         my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
733         my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
734         my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
735         my @thread3=&ROUND(0,4,8,12);
736
737         foreach (@thread0) {
738                 eval;                   eval(shift(@thread3));
739                 eval(shift(@thread1));  eval(shift(@thread3));
740                 eval(shift(@thread2));  eval(shift(@thread3));
741         }
742
743         @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
744         @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
745         @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
746         @thread3=&ROUND(0,5,10,15);
747
748         foreach (@thread0) {
749                 eval;                   eval(shift(@thread3));
750                 eval(shift(@thread1));  eval(shift(@thread3));
751                 eval(shift(@thread2));  eval(shift(@thread3));
752         }
753 $code.=<<___;
754         bne             .Loop_neon
755
756         add             @t[3],sp,#32
757         vld1.32         {$t0-$t1},[sp]          @ load key material
758         vld1.32         {$t2-$t3},[@t[3]]
759
760         ldr             @t[3],[sp,#4*(32+2)]    @ load len
761
762         str             @t[0], [sp,#4*(16+8)]   @ modulo-scheduled store
763         str             @t[1], [sp,#4*(16+9)]
764         str             @x[12],[sp,#4*(16+12)]
765         str             @t[2], [sp,#4*(16+13)]
766         str             @x[14],[sp,#4*(16+14)]
767
768         @ at this point we have first half of 512-bit result in
769         @ @x[0-7] and second half at sp+4*(16+8)
770
771         ldr             r12,[sp,#4*(32+1)]      @ load inp
772         ldr             r14,[sp,#4*(32+0)]      @ load out
773
774         vadd.i32        $a0,$a0,$t0             @ accumulate key material
775         vadd.i32        $a1,$a1,$t0
776         vadd.i32        $a2,$a2,$t0
777         vldr            $t0#lo,[sp,#4*(16+0)]   @ one
778
779         vadd.i32        $b0,$b0,$t1
780         vadd.i32        $b1,$b1,$t1
781         vadd.i32        $b2,$b2,$t1
782         vldr            $t1#lo,[sp,#4*(16+2)]   @ two
783
784         vadd.i32        $c0,$c0,$t2
785         vadd.i32        $c1,$c1,$t2
786         vadd.i32        $c2,$c2,$t2
787         vadd.i32        $d1#lo,$d1#lo,$t0#lo    @ counter+1
788         vadd.i32        $d2#lo,$d2#lo,$t1#lo    @ counter+2
789
790         vadd.i32        $d0,$d0,$t3
791         vadd.i32        $d1,$d1,$t3
792         vadd.i32        $d2,$d2,$t3
793
794         cmp             @t[3],#64*4
795         blo             .Ltail_neon
796
797         vld1.8          {$t0-$t1},[r12]!        @ load input
798          mov            @t[3],sp
799         vld1.8          {$t2-$t3},[r12]!
800         veor            $a0,$a0,$t0             @ xor with input
801         veor            $b0,$b0,$t1
802         vld1.8          {$t0-$t1},[r12]!
803         veor            $c0,$c0,$t2
804         veor            $d0,$d0,$t3
805         vld1.8          {$t2-$t3},[r12]!
806
807         veor            $a1,$a1,$t0
808          vst1.8         {$a0-$b0},[r14]!        @ store output
809         veor            $b1,$b1,$t1
810         vld1.8          {$t0-$t1},[r12]!
811         veor            $c1,$c1,$t2
812          vst1.8         {$c0-$d0},[r14]!
813         veor            $d1,$d1,$t3
814         vld1.8          {$t2-$t3},[r12]!
815
816         veor            $a2,$a2,$t0
817          vld1.32        {$a0-$b0},[@t[3]]!      @ load for next iteration
818          veor           $t0#hi,$t0#hi,$t0#hi
819          vldr           $t0#lo,[sp,#4*(16+4)]   @ four
820         veor            $b2,$b2,$t1
821          vld1.32        {$c0-$d0},[@t[3]]
822         veor            $c2,$c2,$t2
823          vst1.8         {$a1-$b1},[r14]!
824         veor            $d2,$d2,$t3
825          vst1.8         {$c1-$d1},[r14]!
826
827         vadd.i32        $d0#lo,$d0#lo,$t0#lo    @ next counter value
828         vldr            $t0#lo,[sp,#4*(16+0)]   @ one
829
830         ldmia           sp,{@t[0]-@t[3]}        @ load key material
831         add             @x[0],@x[0],@t[0]       @ accumulate key material
832         ldr             @t[0],[r12],#16         @ load input
833          vst1.8         {$a2-$b2},[r14]!
834         add             @x[1],@x[1],@t[1]
835         ldr             @t[1],[r12,#-12]
836          vst1.8         {$c2-$d2},[r14]!
837         add             @x[2],@x[2],@t[2]
838         ldr             @t[2],[r12,#-8]
839         add             @x[3],@x[3],@t[3]
840         ldr             @t[3],[r12,#-4]
841 # ifdef __ARMEB__
842         rev             @x[0],@x[0]
843         rev             @x[1],@x[1]
844         rev             @x[2],@x[2]
845         rev             @x[3],@x[3]
846 # endif
847         eor             @x[0],@x[0],@t[0]       @ xor with input
848          add            @t[0],sp,#4*(4)
849         eor             @x[1],@x[1],@t[1]
850         str             @x[0],[r14],#16         @ store output
851         eor             @x[2],@x[2],@t[2]
852         str             @x[1],[r14,#-12]
853         eor             @x[3],@x[3],@t[3]
854          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
855         str             @x[2],[r14,#-8]
856         str             @x[3],[r14,#-4]
857
858         add             @x[4],@x[4],@t[0]       @ accumulate key material
859         ldr             @t[0],[r12],#16         @ load input
860         add             @x[5],@x[5],@t[1]
861         ldr             @t[1],[r12,#-12]
862         add             @x[6],@x[6],@t[2]
863         ldr             @t[2],[r12,#-8]
864         add             @x[7],@x[7],@t[3]
865         ldr             @t[3],[r12,#-4]
866 # ifdef __ARMEB__
867         rev             @x[4],@x[4]
868         rev             @x[5],@x[5]
869         rev             @x[6],@x[6]
870         rev             @x[7],@x[7]
871 # endif
872         eor             @x[4],@x[4],@t[0]
873          add            @t[0],sp,#4*(8)
874         eor             @x[5],@x[5],@t[1]
875         str             @x[4],[r14],#16         @ store output
876         eor             @x[6],@x[6],@t[2]
877         str             @x[5],[r14,#-12]
878         eor             @x[7],@x[7],@t[3]
879          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
880         str             @x[6],[r14,#-8]
881          add            @x[0],sp,#4*(16+8)
882         str             @x[7],[r14,#-4]
883
884         ldmia           @x[0],{@x[0]-@x[7]}     @ load second half
885
886         add             @x[0],@x[0],@t[0]       @ accumulate key material
887         ldr             @t[0],[r12],#16         @ load input
888         add             @x[1],@x[1],@t[1]
889         ldr             @t[1],[r12,#-12]
890 # ifdef __thumb2__
891         it      hi
892 # endif
893          strhi          @t[2],[sp,#4*(16+10)]   @ copy "@x[10]" while at it
894         add             @x[2],@x[2],@t[2]
895         ldr             @t[2],[r12,#-8]
896 # ifdef __thumb2__
897         it      hi
898 # endif
899          strhi          @t[3],[sp,#4*(16+11)]   @ copy "@x[11]" while at it
900         add             @x[3],@x[3],@t[3]
901         ldr             @t[3],[r12,#-4]
902 # ifdef __ARMEB__
903         rev             @x[0],@x[0]
904         rev             @x[1],@x[1]
905         rev             @x[2],@x[2]
906         rev             @x[3],@x[3]
907 # endif
908         eor             @x[0],@x[0],@t[0]
909          add            @t[0],sp,#4*(12)
910         eor             @x[1],@x[1],@t[1]
911         str             @x[0],[r14],#16         @ store output
912         eor             @x[2],@x[2],@t[2]
913         str             @x[1],[r14,#-12]
914         eor             @x[3],@x[3],@t[3]
915          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
916         str             @x[2],[r14,#-8]
917         str             @x[3],[r14,#-4]
918
919         add             @x[4],@x[4],@t[0]       @ accumulate key material
920          add            @t[0],@t[0],#4          @ next counter value
921         add             @x[5],@x[5],@t[1]
922          str            @t[0],[sp,#4*(12)]      @ save next counter value
923         ldr             @t[0],[r12],#16         @ load input
924         add             @x[6],@x[6],@t[2]
925          add            @x[4],@x[4],#3          @ counter+3
926         ldr             @t[1],[r12,#-12]
927         add             @x[7],@x[7],@t[3]
928         ldr             @t[2],[r12,#-8]
929         ldr             @t[3],[r12,#-4]
930 # ifdef __ARMEB__
931         rev             @x[4],@x[4]
932         rev             @x[5],@x[5]
933         rev             @x[6],@x[6]
934         rev             @x[7],@x[7]
935 # endif
936         eor             @x[4],@x[4],@t[0]
937 # ifdef __thumb2__
938         it      hi
939 # endif
940          ldrhi          @t[0],[sp,#4*(32+2)]    @ re-load len
941         eor             @x[5],@x[5],@t[1]
942         eor             @x[6],@x[6],@t[2]
943         str             @x[4],[r14],#16         @ store output
944         eor             @x[7],@x[7],@t[3]
945         str             @x[5],[r14,#-12]
946          sub            @t[3],@t[0],#64*4       @ len-=64*4
947         str             @x[6],[r14,#-8]
948         str             @x[7],[r14,#-4]
949         bhi             .Loop_neon_outer
950
951         b               .Ldone_neon
952
953 .align  4
954 .Lbreak_neon:
955         @ harmonize NEON and integer-only stack frames: load data
956         @ from NEON frame, but save to integer-only one; distance
957         @ between the two is 4*(32+4+16-32)=4*(20).
958
959         str             @t[3], [sp,#4*(20+32+2)]        @ save len
960          add            @t[3],sp,#4*(32+4)
961         str             r12,   [sp,#4*(20+32+1)]        @ save inp
962         str             r14,   [sp,#4*(20+32+0)]        @ save out
963
964         ldr             @x[12],[sp,#4*(16+10)]
965         ldr             @x[14],[sp,#4*(16+11)]
966          vldmia         @t[3],{d8-d15}                  @ fulfill ABI requirement
967         str             @x[12],[sp,#4*(20+16+10)]       @ copy "@x[10]"
968         str             @x[14],[sp,#4*(20+16+11)]       @ copy "@x[11]"
969
970         ldr             @t[3], [sp,#4*(15)]
971         ldr             @x[12],[sp,#4*(12)]             @ modulo-scheduled load
972         ldr             @t[2], [sp,#4*(13)]
973         ldr             @x[14],[sp,#4*(14)]
974         str             @t[3], [sp,#4*(20+16+15)]
975         add             @t[3],sp,#4*(20)
976         vst1.32         {$a0-$b0},[@t[3]]!              @ copy key
977         add             sp,sp,#4*(20)                   @ switch frame
978         vst1.32         {$c0-$d0},[@t[3]]
979         mov             @t[3],#10
980         b               .Loop                           @ go integer-only
981
982 .align  4
983 .Ltail_neon:
984         cmp             @t[3],#64*3
985         bhs             .L192_or_more_neon
986         cmp             @t[3],#64*2
987         bhs             .L128_or_more_neon
988         cmp             @t[3],#64*1
989         bhs             .L64_or_more_neon
990
991         add             @t[0],sp,#4*(8)
992         vst1.8          {$a0-$b0},[sp]
993         add             @t[2],sp,#4*(0)
994         vst1.8          {$c0-$d0},[@t[0]]
995         b               .Loop_tail_neon
996
997 .align  4
998 .L64_or_more_neon:
999         vld1.8          {$t0-$t1},[r12]!
1000         vld1.8          {$t2-$t3},[r12]!
1001         veor            $a0,$a0,$t0
1002         veor            $b0,$b0,$t1
1003         veor            $c0,$c0,$t2
1004         veor            $d0,$d0,$t3
1005         vst1.8          {$a0-$b0},[r14]!
1006         vst1.8          {$c0-$d0},[r14]!
1007
1008         beq             .Ldone_neon
1009
1010         add             @t[0],sp,#4*(8)
1011         vst1.8          {$a1-$b1},[sp]
1012         add             @t[2],sp,#4*(0)
1013         vst1.8          {$c1-$d1},[@t[0]]
1014         sub             @t[3],@t[3],#64*1       @ len-=64*1
1015         b               .Loop_tail_neon
1016
1017 .align  4
1018 .L128_or_more_neon:
1019         vld1.8          {$t0-$t1},[r12]!
1020         vld1.8          {$t2-$t3},[r12]!
1021         veor            $a0,$a0,$t0
1022         veor            $b0,$b0,$t1
1023         vld1.8          {$t0-$t1},[r12]!
1024         veor            $c0,$c0,$t2
1025         veor            $d0,$d0,$t3
1026         vld1.8          {$t2-$t3},[r12]!
1027
1028         veor            $a1,$a1,$t0
1029         veor            $b1,$b1,$t1
1030          vst1.8         {$a0-$b0},[r14]!
1031         veor            $c1,$c1,$t2
1032          vst1.8         {$c0-$d0},[r14]!
1033         veor            $d1,$d1,$t3
1034         vst1.8          {$a1-$b1},[r14]!
1035         vst1.8          {$c1-$d1},[r14]!
1036
1037         beq             .Ldone_neon
1038
1039         add             @t[0],sp,#4*(8)
1040         vst1.8          {$a2-$b2},[sp]
1041         add             @t[2],sp,#4*(0)
1042         vst1.8          {$c2-$d2},[@t[0]]
1043         sub             @t[3],@t[3],#64*2       @ len-=64*2
1044         b               .Loop_tail_neon
1045
1046 .align  4
1047 .L192_or_more_neon:
1048         vld1.8          {$t0-$t1},[r12]!
1049         vld1.8          {$t2-$t3},[r12]!
1050         veor            $a0,$a0,$t0
1051         veor            $b0,$b0,$t1
1052         vld1.8          {$t0-$t1},[r12]!
1053         veor            $c0,$c0,$t2
1054         veor            $d0,$d0,$t3
1055         vld1.8          {$t2-$t3},[r12]!
1056
1057         veor            $a1,$a1,$t0
1058         veor            $b1,$b1,$t1
1059         vld1.8          {$t0-$t1},[r12]!
1060         veor            $c1,$c1,$t2
1061          vst1.8         {$a0-$b0},[r14]!
1062         veor            $d1,$d1,$t3
1063         vld1.8          {$t2-$t3},[r12]!
1064
1065         veor            $a2,$a2,$t0
1066          vst1.8         {$c0-$d0},[r14]!
1067         veor            $b2,$b2,$t1
1068          vst1.8         {$a1-$b1},[r14]!
1069         veor            $c2,$c2,$t2
1070          vst1.8         {$c1-$d1},[r14]!
1071         veor            $d2,$d2,$t3
1072         vst1.8          {$a2-$b2},[r14]!
1073         vst1.8          {$c2-$d2},[r14]!
1074
1075         beq             .Ldone_neon
1076
1077         ldmia           sp,{@t[0]-@t[3]}        @ load key material
1078         add             @x[0],@x[0],@t[0]       @ accumulate key material
1079          add            @t[0],sp,#4*(4)
1080         add             @x[1],@x[1],@t[1]
1081         add             @x[2],@x[2],@t[2]
1082         add             @x[3],@x[3],@t[3]
1083          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
1084
1085         add             @x[4],@x[4],@t[0]       @ accumulate key material
1086          add            @t[0],sp,#4*(8)
1087         add             @x[5],@x[5],@t[1]
1088         add             @x[6],@x[6],@t[2]
1089         add             @x[7],@x[7],@t[3]
1090          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
1091 # ifdef __ARMEB__
1092         rev             @x[0],@x[0]
1093         rev             @x[1],@x[1]
1094         rev             @x[2],@x[2]
1095         rev             @x[3],@x[3]
1096         rev             @x[4],@x[4]
1097         rev             @x[5],@x[5]
1098         rev             @x[6],@x[6]
1099         rev             @x[7],@x[7]
1100 # endif
1101         stmia           sp,{@x[0]-@x[7]}
1102          add            @x[0],sp,#4*(16+8)
1103
1104         ldmia           @x[0],{@x[0]-@x[7]}     @ load second half
1105
1106         add             @x[0],@x[0],@t[0]       @ accumulate key material
1107          add            @t[0],sp,#4*(12)
1108         add             @x[1],@x[1],@t[1]
1109         add             @x[2],@x[2],@t[2]
1110         add             @x[3],@x[3],@t[3]
1111          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
1112
1113         add             @x[4],@x[4],@t[0]       @ accumulate key material
1114          add            @t[0],sp,#4*(8)
1115         add             @x[5],@x[5],@t[1]
1116          add            @x[4],@x[4],#3          @ counter+3
1117         add             @x[6],@x[6],@t[2]
1118         add             @x[7],@x[7],@t[3]
1119          ldr            @t[3],[sp,#4*(32+2)]    @ re-load len
1120 # ifdef __ARMEB__
1121         rev             @x[0],@x[0]
1122         rev             @x[1],@x[1]
1123         rev             @x[2],@x[2]
1124         rev             @x[3],@x[3]
1125         rev             @x[4],@x[4]
1126         rev             @x[5],@x[5]
1127         rev             @x[6],@x[6]
1128         rev             @x[7],@x[7]
1129 # endif
1130         stmia           @t[0],{@x[0]-@x[7]}
1131          add            @t[2],sp,#4*(0)
1132          sub            @t[3],@t[3],#64*3       @ len-=64*3
1133
1134 .Loop_tail_neon:
1135         ldrb            @t[0],[@t[2]],#1        @ read buffer on stack
1136         ldrb            @t[1],[r12],#1          @ read input
1137         subs            @t[3],@t[3],#1
1138         eor             @t[0],@t[0],@t[1]
1139         strb            @t[0],[r14],#1          @ store output
1140         bne             .Loop_tail_neon
1141
1142 .Ldone_neon:
1143         add             sp,sp,#4*(32+4)
1144         vldmia          sp,{d8-d15}
1145         add             sp,sp,#4*(16+3)
1146         ldmia           sp!,{r4-r11,pc}
1147 .size   ChaCha20_neon,.-ChaCha20_neon
1148 .comm   OPENSSL_armcap_P,4,4
1149 #endif
1150 ___
1151 }}}
1152
1153 foreach (split("\n",$code)) {
1154         s/\`([^\`]*)\`/eval $1/geo;
1155
1156         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1157
1158         print $_,"\n";
1159 }
1160 close STDOUT;