chacha/asm/chacha*: ensure that zero length is handled (without crash).
[openssl.git] / crypto / chacha / asm / chacha-armv4.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # December 2014
11
12 # ChaCha20 for ARMv4.
13 #
14 # Performance in cycles per byte out of large buffer.
15 #
16 #                       IALU/gcc-4.4    1xNEON      3xNEON+1xIALU
17 #
18 # Cortex-A5             19.3(*)/+95%    21.8        14.1
19 # Cortex-A8             10.5(*)/+160%   13.9        6.35
20 # Cortex-A9             12.9(**)/+110%  14.3        6.50
21 # Cortex-A15            11.0/+40%       16.0        5.00
22 # Snapdragon S4         11.5/+125%      13.6        4.90
23 #
24 # (*)   most "favourable" result for aligned data on little-endian
25 #       processor, result for misaligned data is 10-15% lower;
26 # (**)  this result is a trade-off: it can be improved by 20%,
27 #       but then Snapdragon S4 and Cortex-A8 results get
28 #       20-25% worse;
29
30 $flavour = shift;
31 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
32 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
33
34 if ($flavour && $flavour ne "void") {
35     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
36     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
37     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
38     die "can't locate arm-xlate.pl";
39
40     open STDOUT,"| \"$^X\" $xlate $flavour $output";
41 } else {
42     open STDOUT,">$output";
43 }
44
45 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
46 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
47   my $arg = pop;
48     $arg = "#$arg" if ($arg*1 eq $arg);
49     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
50 }
51
52 my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
53 my @t=map("r$_",(8..11));
54
55 sub ROUND {
56 my ($a0,$b0,$c0,$d0)=@_;
57 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
58 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
59 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
60 my $odd = $d0&1;
61 my ($xc,$xc_) = (@t[0..1]);
62 my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
63 my @ret;
64
65         # Consider order in which variables are addressed by their
66         # index:
67         #
68         #       a   b   c   d
69         #
70         #       0   4   8  12 < even round
71         #       1   5   9  13
72         #       2   6  10  14
73         #       3   7  11  15
74         #       0   5  10  15 < odd round
75         #       1   6  11  12
76         #       2   7   8  13
77         #       3   4   9  14
78         #
79         # 'a', 'b' are permanently allocated in registers, @x[0..7],
80         # while 'c's and pair of 'd's are maintained in memory. If
81         # you observe 'c' column, you'll notice that pair of 'c's is
82         # invariant between rounds. This means that we have to reload
83         # them once per round, in the middle. This is why you'll see
84         # bunch of 'c' stores and loads in the middle, but none in
85         # the beginning or end. If you observe 'd' column, you'll
86         # notice that 15 and 13 are reused in next pair of rounds.
87         # This is why these two are chosen for offloading to memory,
88         # to make loads count more.
89                                                         push @ret,(
90         "&add   (@x[$a0],@x[$a0],@x[$b0])",
91         "&mov   ($xd,$xd,'ror#16')",
92          "&add  (@x[$a1],@x[$a1],@x[$b1])",
93          "&mov  ($xd_,$xd_,'ror#16')",
94         "&eor   ($xd,$xd,@x[$a0],'ror#16')",
95          "&eor  ($xd_,$xd_,@x[$a1],'ror#16')",
96
97         "&add   ($xc,$xc,$xd)",
98         "&mov   (@x[$b0],@x[$b0],'ror#20')",
99          "&add  ($xc_,$xc_,$xd_)",
100          "&mov  (@x[$b1],@x[$b1],'ror#20')",
101         "&eor   (@x[$b0],@x[$b0],$xc,'ror#20')",
102          "&eor  (@x[$b1],@x[$b1],$xc_,'ror#20')",
103
104         "&add   (@x[$a0],@x[$a0],@x[$b0])",
105         "&mov   ($xd,$xd,'ror#24')",
106          "&add  (@x[$a1],@x[$a1],@x[$b1])",
107          "&mov  ($xd_,$xd_,'ror#24')",
108         "&eor   ($xd,$xd,@x[$a0],'ror#24')",
109          "&eor  ($xd_,$xd_,@x[$a1],'ror#24')",
110
111         "&add   ($xc,$xc,$xd)",
112         "&mov   (@x[$b0],@x[$b0],'ror#25')"             );
113                                                         push @ret,(
114         "&str   ($xd,'[sp,#4*(16+$d0)]')",
115         "&ldr   ($xd,'[sp,#4*(16+$d2)]')"               ) if ($odd);
116                                                         push @ret,(
117          "&add  ($xc_,$xc_,$xd_)",
118          "&mov  (@x[$b1],@x[$b1],'ror#25')"             );
119                                                         push @ret,(
120          "&str  ($xd_,'[sp,#4*(16+$d1)]')",
121          "&ldr  ($xd_,'[sp,#4*(16+$d3)]')"              ) if (!$odd);
122                                                         push @ret,(
123         "&eor   (@x[$b0],@x[$b0],$xc,'ror#25')",
124          "&eor  (@x[$b1],@x[$b1],$xc_,'ror#25')"        );
125
126         $xd=@x[$d2]                                     if (!$odd);
127         $xd_=@x[$d3]                                    if ($odd);
128                                                         push @ret,(
129         "&str   ($xc,'[sp,#4*(16+$c0)]')",
130         "&ldr   ($xc,'[sp,#4*(16+$c2)]')",
131         "&add   (@x[$a2],@x[$a2],@x[$b2])",
132         "&mov   ($xd,$xd,'ror#16')",
133          "&str  ($xc_,'[sp,#4*(16+$c1)]')",
134          "&ldr  ($xc_,'[sp,#4*(16+$c3)]')",
135          "&add  (@x[$a3],@x[$a3],@x[$b3])",
136          "&mov  ($xd_,$xd_,'ror#16')",
137         "&eor   ($xd,$xd,@x[$a2],'ror#16')",
138          "&eor  ($xd_,$xd_,@x[$a3],'ror#16')",
139
140         "&add   ($xc,$xc,$xd)",
141         "&mov   (@x[$b2],@x[$b2],'ror#20')",
142          "&add  ($xc_,$xc_,$xd_)",
143          "&mov  (@x[$b3],@x[$b3],'ror#20')",
144         "&eor   (@x[$b2],@x[$b2],$xc,'ror#20')",
145          "&eor  (@x[$b3],@x[$b3],$xc_,'ror#20')",
146
147         "&add   (@x[$a2],@x[$a2],@x[$b2])",
148         "&mov   ($xd,$xd,'ror#24')",
149          "&add  (@x[$a3],@x[$a3],@x[$b3])",
150          "&mov  ($xd_,$xd_,'ror#24')",
151         "&eor   ($xd,$xd,@x[$a2],'ror#24')",
152          "&eor  ($xd_,$xd_,@x[$a3],'ror#24')",
153
154         "&add   ($xc,$xc,$xd)",
155         "&mov   (@x[$b2],@x[$b2],'ror#25')",
156          "&add  ($xc_,$xc_,$xd_)",
157          "&mov  (@x[$b3],@x[$b3],'ror#25')",
158         "&eor   (@x[$b2],@x[$b2],$xc,'ror#25')",
159          "&eor  (@x[$b3],@x[$b3],$xc_,'ror#25')"        );
160
161         @ret;
162 }
163
164 $code.=<<___;
165 #include "arm_arch.h"
166
167 .text
168 #if defined(__thumb2__)
169 .syntax unified
170 .thumb
171 #else
172 .code   32
173 #endif
174
175 #if defined(__thumb2__) || defined(__clang__)
176 #define ldrhsb  ldrbhs
177 #endif
178
179 .align  5
180 .Lsigma:
181 .long   0x61707865,0x3320646e,0x79622d32,0x6b206574     @ endian-neutral
182 .Lone:
183 .long   1,0,0,0
184 #if __ARM_MAX_ARCH__>=7
185 .LOPENSSL_armcap:
186 .word   OPENSSL_armcap_P-.LChaCha20_ctr32
187 #else
188 .word   -1
189 #endif
190
191 .globl  ChaCha20_ctr32
192 .type   ChaCha20_ctr32,%function
193 .align  5
194 ChaCha20_ctr32:
195 .LChaCha20_ctr32:
196         ldr     r12,[sp,#0]             @ pull pointer to counter and nonce
197         stmdb   sp!,{r0-r2,r4-r11,lr}
198 #if __ARM_ARCH__<7 && !defined(__thumb2__)
199         sub     r14,pc,#16              @ ChaCha20_ctr32
200 #else
201         adr     r14,.LChaCha20_ctr32
202 #endif
203         cmp     r2,#0                   @ len==0?
204 #ifdef  __thumb2__
205         itt     eq
206 #endif
207         addeq   sp,sp,#4*3
208         beq     .Lno_data
209 #if __ARM_MAX_ARCH__>=7
210         cmp     r2,#192                 @ test len
211         bls     .Lshort
212         ldr     r4,[r14,#-32]
213         ldr     r4,[r14,r4]
214 # ifdef __APPLE__
215         ldr     r4,[r4]
216 # endif
217         tst     r4,#1
218         bne     .LChaCha20_neon
219 .Lshort:
220 #endif
221         ldmia   r12,{r4-r7}             @ load counter and nonce
222         sub     sp,sp,#4*(16)           @ off-load area
223         sub     r14,r14,#64             @ .Lsigma
224         stmdb   sp!,{r4-r7}             @ copy counter and nonce
225         ldmia   r3,{r4-r11}             @ load key
226         ldmia   r14,{r0-r3}             @ load sigma
227         stmdb   sp!,{r4-r11}            @ copy key
228         stmdb   sp!,{r0-r3}             @ copy sigma
229         str     r10,[sp,#4*(16+10)]     @ off-load "@x[10]"
230         str     r11,[sp,#4*(16+11)]     @ off-load "@x[11]"
231         b       .Loop_outer_enter
232
233 .align  4
234 .Loop_outer:
235         ldmia   sp,{r0-r9}              @ load key material
236         str     @t[3],[sp,#4*(32+2)]    @ save len
237         str     r12,  [sp,#4*(32+1)]    @ save inp
238         str     r14,  [sp,#4*(32+0)]    @ save out
239 .Loop_outer_enter:
240         ldr     @t[3], [sp,#4*(15)]
241         ldr     @x[12],[sp,#4*(12)]     @ modulo-scheduled load
242         ldr     @t[2], [sp,#4*(13)]
243         ldr     @x[14],[sp,#4*(14)]
244         str     @t[3], [sp,#4*(16+15)]
245         mov     @t[3],#10
246         b       .Loop
247
248 .align  4
249 .Loop:
250         subs    @t[3],@t[3],#1
251 ___
252         foreach (&ROUND(0, 4, 8,12)) { eval; }
253         foreach (&ROUND(0, 5,10,15)) { eval; }
254 $code.=<<___;
255         bne     .Loop
256
257         ldr     @t[3],[sp,#4*(32+2)]    @ load len
258
259         str     @t[0], [sp,#4*(16+8)]   @ modulo-scheduled store
260         str     @t[1], [sp,#4*(16+9)]
261         str     @x[12],[sp,#4*(16+12)]
262         str     @t[2], [sp,#4*(16+13)]
263         str     @x[14],[sp,#4*(16+14)]
264
265         @ at this point we have first half of 512-bit result in
266         @ @x[0-7] and second half at sp+4*(16+8)
267
268         cmp     @t[3],#64               @ done yet?
269 #ifdef  __thumb2__
270         itete   lo
271 #endif
272         addlo   r12,sp,#4*(0)           @ shortcut or ...
273         ldrhs   r12,[sp,#4*(32+1)]      @ ... load inp
274         addlo   r14,sp,#4*(0)           @ shortcut or ...
275         ldrhs   r14,[sp,#4*(32+0)]      @ ... load out
276
277         ldr     @t[0],[sp,#4*(0)]       @ load key material
278         ldr     @t[1],[sp,#4*(1)]
279
280 #if __ARM_ARCH__>=6 || !defined(__ARMEB__)
281 # if __ARM_ARCH__<7
282         orr     @t[2],r12,r14
283         tst     @t[2],#3                @ are input and output aligned?
284         ldr     @t[2],[sp,#4*(2)]
285         bne     .Lunaligned
286         cmp     @t[3],#64               @ restore flags
287 # else
288         ldr     @t[2],[sp,#4*(2)]
289 # endif
290         ldr     @t[3],[sp,#4*(3)]
291
292         add     @x[0],@x[0],@t[0]       @ accumulate key material
293         add     @x[1],@x[1],@t[1]
294 # ifdef __thumb2__
295         itt     hs
296 # endif
297         ldrhs   @t[0],[r12],#16         @ load input
298         ldrhs   @t[1],[r12,#-12]
299
300         add     @x[2],@x[2],@t[2]
301         add     @x[3],@x[3],@t[3]
302 # ifdef __thumb2__
303         itt     hs
304 # endif
305         ldrhs   @t[2],[r12,#-8]
306         ldrhs   @t[3],[r12,#-4]
307 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
308         rev     @x[0],@x[0]
309         rev     @x[1],@x[1]
310         rev     @x[2],@x[2]
311         rev     @x[3],@x[3]
312 # endif
313 # ifdef __thumb2__
314         itt     hs
315 # endif
316         eorhs   @x[0],@x[0],@t[0]       @ xor with input
317         eorhs   @x[1],@x[1],@t[1]
318          add    @t[0],sp,#4*(4)
319         str     @x[0],[r14],#16         @ store output
320 # ifdef __thumb2__
321         itt     hs
322 # endif
323         eorhs   @x[2],@x[2],@t[2]
324         eorhs   @x[3],@x[3],@t[3]
325          ldmia  @t[0],{@t[0]-@t[3]}     @ load key material
326         str     @x[1],[r14,#-12]
327         str     @x[2],[r14,#-8]
328         str     @x[3],[r14,#-4]
329
330         add     @x[4],@x[4],@t[0]       @ accumulate key material
331         add     @x[5],@x[5],@t[1]
332 # ifdef __thumb2__
333         itt     hs
334 # endif
335         ldrhs   @t[0],[r12],#16         @ load input
336         ldrhs   @t[1],[r12,#-12]
337         add     @x[6],@x[6],@t[2]
338         add     @x[7],@x[7],@t[3]
339 # ifdef __thumb2__
340         itt     hs
341 # endif
342         ldrhs   @t[2],[r12,#-8]
343         ldrhs   @t[3],[r12,#-4]
344 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
345         rev     @x[4],@x[4]
346         rev     @x[5],@x[5]
347         rev     @x[6],@x[6]
348         rev     @x[7],@x[7]
349 # endif
350 # ifdef __thumb2__
351         itt     hs
352 # endif
353         eorhs   @x[4],@x[4],@t[0]
354         eorhs   @x[5],@x[5],@t[1]
355          add    @t[0],sp,#4*(8)
356         str     @x[4],[r14],#16         @ store output
357 # ifdef __thumb2__
358         itt     hs
359 # endif
360         eorhs   @x[6],@x[6],@t[2]
361         eorhs   @x[7],@x[7],@t[3]
362         str     @x[5],[r14,#-12]
363          ldmia  @t[0],{@t[0]-@t[3]}     @ load key material
364         str     @x[6],[r14,#-8]
365          add    @x[0],sp,#4*(16+8)
366         str     @x[7],[r14,#-4]
367
368         ldmia   @x[0],{@x[0]-@x[7]}     @ load second half
369
370         add     @x[0],@x[0],@t[0]       @ accumulate key material
371         add     @x[1],@x[1],@t[1]
372 # ifdef __thumb2__
373         itt     hs
374 # endif
375         ldrhs   @t[0],[r12],#16         @ load input
376         ldrhs   @t[1],[r12,#-12]
377 # ifdef __thumb2__
378         itt     hi
379 # endif
380          strhi  @t[2],[sp,#4*(16+10)]   @ copy "@x[10]" while at it
381          strhi  @t[3],[sp,#4*(16+11)]   @ copy "@x[11]" while at it
382         add     @x[2],@x[2],@t[2]
383         add     @x[3],@x[3],@t[3]
384 # ifdef __thumb2__
385         itt     hs
386 # endif
387         ldrhs   @t[2],[r12,#-8]
388         ldrhs   @t[3],[r12,#-4]
389 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
390         rev     @x[0],@x[0]
391         rev     @x[1],@x[1]
392         rev     @x[2],@x[2]
393         rev     @x[3],@x[3]
394 # endif
395 # ifdef __thumb2__
396         itt     hs
397 # endif
398         eorhs   @x[0],@x[0],@t[0]
399         eorhs   @x[1],@x[1],@t[1]
400          add    @t[0],sp,#4*(12)
401         str     @x[0],[r14],#16         @ store output
402 # ifdef __thumb2__
403         itt     hs
404 # endif
405         eorhs   @x[2],@x[2],@t[2]
406         eorhs   @x[3],@x[3],@t[3]
407         str     @x[1],[r14,#-12]
408          ldmia  @t[0],{@t[0]-@t[3]}     @ load key material
409         str     @x[2],[r14,#-8]
410         str     @x[3],[r14,#-4]
411
412         add     @x[4],@x[4],@t[0]       @ accumulate key material
413         add     @x[5],@x[5],@t[1]
414 # ifdef __thumb2__
415         itt     hi
416 # endif
417          addhi  @t[0],@t[0],#1          @ next counter value
418          strhi  @t[0],[sp,#4*(12)]      @ save next counter value
419 # ifdef __thumb2__
420         itt     hs
421 # endif
422         ldrhs   @t[0],[r12],#16         @ load input
423         ldrhs   @t[1],[r12,#-12]
424         add     @x[6],@x[6],@t[2]
425         add     @x[7],@x[7],@t[3]
426 # ifdef __thumb2__
427         itt     hs
428 # endif
429         ldrhs   @t[2],[r12,#-8]
430         ldrhs   @t[3],[r12,#-4]
431 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
432         rev     @x[4],@x[4]
433         rev     @x[5],@x[5]
434         rev     @x[6],@x[6]
435         rev     @x[7],@x[7]
436 # endif
437 # ifdef __thumb2__
438         itt     hs
439 # endif
440         eorhs   @x[4],@x[4],@t[0]
441         eorhs   @x[5],@x[5],@t[1]
442 # ifdef __thumb2__
443         it      hi
444 # endif
445          ldrhi  @t[0],[sp,#4*(32+2)]    @ re-load len
446 # ifdef __thumb2__
447         itt     hs
448 # endif
449         eorhs   @x[6],@x[6],@t[2]
450         eorhs   @x[7],@x[7],@t[3]
451         str     @x[4],[r14],#16         @ store output
452         str     @x[5],[r14,#-12]
453 # ifdef __thumb2__
454         it      hs
455 # endif
456          subhs  @t[3],@t[0],#64         @ len-=64
457         str     @x[6],[r14,#-8]
458         str     @x[7],[r14,#-4]
459         bhi     .Loop_outer
460
461         beq     .Ldone
462 # if __ARM_ARCH__<7
463         b       .Ltail
464
465 .align  4
466 .Lunaligned:                            @ unaligned endian-neutral path
467         cmp     @t[3],#64               @ restore flags
468 # endif
469 #endif
470 #if __ARM_ARCH__<7
471         ldr     @t[3],[sp,#4*(3)]
472 ___
473 for ($i=0;$i<16;$i+=4) {
474 my $j=$i&0x7;
475
476 $code.=<<___    if ($i==4);
477         add     @x[0],sp,#4*(16+8)
478 ___
479 $code.=<<___    if ($i==8);
480         ldmia   @x[0],{@x[0]-@x[7]}             @ load second half
481 # ifdef __thumb2__
482         itt     hi
483 # endif
484         strhi   @t[2],[sp,#4*(16+10)]           @ copy "@x[10]"
485         strhi   @t[3],[sp,#4*(16+11)]           @ copy "@x[11]"
486 ___
487 $code.=<<___;
488         add     @x[$j+0],@x[$j+0],@t[0]         @ accumulate key material
489 ___
490 $code.=<<___    if ($i==12);
491 # ifdef __thumb2__
492         itt     hi
493 # endif
494         addhi   @t[0],@t[0],#1                  @ next counter value
495         strhi   @t[0],[sp,#4*(12)]              @ save next counter value
496 ___
497 $code.=<<___;
498         add     @x[$j+1],@x[$j+1],@t[1]
499         add     @x[$j+2],@x[$j+2],@t[2]
500 # ifdef __thumb2__
501         itete   lo
502 # endif
503         eorlo   @t[0],@t[0],@t[0]               @ zero or ...
504         ldrhsb  @t[0],[r12],#16                 @ ... load input
505         eorlo   @t[1],@t[1],@t[1]
506         ldrhsb  @t[1],[r12,#-12]
507
508         add     @x[$j+3],@x[$j+3],@t[3]
509 # ifdef __thumb2__
510         itete   lo
511 # endif
512         eorlo   @t[2],@t[2],@t[2]
513         ldrhsb  @t[2],[r12,#-8]
514         eorlo   @t[3],@t[3],@t[3]
515         ldrhsb  @t[3],[r12,#-4]
516
517         eor     @x[$j+0],@t[0],@x[$j+0]         @ xor with input (or zero)
518         eor     @x[$j+1],@t[1],@x[$j+1]
519 # ifdef __thumb2__
520         itt     hs
521 # endif
522         ldrhsb  @t[0],[r12,#-15]                @ load more input
523         ldrhsb  @t[1],[r12,#-11]
524         eor     @x[$j+2],@t[2],@x[$j+2]
525          strb   @x[$j+0],[r14],#16              @ store output
526         eor     @x[$j+3],@t[3],@x[$j+3]
527 # ifdef __thumb2__
528         itt     hs
529 # endif
530         ldrhsb  @t[2],[r12,#-7]
531         ldrhsb  @t[3],[r12,#-3]
532          strb   @x[$j+1],[r14,#-12]
533         eor     @x[$j+0],@t[0],@x[$j+0],lsr#8
534          strb   @x[$j+2],[r14,#-8]
535         eor     @x[$j+1],@t[1],@x[$j+1],lsr#8
536 # ifdef __thumb2__
537         itt     hs
538 # endif
539         ldrhsb  @t[0],[r12,#-14]                @ load more input
540         ldrhsb  @t[1],[r12,#-10]
541          strb   @x[$j+3],[r14,#-4]
542         eor     @x[$j+2],@t[2],@x[$j+2],lsr#8
543          strb   @x[$j+0],[r14,#-15]
544         eor     @x[$j+3],@t[3],@x[$j+3],lsr#8
545 # ifdef __thumb2__
546         itt     hs
547 # endif
548         ldrhsb  @t[2],[r12,#-6]
549         ldrhsb  @t[3],[r12,#-2]
550          strb   @x[$j+1],[r14,#-11]
551         eor     @x[$j+0],@t[0],@x[$j+0],lsr#8
552          strb   @x[$j+2],[r14,#-7]
553         eor     @x[$j+1],@t[1],@x[$j+1],lsr#8
554 # ifdef __thumb2__
555         itt     hs
556 # endif
557         ldrhsb  @t[0],[r12,#-13]                @ load more input
558         ldrhsb  @t[1],[r12,#-9]
559          strb   @x[$j+3],[r14,#-3]
560         eor     @x[$j+2],@t[2],@x[$j+2],lsr#8
561          strb   @x[$j+0],[r14,#-14]
562         eor     @x[$j+3],@t[3],@x[$j+3],lsr#8
563 # ifdef __thumb2__
564         itt     hs
565 # endif
566         ldrhsb  @t[2],[r12,#-5]
567         ldrhsb  @t[3],[r12,#-1]
568          strb   @x[$j+1],[r14,#-10]
569          strb   @x[$j+2],[r14,#-6]
570         eor     @x[$j+0],@t[0],@x[$j+0],lsr#8
571          strb   @x[$j+3],[r14,#-2]
572         eor     @x[$j+1],@t[1],@x[$j+1],lsr#8
573          strb   @x[$j+0],[r14,#-13]
574         eor     @x[$j+2],@t[2],@x[$j+2],lsr#8
575          strb   @x[$j+1],[r14,#-9]
576         eor     @x[$j+3],@t[3],@x[$j+3],lsr#8
577          strb   @x[$j+2],[r14,#-5]
578          strb   @x[$j+3],[r14,#-1]
579 ___
580 $code.=<<___    if ($i<12);
581         add     @t[0],sp,#4*(4+$i)
582         ldmia   @t[0],{@t[0]-@t[3]}             @ load key material
583 ___
584 }
585 $code.=<<___;
586 # ifdef __thumb2__
587         it      hi
588 # endif
589         ldrhi   @t[0],[sp,#4*(32+2)]            @ re-load len
590 # ifdef __thumb2__
591         it      hs
592 # endif
593         subhs   @t[3],@t[0],#64                 @ len-=64
594         bhi     .Loop_outer
595
596         beq     .Ldone
597 #endif
598
599 .Ltail:
600         ldr     r12,[sp,#4*(32+1)]      @ load inp
601         add     @t[2],sp,#4*(0)
602         ldr     r14,[sp,#4*(32+0)]      @ load out
603
604 .Loop_tail:
605         ldrb    @t[0],[@t[2]],#1        @ read buffer on stack
606         ldrb    @t[1],[r12],#1          @ read input
607         subs    @t[3],@t[3],#1
608         eor     @t[0],@t[0],@t[1]
609         strb    @t[0],[r14],#1          @ store output
610         bne     .Loop_tail
611
612 .Ldone:
613         add     sp,sp,#4*(32+3)
614 .Lno_data:
615         ldmia   sp!,{r4-r11,pc}
616 .size   ChaCha20_ctr32,.-ChaCha20_ctr32
617 ___
618
619 {{{
620 my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
621     map("q$_",(0..15));
622
623 sub NEONROUND {
624 my $odd = pop;
625 my ($a,$b,$c,$d,$t)=@_;
626
627         (
628         "&vadd_i32      ($a,$a,$b)",
629         "&veor          ($d,$d,$a)",
630         "&vrev32_16     ($d,$d)",       # vrot ($d,16)
631
632         "&vadd_i32      ($c,$c,$d)",
633         "&veor          ($t,$b,$c)",
634         "&vshr_u32      ($b,$t,20)",
635         "&vsli_32       ($b,$t,12)",
636
637         "&vadd_i32      ($a,$a,$b)",
638         "&veor          ($t,$d,$a)",
639         "&vshr_u32      ($d,$t,24)",
640         "&vsli_32       ($d,$t,8)",
641
642         "&vadd_i32      ($c,$c,$d)",
643         "&veor          ($t,$b,$c)",
644         "&vshr_u32      ($b,$t,25)",
645         "&vsli_32       ($b,$t,7)",
646
647         "&vext_8        ($c,$c,$c,8)",
648         "&vext_8        ($b,$b,$b,$odd?12:4)",
649         "&vext_8        ($d,$d,$d,$odd?4:12)"
650         );
651 }
652
653 $code.=<<___;
654 #if __ARM_MAX_ARCH__>=7
655 .arch   armv7-a
656 .fpu    neon
657
658 .type   ChaCha20_neon,%function
659 .align  5
660 ChaCha20_neon:
661         ldr             r12,[sp,#0]             @ pull pointer to counter and nonce
662         stmdb           sp!,{r0-r2,r4-r11,lr}
663 .LChaCha20_neon:
664         adr             r14,.Lsigma
665         vstmdb          sp!,{d8-d15}            @ ABI spec says so
666         stmdb           sp!,{r0-r3}
667
668         vld1.32         {$b0-$c0},[r3]          @ load key
669         ldmia           r3,{r4-r11}             @ load key
670
671         sub             sp,sp,#4*(16+16)
672         vld1.32         {$d0},[r12]             @ load counter and nonce
673         add             r12,sp,#4*8
674         ldmia           r14,{r0-r3}             @ load sigma
675         vld1.32         {$a0},[r14]!            @ load sigma
676         vld1.32         {$t0},[r14]             @ one
677         vst1.32         {$c0-$d0},[r12]         @ copy 1/2key|counter|nonce
678         vst1.32         {$a0-$b0},[sp]          @ copy sigma|1/2key
679
680         str             r10,[sp,#4*(16+10)]     @ off-load "@x[10]"
681         str             r11,[sp,#4*(16+11)]     @ off-load "@x[11]"
682         vshl.i32        $t1#lo,$t0#lo,#1        @ two
683         vstr            $t0#lo,[sp,#4*(16+0)]
684         vshl.i32        $t2#lo,$t0#lo,#2        @ four
685         vstr            $t1#lo,[sp,#4*(16+2)]
686         vmov            $a1,$a0
687         vstr            $t2#lo,[sp,#4*(16+4)]
688         vmov            $a2,$a0
689         vmov            $b1,$b0
690         vmov            $b2,$b0
691         b               .Loop_neon_enter
692
693 .align  4
694 .Loop_neon_outer:
695         ldmia           sp,{r0-r9}              @ load key material
696         cmp             @t[3],#64*2             @ if len<=64*2
697         bls             .Lbreak_neon            @ switch to integer-only
698         vmov            $a1,$a0
699         str             @t[3],[sp,#4*(32+2)]    @ save len
700         vmov            $a2,$a0
701         str             r12,  [sp,#4*(32+1)]    @ save inp
702         vmov            $b1,$b0
703         str             r14,  [sp,#4*(32+0)]    @ save out
704         vmov            $b2,$b0
705 .Loop_neon_enter:
706         ldr             @t[3], [sp,#4*(15)]
707         vadd.i32        $d1,$d0,$t0             @ counter+1
708         ldr             @x[12],[sp,#4*(12)]     @ modulo-scheduled load
709         vmov            $c1,$c0
710         ldr             @t[2], [sp,#4*(13)]
711         vmov            $c2,$c0
712         ldr             @x[14],[sp,#4*(14)]
713         vadd.i32        $d2,$d1,$t0             @ counter+2
714         str             @t[3], [sp,#4*(16+15)]
715         mov             @t[3],#10
716         add             @x[12],@x[12],#3        @ counter+3 
717         b               .Loop_neon
718
719 .align  4
720 .Loop_neon:
721         subs            @t[3],@t[3],#1
722 ___
723         my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
724         my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
725         my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
726         my @thread3=&ROUND(0,4,8,12);
727
728         foreach (@thread0) {
729                 eval;                   eval(shift(@thread3));
730                 eval(shift(@thread1));  eval(shift(@thread3));
731                 eval(shift(@thread2));  eval(shift(@thread3));
732         }
733
734         @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
735         @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
736         @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
737         @thread3=&ROUND(0,5,10,15);
738
739         foreach (@thread0) {
740                 eval;                   eval(shift(@thread3));
741                 eval(shift(@thread1));  eval(shift(@thread3));
742                 eval(shift(@thread2));  eval(shift(@thread3));
743         }
744 $code.=<<___;
745         bne             .Loop_neon
746
747         add             @t[3],sp,#32
748         vld1.32         {$t0-$t1},[sp]          @ load key material
749         vld1.32         {$t2-$t3},[@t[3]]
750
751         ldr             @t[3],[sp,#4*(32+2)]    @ load len
752
753         str             @t[0], [sp,#4*(16+8)]   @ modulo-scheduled store
754         str             @t[1], [sp,#4*(16+9)]
755         str             @x[12],[sp,#4*(16+12)]
756         str             @t[2], [sp,#4*(16+13)]
757         str             @x[14],[sp,#4*(16+14)]
758
759         @ at this point we have first half of 512-bit result in
760         @ @x[0-7] and second half at sp+4*(16+8)
761
762         ldr             r12,[sp,#4*(32+1)]      @ load inp
763         ldr             r14,[sp,#4*(32+0)]      @ load out
764
765         vadd.i32        $a0,$a0,$t0             @ accumulate key material
766         vadd.i32        $a1,$a1,$t0
767         vadd.i32        $a2,$a2,$t0
768         vldr            $t0#lo,[sp,#4*(16+0)]   @ one
769
770         vadd.i32        $b0,$b0,$t1
771         vadd.i32        $b1,$b1,$t1
772         vadd.i32        $b2,$b2,$t1
773         vldr            $t1#lo,[sp,#4*(16+2)]   @ two
774
775         vadd.i32        $c0,$c0,$t2
776         vadd.i32        $c1,$c1,$t2
777         vadd.i32        $c2,$c2,$t2
778         vadd.i32        $d1#lo,$d1#lo,$t0#lo    @ counter+1
779         vadd.i32        $d2#lo,$d2#lo,$t1#lo    @ counter+2
780
781         vadd.i32        $d0,$d0,$t3
782         vadd.i32        $d1,$d1,$t3
783         vadd.i32        $d2,$d2,$t3
784
785         cmp             @t[3],#64*4
786         blo             .Ltail_neon
787
788         vld1.8          {$t0-$t1},[r12]!        @ load input
789          mov            @t[3],sp
790         vld1.8          {$t2-$t3},[r12]!
791         veor            $a0,$a0,$t0             @ xor with input
792         veor            $b0,$b0,$t1
793         vld1.8          {$t0-$t1},[r12]!
794         veor            $c0,$c0,$t2
795         veor            $d0,$d0,$t3
796         vld1.8          {$t2-$t3},[r12]!
797
798         veor            $a1,$a1,$t0
799          vst1.8         {$a0-$b0},[r14]!        @ store output
800         veor            $b1,$b1,$t1
801         vld1.8          {$t0-$t1},[r12]!
802         veor            $c1,$c1,$t2
803          vst1.8         {$c0-$d0},[r14]!
804         veor            $d1,$d1,$t3
805         vld1.8          {$t2-$t3},[r12]!
806
807         veor            $a2,$a2,$t0
808          vld1.32        {$a0-$b0},[@t[3]]!      @ load for next iteration
809          veor           $t0#hi,$t0#hi,$t0#hi
810          vldr           $t0#lo,[sp,#4*(16+4)]   @ four
811         veor            $b2,$b2,$t1
812          vld1.32        {$c0-$d0},[@t[3]]
813         veor            $c2,$c2,$t2
814          vst1.8         {$a1-$b1},[r14]!
815         veor            $d2,$d2,$t3
816          vst1.8         {$c1-$d1},[r14]!
817
818         vadd.i32        $d0#lo,$d0#lo,$t0#lo    @ next counter value
819         vldr            $t0#lo,[sp,#4*(16+0)]   @ one
820
821         ldmia           sp,{@t[0]-@t[3]}        @ load key material
822         add             @x[0],@x[0],@t[0]       @ accumulate key material
823         ldr             @t[0],[r12],#16         @ load input
824          vst1.8         {$a2-$b2},[r14]!
825         add             @x[1],@x[1],@t[1]
826         ldr             @t[1],[r12,#-12]
827          vst1.8         {$c2-$d2},[r14]!
828         add             @x[2],@x[2],@t[2]
829         ldr             @t[2],[r12,#-8]
830         add             @x[3],@x[3],@t[3]
831         ldr             @t[3],[r12,#-4]
832 # ifdef __ARMEB__
833         rev             @x[0],@x[0]
834         rev             @x[1],@x[1]
835         rev             @x[2],@x[2]
836         rev             @x[3],@x[3]
837 # endif
838         eor             @x[0],@x[0],@t[0]       @ xor with input
839          add            @t[0],sp,#4*(4)
840         eor             @x[1],@x[1],@t[1]
841         str             @x[0],[r14],#16         @ store output
842         eor             @x[2],@x[2],@t[2]
843         str             @x[1],[r14,#-12]
844         eor             @x[3],@x[3],@t[3]
845          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
846         str             @x[2],[r14,#-8]
847         str             @x[3],[r14,#-4]
848
849         add             @x[4],@x[4],@t[0]       @ accumulate key material
850         ldr             @t[0],[r12],#16         @ load input
851         add             @x[5],@x[5],@t[1]
852         ldr             @t[1],[r12,#-12]
853         add             @x[6],@x[6],@t[2]
854         ldr             @t[2],[r12,#-8]
855         add             @x[7],@x[7],@t[3]
856         ldr             @t[3],[r12,#-4]
857 # ifdef __ARMEB__
858         rev             @x[4],@x[4]
859         rev             @x[5],@x[5]
860         rev             @x[6],@x[6]
861         rev             @x[7],@x[7]
862 # endif
863         eor             @x[4],@x[4],@t[0]
864          add            @t[0],sp,#4*(8)
865         eor             @x[5],@x[5],@t[1]
866         str             @x[4],[r14],#16         @ store output
867         eor             @x[6],@x[6],@t[2]
868         str             @x[5],[r14,#-12]
869         eor             @x[7],@x[7],@t[3]
870          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
871         str             @x[6],[r14,#-8]
872          add            @x[0],sp,#4*(16+8)
873         str             @x[7],[r14,#-4]
874
875         ldmia           @x[0],{@x[0]-@x[7]}     @ load second half
876
877         add             @x[0],@x[0],@t[0]       @ accumulate key material
878         ldr             @t[0],[r12],#16         @ load input
879         add             @x[1],@x[1],@t[1]
880         ldr             @t[1],[r12,#-12]
881 # ifdef __thumb2__
882         it      hi
883 # endif
884          strhi          @t[2],[sp,#4*(16+10)]   @ copy "@x[10]" while at it
885         add             @x[2],@x[2],@t[2]
886         ldr             @t[2],[r12,#-8]
887 # ifdef __thumb2__
888         it      hi
889 # endif
890          strhi          @t[3],[sp,#4*(16+11)]   @ copy "@x[11]" while at it
891         add             @x[3],@x[3],@t[3]
892         ldr             @t[3],[r12,#-4]
893 # ifdef __ARMEB__
894         rev             @x[0],@x[0]
895         rev             @x[1],@x[1]
896         rev             @x[2],@x[2]
897         rev             @x[3],@x[3]
898 # endif
899         eor             @x[0],@x[0],@t[0]
900          add            @t[0],sp,#4*(12)
901         eor             @x[1],@x[1],@t[1]
902         str             @x[0],[r14],#16         @ store output
903         eor             @x[2],@x[2],@t[2]
904         str             @x[1],[r14,#-12]
905         eor             @x[3],@x[3],@t[3]
906          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
907         str             @x[2],[r14,#-8]
908         str             @x[3],[r14,#-4]
909
910         add             @x[4],@x[4],@t[0]       @ accumulate key material
911          add            @t[0],@t[0],#4          @ next counter value
912         add             @x[5],@x[5],@t[1]
913          str            @t[0],[sp,#4*(12)]      @ save next counter value
914         ldr             @t[0],[r12],#16         @ load input
915         add             @x[6],@x[6],@t[2]
916          add            @x[4],@x[4],#3          @ counter+3
917         ldr             @t[1],[r12,#-12]
918         add             @x[7],@x[7],@t[3]
919         ldr             @t[2],[r12,#-8]
920         ldr             @t[3],[r12,#-4]
921 # ifdef __ARMEB__
922         rev             @x[4],@x[4]
923         rev             @x[5],@x[5]
924         rev             @x[6],@x[6]
925         rev             @x[7],@x[7]
926 # endif
927         eor             @x[4],@x[4],@t[0]
928 # ifdef __thumb2__
929         it      hi
930 # endif
931          ldrhi          @t[0],[sp,#4*(32+2)]    @ re-load len
932         eor             @x[5],@x[5],@t[1]
933         eor             @x[6],@x[6],@t[2]
934         str             @x[4],[r14],#16         @ store output
935         eor             @x[7],@x[7],@t[3]
936         str             @x[5],[r14,#-12]
937          sub            @t[3],@t[0],#64*4       @ len-=64*4
938         str             @x[6],[r14,#-8]
939         str             @x[7],[r14,#-4]
940         bhi             .Loop_neon_outer
941
942         b               .Ldone_neon
943
944 .align  4
945 .Lbreak_neon:
946         @ harmonize NEON and integer-only stack frames: load data
947         @ from NEON frame, but save to integer-only one; distance
948         @ between the two is 4*(32+4+16-32)=4*(20).
949
950         str             @t[3], [sp,#4*(20+32+2)]        @ save len
951          add            @t[3],sp,#4*(32+4)
952         str             r12,   [sp,#4*(20+32+1)]        @ save inp
953         str             r14,   [sp,#4*(20+32+0)]        @ save out
954
955         ldr             @x[12],[sp,#4*(16+10)]
956         ldr             @x[14],[sp,#4*(16+11)]
957          vldmia         @t[3],{d8-d15}                  @ fulfill ABI requirement
958         str             @x[12],[sp,#4*(20+16+10)]       @ copy "@x[10]"
959         str             @x[14],[sp,#4*(20+16+11)]       @ copy "@x[11]"
960
961         ldr             @t[3], [sp,#4*(15)]
962         ldr             @x[12],[sp,#4*(12)]             @ modulo-scheduled load
963         ldr             @t[2], [sp,#4*(13)]
964         ldr             @x[14],[sp,#4*(14)]
965         str             @t[3], [sp,#4*(20+16+15)]
966         add             @t[3],sp,#4*(20)
967         vst1.32         {$a0-$b0},[@t[3]]!              @ copy key
968         add             sp,sp,#4*(20)                   @ switch frame
969         vst1.32         {$c0-$d0},[@t[3]]
970         mov             @t[3],#10
971         b               .Loop                           @ go integer-only
972
973 .align  4
974 .Ltail_neon:
975         cmp             @t[3],#64*3
976         bhs             .L192_or_more_neon
977         cmp             @t[3],#64*2
978         bhs             .L128_or_more_neon
979         cmp             @t[3],#64*1
980         bhs             .L64_or_more_neon
981
982         add             @t[0],sp,#4*(8)
983         vst1.8          {$a0-$b0},[sp]
984         add             @t[2],sp,#4*(0)
985         vst1.8          {$c0-$d0},[@t[0]]
986         b               .Loop_tail_neon
987
988 .align  4
989 .L64_or_more_neon:
990         vld1.8          {$t0-$t1},[r12]!
991         vld1.8          {$t2-$t3},[r12]!
992         veor            $a0,$a0,$t0
993         veor            $b0,$b0,$t1
994         veor            $c0,$c0,$t2
995         veor            $d0,$d0,$t3
996         vst1.8          {$a0-$b0},[r14]!
997         vst1.8          {$c0-$d0},[r14]!
998
999         beq             .Ldone_neon
1000
1001         add             @t[0],sp,#4*(8)
1002         vst1.8          {$a1-$b1},[sp]
1003         add             @t[2],sp,#4*(0)
1004         vst1.8          {$c1-$d1},[@t[0]]
1005         sub             @t[3],@t[3],#64*1       @ len-=64*1
1006         b               .Loop_tail_neon
1007
1008 .align  4
1009 .L128_or_more_neon:
1010         vld1.8          {$t0-$t1},[r12]!
1011         vld1.8          {$t2-$t3},[r12]!
1012         veor            $a0,$a0,$t0
1013         veor            $b0,$b0,$t1
1014         vld1.8          {$t0-$t1},[r12]!
1015         veor            $c0,$c0,$t2
1016         veor            $d0,$d0,$t3
1017         vld1.8          {$t2-$t3},[r12]!
1018
1019         veor            $a1,$a1,$t0
1020         veor            $b1,$b1,$t1
1021          vst1.8         {$a0-$b0},[r14]!
1022         veor            $c1,$c1,$t2
1023          vst1.8         {$c0-$d0},[r14]!
1024         veor            $d1,$d1,$t3
1025         vst1.8          {$a1-$b1},[r14]!
1026         vst1.8          {$c1-$d1},[r14]!
1027
1028         beq             .Ldone_neon
1029
1030         add             @t[0],sp,#4*(8)
1031         vst1.8          {$a2-$b2},[sp]
1032         add             @t[2],sp,#4*(0)
1033         vst1.8          {$c2-$d2},[@t[0]]
1034         sub             @t[3],@t[3],#64*2       @ len-=64*2
1035         b               .Loop_tail_neon
1036
1037 .align  4
1038 .L192_or_more_neon:
1039         vld1.8          {$t0-$t1},[r12]!
1040         vld1.8          {$t2-$t3},[r12]!
1041         veor            $a0,$a0,$t0
1042         veor            $b0,$b0,$t1
1043         vld1.8          {$t0-$t1},[r12]!
1044         veor            $c0,$c0,$t2
1045         veor            $d0,$d0,$t3
1046         vld1.8          {$t2-$t3},[r12]!
1047
1048         veor            $a1,$a1,$t0
1049         veor            $b1,$b1,$t1
1050         vld1.8          {$t0-$t1},[r12]!
1051         veor            $c1,$c1,$t2
1052          vst1.8         {$a0-$b0},[r14]!
1053         veor            $d1,$d1,$t3
1054         vld1.8          {$t2-$t3},[r12]!
1055
1056         veor            $a2,$a2,$t0
1057          vst1.8         {$c0-$d0},[r14]!
1058         veor            $b2,$b2,$t1
1059          vst1.8         {$a1-$b1},[r14]!
1060         veor            $c2,$c2,$t2
1061          vst1.8         {$c1-$d1},[r14]!
1062         veor            $d2,$d2,$t3
1063         vst1.8          {$a2-$b2},[r14]!
1064         vst1.8          {$c2-$d2},[r14]!
1065
1066         beq             .Ldone_neon
1067
1068         ldmia           sp,{@t[0]-@t[3]}        @ load key material
1069         add             @x[0],@x[0],@t[0]       @ accumulate key material
1070          add            @t[0],sp,#4*(4)
1071         add             @x[1],@x[1],@t[1]
1072         add             @x[2],@x[2],@t[2]
1073         add             @x[3],@x[3],@t[3]
1074          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
1075
1076         add             @x[4],@x[4],@t[0]       @ accumulate key material
1077          add            @t[0],sp,#4*(8)
1078         add             @x[5],@x[5],@t[1]
1079         add             @x[6],@x[6],@t[2]
1080         add             @x[7],@x[7],@t[3]
1081          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
1082 # ifdef __ARMEB__
1083         rev             @x[0],@x[0]
1084         rev             @x[1],@x[1]
1085         rev             @x[2],@x[2]
1086         rev             @x[3],@x[3]
1087         rev             @x[4],@x[4]
1088         rev             @x[5],@x[5]
1089         rev             @x[6],@x[6]
1090         rev             @x[7],@x[7]
1091 # endif
1092         stmia           sp,{@x[0]-@x[7]}
1093          add            @x[0],sp,#4*(16+8)
1094
1095         ldmia           @x[0],{@x[0]-@x[7]}     @ load second half
1096
1097         add             @x[0],@x[0],@t[0]       @ accumulate key material
1098          add            @t[0],sp,#4*(12)
1099         add             @x[1],@x[1],@t[1]
1100         add             @x[2],@x[2],@t[2]
1101         add             @x[3],@x[3],@t[3]
1102          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
1103
1104         add             @x[4],@x[4],@t[0]       @ accumulate key material
1105          add            @t[0],sp,#4*(8)
1106         add             @x[5],@x[5],@t[1]
1107          add            @x[4],@x[4],#3          @ counter+3
1108         add             @x[6],@x[6],@t[2]
1109         add             @x[7],@x[7],@t[3]
1110          ldr            @t[3],[sp,#4*(32+2)]    @ re-load len
1111 # ifdef __ARMEB__
1112         rev             @x[0],@x[0]
1113         rev             @x[1],@x[1]
1114         rev             @x[2],@x[2]
1115         rev             @x[3],@x[3]
1116         rev             @x[4],@x[4]
1117         rev             @x[5],@x[5]
1118         rev             @x[6],@x[6]
1119         rev             @x[7],@x[7]
1120 # endif
1121         stmia           @t[0],{@x[0]-@x[7]}
1122          add            @t[2],sp,#4*(0)
1123          sub            @t[3],@t[0],#64*3       @ len-=64*3
1124
1125 .Loop_tail_neon:
1126         ldrb            @t[0],[@t[2]],#1        @ read buffer on stack
1127         ldrb            @t[1],[r12],#1          @ read input
1128         subs            @t[3],@t[3],#1
1129         eor             @t[0],@t[0],@t[1]
1130         strb            @t[0],[r14],#1          @ store ouput
1131         bne             .Loop_tail_neon
1132
1133 .Ldone_neon:
1134         add             sp,sp,#4*(32+4)
1135         vldmia          sp,{d8-d15}
1136         add             sp,sp,#4*(16+3)
1137         ldmia           sp!,{r4-r11,pc}
1138 .size   ChaCha20_neon,.-ChaCha20_neon
1139 .comm   OPENSSL_armcap_P,4,4
1140 #endif
1141 ___
1142 }}}
1143
1144 foreach (split("\n",$code)) {
1145         s/\`([^\`]*)\`/eval $1/geo;
1146
1147         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1148
1149         print $_,"\n";
1150 }
1151 close STDOUT;