8a9f6c4f19ddc8a6e617a416650acba79e90a9ca
[openssl.git] / crypto / chacha / asm / chacha-armv4.pl
1 #! /usr/bin/env perl
2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # December 2014
18 #
19 # ChaCha20 for ARMv4.
20 #
21 # Performance in cycles per byte out of large buffer.
22 #
23 #                       IALU/gcc-4.4    1xNEON      3xNEON+1xIALU
24 #
25 # Cortex-A5             19.3(*)/+95%    21.8        14.1
26 # Cortex-A8             10.5(*)/+160%   13.9        6.35
27 # Cortex-A9             12.9(**)/+110%  14.3        6.50
28 # Cortex-A15            11.0/+40%       16.0        5.00
29 # Snapdragon S4         11.5/+125%      13.6        4.90
30 #
31 # (*)   most "favourable" result for aligned data on little-endian
32 #       processor, result for misaligned data is 10-15% lower;
33 # (**)  this result is a trade-off: it can be improved by 20%,
34 #       but then Snapdragon S4 and Cortex-A8 results get
35 #       20-25% worse;
36
37 # $output is the last argument if it looks like a file (it has an extension)
38 # $flavour is the first argument if it doesn't look like a file
39 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
40 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
41
42 if ($flavour && $flavour ne "void") {
43     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
44     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
45     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
46     die "can't locate arm-xlate.pl";
47
48     open STDOUT,"| \"$^X\" $xlate $flavour $output"
49         or die "can't call $xlate: $!";
50 } else {
51     $output and open STDOUT,">$output";
52 }
53
54 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
55 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
56   my $arg = pop;
57     $arg = "#$arg" if ($arg*1 eq $arg);
58     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
59 }
60
61 my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
62 my @t=map("r$_",(8..11));
63
64 sub ROUND {
65 my ($a0,$b0,$c0,$d0)=@_;
66 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
67 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
68 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
69 my $odd = $d0&1;
70 my ($xc,$xc_) = (@t[0..1]);
71 my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
72 my @ret;
73
74         # Consider order in which variables are addressed by their
75         # index:
76         #
77         #       a   b   c   d
78         #
79         #       0   4   8  12 < even round
80         #       1   5   9  13
81         #       2   6  10  14
82         #       3   7  11  15
83         #       0   5  10  15 < odd round
84         #       1   6  11  12
85         #       2   7   8  13
86         #       3   4   9  14
87         #
88         # 'a', 'b' are permanently allocated in registers, @x[0..7],
89         # while 'c's and pair of 'd's are maintained in memory. If
90         # you observe 'c' column, you'll notice that pair of 'c's is
91         # invariant between rounds. This means that we have to reload
92         # them once per round, in the middle. This is why you'll see
93         # bunch of 'c' stores and loads in the middle, but none in
94         # the beginning or end. If you observe 'd' column, you'll
95         # notice that 15 and 13 are reused in next pair of rounds.
96         # This is why these two are chosen for offloading to memory,
97         # to make loads count more.
98                                                         push @ret,(
99         "&add   (@x[$a0],@x[$a0],@x[$b0])",
100         "&mov   ($xd,$xd,'ror#16')",
101          "&add  (@x[$a1],@x[$a1],@x[$b1])",
102          "&mov  ($xd_,$xd_,'ror#16')",
103         "&eor   ($xd,$xd,@x[$a0],'ror#16')",
104          "&eor  ($xd_,$xd_,@x[$a1],'ror#16')",
105
106         "&add   ($xc,$xc,$xd)",
107         "&mov   (@x[$b0],@x[$b0],'ror#20')",
108          "&add  ($xc_,$xc_,$xd_)",
109          "&mov  (@x[$b1],@x[$b1],'ror#20')",
110         "&eor   (@x[$b0],@x[$b0],$xc,'ror#20')",
111          "&eor  (@x[$b1],@x[$b1],$xc_,'ror#20')",
112
113         "&add   (@x[$a0],@x[$a0],@x[$b0])",
114         "&mov   ($xd,$xd,'ror#24')",
115          "&add  (@x[$a1],@x[$a1],@x[$b1])",
116          "&mov  ($xd_,$xd_,'ror#24')",
117         "&eor   ($xd,$xd,@x[$a0],'ror#24')",
118          "&eor  ($xd_,$xd_,@x[$a1],'ror#24')",
119
120         "&add   ($xc,$xc,$xd)",
121         "&mov   (@x[$b0],@x[$b0],'ror#25')"             );
122                                                         push @ret,(
123         "&str   ($xd,'[sp,#4*(16+$d0)]')",
124         "&ldr   ($xd,'[sp,#4*(16+$d2)]')"               ) if ($odd);
125                                                         push @ret,(
126          "&add  ($xc_,$xc_,$xd_)",
127          "&mov  (@x[$b1],@x[$b1],'ror#25')"             );
128                                                         push @ret,(
129          "&str  ($xd_,'[sp,#4*(16+$d1)]')",
130          "&ldr  ($xd_,'[sp,#4*(16+$d3)]')"              ) if (!$odd);
131                                                         push @ret,(
132         "&eor   (@x[$b0],@x[$b0],$xc,'ror#25')",
133          "&eor  (@x[$b1],@x[$b1],$xc_,'ror#25')"        );
134
135         $xd=@x[$d2]                                     if (!$odd);
136         $xd_=@x[$d3]                                    if ($odd);
137                                                         push @ret,(
138         "&str   ($xc,'[sp,#4*(16+$c0)]')",
139         "&ldr   ($xc,'[sp,#4*(16+$c2)]')",
140         "&add   (@x[$a2],@x[$a2],@x[$b2])",
141         "&mov   ($xd,$xd,'ror#16')",
142          "&str  ($xc_,'[sp,#4*(16+$c1)]')",
143          "&ldr  ($xc_,'[sp,#4*(16+$c3)]')",
144          "&add  (@x[$a3],@x[$a3],@x[$b3])",
145          "&mov  ($xd_,$xd_,'ror#16')",
146         "&eor   ($xd,$xd,@x[$a2],'ror#16')",
147          "&eor  ($xd_,$xd_,@x[$a3],'ror#16')",
148
149         "&add   ($xc,$xc,$xd)",
150         "&mov   (@x[$b2],@x[$b2],'ror#20')",
151          "&add  ($xc_,$xc_,$xd_)",
152          "&mov  (@x[$b3],@x[$b3],'ror#20')",
153         "&eor   (@x[$b2],@x[$b2],$xc,'ror#20')",
154          "&eor  (@x[$b3],@x[$b3],$xc_,'ror#20')",
155
156         "&add   (@x[$a2],@x[$a2],@x[$b2])",
157         "&mov   ($xd,$xd,'ror#24')",
158          "&add  (@x[$a3],@x[$a3],@x[$b3])",
159          "&mov  ($xd_,$xd_,'ror#24')",
160         "&eor   ($xd,$xd,@x[$a2],'ror#24')",
161          "&eor  ($xd_,$xd_,@x[$a3],'ror#24')",
162
163         "&add   ($xc,$xc,$xd)",
164         "&mov   (@x[$b2],@x[$b2],'ror#25')",
165          "&add  ($xc_,$xc_,$xd_)",
166          "&mov  (@x[$b3],@x[$b3],'ror#25')",
167         "&eor   (@x[$b2],@x[$b2],$xc,'ror#25')",
168          "&eor  (@x[$b3],@x[$b3],$xc_,'ror#25')"        );
169
170         @ret;
171 }
172
173 $code.=<<___;
174 #include "arm_arch.h"
175
176 #if defined(__thumb2__) || defined(__clang__)
177 .syntax unified
178 #endif
179 #if defined(__thumb2__)
180 .thumb
181 #else
182 .code   32
183 #endif
184
185 #if defined(__thumb2__) || defined(__clang__)
186 #define ldrhsb  ldrbhs
187 #endif
188
189 .text
190
191 .align  5
192 .Lsigma:
193 .long   0x61707865,0x3320646e,0x79622d32,0x6b206574     @ endian-neutral
194 .Lone:
195 .long   1,0,0,0
196 #if __ARM_MAX_ARCH__>=7
197 .LOPENSSL_armcap:
198 # ifdef _WIN32
199 .word   OPENSSL_armcap_P
200 # else
201 .word   OPENSSL_armcap_P-.LChaCha20_ctr32
202 # endif
203 #else
204 .word   -1
205 #endif
206
207 .globl  ChaCha20_ctr32
208 .type   ChaCha20_ctr32,%function
209 .align  5
210 ChaCha20_ctr32:
211 .LChaCha20_ctr32:
212         ldr     r12,[sp,#0]             @ pull pointer to counter and nonce
213         stmdb   sp!,{r0-r2,r4-r11,lr}
214 #if __ARM_ARCH__<7 && !defined(__thumb2__)
215         sub     r14,pc,#16              @ ChaCha20_ctr32
216 #else
217         adr     r14,.LChaCha20_ctr32
218 #endif
219         cmp     r2,#0                   @ len==0?
220 #ifdef  __thumb2__
221         itt     eq
222 #endif
223         addeq   sp,sp,#4*3
224         beq     .Lno_data
225 #if __ARM_MAX_ARCH__>=7
226         cmp     r2,#192                 @ test len
227         bls     .Lshort
228         ldr     r4,[r14,#-32]
229 # if !defined(_WIN32)
230         ldr     r4,[r14,r4]
231 # endif
232 # if defined(__APPLE__) || defined(_WIN32)
233         ldr     r4,[r4]
234 # endif
235         tst     r4,#ARMV7_NEON
236         bne     .LChaCha20_neon
237 .Lshort:
238 #endif
239         ldmia   r12,{r4-r7}             @ load counter and nonce
240         sub     sp,sp,#4*(16)           @ off-load area
241         sub     r14,r14,#64             @ .Lsigma
242         stmdb   sp!,{r4-r7}             @ copy counter and nonce
243         ldmia   r3,{r4-r11}             @ load key
244         ldmia   r14,{r0-r3}             @ load sigma
245         stmdb   sp!,{r4-r11}            @ copy key
246         stmdb   sp!,{r0-r3}             @ copy sigma
247         str     r10,[sp,#4*(16+10)]     @ off-load "@x[10]"
248         str     r11,[sp,#4*(16+11)]     @ off-load "@x[11]"
249         b       .Loop_outer_enter
250
251 .align  4
252 .Loop_outer:
253         ldmia   sp,{r0-r9}              @ load key material
254         str     @t[3],[sp,#4*(32+2)]    @ save len
255         str     r12,  [sp,#4*(32+1)]    @ save inp
256         str     r14,  [sp,#4*(32+0)]    @ save out
257 .Loop_outer_enter:
258         ldr     @t[3], [sp,#4*(15)]
259         ldr     @x[12],[sp,#4*(12)]     @ modulo-scheduled load
260         ldr     @t[2], [sp,#4*(13)]
261         ldr     @x[14],[sp,#4*(14)]
262         str     @t[3], [sp,#4*(16+15)]
263         mov     @t[3],#10
264         b       .Loop
265
266 .align  4
267 .Loop:
268         subs    @t[3],@t[3],#1
269 ___
270         foreach (&ROUND(0, 4, 8,12)) { eval; }
271         foreach (&ROUND(0, 5,10,15)) { eval; }
272 $code.=<<___;
273         bne     .Loop
274
275         ldr     @t[3],[sp,#4*(32+2)]    @ load len
276
277         str     @t[0], [sp,#4*(16+8)]   @ modulo-scheduled store
278         str     @t[1], [sp,#4*(16+9)]
279         str     @x[12],[sp,#4*(16+12)]
280         str     @t[2], [sp,#4*(16+13)]
281         str     @x[14],[sp,#4*(16+14)]
282
283         @ at this point we have first half of 512-bit result in
284         @ @x[0-7] and second half at sp+4*(16+8)
285
286         cmp     @t[3],#64               @ done yet?
287 #ifdef  __thumb2__
288         itete   lo
289 #endif
290         addlo   r12,sp,#4*(0)           @ shortcut or ...
291         ldrhs   r12,[sp,#4*(32+1)]      @ ... load inp
292         addlo   r14,sp,#4*(0)           @ shortcut or ...
293         ldrhs   r14,[sp,#4*(32+0)]      @ ... load out
294
295         ldr     @t[0],[sp,#4*(0)]       @ load key material
296         ldr     @t[1],[sp,#4*(1)]
297
298 #if __ARM_ARCH__>=6 || !defined(__ARMEB__)
299 # if __ARM_ARCH__<7
300         orr     @t[2],r12,r14
301         tst     @t[2],#3                @ are input and output aligned?
302         ldr     @t[2],[sp,#4*(2)]
303         bne     .Lunaligned
304         cmp     @t[3],#64               @ restore flags
305 # else
306         ldr     @t[2],[sp,#4*(2)]
307 # endif
308         ldr     @t[3],[sp,#4*(3)]
309
310         add     @x[0],@x[0],@t[0]       @ accumulate key material
311         add     @x[1],@x[1],@t[1]
312 # ifdef __thumb2__
313         itt     hs
314 # endif
315         ldrhs   @t[0],[r12],#16         @ load input
316         ldrhs   @t[1],[r12,#-12]
317
318         add     @x[2],@x[2],@t[2]
319         add     @x[3],@x[3],@t[3]
320 # ifdef __thumb2__
321         itt     hs
322 # endif
323         ldrhs   @t[2],[r12,#-8]
324         ldrhs   @t[3],[r12,#-4]
325 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
326         rev     @x[0],@x[0]
327         rev     @x[1],@x[1]
328         rev     @x[2],@x[2]
329         rev     @x[3],@x[3]
330 # endif
331 # ifdef __thumb2__
332         itt     hs
333 # endif
334         eorhs   @x[0],@x[0],@t[0]       @ xor with input
335         eorhs   @x[1],@x[1],@t[1]
336          add    @t[0],sp,#4*(4)
337         str     @x[0],[r14],#16         @ store output
338 # ifdef __thumb2__
339         itt     hs
340 # endif
341         eorhs   @x[2],@x[2],@t[2]
342         eorhs   @x[3],@x[3],@t[3]
343          ldmia  @t[0],{@t[0]-@t[3]}     @ load key material
344         str     @x[1],[r14,#-12]
345         str     @x[2],[r14,#-8]
346         str     @x[3],[r14,#-4]
347
348         add     @x[4],@x[4],@t[0]       @ accumulate key material
349         add     @x[5],@x[5],@t[1]
350 # ifdef __thumb2__
351         itt     hs
352 # endif
353         ldrhs   @t[0],[r12],#16         @ load input
354         ldrhs   @t[1],[r12,#-12]
355         add     @x[6],@x[6],@t[2]
356         add     @x[7],@x[7],@t[3]
357 # ifdef __thumb2__
358         itt     hs
359 # endif
360         ldrhs   @t[2],[r12,#-8]
361         ldrhs   @t[3],[r12,#-4]
362 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
363         rev     @x[4],@x[4]
364         rev     @x[5],@x[5]
365         rev     @x[6],@x[6]
366         rev     @x[7],@x[7]
367 # endif
368 # ifdef __thumb2__
369         itt     hs
370 # endif
371         eorhs   @x[4],@x[4],@t[0]
372         eorhs   @x[5],@x[5],@t[1]
373          add    @t[0],sp,#4*(8)
374         str     @x[4],[r14],#16         @ store output
375 # ifdef __thumb2__
376         itt     hs
377 # endif
378         eorhs   @x[6],@x[6],@t[2]
379         eorhs   @x[7],@x[7],@t[3]
380         str     @x[5],[r14,#-12]
381          ldmia  @t[0],{@t[0]-@t[3]}     @ load key material
382         str     @x[6],[r14,#-8]
383          add    @x[0],sp,#4*(16+8)
384         str     @x[7],[r14,#-4]
385
386         ldmia   @x[0],{@x[0]-@x[7]}     @ load second half
387
388         add     @x[0],@x[0],@t[0]       @ accumulate key material
389         add     @x[1],@x[1],@t[1]
390 # ifdef __thumb2__
391         itt     hs
392 # endif
393         ldrhs   @t[0],[r12],#16         @ load input
394         ldrhs   @t[1],[r12,#-12]
395 # ifdef __thumb2__
396         itt     hi
397 # endif
398          strhi  @t[2],[sp,#4*(16+10)]   @ copy "@x[10]" while at it
399          strhi  @t[3],[sp,#4*(16+11)]   @ copy "@x[11]" while at it
400         add     @x[2],@x[2],@t[2]
401         add     @x[3],@x[3],@t[3]
402 # ifdef __thumb2__
403         itt     hs
404 # endif
405         ldrhs   @t[2],[r12,#-8]
406         ldrhs   @t[3],[r12,#-4]
407 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
408         rev     @x[0],@x[0]
409         rev     @x[1],@x[1]
410         rev     @x[2],@x[2]
411         rev     @x[3],@x[3]
412 # endif
413 # ifdef __thumb2__
414         itt     hs
415 # endif
416         eorhs   @x[0],@x[0],@t[0]
417         eorhs   @x[1],@x[1],@t[1]
418          add    @t[0],sp,#4*(12)
419         str     @x[0],[r14],#16         @ store output
420 # ifdef __thumb2__
421         itt     hs
422 # endif
423         eorhs   @x[2],@x[2],@t[2]
424         eorhs   @x[3],@x[3],@t[3]
425         str     @x[1],[r14,#-12]
426          ldmia  @t[0],{@t[0]-@t[3]}     @ load key material
427         str     @x[2],[r14,#-8]
428         str     @x[3],[r14,#-4]
429
430         add     @x[4],@x[4],@t[0]       @ accumulate key material
431         add     @x[5],@x[5],@t[1]
432 # ifdef __thumb2__
433         itt     hi
434 # endif
435          addhi  @t[0],@t[0],#1          @ next counter value
436          strhi  @t[0],[sp,#4*(12)]      @ save next counter value
437 # ifdef __thumb2__
438         itt     hs
439 # endif
440         ldrhs   @t[0],[r12],#16         @ load input
441         ldrhs   @t[1],[r12,#-12]
442         add     @x[6],@x[6],@t[2]
443         add     @x[7],@x[7],@t[3]
444 # ifdef __thumb2__
445         itt     hs
446 # endif
447         ldrhs   @t[2],[r12,#-8]
448         ldrhs   @t[3],[r12,#-4]
449 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
450         rev     @x[4],@x[4]
451         rev     @x[5],@x[5]
452         rev     @x[6],@x[6]
453         rev     @x[7],@x[7]
454 # endif
455 # ifdef __thumb2__
456         itt     hs
457 # endif
458         eorhs   @x[4],@x[4],@t[0]
459         eorhs   @x[5],@x[5],@t[1]
460 # ifdef __thumb2__
461          it     ne
462 # endif
463          ldrne  @t[0],[sp,#4*(32+2)]    @ re-load len
464 # ifdef __thumb2__
465         itt     hs
466 # endif
467         eorhs   @x[6],@x[6],@t[2]
468         eorhs   @x[7],@x[7],@t[3]
469         str     @x[4],[r14],#16         @ store output
470         str     @x[5],[r14,#-12]
471 # ifdef __thumb2__
472         it      hs
473 # endif
474          subhs  @t[3],@t[0],#64         @ len-=64
475         str     @x[6],[r14,#-8]
476         str     @x[7],[r14,#-4]
477         bhi     .Loop_outer
478
479         beq     .Ldone
480 # if __ARM_ARCH__<7
481         b       .Ltail
482
483 .align  4
484 .Lunaligned:                            @ unaligned endian-neutral path
485         cmp     @t[3],#64               @ restore flags
486 # endif
487 #endif
488 #if __ARM_ARCH__<7
489         ldr     @t[3],[sp,#4*(3)]
490 ___
491 for ($i=0;$i<16;$i+=4) {
492 my $j=$i&0x7;
493
494 $code.=<<___    if ($i==4);
495         add     @x[0],sp,#4*(16+8)
496 ___
497 $code.=<<___    if ($i==8);
498         ldmia   @x[0],{@x[0]-@x[7]}             @ load second half
499 # ifdef __thumb2__
500         itt     hi
501 # endif
502         strhi   @t[2],[sp,#4*(16+10)]           @ copy "@x[10]"
503         strhi   @t[3],[sp,#4*(16+11)]           @ copy "@x[11]"
504 ___
505 $code.=<<___;
506         add     @x[$j+0],@x[$j+0],@t[0]         @ accumulate key material
507 ___
508 $code.=<<___    if ($i==12);
509 # ifdef __thumb2__
510         itt     hi
511 # endif
512         addhi   @t[0],@t[0],#1                  @ next counter value
513         strhi   @t[0],[sp,#4*(12)]              @ save next counter value
514 ___
515 $code.=<<___;
516         add     @x[$j+1],@x[$j+1],@t[1]
517         add     @x[$j+2],@x[$j+2],@t[2]
518 # ifdef __thumb2__
519         itete   lo
520 # endif
521         eorlo   @t[0],@t[0],@t[0]               @ zero or ...
522         ldrhsb  @t[0],[r12],#16                 @ ... load input
523         eorlo   @t[1],@t[1],@t[1]
524         ldrhsb  @t[1],[r12,#-12]
525
526         add     @x[$j+3],@x[$j+3],@t[3]
527 # ifdef __thumb2__
528         itete   lo
529 # endif
530         eorlo   @t[2],@t[2],@t[2]
531         ldrhsb  @t[2],[r12,#-8]
532         eorlo   @t[3],@t[3],@t[3]
533         ldrhsb  @t[3],[r12,#-4]
534
535         eor     @x[$j+0],@t[0],@x[$j+0]         @ xor with input (or zero)
536         eor     @x[$j+1],@t[1],@x[$j+1]
537 # ifdef __thumb2__
538         itt     hs
539 # endif
540         ldrhsb  @t[0],[r12,#-15]                @ load more input
541         ldrhsb  @t[1],[r12,#-11]
542         eor     @x[$j+2],@t[2],@x[$j+2]
543          strb   @x[$j+0],[r14],#16              @ store output
544         eor     @x[$j+3],@t[3],@x[$j+3]
545 # ifdef __thumb2__
546         itt     hs
547 # endif
548         ldrhsb  @t[2],[r12,#-7]
549         ldrhsb  @t[3],[r12,#-3]
550          strb   @x[$j+1],[r14,#-12]
551         eor     @x[$j+0],@t[0],@x[$j+0],lsr#8
552          strb   @x[$j+2],[r14,#-8]
553         eor     @x[$j+1],@t[1],@x[$j+1],lsr#8
554 # ifdef __thumb2__
555         itt     hs
556 # endif
557         ldrhsb  @t[0],[r12,#-14]                @ load more input
558         ldrhsb  @t[1],[r12,#-10]
559          strb   @x[$j+3],[r14,#-4]
560         eor     @x[$j+2],@t[2],@x[$j+2],lsr#8
561          strb   @x[$j+0],[r14,#-15]
562         eor     @x[$j+3],@t[3],@x[$j+3],lsr#8
563 # ifdef __thumb2__
564         itt     hs
565 # endif
566         ldrhsb  @t[2],[r12,#-6]
567         ldrhsb  @t[3],[r12,#-2]
568          strb   @x[$j+1],[r14,#-11]
569         eor     @x[$j+0],@t[0],@x[$j+0],lsr#8
570          strb   @x[$j+2],[r14,#-7]
571         eor     @x[$j+1],@t[1],@x[$j+1],lsr#8
572 # ifdef __thumb2__
573         itt     hs
574 # endif
575         ldrhsb  @t[0],[r12,#-13]                @ load more input
576         ldrhsb  @t[1],[r12,#-9]
577          strb   @x[$j+3],[r14,#-3]
578         eor     @x[$j+2],@t[2],@x[$j+2],lsr#8
579          strb   @x[$j+0],[r14,#-14]
580         eor     @x[$j+3],@t[3],@x[$j+3],lsr#8
581 # ifdef __thumb2__
582         itt     hs
583 # endif
584         ldrhsb  @t[2],[r12,#-5]
585         ldrhsb  @t[3],[r12,#-1]
586          strb   @x[$j+1],[r14,#-10]
587          strb   @x[$j+2],[r14,#-6]
588         eor     @x[$j+0],@t[0],@x[$j+0],lsr#8
589          strb   @x[$j+3],[r14,#-2]
590         eor     @x[$j+1],@t[1],@x[$j+1],lsr#8
591          strb   @x[$j+0],[r14,#-13]
592         eor     @x[$j+2],@t[2],@x[$j+2],lsr#8
593          strb   @x[$j+1],[r14,#-9]
594         eor     @x[$j+3],@t[3],@x[$j+3],lsr#8
595          strb   @x[$j+2],[r14,#-5]
596          strb   @x[$j+3],[r14,#-1]
597 ___
598 $code.=<<___    if ($i<12);
599         add     @t[0],sp,#4*(4+$i)
600         ldmia   @t[0],{@t[0]-@t[3]}             @ load key material
601 ___
602 }
603 $code.=<<___;
604 # ifdef __thumb2__
605         it      ne
606 # endif
607         ldrne   @t[0],[sp,#4*(32+2)]            @ re-load len
608 # ifdef __thumb2__
609         it      hs
610 # endif
611         subhs   @t[3],@t[0],#64                 @ len-=64
612         bhi     .Loop_outer
613
614         beq     .Ldone
615 #endif
616
617 .Ltail:
618         ldr     r12,[sp,#4*(32+1)]      @ load inp
619         add     @t[1],sp,#4*(0)
620         ldr     r14,[sp,#4*(32+0)]      @ load out
621
622 .Loop_tail:
623         ldrb    @t[2],[@t[1]],#1        @ read buffer on stack
624         ldrb    @t[3],[r12],#1          @ read input
625         subs    @t[0],@t[0],#1
626         eor     @t[3],@t[3],@t[2]
627         strb    @t[3],[r14],#1          @ store output
628         bne     .Loop_tail
629
630 .Ldone:
631         add     sp,sp,#4*(32+3)
632 .Lno_data:
633         ldmia   sp!,{r4-r11,pc}
634 .size   ChaCha20_ctr32,.-ChaCha20_ctr32
635 ___
636
637 {{{
638 my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
639     map("q$_",(0..15));
640
641 sub NEONROUND {
642 my $odd = pop;
643 my ($a,$b,$c,$d,$t)=@_;
644
645         (
646         "&vadd_i32      ($a,$a,$b)",
647         "&veor          ($d,$d,$a)",
648         "&vrev32_16     ($d,$d)",       # vrot ($d,16)
649
650         "&vadd_i32      ($c,$c,$d)",
651         "&veor          ($t,$b,$c)",
652         "&vshr_u32      ($b,$t,20)",
653         "&vsli_32       ($b,$t,12)",
654
655         "&vadd_i32      ($a,$a,$b)",
656         "&veor          ($t,$d,$a)",
657         "&vshr_u32      ($d,$t,24)",
658         "&vsli_32       ($d,$t,8)",
659
660         "&vadd_i32      ($c,$c,$d)",
661         "&veor          ($t,$b,$c)",
662         "&vshr_u32      ($b,$t,25)",
663         "&vsli_32       ($b,$t,7)",
664
665         "&vext_8        ($c,$c,$c,8)",
666         "&vext_8        ($b,$b,$b,$odd?12:4)",
667         "&vext_8        ($d,$d,$d,$odd?4:12)"
668         );
669 }
670
671 $code.=<<___;
672 #if __ARM_MAX_ARCH__>=7
673 .arch   armv7-a
674 .fpu    neon
675
676 .type   ChaCha20_neon,%function
677 .align  5
678 ChaCha20_neon:
679         ldr             r12,[sp,#0]             @ pull pointer to counter and nonce
680         stmdb           sp!,{r0-r2,r4-r11,lr}
681 .LChaCha20_neon:
682         adr             r14,.Lsigma
683         vstmdb          sp!,{d8-d15}            @ ABI spec says so
684         stmdb           sp!,{r0-r3}
685
686         vld1.32         {$b0-$c0},[r3]          @ load key
687         ldmia           r3,{r4-r11}             @ load key
688
689         sub             sp,sp,#4*(16+16)
690         vld1.32         {$d0},[r12]             @ load counter and nonce
691         add             r12,sp,#4*8
692         ldmia           r14,{r0-r3}             @ load sigma
693         vld1.32         {$a0},[r14]!            @ load sigma
694         vld1.32         {$t0},[r14]             @ one
695         vst1.32         {$c0-$d0},[r12]         @ copy 1/2key|counter|nonce
696         vst1.32         {$a0-$b0},[sp]          @ copy sigma|1/2key
697
698         str             r10,[sp,#4*(16+10)]     @ off-load "@x[10]"
699         str             r11,[sp,#4*(16+11)]     @ off-load "@x[11]"
700         vshl.i32        $t1#lo,$t0#lo,#1        @ two
701         vstr            $t0#lo,[sp,#4*(16+0)]
702         vshl.i32        $t2#lo,$t0#lo,#2        @ four
703         vstr            $t1#lo,[sp,#4*(16+2)]
704         vmov            $a1,$a0
705         vstr            $t2#lo,[sp,#4*(16+4)]
706         vmov            $a2,$a0
707         vmov            $b1,$b0
708         vmov            $b2,$b0
709         b               .Loop_neon_enter
710
711 .align  4
712 .Loop_neon_outer:
713         ldmia           sp,{r0-r9}              @ load key material
714         cmp             @t[3],#64*2             @ if len<=64*2
715         bls             .Lbreak_neon            @ switch to integer-only
716         vmov            $a1,$a0
717         str             @t[3],[sp,#4*(32+2)]    @ save len
718         vmov            $a2,$a0
719         str             r12,  [sp,#4*(32+1)]    @ save inp
720         vmov            $b1,$b0
721         str             r14,  [sp,#4*(32+0)]    @ save out
722         vmov            $b2,$b0
723 .Loop_neon_enter:
724         ldr             @t[3], [sp,#4*(15)]
725         vadd.i32        $d1,$d0,$t0             @ counter+1
726         ldr             @x[12],[sp,#4*(12)]     @ modulo-scheduled load
727         vmov            $c1,$c0
728         ldr             @t[2], [sp,#4*(13)]
729         vmov            $c2,$c0
730         ldr             @x[14],[sp,#4*(14)]
731         vadd.i32        $d2,$d1,$t0             @ counter+2
732         str             @t[3], [sp,#4*(16+15)]
733         mov             @t[3],#10
734         add             @x[12],@x[12],#3        @ counter+3
735         b               .Loop_neon
736
737 .align  4
738 .Loop_neon:
739         subs            @t[3],@t[3],#1
740 ___
741         my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
742         my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
743         my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
744         my @thread3=&ROUND(0,4,8,12);
745
746         foreach (@thread0) {
747                 eval;                   eval(shift(@thread3));
748                 eval(shift(@thread1));  eval(shift(@thread3));
749                 eval(shift(@thread2));  eval(shift(@thread3));
750         }
751
752         @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
753         @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
754         @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
755         @thread3=&ROUND(0,5,10,15);
756
757         foreach (@thread0) {
758                 eval;                   eval(shift(@thread3));
759                 eval(shift(@thread1));  eval(shift(@thread3));
760                 eval(shift(@thread2));  eval(shift(@thread3));
761         }
762 $code.=<<___;
763         bne             .Loop_neon
764
765         add             @t[3],sp,#32
766         vld1.32         {$t0-$t1},[sp]          @ load key material
767         vld1.32         {$t2-$t3},[@t[3]]
768
769         ldr             @t[3],[sp,#4*(32+2)]    @ load len
770
771         str             @t[0], [sp,#4*(16+8)]   @ modulo-scheduled store
772         str             @t[1], [sp,#4*(16+9)]
773         str             @x[12],[sp,#4*(16+12)]
774         str             @t[2], [sp,#4*(16+13)]
775         str             @x[14],[sp,#4*(16+14)]
776
777         @ at this point we have first half of 512-bit result in
778         @ @x[0-7] and second half at sp+4*(16+8)
779
780         ldr             r12,[sp,#4*(32+1)]      @ load inp
781         ldr             r14,[sp,#4*(32+0)]      @ load out
782
783         vadd.i32        $a0,$a0,$t0             @ accumulate key material
784         vadd.i32        $a1,$a1,$t0
785         vadd.i32        $a2,$a2,$t0
786         vldr            $t0#lo,[sp,#4*(16+0)]   @ one
787
788         vadd.i32        $b0,$b0,$t1
789         vadd.i32        $b1,$b1,$t1
790         vadd.i32        $b2,$b2,$t1
791         vldr            $t1#lo,[sp,#4*(16+2)]   @ two
792
793         vadd.i32        $c0,$c0,$t2
794         vadd.i32        $c1,$c1,$t2
795         vadd.i32        $c2,$c2,$t2
796         vadd.i32        $d1#lo,$d1#lo,$t0#lo    @ counter+1
797         vadd.i32        $d2#lo,$d2#lo,$t1#lo    @ counter+2
798
799         vadd.i32        $d0,$d0,$t3
800         vadd.i32        $d1,$d1,$t3
801         vadd.i32        $d2,$d2,$t3
802
803         cmp             @t[3],#64*4
804         blo             .Ltail_neon
805
806         vld1.8          {$t0-$t1},[r12]!        @ load input
807          mov            @t[3],sp
808         vld1.8          {$t2-$t3},[r12]!
809         veor            $a0,$a0,$t0             @ xor with input
810         veor            $b0,$b0,$t1
811         vld1.8          {$t0-$t1},[r12]!
812         veor            $c0,$c0,$t2
813         veor            $d0,$d0,$t3
814         vld1.8          {$t2-$t3},[r12]!
815
816         veor            $a1,$a1,$t0
817          vst1.8         {$a0-$b0},[r14]!        @ store output
818         veor            $b1,$b1,$t1
819         vld1.8          {$t0-$t1},[r12]!
820         veor            $c1,$c1,$t2
821          vst1.8         {$c0-$d0},[r14]!
822         veor            $d1,$d1,$t3
823         vld1.8          {$t2-$t3},[r12]!
824
825         veor            $a2,$a2,$t0
826          vld1.32        {$a0-$b0},[@t[3]]!      @ load for next iteration
827          veor           $t0#hi,$t0#hi,$t0#hi
828          vldr           $t0#lo,[sp,#4*(16+4)]   @ four
829         veor            $b2,$b2,$t1
830          vld1.32        {$c0-$d0},[@t[3]]
831         veor            $c2,$c2,$t2
832          vst1.8         {$a1-$b1},[r14]!
833         veor            $d2,$d2,$t3
834          vst1.8         {$c1-$d1},[r14]!
835
836         vadd.i32        $d0#lo,$d0#lo,$t0#lo    @ next counter value
837         vldr            $t0#lo,[sp,#4*(16+0)]   @ one
838
839         ldmia           sp,{@t[0]-@t[3]}        @ load key material
840         add             @x[0],@x[0],@t[0]       @ accumulate key material
841         ldr             @t[0],[r12],#16         @ load input
842          vst1.8         {$a2-$b2},[r14]!
843         add             @x[1],@x[1],@t[1]
844         ldr             @t[1],[r12,#-12]
845          vst1.8         {$c2-$d2},[r14]!
846         add             @x[2],@x[2],@t[2]
847         ldr             @t[2],[r12,#-8]
848         add             @x[3],@x[3],@t[3]
849         ldr             @t[3],[r12,#-4]
850 # ifdef __ARMEB__
851         rev             @x[0],@x[0]
852         rev             @x[1],@x[1]
853         rev             @x[2],@x[2]
854         rev             @x[3],@x[3]
855 # endif
856         eor             @x[0],@x[0],@t[0]       @ xor with input
857          add            @t[0],sp,#4*(4)
858         eor             @x[1],@x[1],@t[1]
859         str             @x[0],[r14],#16         @ store output
860         eor             @x[2],@x[2],@t[2]
861         str             @x[1],[r14,#-12]
862         eor             @x[3],@x[3],@t[3]
863          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
864         str             @x[2],[r14,#-8]
865         str             @x[3],[r14,#-4]
866
867         add             @x[4],@x[4],@t[0]       @ accumulate key material
868         ldr             @t[0],[r12],#16         @ load input
869         add             @x[5],@x[5],@t[1]
870         ldr             @t[1],[r12,#-12]
871         add             @x[6],@x[6],@t[2]
872         ldr             @t[2],[r12,#-8]
873         add             @x[7],@x[7],@t[3]
874         ldr             @t[3],[r12,#-4]
875 # ifdef __ARMEB__
876         rev             @x[4],@x[4]
877         rev             @x[5],@x[5]
878         rev             @x[6],@x[6]
879         rev             @x[7],@x[7]
880 # endif
881         eor             @x[4],@x[4],@t[0]
882          add            @t[0],sp,#4*(8)
883         eor             @x[5],@x[5],@t[1]
884         str             @x[4],[r14],#16         @ store output
885         eor             @x[6],@x[6],@t[2]
886         str             @x[5],[r14,#-12]
887         eor             @x[7],@x[7],@t[3]
888          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
889         str             @x[6],[r14,#-8]
890          add            @x[0],sp,#4*(16+8)
891         str             @x[7],[r14,#-4]
892
893         ldmia           @x[0],{@x[0]-@x[7]}     @ load second half
894
895         add             @x[0],@x[0],@t[0]       @ accumulate key material
896         ldr             @t[0],[r12],#16         @ load input
897         add             @x[1],@x[1],@t[1]
898         ldr             @t[1],[r12,#-12]
899 # ifdef __thumb2__
900         it      hi
901 # endif
902          strhi          @t[2],[sp,#4*(16+10)]   @ copy "@x[10]" while at it
903         add             @x[2],@x[2],@t[2]
904         ldr             @t[2],[r12,#-8]
905 # ifdef __thumb2__
906         it      hi
907 # endif
908          strhi          @t[3],[sp,#4*(16+11)]   @ copy "@x[11]" while at it
909         add             @x[3],@x[3],@t[3]
910         ldr             @t[3],[r12,#-4]
911 # ifdef __ARMEB__
912         rev             @x[0],@x[0]
913         rev             @x[1],@x[1]
914         rev             @x[2],@x[2]
915         rev             @x[3],@x[3]
916 # endif
917         eor             @x[0],@x[0],@t[0]
918          add            @t[0],sp,#4*(12)
919         eor             @x[1],@x[1],@t[1]
920         str             @x[0],[r14],#16         @ store output
921         eor             @x[2],@x[2],@t[2]
922         str             @x[1],[r14,#-12]
923         eor             @x[3],@x[3],@t[3]
924          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
925         str             @x[2],[r14,#-8]
926         str             @x[3],[r14,#-4]
927
928         add             @x[4],@x[4],@t[0]       @ accumulate key material
929          add            @t[0],@t[0],#4          @ next counter value
930         add             @x[5],@x[5],@t[1]
931          str            @t[0],[sp,#4*(12)]      @ save next counter value
932         ldr             @t[0],[r12],#16         @ load input
933         add             @x[6],@x[6],@t[2]
934          add            @x[4],@x[4],#3          @ counter+3
935         ldr             @t[1],[r12,#-12]
936         add             @x[7],@x[7],@t[3]
937         ldr             @t[2],[r12,#-8]
938         ldr             @t[3],[r12,#-4]
939 # ifdef __ARMEB__
940         rev             @x[4],@x[4]
941         rev             @x[5],@x[5]
942         rev             @x[6],@x[6]
943         rev             @x[7],@x[7]
944 # endif
945         eor             @x[4],@x[4],@t[0]
946 # ifdef __thumb2__
947         it      hi
948 # endif
949          ldrhi          @t[0],[sp,#4*(32+2)]    @ re-load len
950         eor             @x[5],@x[5],@t[1]
951         eor             @x[6],@x[6],@t[2]
952         str             @x[4],[r14],#16         @ store output
953         eor             @x[7],@x[7],@t[3]
954         str             @x[5],[r14,#-12]
955          sub            @t[3],@t[0],#64*4       @ len-=64*4
956         str             @x[6],[r14,#-8]
957         str             @x[7],[r14,#-4]
958         bhi             .Loop_neon_outer
959
960         b               .Ldone_neon
961
962 .align  4
963 .Lbreak_neon:
964         @ harmonize NEON and integer-only stack frames: load data
965         @ from NEON frame, but save to integer-only one; distance
966         @ between the two is 4*(32+4+16-32)=4*(20).
967
968         str             @t[3], [sp,#4*(20+32+2)]        @ save len
969          add            @t[3],sp,#4*(32+4)
970         str             r12,   [sp,#4*(20+32+1)]        @ save inp
971         str             r14,   [sp,#4*(20+32+0)]        @ save out
972
973         ldr             @x[12],[sp,#4*(16+10)]
974         ldr             @x[14],[sp,#4*(16+11)]
975          vldmia         @t[3],{d8-d15}                  @ fulfill ABI requirement
976         str             @x[12],[sp,#4*(20+16+10)]       @ copy "@x[10]"
977         str             @x[14],[sp,#4*(20+16+11)]       @ copy "@x[11]"
978
979         ldr             @t[3], [sp,#4*(15)]
980         ldr             @x[12],[sp,#4*(12)]             @ modulo-scheduled load
981         ldr             @t[2], [sp,#4*(13)]
982         ldr             @x[14],[sp,#4*(14)]
983         str             @t[3], [sp,#4*(20+16+15)]
984         add             @t[3],sp,#4*(20)
985         vst1.32         {$a0-$b0},[@t[3]]!              @ copy key
986         add             sp,sp,#4*(20)                   @ switch frame
987         vst1.32         {$c0-$d0},[@t[3]]
988         mov             @t[3],#10
989         b               .Loop                           @ go integer-only
990
991 .align  4
992 .Ltail_neon:
993         cmp             @t[3],#64*3
994         bhs             .L192_or_more_neon
995         cmp             @t[3],#64*2
996         bhs             .L128_or_more_neon
997         cmp             @t[3],#64*1
998         bhs             .L64_or_more_neon
999
1000         add             @t[0],sp,#4*(8)
1001         vst1.8          {$a0-$b0},[sp]
1002         add             @t[2],sp,#4*(0)
1003         vst1.8          {$c0-$d0},[@t[0]]
1004         b               .Loop_tail_neon
1005
1006 .align  4
1007 .L64_or_more_neon:
1008         vld1.8          {$t0-$t1},[r12]!
1009         vld1.8          {$t2-$t3},[r12]!
1010         veor            $a0,$a0,$t0
1011         veor            $b0,$b0,$t1
1012         veor            $c0,$c0,$t2
1013         veor            $d0,$d0,$t3
1014         vst1.8          {$a0-$b0},[r14]!
1015         vst1.8          {$c0-$d0},[r14]!
1016
1017         beq             .Ldone_neon
1018
1019         add             @t[0],sp,#4*(8)
1020         vst1.8          {$a1-$b1},[sp]
1021         add             @t[2],sp,#4*(0)
1022         vst1.8          {$c1-$d1},[@t[0]]
1023         sub             @t[3],@t[3],#64*1       @ len-=64*1
1024         b               .Loop_tail_neon
1025
1026 .align  4
1027 .L128_or_more_neon:
1028         vld1.8          {$t0-$t1},[r12]!
1029         vld1.8          {$t2-$t3},[r12]!
1030         veor            $a0,$a0,$t0
1031         veor            $b0,$b0,$t1
1032         vld1.8          {$t0-$t1},[r12]!
1033         veor            $c0,$c0,$t2
1034         veor            $d0,$d0,$t3
1035         vld1.8          {$t2-$t3},[r12]!
1036
1037         veor            $a1,$a1,$t0
1038         veor            $b1,$b1,$t1
1039          vst1.8         {$a0-$b0},[r14]!
1040         veor            $c1,$c1,$t2
1041          vst1.8         {$c0-$d0},[r14]!
1042         veor            $d1,$d1,$t3
1043         vst1.8          {$a1-$b1},[r14]!
1044         vst1.8          {$c1-$d1},[r14]!
1045
1046         beq             .Ldone_neon
1047
1048         add             @t[0],sp,#4*(8)
1049         vst1.8          {$a2-$b2},[sp]
1050         add             @t[2],sp,#4*(0)
1051         vst1.8          {$c2-$d2},[@t[0]]
1052         sub             @t[3],@t[3],#64*2       @ len-=64*2
1053         b               .Loop_tail_neon
1054
1055 .align  4
1056 .L192_or_more_neon:
1057         vld1.8          {$t0-$t1},[r12]!
1058         vld1.8          {$t2-$t3},[r12]!
1059         veor            $a0,$a0,$t0
1060         veor            $b0,$b0,$t1
1061         vld1.8          {$t0-$t1},[r12]!
1062         veor            $c0,$c0,$t2
1063         veor            $d0,$d0,$t3
1064         vld1.8          {$t2-$t3},[r12]!
1065
1066         veor            $a1,$a1,$t0
1067         veor            $b1,$b1,$t1
1068         vld1.8          {$t0-$t1},[r12]!
1069         veor            $c1,$c1,$t2
1070          vst1.8         {$a0-$b0},[r14]!
1071         veor            $d1,$d1,$t3
1072         vld1.8          {$t2-$t3},[r12]!
1073
1074         veor            $a2,$a2,$t0
1075          vst1.8         {$c0-$d0},[r14]!
1076         veor            $b2,$b2,$t1
1077          vst1.8         {$a1-$b1},[r14]!
1078         veor            $c2,$c2,$t2
1079          vst1.8         {$c1-$d1},[r14]!
1080         veor            $d2,$d2,$t3
1081         vst1.8          {$a2-$b2},[r14]!
1082         vst1.8          {$c2-$d2},[r14]!
1083
1084         beq             .Ldone_neon
1085
1086         ldmia           sp,{@t[0]-@t[3]}        @ load key material
1087         add             @x[0],@x[0],@t[0]       @ accumulate key material
1088          add            @t[0],sp,#4*(4)
1089         add             @x[1],@x[1],@t[1]
1090         add             @x[2],@x[2],@t[2]
1091         add             @x[3],@x[3],@t[3]
1092          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
1093
1094         add             @x[4],@x[4],@t[0]       @ accumulate key material
1095          add            @t[0],sp,#4*(8)
1096         add             @x[5],@x[5],@t[1]
1097         add             @x[6],@x[6],@t[2]
1098         add             @x[7],@x[7],@t[3]
1099          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
1100 # ifdef __ARMEB__
1101         rev             @x[0],@x[0]
1102         rev             @x[1],@x[1]
1103         rev             @x[2],@x[2]
1104         rev             @x[3],@x[3]
1105         rev             @x[4],@x[4]
1106         rev             @x[5],@x[5]
1107         rev             @x[6],@x[6]
1108         rev             @x[7],@x[7]
1109 # endif
1110         stmia           sp,{@x[0]-@x[7]}
1111          add            @x[0],sp,#4*(16+8)
1112
1113         ldmia           @x[0],{@x[0]-@x[7]}     @ load second half
1114
1115         add             @x[0],@x[0],@t[0]       @ accumulate key material
1116          add            @t[0],sp,#4*(12)
1117         add             @x[1],@x[1],@t[1]
1118         add             @x[2],@x[2],@t[2]
1119         add             @x[3],@x[3],@t[3]
1120          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
1121
1122         add             @x[4],@x[4],@t[0]       @ accumulate key material
1123          add            @t[0],sp,#4*(8)
1124         add             @x[5],@x[5],@t[1]
1125          add            @x[4],@x[4],#3          @ counter+3
1126         add             @x[6],@x[6],@t[2]
1127         add             @x[7],@x[7],@t[3]
1128          ldr            @t[3],[sp,#4*(32+2)]    @ re-load len
1129 # ifdef __ARMEB__
1130         rev             @x[0],@x[0]
1131         rev             @x[1],@x[1]
1132         rev             @x[2],@x[2]
1133         rev             @x[3],@x[3]
1134         rev             @x[4],@x[4]
1135         rev             @x[5],@x[5]
1136         rev             @x[6],@x[6]
1137         rev             @x[7],@x[7]
1138 # endif
1139         stmia           @t[0],{@x[0]-@x[7]}
1140          add            @t[2],sp,#4*(0)
1141          sub            @t[3],@t[3],#64*3       @ len-=64*3
1142
1143 .Loop_tail_neon:
1144         ldrb            @t[0],[@t[2]],#1        @ read buffer on stack
1145         ldrb            @t[1],[r12],#1          @ read input
1146         subs            @t[3],@t[3],#1
1147         eor             @t[0],@t[0],@t[1]
1148         strb            @t[0],[r14],#1          @ store output
1149         bne             .Loop_tail_neon
1150
1151 .Ldone_neon:
1152         add             sp,sp,#4*(32+4)
1153         vldmia          sp,{d8-d15}
1154         add             sp,sp,#4*(16+3)
1155         ldmia           sp!,{r4-r11,pc}
1156 .size   ChaCha20_neon,.-ChaCha20_neon
1157 .comm   OPENSSL_armcap_P,4,4
1158 #endif
1159 ___
1160 }}}
1161
1162 foreach (split("\n",$code)) {
1163         s/\`([^\`]*)\`/eval $1/geo;
1164
1165         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1166
1167         print $_,"\n";
1168 }
1169 close STDOUT;