ARM assembly pack: add ChaCha20 and Poly1305 modules.
[openssl.git] / crypto / poly1305 / asm / poly1305-armv4.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 #                       IALU(*)/gcc-4.4         NEON
11 #
12 # ARM11xx(ARMv6)        7.78/+100%              -
13 # Cortex-A5             6.30/+130%              2.96
14 # Cortex-A8             6.25/+115%              2.36
15 # Cortex-A9             5.10/+95%               2.55
16 # Cortex-A15            3.79/+85%               1.25(**)
17 # Snapdragon S4         5.70/+100%              1.48(**)
18 #
19 # (*)   this is for -march=armv6, i.e. with bunch of ldrb loading data;
20 # (**)  these are trade-off results, they can be improved by ~8% but at
21 #       the cost of 15/12% regression on Cortex-A5/A7, it's even possible
22 #       to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
23
24 $flavour = shift;
25 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
26 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
27
28 if ($flavour && $flavour ne "void") {
29     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
31     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
32     die "can't locate arm-xlate.pl";
33
34     open STDOUT,"| \"$^X\" $xlate $flavour $output";
35 } else {
36     open STDOUT,">$output";
37 }
38
39 ($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
40
41 $code.=<<___;
42 #include "arm_arch.h"
43
44 .text
45 #if defined(__thumb2__)
46 .syntax unified
47 .thumb
48 #else
49 .code   32
50 #endif
51
52 .globl  poly1305_emit
53 .globl  poly1305_blocks
54 .globl  poly1305_init
55 .type   poly1305_init,%function
56 .align  5
57 poly1305_init:
58 .Lpoly1305_init:
59         stmdb   sp!,{r4-r11}
60
61         eor     r3,r3,r3
62         cmp     $inp,#0
63         str     r3,[$ctx,#0]            @ zero hash value
64         str     r3,[$ctx,#4]
65         str     r3,[$ctx,#8]
66         str     r3,[$ctx,#12]
67         str     r3,[$ctx,#16]
68         str     r3,[$ctx,#36]           @ is_base2_26
69         add     $ctx,$ctx,#20
70
71 #ifdef  __thumb2__
72         it      eq
73 #endif
74         moveq   r0,#0
75         beq     .Lno_key
76
77 #if     __ARM_MAX_ARCH__>=7
78         adr     r11,.Lpoly1305_init
79         ldr     r12,.LOPENSSL_armcap
80 #endif
81         ldrb    r4,[$inp,#0]
82         mov     r10,#0x0fffffff
83         ldrb    r5,[$inp,#1]
84         and     r3,r10,#-4              @ 0x0ffffffc
85         ldrb    r6,[$inp,#2]
86         ldrb    r7,[$inp,#3]
87         orr     r4,r4,r5,lsl#8
88         ldrb    r5,[$inp,#4]
89         orr     r4,r4,r6,lsl#16
90         ldrb    r6,[$inp,#5]
91         orr     r4,r4,r7,lsl#24
92         ldrb    r7,[$inp,#6]
93         and     r4,r4,r10
94
95 #if     __ARM_MAX_ARCH__>=7
96         ldr     r12,[r11,r12]           @ OPENSSL_armcap_P
97 # ifdef __APPLE__
98         ldr     r12,[r12]
99 # endif
100 #endif
101         ldrb    r8,[$inp,#7]
102         orr     r5,r5,r6,lsl#8
103         ldrb    r6,[$inp,#8]
104         orr     r5,r5,r7,lsl#16
105         ldrb    r7,[$inp,#9]
106         orr     r5,r5,r8,lsl#24
107         ldrb    r8,[$inp,#10]
108         and     r5,r5,r3
109
110 #if     __ARM_MAX_ARCH__>=7
111         tst     r12,#1                  @ check for NEON
112 # ifdef __APPLE__
113         adr     r9,poly1305_blocks_neon
114         adr     r11,poly1305_blocks
115 #  ifdef __thumb2__
116         it      ne
117 #  endif
118         movne   r11,r9
119         adr     r12,poly1305_emit
120         adr     r10,poly1305_emit_neon
121 #  ifdef __thumb2__
122         it      ne
123 #  endif
124         movne   r12,r10
125 # else
126 #  ifdef __thumb2__
127         itete   eq
128 #  endif
129         addeq   r12,r11,#(poly1305_emit-.Lpoly1305_init)
130         addne   r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
131         addeq   r11,r11,#(poly1305_blocks-.Lpoly1305_init)
132         addne   r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
133 # endif
134 # ifdef __thumb2__
135         orr     r12,r12,#1      @ thumb-ify address
136         orr     r11,r11,#1
137 # endif
138 #endif
139         ldrb    r9,[$inp,#11]
140         orr     r6,r6,r7,lsl#8
141         ldrb    r7,[$inp,#12]
142         orr     r6,r6,r8,lsl#16
143         ldrb    r8,[$inp,#13]
144         orr     r6,r6,r9,lsl#24
145         ldrb    r9,[$inp,#14]
146         and     r6,r6,r3
147
148         ldrb    r10,[$inp,#15]
149         orr     r7,r7,r8,lsl#8
150         str     r4,[$ctx,#0]
151         orr     r7,r7,r9,lsl#16
152         str     r5,[$ctx,#4]
153         orr     r7,r7,r10,lsl#24
154         str     r6,[$ctx,#8]
155         and     r7,r7,r3
156         str     r7,[$ctx,#12]
157 #if     __ARM_MAX_ARCH__>=7
158         stmia   r2,{r11,r12}            @ fill functions table
159         mov     r0,#1
160 #else
161         mov     r0,#0
162 #endif
163 .Lno_key:
164         ldmia   sp!,{r4-r11}
165 #if     __ARM_ARCH__>=5
166         ret                             @ bx    lr
167 #else
168         tst     lr,#1
169         moveq   pc,lr                   @ be binary compatible with V4, yet
170         bx      lr                      @ interoperable with Thumb ISA:-)
171 #endif
172 .size   poly1305_init,.-poly1305_init
173 ___
174 {
175 my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
176 my ($s1,$s2,$s3)=($r1,$r2,$r3);
177
178 $code.=<<___;
179 .type   poly1305_blocks,%function
180 .align  5
181 poly1305_blocks:
182         stmdb   sp!,{r3-r11,lr}
183
184         ands    $len,$len,#-16
185         beq     .Lno_data
186
187         cmp     $padbit,#0
188         add     $len,$len,$inp          @ end pointer
189         sub     sp,sp,#32
190
191         ldmia   $ctx,{$h0-$r3}          @ load context
192
193         str     $ctx,[sp,#12]           @ offload stuff
194         mov     lr,$inp
195         str     $len,[sp,#16]
196         str     $r1,[sp,#20]
197         str     $r2,[sp,#24]
198         str     $r3,[sp,#28]
199         b       .Loop
200
201 .Loop:
202 #if __ARM_ARCH__<7
203         ldrb    r0,[lr],#16             @ load input
204 # ifdef __thumb2__
205         it      hi
206 # endif
207         addhi   $h4,$h4,#1              @ 1<<128
208         ldrb    r1,[lr,#-15]
209         ldrb    r2,[lr,#-14]
210         ldrb    r3,[lr,#-13]
211         orr     r1,r0,r1,lsl#8
212         ldrb    r0,[lr,#-12]
213         orr     r2,r1,r2,lsl#16
214         ldrb    r1,[lr,#-11]
215         orr     r3,r2,r3,lsl#24
216         ldrb    r2,[lr,#-10]
217         adds    $h0,$h0,r3              @ accumulate input
218
219         ldrb    r3,[lr,#-9]
220         orr     r1,r0,r1,lsl#8
221         ldrb    r0,[lr,#-8]
222         orr     r2,r1,r2,lsl#16
223         ldrb    r1,[lr,#-7]
224         orr     r3,r2,r3,lsl#24
225         ldrb    r2,[lr,#-6]
226         adcs    $h1,$h1,r3
227
228         ldrb    r3,[lr,#-5]
229         orr     r1,r0,r1,lsl#8
230         ldrb    r0,[lr,#-4]
231         orr     r2,r1,r2,lsl#16
232         ldrb    r1,[lr,#-3]
233         orr     r3,r2,r3,lsl#24
234         ldrb    r2,[lr,#-2]
235         adcs    $h2,$h2,r3
236
237         ldrb    r3,[lr,#-1]
238         orr     r1,r0,r1,lsl#8
239         str     lr,[sp,#8]              @ offload input pointer
240         orr     r2,r1,r2,lsl#16
241         add     $s1,$r1,$r1,lsr#2
242         orr     r3,r2,r3,lsl#24
243 #else
244         ldr     r0,[lr],#16             @ load input
245 # ifdef __thumb2__
246         it      hi
247 # endif
248         addhi   $h4,$h4,#1              @ padbit
249         ldr     r1,[lr,#-12]
250         ldr     r2,[lr,#-8]
251         ldr     r3,[lr,#-4]
252 # ifdef __ARMEB__
253         rev     r0,r0
254         rev     r1,r1
255         rev     r2,r2
256         rev     r3,r3
257 # endif
258         adds    $h0,$h0,r0              @ accumulate input
259         str     lr,[sp,#8]              @ offload input pointer
260         adcs    $h1,$h1,r1
261         add     $s1,$r1,$r1,lsr#2
262         adcs    $h2,$h2,r2
263 #endif
264         add     $s2,$r2,$r2,lsr#2
265         adcs    $h3,$h3,r3
266         add     $s3,$r3,$r3,lsr#2
267
268         umull   r2,r3,$h1,$r0
269          adc    $h4,$h4,#0
270         umull   r0,r1,$h0,$r0
271         umlal   r2,r3,$h4,$s1
272         umlal   r0,r1,$h3,$s1
273         ldr     $r1,[sp,#20]            @ reload $r1
274         umlal   r2,r3,$h2,$s3
275         umlal   r0,r1,$h1,$s3
276         umlal   r2,r3,$h3,$s2
277         umlal   r0,r1,$h2,$s2
278         umlal   r2,r3,$h0,$r1
279         str     r0,[sp,#0]              @ future $h0
280          mul    r0,$s2,$h4
281         ldr     $r2,[sp,#24]            @ reload $r2
282         adds    r2,r2,r1                @ d1+=d0>>32
283          eor    r1,r1,r1
284         adc     lr,r3,#0                @ future $h2
285         str     r2,[sp,#4]              @ future $h1
286
287         mul     r2,$s3,$h4
288         eor     r3,r3,r3
289         umlal   r0,r1,$h3,$s3
290         ldr     $r3,[sp,#28]            @ reload $r3
291         umlal   r2,r3,$h3,$r0
292         umlal   r0,r1,$h2,$r0
293         umlal   r2,r3,$h2,$r1
294         umlal   r0,r1,$h1,$r1
295         umlal   r2,r3,$h1,$r2
296         umlal   r0,r1,$h0,$r2
297         umlal   r2,r3,$h0,$r3
298         ldr     $h0,[sp,#0]
299         mul     $h4,$r0,$h4
300         ldr     $h1,[sp,#4]
301
302         adds    $h2,lr,r0               @ d2+=d1>>32
303         ldr     lr,[sp,#8]              @ reload input pointer
304         adc     r1,r1,#0
305         adds    $h3,r2,r1               @ d3+=d2>>32
306         ldr     r0,[sp,#16]             @ reload end pointer
307         adc     r3,r3,#0
308         add     $h4,$h4,r3              @ h4+=d3>>32
309
310         and     r1,$h4,#-4
311         and     $h4,$h4,#3
312         add     r1,r1,r1,lsr#2          @ *=5
313         adds    $h0,$h0,r1
314         adcs    $h1,$h1,#0
315         adcs    $h2,$h2,#0
316         adc     $h3,$h3,#0
317
318         cmp     r0,lr                   @ done yet?
319         bhi     .Loop
320
321         ldr     $ctx,[sp,#12]
322         add     sp,sp,#32
323         stmia   $ctx,{$h0-$h4}          @ store the result
324
325 .Lno_data:
326 #if     __ARM_ARCH__>=5
327         ldmia   sp!,{r3-r11,pc}
328 #else
329         ldmia   sp!,{r3-r11,lr}
330         tst     lr,#1
331         moveq   pc,lr                   @ be binary compatible with V4, yet
332         bx      lr                      @ interoperable with Thumb ISA:-)
333 #endif
334 .size   poly1305_blocks,.-poly1305_blocks
335 ___
336 }
337 {
338 my ($ctx,$mac,$nonce)=map("r$_",(0..2));
339 my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
340 my $g4=$h4;
341
342 $code.=<<___;
343 .type   poly1305_emit,%function
344 .align  5
345 poly1305_emit:
346         stmdb   sp!,{r4-r11}
347 .Lpoly1305_emit_enter:
348
349         ldmia   $ctx,{$h0-$h4}
350         adds    $g0,$h0,#5              @ compare to modulus
351         adcs    $g1,$h1,#0
352         adcs    $g2,$h2,#0
353         adcs    $g3,$h3,#0
354         adc     $g4,$h4,#0
355         tst     $g4,#4                  @ did it carry/borrow?
356
357 #ifdef  __thumb2__
358         it      ne
359 #endif
360         movne   $h0,$g0
361         ldr     $g0,[$nonce,#0]
362 #ifdef  __thumb2__
363         it      ne
364 #endif
365         movne   $h1,$g1
366         ldr     $g1,[$nonce,#4]
367 #ifdef  __thumb2__
368         it      ne
369 #endif
370         movne   $h2,$g2
371         ldr     $g2,[$nonce,#8]
372 #ifdef  __thumb2__
373         it      ne
374 #endif
375         movne   $h3,$g3
376         ldr     $g3,[$nonce,#12]
377
378         adds    $h0,$h0,$g0
379         adcs    $h1,$h1,$g1
380         adcs    $h2,$h2,$g2
381         adc     $h3,$h3,$g3
382
383 #if __ARM_ARCH__>=7
384 # ifdef __ARMEB__
385         rev     $h0,$h0
386         rev     $h1,$h1
387         rev     $h2,$h2
388         rev     $h3,$h3
389 # endif
390         str     $h0,[$mac,#0]
391         str     $h1,[$mac,#4]
392         str     $h2,[$mac,#8]
393         str     $h3,[$mac,#12]
394 #else
395         strb    $h0,[$mac,#0]
396         mov     $h0,$h0,lsr#8
397         strb    $h1,[$mac,#4]
398         mov     $h1,$h1,lsr#8
399         strb    $h2,[$mac,#8]
400         mov     $h2,$h2,lsr#8
401         strb    $h3,[$mac,#12]
402         mov     $h3,$h3,lsr#8
403
404         strb    $h0,[$mac,#1]
405         mov     $h0,$h0,lsr#8
406         strb    $h1,[$mac,#5]
407         mov     $h1,$h1,lsr#8
408         strb    $h2,[$mac,#9]
409         mov     $h2,$h2,lsr#8
410         strb    $h3,[$mac,#13]
411         mov     $h3,$h3,lsr#8
412
413         strb    $h0,[$mac,#2]
414         mov     $h0,$h0,lsr#8
415         strb    $h1,[$mac,#6]
416         mov     $h1,$h1,lsr#8
417         strb    $h2,[$mac,#10]
418         mov     $h2,$h2,lsr#8
419         strb    $h3,[$mac,#14]
420         mov     $h3,$h3,lsr#8
421
422         strb    $h0,[$mac,#3]
423         strb    $h1,[$mac,#7]
424         strb    $h2,[$mac,#11]
425         strb    $h3,[$mac,#15]
426 #endif
427         ldmia   sp!,{r4-r11}
428 #if     __ARM_ARCH__>=5
429         ret                             @ bx    lr
430 #else
431         tst     lr,#1
432         moveq   pc,lr                   @ be binary compatible with V4, yet
433         bx      lr                      @ interoperable with Thumb ISA:-)
434 #endif
435 .size   poly1305_emit,.-poly1305_emit
436 ___
437 {
438 my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
439 my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
440 my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
441
442 my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
443
444 $code.=<<___;
445 #if     __ARM_MAX_ARCH__>=7
446 .fpu    neon
447
448 .type   poly1305_init_neon,%function
449 .align  5
450 poly1305_init_neon:
451         ldr     r4,[$ctx,#20]           @ load key base 2^32
452         ldr     r5,[$ctx,#24]
453         ldr     r6,[$ctx,#28]
454         ldr     r7,[$ctx,#32]
455
456         and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
457         mov     r3,r4,lsr#26
458         mov     r4,r5,lsr#20
459         orr     r3,r3,r5,lsl#6
460         mov     r5,r6,lsr#14
461         orr     r4,r4,r6,lsl#12
462         mov     r6,r7,lsr#8
463         orr     r5,r5,r7,lsl#18
464         and     r3,r3,#0x03ffffff
465         and     r4,r4,#0x03ffffff
466         and     r5,r5,#0x03ffffff
467
468         vdup.32 $R0,r2                  @ r^1 in both lanes
469         add     r2,r3,r3,lsl#2          @ *5
470         vdup.32 $R1,r3
471         add     r3,r4,r4,lsl#2
472         vdup.32 $S1,r2
473         vdup.32 $R2,r4
474         add     r4,r5,r5,lsl#2
475         vdup.32 $S2,r3
476         vdup.32 $R3,r5
477         add     r5,r6,r6,lsl#2
478         vdup.32 $S3,r4
479         vdup.32 $R4,r6
480         vdup.32 $S4,r5
481
482         mov     $zeros,#2               @ counter
483
484 .Lsquare_neon:
485         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
486         @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
487         @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
488         @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
489         @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
490         @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
491
492         vmull.u32       $D0,$R0,${R0}[1]
493         vmull.u32       $D1,$R1,${R0}[1]
494         vmull.u32       $D2,$R2,${R0}[1]
495         vmull.u32       $D3,$R3,${R0}[1]
496         vmull.u32       $D4,$R4,${R0}[1]
497
498         vmlal.u32       $D0,$R4,${S1}[1]
499         vmlal.u32       $D1,$R0,${R1}[1]
500         vmlal.u32       $D2,$R1,${R1}[1]
501         vmlal.u32       $D3,$R2,${R1}[1]
502         vmlal.u32       $D4,$R3,${R1}[1]
503
504         vmlal.u32       $D0,$R3,${S2}[1]
505         vmlal.u32       $D1,$R4,${S2}[1]
506         vmlal.u32       $D3,$R1,${R2}[1]
507         vmlal.u32       $D2,$R0,${R2}[1]
508         vmlal.u32       $D4,$R2,${R2}[1]
509
510         vmlal.u32       $D0,$R2,${S3}[1]
511         vmlal.u32       $D3,$R0,${R3}[1]
512         vmlal.u32       $D1,$R3,${S3}[1]
513         vmlal.u32       $D2,$R4,${S3}[1]
514         vmlal.u32       $D4,$R1,${R3}[1]
515
516         vmlal.u32       $D3,$R4,${S4}[1]
517         vmlal.u32       $D0,$R1,${S4}[1]
518         vmlal.u32       $D1,$R2,${S4}[1]
519         vmlal.u32       $D2,$R3,${S4}[1]
520         vmlal.u32       $D4,$R0,${R4}[1]
521
522         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
523         @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
524         @ and P. Schwabe
525
526         vshr.u64        $T0,$D3,#26
527         vmovn.i64       $D3#lo,$D3
528          vshr.u64       $T1,$D0,#26
529          vmovn.i64      $D0#lo,$D0
530         vadd.i64        $D4,$D4,$T0             @ h3 -> h4
531         vbic.i32        $D3#lo,#0xfc000000      @ &=0x03ffffff
532          vadd.i64       $D1,$D1,$T1             @ h0 -> h1
533          vbic.i32       $D0#lo,#0xfc000000
534
535         vshrn.u64       $T0#lo,$D4,#26
536         vmovn.i64       $D4#lo,$D4
537          vshr.u64       $T1,$D1,#26
538          vmovn.i64      $D1#lo,$D1
539          vadd.i64       $D2,$D2,$T1             @ h1 -> h2
540         vbic.i32        $D4#lo,#0xfc000000
541          vbic.i32       $D1#lo,#0xfc000000
542
543         vadd.i32        $D0#lo,$D0#lo,$T0#lo
544         vshl.u32        $T0#lo,$T0#lo,#2
545          vshrn.u64      $T1#lo,$D2,#26
546          vmovn.i64      $D2#lo,$D2
547         vadd.i32        $D0#lo,$D0#lo,$T0#lo    @ h4 -> h0
548          vadd.i32       $D3#lo,$D3#lo,$T1#lo    @ h2 -> h3
549          vbic.i32       $D2#lo,#0xfc000000
550
551         vshr.u32        $T0#lo,$D0#lo,#26
552         vbic.i32        $D0#lo,#0xfc000000
553          vshr.u32       $T1#lo,$D3#lo,#26
554          vbic.i32       $D3#lo,#0xfc000000
555         vadd.i32        $D1#lo,$D1#lo,$T0#lo    @ h0 -> h1
556          vadd.i32       $D4#lo,$D4#lo,$T1#lo    @ h3 -> h4
557
558         subs            $zeros,$zeros,#1
559         beq             .Lsquare_break_neon
560
561         add             $tbl0,$ctx,#(48+0*9*4)
562         add             $tbl1,$ctx,#(48+1*9*4)
563
564         vtrn.32         $R0,$D0#lo              @ r^2:r^1
565         vtrn.32         $R2,$D2#lo
566         vtrn.32         $R3,$D3#lo
567         vtrn.32         $R1,$D1#lo
568         vtrn.32         $R4,$D4#lo
569
570         vshl.u32        $S2,$R2,#2              @ *5
571         vshl.u32        $S3,$R3,#2
572         vshl.u32        $S1,$R1,#2
573         vshl.u32        $S4,$R4,#2
574         vadd.i32        $S2,$S2,$R2
575         vadd.i32        $S1,$S1,$R1
576         vadd.i32        $S3,$S3,$R3
577         vadd.i32        $S4,$S4,$R4
578
579         vst4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
580         vst4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
581         vst4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
582         vst4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
583         vst1.32         {${S4}[0]},[$tbl0,:32]
584         vst1.32         {${S4}[1]},[$tbl1,:32]
585
586         b               .Lsquare_neon
587
588 .align  4
589 .Lsquare_break_neon:
590         add             $tbl0,$ctx,#(48+2*4*9)
591         add             $tbl1,$ctx,#(48+3*4*9)
592
593         vmov            $R0,$D0#lo              @ r^4:r^3
594         vshl.u32        $S1,$D1#lo,#2           @ *5
595         vmov            $R1,$D1#lo
596         vshl.u32        $S2,$D2#lo,#2
597         vmov            $R2,$D2#lo
598         vshl.u32        $S3,$D3#lo,#2
599         vmov            $R3,$D3#lo
600         vshl.u32        $S4,$D4#lo,#2
601         vmov            $R4,$D4#lo
602         vadd.i32        $S1,$S1,$D1#lo
603         vadd.i32        $S2,$S2,$D2#lo
604         vadd.i32        $S3,$S3,$D3#lo
605         vadd.i32        $S4,$S4,$D4#lo
606
607         vst4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
608         vst4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
609         vst4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
610         vst4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
611         vst1.32         {${S4}[0]},[$tbl0]
612         vst1.32         {${S4}[1]},[$tbl1]
613
614         ret                             @ bx    lr
615 .size   poly1305_init_neon,.-poly1305_init_neon
616
617 .type   poly1305_blocks_neon,%function
618 .align  5
619 poly1305_blocks_neon:
620         ldr     ip,[$ctx,#36]           @ is_base2_26
621         ands    $len,$len,#-16
622         beq     .Lno_data_neon
623
624         cmp     $len,#64
625         bhs     .Lenter_neon
626         tst     ip,ip                   @ is_base2_26?
627         beq     poly1305_blocks
628
629 .Lenter_neon:
630         stmdb   sp!,{r4-r7}
631         vstmdb  sp!,{d8-d15}            @ ABI specification says so
632
633         tst     ip,ip                   @ is_base2_26?
634         bne     .Lbase2_26_neon
635
636         stmdb   sp!,{r1-r3,lr}
637         bl      poly1305_init_neon
638
639         ldr     r4,[$ctx,#0]            @ load hash value base 2^32
640         ldr     r5,[$ctx,#4]
641         ldr     r6,[$ctx,#8]
642         ldr     r7,[$ctx,#12]
643         ldr     ip,[$ctx,#16]
644
645         and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
646         mov     r3,r4,lsr#26
647          veor   $D0#lo,$D0#lo,$D0#lo
648         mov     r4,r5,lsr#20
649         orr     r3,r3,r5,lsl#6
650          veor   $D1#lo,$D1#lo,$D1#lo
651         mov     r5,r6,lsr#14
652         orr     r4,r4,r6,lsl#12
653          veor   $D2#lo,$D2#lo,$D2#lo
654         mov     r6,r7,lsr#8
655         orr     r5,r5,r7,lsl#18
656          veor   $D3#lo,$D3#lo,$D3#lo
657         and     r3,r3,#0x03ffffff
658         orr     r6,r6,ip,lsl#24
659          veor   $D4#lo,$D4#lo,$D4#lo
660         and     r4,r4,#0x03ffffff
661         mov     r1,#1
662         and     r5,r5,#0x03ffffff
663         str     r1,[$ctx,#36]           @ is_base2_26
664
665         vmov.32 $D0#lo[0],r2
666         vmov.32 $D1#lo[0],r3
667         vmov.32 $D2#lo[0],r4
668         vmov.32 $D3#lo[0],r5
669         vmov.32 $D4#lo[0],r6
670         adr     $zeros,.Lzeros
671
672         ldmia   sp!,{r1-r3,lr}
673         b       .Lbase2_32_neon
674
675 .align  4
676 .Lbase2_26_neon:
677         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
678         @ load hash value
679
680         veor            $D0#lo,$D0#lo,$D0#lo
681         veor            $D1#lo,$D1#lo,$D1#lo
682         veor            $D2#lo,$D2#lo,$D2#lo
683         veor            $D3#lo,$D3#lo,$D3#lo
684         veor            $D4#lo,$D4#lo,$D4#lo
685         vld4.32         {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
686         adr             $zeros,.Lzeros
687         vld1.32         {$D4#lo[0]},[$ctx]
688         sub             $ctx,$ctx,#16           @ rewind
689
690 .Lbase2_32_neon:
691         add             $in2,$inp,#32
692         mov             $padbit,$padbit,lsl#24
693         tst             $len,#31
694         beq             .Leven
695
696         vld4.32         {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
697         vmov.32         $H4#lo[0],$padbit
698         sub             $len,$len,#16
699         add             $in2,$inp,#32
700
701 # ifdef __ARMEB__
702         vrev32.8        $H0,$H0
703         vrev32.8        $H3,$H3
704         vrev32.8        $H1,$H1
705         vrev32.8        $H2,$H2
706 # endif
707         vsri.u32        $H4#lo,$H3#lo,#8        @ base 2^32 -> base 2^26
708         vshl.u32        $H3#lo,$H3#lo,#18
709
710         vsri.u32        $H3#lo,$H2#lo,#14
711         vshl.u32        $H2#lo,$H2#lo,#12
712         vadd.i32        $H4#hi,$H4#lo,$D4#lo    @ add hash value and move to #hi
713
714         vbic.i32        $H3#lo,#0xfc000000
715         vsri.u32        $H2#lo,$H1#lo,#20
716         vshl.u32        $H1#lo,$H1#lo,#6
717
718         vbic.i32        $H2#lo,#0xfc000000
719         vsri.u32        $H1#lo,$H0#lo,#26
720         vadd.i32        $H3#hi,$H3#lo,$D3#lo
721
722         vbic.i32        $H0#lo,#0xfc000000
723         vbic.i32        $H1#lo,#0xfc000000
724         vadd.i32        $H2#hi,$H2#lo,$D2#lo
725
726         vadd.i32        $H0#hi,$H0#lo,$D0#lo
727         vadd.i32        $H1#hi,$H1#lo,$D1#lo
728
729         mov             $tbl1,$zeros
730         add             $tbl0,$ctx,#48
731
732         cmp             $len,$len
733         b               .Long_tail
734
735 .align  4
736 .Leven:
737         subs            $len,$len,#64
738 # ifdef __thumb2__
739         it              lo
740 # endif
741         movlo           $in2,$zeros
742
743         vmov.i32        $H4,#1<<24              @ padbit, yes, always
744         vld4.32         {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]    @ inp[0:1]
745         add             $inp,$inp,#64
746         vld4.32         {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]    @ inp[2:3] (or 0)
747         add             $in2,$in2,#64
748 # ifdef __thumb2__
749         itt             hi
750 # endif
751         addhi           $tbl1,$ctx,#(48+1*9*4)
752         addhi           $tbl0,$ctx,#(48+3*9*4)
753
754 # ifdef __ARMEB__
755         vrev32.8        $H0,$H0
756         vrev32.8        $H3,$H3
757         vrev32.8        $H1,$H1
758         vrev32.8        $H2,$H2
759 # endif
760         vsri.u32        $H4,$H3,#8              @ base 2^32 -> base 2^26
761         vshl.u32        $H3,$H3,#18
762
763         vsri.u32        $H3,$H2,#14
764         vshl.u32        $H2,$H2,#12
765
766         vbic.i32        $H3,#0xfc000000
767         vsri.u32        $H2,$H1,#20
768         vshl.u32        $H1,$H1,#6
769
770         vbic.i32        $H2,#0xfc000000
771         vsri.u32        $H1,$H0,#26
772
773         vbic.i32        $H0,#0xfc000000
774         vbic.i32        $H1,#0xfc000000
775
776         bls             .Lskip_loop
777
778         vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^2
779         vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^4
780         vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
781         vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
782         b               .Loop_neon
783
784 .align  5
785 .Loop_neon:
786         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
787         @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
788         @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
789         @   \___________________/
790         @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
791         @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
792         @   \___________________/ \____________________/
793         @
794         @ Note that we start with inp[2:3]*r^2. This is because it
795         @ doesn't depend on reduction in previous iteration.
796         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
797         @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
798         @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
799         @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
800         @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
801         @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
802
803         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
804         @ inp[2:3]*r^2
805
806         vadd.i32        $H2#lo,$H2#lo,$D2#lo    @ accumulate inp[0:1]
807         vmull.u32       $D2,$H2#hi,${R0}[1]
808         vadd.i32        $H0#lo,$H0#lo,$D0#lo
809         vmull.u32       $D0,$H0#hi,${R0}[1]
810         vadd.i32        $H3#lo,$H3#lo,$D3#lo
811         vmull.u32       $D3,$H3#hi,${R0}[1]
812         vmlal.u32       $D2,$H1#hi,${R1}[1]
813         vadd.i32        $H1#lo,$H1#lo,$D1#lo
814         vmull.u32       $D1,$H1#hi,${R0}[1]
815
816         vadd.i32        $H4#lo,$H4#lo,$D4#lo
817         vmull.u32       $D4,$H4#hi,${R0}[1]
818         subs            $len,$len,#64
819         vmlal.u32       $D0,$H4#hi,${S1}[1]
820 # ifdef __thumb2__
821         it              lo
822 # endif
823         movlo           $in2,$zeros
824         vmlal.u32       $D3,$H2#hi,${R1}[1]
825         vld1.32         ${S4}[1],[$tbl1,:32]
826         vmlal.u32       $D1,$H0#hi,${R1}[1]
827         vmlal.u32       $D4,$H3#hi,${R1}[1]
828
829         vmlal.u32       $D0,$H3#hi,${S2}[1]
830         vmlal.u32       $D3,$H1#hi,${R2}[1]
831         vmlal.u32       $D4,$H2#hi,${R2}[1]
832         vmlal.u32       $D1,$H4#hi,${S2}[1]
833         vmlal.u32       $D2,$H0#hi,${R2}[1]
834
835         vmlal.u32       $D3,$H0#hi,${R3}[1]
836         vmlal.u32       $D0,$H2#hi,${S3}[1]
837         vmlal.u32       $D4,$H1#hi,${R3}[1]
838         vmlal.u32       $D1,$H3#hi,${S3}[1]
839         vmlal.u32       $D2,$H4#hi,${S3}[1]
840
841         vmlal.u32       $D3,$H4#hi,${S4}[1]
842         vmlal.u32       $D0,$H1#hi,${S4}[1]
843         vmlal.u32       $D4,$H0#hi,${R4}[1]
844         vmlal.u32       $D1,$H2#hi,${S4}[1]
845         vmlal.u32       $D2,$H3#hi,${S4}[1]
846
847         vld4.32         {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]    @ inp[2:3] (or 0)
848         add             $in2,$in2,#64
849
850         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
851         @ (hash+inp[0:1])*r^4 and accumulate
852
853         vmlal.u32       $D3,$H3#lo,${R0}[0]
854         vmlal.u32       $D0,$H0#lo,${R0}[0]
855         vmlal.u32       $D4,$H4#lo,${R0}[0]
856         vmlal.u32       $D1,$H1#lo,${R0}[0]
857         vmlal.u32       $D2,$H2#lo,${R0}[0]
858         vld1.32         ${S4}[0],[$tbl0,:32]
859
860         vmlal.u32       $D3,$H2#lo,${R1}[0]
861         vmlal.u32       $D0,$H4#lo,${S1}[0]
862         vmlal.u32       $D4,$H3#lo,${R1}[0]
863         vmlal.u32       $D1,$H0#lo,${R1}[0]
864         vmlal.u32       $D2,$H1#lo,${R1}[0]
865
866         vmlal.u32       $D3,$H1#lo,${R2}[0]
867         vmlal.u32       $D0,$H3#lo,${S2}[0]
868         vmlal.u32       $D4,$H2#lo,${R2}[0]
869         vmlal.u32       $D1,$H4#lo,${S2}[0]
870         vmlal.u32       $D2,$H0#lo,${R2}[0]
871
872         vmlal.u32       $D3,$H0#lo,${R3}[0]
873         vmlal.u32       $D0,$H2#lo,${S3}[0]
874         vmlal.u32       $D4,$H1#lo,${R3}[0]
875         vmlal.u32       $D1,$H3#lo,${S3}[0]
876         vmlal.u32       $D3,$H4#lo,${S4}[0]
877
878         vmlal.u32       $D2,$H4#lo,${S3}[0]
879         vmlal.u32       $D0,$H1#lo,${S4}[0]
880         vmlal.u32       $D4,$H0#lo,${R4}[0]
881         vmov.i32        $H4,#1<<24              @ padbit, yes, always
882         vmlal.u32       $D1,$H2#lo,${S4}[0]
883         vmlal.u32       $D2,$H3#lo,${S4}[0]
884
885         vld4.32         {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]    @ inp[0:1]
886         add             $inp,$inp,#64
887 # ifdef __ARMEB__
888         vrev32.8        $H0,$H0
889         vrev32.8        $H1,$H1
890         vrev32.8        $H2,$H2
891         vrev32.8        $H3,$H3
892 # endif
893
894         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
895         @ lazy reduction interleaved with base 2^32 -> base 2^26
896
897         vshr.u64        $T0,$D3,#26
898         vmovn.i64       $D3#lo,$D3
899          vshr.u64       $T1,$D0,#26
900          vmovn.i64      $D0#lo,$D0
901         vadd.i64        $D4,$D4,$T0             @ h3 -> h4
902         vbic.i32        $D3#lo,#0xfc000000
903           vsri.u32      $H4,$H3,#8              @ base 2^32 -> base 2^26
904          vadd.i64       $D1,$D1,$T1             @ h0 -> h1
905           vshl.u32      $H3,$H3,#18
906          vbic.i32       $D0#lo,#0xfc000000
907
908         vshrn.u64       $T0#lo,$D4,#26
909         vmovn.i64       $D4#lo,$D4
910          vshr.u64       $T1,$D1,#26
911          vmovn.i64      $D1#lo,$D1
912          vadd.i64       $D2,$D2,$T1             @ h1 -> h2
913           vsri.u32      $H3,$H2,#14
914         vbic.i32        $D4#lo,#0xfc000000
915           vshl.u32      $H2,$H2,#12
916          vbic.i32       $D1#lo,#0xfc000000
917
918         vadd.i32        $D0#lo,$D0#lo,$T0#lo
919         vshl.u32        $T0#lo,$T0#lo,#2
920           vbic.i32      $H3,#0xfc000000
921          vshrn.u64      $T1#lo,$D2,#26
922          vmovn.i64      $D2#lo,$D2
923         vadd.i32        $D0#lo,$D0#lo,$T0#lo    @ h4 -> h0
924           vsri.u32      $H2,$H1,#20
925          vadd.i32       $D3#lo,$D3#lo,$T1#lo    @ h2 -> h3
926           vshl.u32      $H1,$H1,#6
927          vbic.i32       $D2#lo,#0xfc000000
928           vbic.i32      $H2,#0xfc000000
929
930         vshr.u32        $T0#lo,$D0#lo,#26
931         vbic.i32        $D0#lo,#0xfc000000
932           vsri.u32      $H1,$H0,#26
933           vbic.i32      $H0,#0xfc000000
934          vshr.u32       $T1#lo,$D3#lo,#26
935          vbic.i32       $D3#lo,#0xfc000000
936         vadd.i32        $D1#lo,$D1#lo,$T0#lo    @ h0 -> h1
937          vadd.i32       $D4#lo,$D4#lo,$T1#lo    @ h3 -> h4
938           vbic.i32      $H1,#0xfc000000
939
940         bhi             .Loop_neon
941
942 .Lskip_loop:
943         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
944         @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
945
946         add             $tbl1,$ctx,#(48+0*9*4)
947         add             $tbl0,$ctx,#(48+1*9*4)
948         adds            $len,$len,#32
949 # ifdef __thumb2__
950         it              ne
951 # endif
952         movne           $len,#0
953         bne             .Long_tail
954
955         vadd.i32        $H2#hi,$H2#lo,$D2#lo    @ add hash value and move to #hi
956         vadd.i32        $H0#hi,$H0#lo,$D0#lo
957         vadd.i32        $H3#hi,$H3#lo,$D3#lo
958         vadd.i32        $H1#hi,$H1#lo,$D1#lo
959         vadd.i32        $H4#hi,$H4#lo,$D4#lo
960
961 .Long_tail:
962         vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^1
963         vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^2
964
965         vadd.i32        $H2#lo,$H2#lo,$D2#lo    @ can be redundant
966         vmull.u32       $D2,$H2#hi,$R0
967         vadd.i32        $H0#lo,$H0#lo,$D0#lo
968         vmull.u32       $D0,$H0#hi,$R0
969         vadd.i32        $H3#lo,$H3#lo,$D3#lo
970         vmull.u32       $D3,$H3#hi,$R0
971         vadd.i32        $H1#lo,$H1#lo,$D1#lo
972         vmull.u32       $D1,$H1#hi,$R0
973         vadd.i32        $H4#lo,$H4#lo,$D4#lo
974         vmull.u32       $D4,$H4#hi,$R0
975
976         vmlal.u32       $D0,$H4#hi,$S1
977         vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
978         vmlal.u32       $D3,$H2#hi,$R1
979         vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
980         vmlal.u32       $D1,$H0#hi,$R1
981         vmlal.u32       $D4,$H3#hi,$R1
982         vmlal.u32       $D2,$H1#hi,$R1
983
984         vmlal.u32       $D3,$H1#hi,$R2
985         vld1.32         ${S4}[1],[$tbl1,:32]
986         vmlal.u32       $D0,$H3#hi,$S2
987         vld1.32         ${S4}[0],[$tbl0,:32]
988         vmlal.u32       $D4,$H2#hi,$R2
989         vmlal.u32       $D1,$H4#hi,$S2
990         vmlal.u32       $D2,$H0#hi,$R2
991
992         vmlal.u32       $D3,$H0#hi,$R3
993 # ifdef __thumb2__
994         it              ne
995 # endif
996          addne          $tbl1,$ctx,#(48+2*9*4)
997         vmlal.u32       $D0,$H2#hi,$S3
998 # ifdef __thumb2__
999         it              ne
1000 # endif
1001          addne          $tbl0,$ctx,#(48+3*9*4)
1002         vmlal.u32       $D4,$H1#hi,$R3
1003         vmlal.u32       $D1,$H3#hi,$S3
1004         vmlal.u32       $D2,$H4#hi,$S3
1005
1006         vmlal.u32       $D3,$H4#hi,$S4
1007          vmov.u64       $MASK,#-1               @ can be redundant
1008         vmlal.u32       $D0,$H1#hi,$S4
1009          vshr.u64       $MASK,$MASK,#38
1010         vmlal.u32       $D4,$H0#hi,$R4
1011         vmlal.u32       $D1,$H2#hi,$S4
1012         vmlal.u32       $D2,$H3#hi,$S4
1013
1014         beq             .Lshort_tail
1015
1016         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1017         @ (hash+inp[0:1])*r^4:r^3 and accumulate
1018
1019         vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^3
1020         vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^4
1021
1022         vmlal.u32       $D2,$H2#lo,$R0
1023         vmlal.u32       $D0,$H0#lo,$R0
1024         vmlal.u32       $D3,$H3#lo,$R0
1025         vmlal.u32       $D1,$H1#lo,$R0
1026         vmlal.u32       $D4,$H4#lo,$R0
1027
1028         vmlal.u32       $D0,$H4#lo,$S1
1029         vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1030         vmlal.u32       $D3,$H2#lo,$R1
1031         vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1032         vmlal.u32       $D1,$H0#lo,$R1
1033         vmlal.u32       $D4,$H3#lo,$R1
1034         vmlal.u32       $D2,$H1#lo,$R1
1035
1036         vmlal.u32       $D3,$H1#lo,$R2
1037         vld1.32         ${S4}[1],[$tbl1,:32]
1038         vmlal.u32       $D0,$H3#lo,$S2
1039         vld1.32         ${S4}[0],[$tbl0,:32]
1040         vmlal.u32       $D4,$H2#lo,$R2
1041         vmlal.u32       $D1,$H4#lo,$S2
1042         vmlal.u32       $D2,$H0#lo,$R2
1043
1044         vmlal.u32       $D3,$H0#lo,$R3
1045         vmlal.u32       $D0,$H2#lo,$S3
1046         vmlal.u32       $D4,$H1#lo,$R3
1047         vmlal.u32       $D1,$H3#lo,$S3
1048         vmlal.u32       $D2,$H4#lo,$S3
1049
1050         vmlal.u32       $D3,$H4#lo,$S4
1051          vmov.u64       $MASK,#-1
1052         vmlal.u32       $D0,$H1#lo,$S4
1053          vshr.u64       $MASK,$MASK,#38
1054         vmlal.u32       $D4,$H0#lo,$R4
1055         vmlal.u32       $D1,$H2#lo,$S4
1056         vmlal.u32       $D2,$H3#lo,$S4
1057
1058 .Lshort_tail:
1059         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1060         @ lazy reduction, but without narrowing
1061
1062         vshr.u64        $T0,$D3,#26
1063         vand.i64        $D3,$D3,$MASK
1064          vshr.u64       $T1,$D0,#26
1065          vand.i64       $D0,$D0,$MASK
1066         vadd.i64        $D4,$D4,$T0             @ h3 -> h4
1067          vadd.i64       $D1,$D1,$T1             @ h0 -> h1
1068
1069         vshr.u64        $T0,$D4,#26
1070         vand.i64        $D4,$D4,$MASK
1071          vshr.u64       $T1,$D1,#26
1072          vand.i64       $D1,$D1,$MASK
1073          vadd.i64       $D2,$D2,$T1             @ h1 -> h2
1074
1075         vadd.i64        $D0,$D0,$T0
1076         vshl.u64        $T0,$T0,#2
1077          vshr.u64       $T1,$D2,#26
1078          vand.i64       $D2,$D2,$MASK
1079         vadd.i64        $D0,$D0,$T0             @ h4 -> h0
1080          vadd.i64       $D3,$D3,$T1             @ h2 -> h3
1081
1082         vshr.u64        $T0,$D0,#26
1083         vand.i64        $D0,$D0,$MASK
1084          vshr.u64       $T1,$D3,#26
1085          vand.i64       $D3,$D3,$MASK
1086         vadd.i64        $D1,$D1,$T0             @ h0 -> h1
1087          vadd.i64       $D4,$D4,$T1             @ h3 -> h4
1088
1089         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1090         @ horizontal addition
1091
1092         vadd.i64        $D2#lo,$D2#lo,$D2#hi
1093         vadd.i64        $D0#lo,$D0#lo,$D0#hi
1094         vadd.i64        $D3#lo,$D3#lo,$D3#hi
1095         vadd.i64        $D1#lo,$D1#lo,$D1#hi
1096         vadd.i64        $D4#lo,$D4#lo,$D4#hi
1097
1098         cmp             $len,#0
1099         bne             .Leven
1100
1101         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1102         @ store hash value
1103
1104         vst4.32         {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
1105         vst1.32         {$D4#lo[0]},[$ctx]
1106
1107         vldmia  sp!,{d8-d15}                    @ epilogue
1108         ldmia   sp!,{r4-r7}
1109 .Lno_data_neon:
1110         ret                                     @ bx    lr
1111 .size   poly1305_blocks_neon,.-poly1305_blocks_neon
1112
1113 .type   poly1305_emit_neon,%function
1114 .align  5
1115 poly1305_emit_neon:
1116         ldr     ip,[$ctx,#36]           @ is_base2_26
1117
1118         stmdb   sp!,{r4-r11}
1119
1120         tst     ip,ip
1121         beq     .Lpoly1305_emit_enter
1122
1123         ldmia   $ctx,{$h0-$h4}
1124         eor     $g0,$g0,$g0
1125
1126         adds    $h0,$h0,$h1,lsl#26      @ base 2^26 -> base 2^32
1127         mov     $h1,$h1,lsr#6
1128         adcs    $h1,$h1,$h2,lsl#20
1129         mov     $h2,$h2,lsr#12
1130         adcs    $h2,$h2,$h3,lsl#14
1131         mov     $h3,$h3,lsr#18
1132         adcs    $h3,$h3,$h4,lsl#8
1133         adc     $h4,$g0,$h4,lsr#24      @ can be partially reduced ...
1134
1135         and     $g0,$h4,#-4             @ ... so reduce
1136         and     $h4,$h3,#3
1137         add     $g0,$g0,$g0,lsr#2       @ *= 5
1138         adds    $h0,$h0,$g0
1139         adcs    $h1,$h1,#0
1140         adcs    $h2,$h2,#0
1141         adc     $h3,$h3,#0
1142
1143         adds    $g0,$h0,#5              @ compare to modulus
1144         adcs    $g1,$h1,#0
1145         adcs    $g2,$h2,#0
1146         adcs    $g3,$h3,#0
1147         adc     $g4,$h4,#0
1148         tst     $g4,#4                  @ did it carry/borrow?
1149
1150 # ifdef __thumb2__
1151         it      ne
1152 # endif
1153         movne   $h0,$g0
1154         ldr     $g0,[$nonce,#0]
1155 # ifdef __thumb2__
1156         it      ne
1157 # endif
1158         movne   $h1,$g1
1159         ldr     $g1,[$nonce,#4]
1160 # ifdef __thumb2__
1161         it      ne
1162 # endif
1163         movne   $h2,$g2
1164         ldr     $g2,[$nonce,#8]
1165 # ifdef __thumb2__
1166         it      ne
1167 # endif
1168         movne   $h3,$g3
1169         ldr     $g3,[$nonce,#12]
1170
1171         adds    $h0,$h0,$g0             @ accumulate nonce
1172         adcs    $h1,$h1,$g1
1173         adcs    $h2,$h2,$g2
1174         adc     $h3,$h3,$g3
1175
1176 # ifdef __ARMEB__
1177         rev     $h0,$h0
1178         rev     $h1,$h1
1179         rev     $h2,$h2
1180         rev     $h3,$h3
1181 # endif
1182         str     $h0,[$mac,#0]           @ store the result
1183         str     $h1,[$mac,#4]
1184         str     $h2,[$mac,#8]
1185         str     $h3,[$mac,#12]
1186
1187         ldmia   sp!,{r4-r11}
1188         ret                             @ bx    lr
1189 .size   poly1305_emit_neon,.-poly1305_emit_neon
1190
1191 .align  5
1192 .Lzeros:
1193 .long   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1194 .LOPENSSL_armcap:
1195 .word   OPENSSL_armcap_P-.Lpoly1305_init
1196 #endif
1197 ___
1198 }       }
1199 $code.=<<___;
1200 .asciz  "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1201 .align  2
1202 #if     __ARM_MAX_ARCH__>=7
1203 .comm   OPENSSL_armcap_P,4,4
1204 #endif
1205 ___
1206
1207 foreach (split("\n",$code)) {
1208         s/\`([^\`]*)\`/eval $1/geo;
1209
1210         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
1211         s/\bret\b/bx    lr/go                                           or
1212         s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
1213
1214         print $_,"\n";
1215 }
1216 close STDOUT; # enforce flush