crypto/poly1305/asm: chase overflow bit on x86 and ARM platforms.
[openssl.git] / crypto / poly1305 / asm / poly1305-armv4.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 #                       IALU(*)/gcc-4.4         NEON
11 #
12 # ARM11xx(ARMv6)        7.78/+100%              -
13 # Cortex-A5             6.35/+130%              3.00
14 # Cortex-A8             6.25/+115%              2.36
15 # Cortex-A9             5.10/+95%               2.55
16 # Cortex-A15            3.85/+85%               1.25(**)
17 # Snapdragon S4         5.70/+100%              1.48(**)
18 #
19 # (*)   this is for -march=armv6, i.e. with bunch of ldrb loading data;
20 # (**)  these are trade-off results, they can be improved by ~8% but at
21 #       the cost of 15/12% regression on Cortex-A5/A7, it's even possible
22 #       to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
23
24 $flavour = shift;
25 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
26 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
27
28 if ($flavour && $flavour ne "void") {
29     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
31     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
32     die "can't locate arm-xlate.pl";
33
34     open STDOUT,"| \"$^X\" $xlate $flavour $output";
35 } else {
36     open STDOUT,">$output";
37 }
38
39 ($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
40
41 $code.=<<___;
42 #include "arm_arch.h"
43
44 .text
45 #if defined(__thumb2__)
46 .syntax unified
47 .thumb
48 #else
49 .code   32
50 #endif
51
52 .globl  poly1305_emit
53 .globl  poly1305_blocks
54 .globl  poly1305_init
55 .type   poly1305_init,%function
56 .align  5
57 poly1305_init:
58 .Lpoly1305_init:
59         stmdb   sp!,{r4-r11}
60
61         eor     r3,r3,r3
62         cmp     $inp,#0
63         str     r3,[$ctx,#0]            @ zero hash value
64         str     r3,[$ctx,#4]
65         str     r3,[$ctx,#8]
66         str     r3,[$ctx,#12]
67         str     r3,[$ctx,#16]
68         str     r3,[$ctx,#36]           @ is_base2_26
69         add     $ctx,$ctx,#20
70
71 #ifdef  __thumb2__
72         it      eq
73 #endif
74         moveq   r0,#0
75         beq     .Lno_key
76
77 #if     __ARM_MAX_ARCH__>=7
78         adr     r11,.Lpoly1305_init
79         ldr     r12,.LOPENSSL_armcap
80 #endif
81         ldrb    r4,[$inp,#0]
82         mov     r10,#0x0fffffff
83         ldrb    r5,[$inp,#1]
84         and     r3,r10,#-4              @ 0x0ffffffc
85         ldrb    r6,[$inp,#2]
86         ldrb    r7,[$inp,#3]
87         orr     r4,r4,r5,lsl#8
88         ldrb    r5,[$inp,#4]
89         orr     r4,r4,r6,lsl#16
90         ldrb    r6,[$inp,#5]
91         orr     r4,r4,r7,lsl#24
92         ldrb    r7,[$inp,#6]
93         and     r4,r4,r10
94
95 #if     __ARM_MAX_ARCH__>=7
96         ldr     r12,[r11,r12]           @ OPENSSL_armcap_P
97 # ifdef __APPLE__
98         ldr     r12,[r12]
99 # endif
100 #endif
101         ldrb    r8,[$inp,#7]
102         orr     r5,r5,r6,lsl#8
103         ldrb    r6,[$inp,#8]
104         orr     r5,r5,r7,lsl#16
105         ldrb    r7,[$inp,#9]
106         orr     r5,r5,r8,lsl#24
107         ldrb    r8,[$inp,#10]
108         and     r5,r5,r3
109
110 #if     __ARM_MAX_ARCH__>=7
111         tst     r12,#ARMV7_NEON         @ check for NEON
112 # ifdef __APPLE__
113         adr     r9,poly1305_blocks_neon
114         adr     r11,poly1305_blocks
115 #  ifdef __thumb2__
116         it      ne
117 #  endif
118         movne   r11,r9
119         adr     r12,poly1305_emit
120         adr     r10,poly1305_emit_neon
121 #  ifdef __thumb2__
122         it      ne
123 #  endif
124         movne   r12,r10
125 # else
126 #  ifdef __thumb2__
127         itete   eq
128 #  endif
129         addeq   r12,r11,#(poly1305_emit-.Lpoly1305_init)
130         addne   r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
131         addeq   r11,r11,#(poly1305_blocks-.Lpoly1305_init)
132         addne   r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
133 # endif
134 # ifdef __thumb2__
135         orr     r12,r12,#1      @ thumb-ify address
136         orr     r11,r11,#1
137 # endif
138 #endif
139         ldrb    r9,[$inp,#11]
140         orr     r6,r6,r7,lsl#8
141         ldrb    r7,[$inp,#12]
142         orr     r6,r6,r8,lsl#16
143         ldrb    r8,[$inp,#13]
144         orr     r6,r6,r9,lsl#24
145         ldrb    r9,[$inp,#14]
146         and     r6,r6,r3
147
148         ldrb    r10,[$inp,#15]
149         orr     r7,r7,r8,lsl#8
150         str     r4,[$ctx,#0]
151         orr     r7,r7,r9,lsl#16
152         str     r5,[$ctx,#4]
153         orr     r7,r7,r10,lsl#24
154         str     r6,[$ctx,#8]
155         and     r7,r7,r3
156         str     r7,[$ctx,#12]
157 #if     __ARM_MAX_ARCH__>=7
158         stmia   r2,{r11,r12}            @ fill functions table
159         mov     r0,#1
160 #else
161         mov     r0,#0
162 #endif
163 .Lno_key:
164         ldmia   sp!,{r4-r11}
165 #if     __ARM_ARCH__>=5
166         ret                             @ bx    lr
167 #else
168         tst     lr,#1
169         moveq   pc,lr                   @ be binary compatible with V4, yet
170         bx      lr                      @ interoperable with Thumb ISA:-)
171 #endif
172 .size   poly1305_init,.-poly1305_init
173 ___
174 {
175 my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
176 my ($s1,$s2,$s3)=($r1,$r2,$r3);
177
178 $code.=<<___;
179 .type   poly1305_blocks,%function
180 .align  5
181 poly1305_blocks:
182         stmdb   sp!,{r3-r11,lr}
183
184         ands    $len,$len,#-16
185         beq     .Lno_data
186
187         cmp     $padbit,#0
188         add     $len,$len,$inp          @ end pointer
189         sub     sp,sp,#32
190
191         ldmia   $ctx,{$h0-$r3}          @ load context
192
193         str     $ctx,[sp,#12]           @ offload stuff
194         mov     lr,$inp
195         str     $len,[sp,#16]
196         str     $r1,[sp,#20]
197         str     $r2,[sp,#24]
198         str     $r3,[sp,#28]
199         b       .Loop
200
201 .Loop:
202 #if __ARM_ARCH__<7
203         ldrb    r0,[lr],#16             @ load input
204 # ifdef __thumb2__
205         it      hi
206 # endif
207         addhi   $h4,$h4,#1              @ 1<<128
208         ldrb    r1,[lr,#-15]
209         ldrb    r2,[lr,#-14]
210         ldrb    r3,[lr,#-13]
211         orr     r1,r0,r1,lsl#8
212         ldrb    r0,[lr,#-12]
213         orr     r2,r1,r2,lsl#16
214         ldrb    r1,[lr,#-11]
215         orr     r3,r2,r3,lsl#24
216         ldrb    r2,[lr,#-10]
217         adds    $h0,$h0,r3              @ accumulate input
218
219         ldrb    r3,[lr,#-9]
220         orr     r1,r0,r1,lsl#8
221         ldrb    r0,[lr,#-8]
222         orr     r2,r1,r2,lsl#16
223         ldrb    r1,[lr,#-7]
224         orr     r3,r2,r3,lsl#24
225         ldrb    r2,[lr,#-6]
226         adcs    $h1,$h1,r3
227
228         ldrb    r3,[lr,#-5]
229         orr     r1,r0,r1,lsl#8
230         ldrb    r0,[lr,#-4]
231         orr     r2,r1,r2,lsl#16
232         ldrb    r1,[lr,#-3]
233         orr     r3,r2,r3,lsl#24
234         ldrb    r2,[lr,#-2]
235         adcs    $h2,$h2,r3
236
237         ldrb    r3,[lr,#-1]
238         orr     r1,r0,r1,lsl#8
239         str     lr,[sp,#8]              @ offload input pointer
240         orr     r2,r1,r2,lsl#16
241         add     $s1,$r1,$r1,lsr#2
242         orr     r3,r2,r3,lsl#24
243 #else
244         ldr     r0,[lr],#16             @ load input
245 # ifdef __thumb2__
246         it      hi
247 # endif
248         addhi   $h4,$h4,#1              @ padbit
249         ldr     r1,[lr,#-12]
250         ldr     r2,[lr,#-8]
251         ldr     r3,[lr,#-4]
252 # ifdef __ARMEB__
253         rev     r0,r0
254         rev     r1,r1
255         rev     r2,r2
256         rev     r3,r3
257 # endif
258         adds    $h0,$h0,r0              @ accumulate input
259         str     lr,[sp,#8]              @ offload input pointer
260         adcs    $h1,$h1,r1
261         add     $s1,$r1,$r1,lsr#2
262         adcs    $h2,$h2,r2
263 #endif
264         add     $s2,$r2,$r2,lsr#2
265         adcs    $h3,$h3,r3
266         add     $s3,$r3,$r3,lsr#2
267
268         umull   r2,r3,$h1,$r0
269          adc    $h4,$h4,#0
270         umull   r0,r1,$h0,$r0
271         umlal   r2,r3,$h4,$s1
272         umlal   r0,r1,$h3,$s1
273         ldr     $r1,[sp,#20]            @ reload $r1
274         umlal   r2,r3,$h2,$s3
275         umlal   r0,r1,$h1,$s3
276         umlal   r2,r3,$h3,$s2
277         umlal   r0,r1,$h2,$s2
278         umlal   r2,r3,$h0,$r1
279         str     r0,[sp,#0]              @ future $h0
280          mul    r0,$s2,$h4
281         ldr     $r2,[sp,#24]            @ reload $r2
282         adds    r2,r2,r1                @ d1+=d0>>32
283          eor    r1,r1,r1
284         adc     lr,r3,#0                @ future $h2
285         str     r2,[sp,#4]              @ future $h1
286
287         mul     r2,$s3,$h4
288         eor     r3,r3,r3
289         umlal   r0,r1,$h3,$s3
290         ldr     $r3,[sp,#28]            @ reload $r3
291         umlal   r2,r3,$h3,$r0
292         umlal   r0,r1,$h2,$r0
293         umlal   r2,r3,$h2,$r1
294         umlal   r0,r1,$h1,$r1
295         umlal   r2,r3,$h1,$r2
296         umlal   r0,r1,$h0,$r2
297         umlal   r2,r3,$h0,$r3
298         ldr     $h0,[sp,#0]
299         mul     $h4,$r0,$h4
300         ldr     $h1,[sp,#4]
301
302         adds    $h2,lr,r0               @ d2+=d1>>32
303         ldr     lr,[sp,#8]              @ reload input pointer
304         adc     r1,r1,#0
305         adds    $h3,r2,r1               @ d3+=d2>>32
306         ldr     r0,[sp,#16]             @ reload end pointer
307         adc     r3,r3,#0
308         add     $h4,$h4,r3              @ h4+=d3>>32
309
310         and     r1,$h4,#-4
311         and     $h4,$h4,#3
312         add     r1,r1,r1,lsr#2          @ *=5
313         adds    $h0,$h0,r1
314         adcs    $h1,$h1,#0
315         adcs    $h2,$h2,#0
316         adcs    $h3,$h3,#0
317         adc     $h4,$h4,#0
318
319         cmp     r0,lr                   @ done yet?
320         bhi     .Loop
321
322         ldr     $ctx,[sp,#12]
323         add     sp,sp,#32
324         stmia   $ctx,{$h0-$h4}          @ store the result
325
326 .Lno_data:
327 #if     __ARM_ARCH__>=5
328         ldmia   sp!,{r3-r11,pc}
329 #else
330         ldmia   sp!,{r3-r11,lr}
331         tst     lr,#1
332         moveq   pc,lr                   @ be binary compatible with V4, yet
333         bx      lr                      @ interoperable with Thumb ISA:-)
334 #endif
335 .size   poly1305_blocks,.-poly1305_blocks
336 ___
337 }
338 {
339 my ($ctx,$mac,$nonce)=map("r$_",(0..2));
340 my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
341 my $g4=$h4;
342
343 $code.=<<___;
344 .type   poly1305_emit,%function
345 .align  5
346 poly1305_emit:
347         stmdb   sp!,{r4-r11}
348 .Lpoly1305_emit_enter:
349
350         ldmia   $ctx,{$h0-$h4}
351         adds    $g0,$h0,#5              @ compare to modulus
352         adcs    $g1,$h1,#0
353         adcs    $g2,$h2,#0
354         adcs    $g3,$h3,#0
355         adc     $g4,$h4,#0
356         tst     $g4,#4                  @ did it carry/borrow?
357
358 #ifdef  __thumb2__
359         it      ne
360 #endif
361         movne   $h0,$g0
362         ldr     $g0,[$nonce,#0]
363 #ifdef  __thumb2__
364         it      ne
365 #endif
366         movne   $h1,$g1
367         ldr     $g1,[$nonce,#4]
368 #ifdef  __thumb2__
369         it      ne
370 #endif
371         movne   $h2,$g2
372         ldr     $g2,[$nonce,#8]
373 #ifdef  __thumb2__
374         it      ne
375 #endif
376         movne   $h3,$g3
377         ldr     $g3,[$nonce,#12]
378
379         adds    $h0,$h0,$g0
380         adcs    $h1,$h1,$g1
381         adcs    $h2,$h2,$g2
382         adc     $h3,$h3,$g3
383
384 #if __ARM_ARCH__>=7
385 # ifdef __ARMEB__
386         rev     $h0,$h0
387         rev     $h1,$h1
388         rev     $h2,$h2
389         rev     $h3,$h3
390 # endif
391         str     $h0,[$mac,#0]
392         str     $h1,[$mac,#4]
393         str     $h2,[$mac,#8]
394         str     $h3,[$mac,#12]
395 #else
396         strb    $h0,[$mac,#0]
397         mov     $h0,$h0,lsr#8
398         strb    $h1,[$mac,#4]
399         mov     $h1,$h1,lsr#8
400         strb    $h2,[$mac,#8]
401         mov     $h2,$h2,lsr#8
402         strb    $h3,[$mac,#12]
403         mov     $h3,$h3,lsr#8
404
405         strb    $h0,[$mac,#1]
406         mov     $h0,$h0,lsr#8
407         strb    $h1,[$mac,#5]
408         mov     $h1,$h1,lsr#8
409         strb    $h2,[$mac,#9]
410         mov     $h2,$h2,lsr#8
411         strb    $h3,[$mac,#13]
412         mov     $h3,$h3,lsr#8
413
414         strb    $h0,[$mac,#2]
415         mov     $h0,$h0,lsr#8
416         strb    $h1,[$mac,#6]
417         mov     $h1,$h1,lsr#8
418         strb    $h2,[$mac,#10]
419         mov     $h2,$h2,lsr#8
420         strb    $h3,[$mac,#14]
421         mov     $h3,$h3,lsr#8
422
423         strb    $h0,[$mac,#3]
424         strb    $h1,[$mac,#7]
425         strb    $h2,[$mac,#11]
426         strb    $h3,[$mac,#15]
427 #endif
428         ldmia   sp!,{r4-r11}
429 #if     __ARM_ARCH__>=5
430         ret                             @ bx    lr
431 #else
432         tst     lr,#1
433         moveq   pc,lr                   @ be binary compatible with V4, yet
434         bx      lr                      @ interoperable with Thumb ISA:-)
435 #endif
436 .size   poly1305_emit,.-poly1305_emit
437 ___
438 {
439 my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
440 my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
441 my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
442
443 my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
444
445 $code.=<<___;
446 #if     __ARM_MAX_ARCH__>=7
447 .fpu    neon
448
449 .type   poly1305_init_neon,%function
450 .align  5
451 poly1305_init_neon:
452         ldr     r4,[$ctx,#20]           @ load key base 2^32
453         ldr     r5,[$ctx,#24]
454         ldr     r6,[$ctx,#28]
455         ldr     r7,[$ctx,#32]
456
457         and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
458         mov     r3,r4,lsr#26
459         mov     r4,r5,lsr#20
460         orr     r3,r3,r5,lsl#6
461         mov     r5,r6,lsr#14
462         orr     r4,r4,r6,lsl#12
463         mov     r6,r7,lsr#8
464         orr     r5,r5,r7,lsl#18
465         and     r3,r3,#0x03ffffff
466         and     r4,r4,#0x03ffffff
467         and     r5,r5,#0x03ffffff
468
469         vdup.32 $R0,r2                  @ r^1 in both lanes
470         add     r2,r3,r3,lsl#2          @ *5
471         vdup.32 $R1,r3
472         add     r3,r4,r4,lsl#2
473         vdup.32 $S1,r2
474         vdup.32 $R2,r4
475         add     r4,r5,r5,lsl#2
476         vdup.32 $S2,r3
477         vdup.32 $R3,r5
478         add     r5,r6,r6,lsl#2
479         vdup.32 $S3,r4
480         vdup.32 $R4,r6
481         vdup.32 $S4,r5
482
483         mov     $zeros,#2               @ counter
484
485 .Lsquare_neon:
486         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
487         @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
488         @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
489         @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
490         @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
491         @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
492
493         vmull.u32       $D0,$R0,${R0}[1]
494         vmull.u32       $D1,$R1,${R0}[1]
495         vmull.u32       $D2,$R2,${R0}[1]
496         vmull.u32       $D3,$R3,${R0}[1]
497         vmull.u32       $D4,$R4,${R0}[1]
498
499         vmlal.u32       $D0,$R4,${S1}[1]
500         vmlal.u32       $D1,$R0,${R1}[1]
501         vmlal.u32       $D2,$R1,${R1}[1]
502         vmlal.u32       $D3,$R2,${R1}[1]
503         vmlal.u32       $D4,$R3,${R1}[1]
504
505         vmlal.u32       $D0,$R3,${S2}[1]
506         vmlal.u32       $D1,$R4,${S2}[1]
507         vmlal.u32       $D3,$R1,${R2}[1]
508         vmlal.u32       $D2,$R0,${R2}[1]
509         vmlal.u32       $D4,$R2,${R2}[1]
510
511         vmlal.u32       $D0,$R2,${S3}[1]
512         vmlal.u32       $D3,$R0,${R3}[1]
513         vmlal.u32       $D1,$R3,${S3}[1]
514         vmlal.u32       $D2,$R4,${S3}[1]
515         vmlal.u32       $D4,$R1,${R3}[1]
516
517         vmlal.u32       $D3,$R4,${S4}[1]
518         vmlal.u32       $D0,$R1,${S4}[1]
519         vmlal.u32       $D1,$R2,${S4}[1]
520         vmlal.u32       $D2,$R3,${S4}[1]
521         vmlal.u32       $D4,$R0,${R4}[1]
522
523         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
524         @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
525         @ and P. Schwabe
526         @
527         @ H0>>+H1>>+H2>>+H3>>+H4
528         @ H3>>+H4>>*5+H0>>+H1
529         @
530         @ Trivia.
531         @
532         @ Result of multiplication of n-bit number by m-bit number is
533         @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
534         @ m-bit number multiplied by 2^n is still n+m bits wide.
535         @
536         @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
537         @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
538         @ one is n+1 bits wide.
539         @
540         @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
541         @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
542         @ can be 27. However! In cases when their width exceeds 26 bits
543         @ they are limited by 2^26+2^6. This in turn means that *sum*
544         @ of the products with these values can still be viewed as sum
545         @ of 52-bit numbers as long as the amount of addends is not a
546         @ power of 2. For example,
547         @
548         @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
549         @
550         @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
551         @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
552         @ 8 * (2^52) or 2^55. However, the value is then multiplied by
553         @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
554         @ which is less than 32 * (2^52) or 2^57. And when processing
555         @ data we are looking at triple as many addends...
556         @
557         @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
558         @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
559         @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
560         @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
561         @ instruction accepts 2x32-bit input and writes 2x64-bit result.
562         @ This means that result of reduction have to be compressed upon
563         @ loop wrap-around. This can be done in the process of reduction
564         @ to minimize amount of instructions [as well as amount of
565         @ 128-bit instructions, which benefits low-end processors], but
566         @ one has to watch for H2 (which is narrower than H0) and 5*H4
567         @ not being wider than 58 bits, so that result of right shift
568         @ by 26 bits fits in 32 bits. This is also useful on x86,
569         @ because it allows to use paddd in place for paddq, which
570         @ benefits Atom, where paddq is ridiculously slow.
571
572         vshr.u64        $T0,$D3,#26
573         vmovn.i64       $D3#lo,$D3
574          vshr.u64       $T1,$D0,#26
575          vmovn.i64      $D0#lo,$D0
576         vadd.i64        $D4,$D4,$T0             @ h3 -> h4
577         vbic.i32        $D3#lo,#0xfc000000      @ &=0x03ffffff
578          vadd.i64       $D1,$D1,$T1             @ h0 -> h1
579          vbic.i32       $D0#lo,#0xfc000000
580
581         vshrn.u64       $T0#lo,$D4,#26
582         vmovn.i64       $D4#lo,$D4
583          vshr.u64       $T1,$D1,#26
584          vmovn.i64      $D1#lo,$D1
585          vadd.i64       $D2,$D2,$T1             @ h1 -> h2
586         vbic.i32        $D4#lo,#0xfc000000
587          vbic.i32       $D1#lo,#0xfc000000
588
589         vadd.i32        $D0#lo,$D0#lo,$T0#lo
590         vshl.u32        $T0#lo,$T0#lo,#2
591          vshrn.u64      $T1#lo,$D2,#26
592          vmovn.i64      $D2#lo,$D2
593         vadd.i32        $D0#lo,$D0#lo,$T0#lo    @ h4 -> h0
594          vadd.i32       $D3#lo,$D3#lo,$T1#lo    @ h2 -> h3
595          vbic.i32       $D2#lo,#0xfc000000
596
597         vshr.u32        $T0#lo,$D0#lo,#26
598         vbic.i32        $D0#lo,#0xfc000000
599          vshr.u32       $T1#lo,$D3#lo,#26
600          vbic.i32       $D3#lo,#0xfc000000
601         vadd.i32        $D1#lo,$D1#lo,$T0#lo    @ h0 -> h1
602          vadd.i32       $D4#lo,$D4#lo,$T1#lo    @ h3 -> h4
603
604         subs            $zeros,$zeros,#1
605         beq             .Lsquare_break_neon
606
607         add             $tbl0,$ctx,#(48+0*9*4)
608         add             $tbl1,$ctx,#(48+1*9*4)
609
610         vtrn.32         $R0,$D0#lo              @ r^2:r^1
611         vtrn.32         $R2,$D2#lo
612         vtrn.32         $R3,$D3#lo
613         vtrn.32         $R1,$D1#lo
614         vtrn.32         $R4,$D4#lo
615
616         vshl.u32        $S2,$R2,#2              @ *5
617         vshl.u32        $S3,$R3,#2
618         vshl.u32        $S1,$R1,#2
619         vshl.u32        $S4,$R4,#2
620         vadd.i32        $S2,$S2,$R2
621         vadd.i32        $S1,$S1,$R1
622         vadd.i32        $S3,$S3,$R3
623         vadd.i32        $S4,$S4,$R4
624
625         vst4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
626         vst4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
627         vst4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
628         vst4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
629         vst1.32         {${S4}[0]},[$tbl0,:32]
630         vst1.32         {${S4}[1]},[$tbl1,:32]
631
632         b               .Lsquare_neon
633
634 .align  4
635 .Lsquare_break_neon:
636         add             $tbl0,$ctx,#(48+2*4*9)
637         add             $tbl1,$ctx,#(48+3*4*9)
638
639         vmov            $R0,$D0#lo              @ r^4:r^3
640         vshl.u32        $S1,$D1#lo,#2           @ *5
641         vmov            $R1,$D1#lo
642         vshl.u32        $S2,$D2#lo,#2
643         vmov            $R2,$D2#lo
644         vshl.u32        $S3,$D3#lo,#2
645         vmov            $R3,$D3#lo
646         vshl.u32        $S4,$D4#lo,#2
647         vmov            $R4,$D4#lo
648         vadd.i32        $S1,$S1,$D1#lo
649         vadd.i32        $S2,$S2,$D2#lo
650         vadd.i32        $S3,$S3,$D3#lo
651         vadd.i32        $S4,$S4,$D4#lo
652
653         vst4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
654         vst4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
655         vst4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
656         vst4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
657         vst1.32         {${S4}[0]},[$tbl0]
658         vst1.32         {${S4}[1]},[$tbl1]
659
660         ret                             @ bx    lr
661 .size   poly1305_init_neon,.-poly1305_init_neon
662
663 .type   poly1305_blocks_neon,%function
664 .align  5
665 poly1305_blocks_neon:
666         ldr     ip,[$ctx,#36]           @ is_base2_26
667         ands    $len,$len,#-16
668         beq     .Lno_data_neon
669
670         cmp     $len,#64
671         bhs     .Lenter_neon
672         tst     ip,ip                   @ is_base2_26?
673         beq     poly1305_blocks
674
675 .Lenter_neon:
676         stmdb   sp!,{r4-r7}
677         vstmdb  sp!,{d8-d15}            @ ABI specification says so
678
679         tst     ip,ip                   @ is_base2_26?
680         bne     .Lbase2_26_neon
681
682         stmdb   sp!,{r1-r3,lr}
683         bl      poly1305_init_neon
684
685         ldr     r4,[$ctx,#0]            @ load hash value base 2^32
686         ldr     r5,[$ctx,#4]
687         ldr     r6,[$ctx,#8]
688         ldr     r7,[$ctx,#12]
689         ldr     ip,[$ctx,#16]
690
691         and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
692         mov     r3,r4,lsr#26
693          veor   $D0#lo,$D0#lo,$D0#lo
694         mov     r4,r5,lsr#20
695         orr     r3,r3,r5,lsl#6
696          veor   $D1#lo,$D1#lo,$D1#lo
697         mov     r5,r6,lsr#14
698         orr     r4,r4,r6,lsl#12
699          veor   $D2#lo,$D2#lo,$D2#lo
700         mov     r6,r7,lsr#8
701         orr     r5,r5,r7,lsl#18
702          veor   $D3#lo,$D3#lo,$D3#lo
703         and     r3,r3,#0x03ffffff
704         orr     r6,r6,ip,lsl#24
705          veor   $D4#lo,$D4#lo,$D4#lo
706         and     r4,r4,#0x03ffffff
707         mov     r1,#1
708         and     r5,r5,#0x03ffffff
709         str     r1,[$ctx,#36]           @ is_base2_26
710
711         vmov.32 $D0#lo[0],r2
712         vmov.32 $D1#lo[0],r3
713         vmov.32 $D2#lo[0],r4
714         vmov.32 $D3#lo[0],r5
715         vmov.32 $D4#lo[0],r6
716         adr     $zeros,.Lzeros
717
718         ldmia   sp!,{r1-r3,lr}
719         b       .Lbase2_32_neon
720
721 .align  4
722 .Lbase2_26_neon:
723         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
724         @ load hash value
725
726         veor            $D0#lo,$D0#lo,$D0#lo
727         veor            $D1#lo,$D1#lo,$D1#lo
728         veor            $D2#lo,$D2#lo,$D2#lo
729         veor            $D3#lo,$D3#lo,$D3#lo
730         veor            $D4#lo,$D4#lo,$D4#lo
731         vld4.32         {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
732         adr             $zeros,.Lzeros
733         vld1.32         {$D4#lo[0]},[$ctx]
734         sub             $ctx,$ctx,#16           @ rewind
735
736 .Lbase2_32_neon:
737         add             $in2,$inp,#32
738         mov             $padbit,$padbit,lsl#24
739         tst             $len,#31
740         beq             .Leven
741
742         vld4.32         {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
743         vmov.32         $H4#lo[0],$padbit
744         sub             $len,$len,#16
745         add             $in2,$inp,#32
746
747 # ifdef __ARMEB__
748         vrev32.8        $H0,$H0
749         vrev32.8        $H3,$H3
750         vrev32.8        $H1,$H1
751         vrev32.8        $H2,$H2
752 # endif
753         vsri.u32        $H4#lo,$H3#lo,#8        @ base 2^32 -> base 2^26
754         vshl.u32        $H3#lo,$H3#lo,#18
755
756         vsri.u32        $H3#lo,$H2#lo,#14
757         vshl.u32        $H2#lo,$H2#lo,#12
758         vadd.i32        $H4#hi,$H4#lo,$D4#lo    @ add hash value and move to #hi
759
760         vbic.i32        $H3#lo,#0xfc000000
761         vsri.u32        $H2#lo,$H1#lo,#20
762         vshl.u32        $H1#lo,$H1#lo,#6
763
764         vbic.i32        $H2#lo,#0xfc000000
765         vsri.u32        $H1#lo,$H0#lo,#26
766         vadd.i32        $H3#hi,$H3#lo,$D3#lo
767
768         vbic.i32        $H0#lo,#0xfc000000
769         vbic.i32        $H1#lo,#0xfc000000
770         vadd.i32        $H2#hi,$H2#lo,$D2#lo
771
772         vadd.i32        $H0#hi,$H0#lo,$D0#lo
773         vadd.i32        $H1#hi,$H1#lo,$D1#lo
774
775         mov             $tbl1,$zeros
776         add             $tbl0,$ctx,#48
777
778         cmp             $len,$len
779         b               .Long_tail
780
781 .align  4
782 .Leven:
783         subs            $len,$len,#64
784         it              lo
785         movlo           $in2,$zeros
786
787         vmov.i32        $H4,#1<<24              @ padbit, yes, always
788         vld4.32         {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]    @ inp[0:1]
789         add             $inp,$inp,#64
790         vld4.32         {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]    @ inp[2:3] (or 0)
791         add             $in2,$in2,#64
792         itt             hi
793         addhi           $tbl1,$ctx,#(48+1*9*4)
794         addhi           $tbl0,$ctx,#(48+3*9*4)
795
796 # ifdef __ARMEB__
797         vrev32.8        $H0,$H0
798         vrev32.8        $H3,$H3
799         vrev32.8        $H1,$H1
800         vrev32.8        $H2,$H2
801 # endif
802         vsri.u32        $H4,$H3,#8              @ base 2^32 -> base 2^26
803         vshl.u32        $H3,$H3,#18
804
805         vsri.u32        $H3,$H2,#14
806         vshl.u32        $H2,$H2,#12
807
808         vbic.i32        $H3,#0xfc000000
809         vsri.u32        $H2,$H1,#20
810         vshl.u32        $H1,$H1,#6
811
812         vbic.i32        $H2,#0xfc000000
813         vsri.u32        $H1,$H0,#26
814
815         vbic.i32        $H0,#0xfc000000
816         vbic.i32        $H1,#0xfc000000
817
818         bls             .Lskip_loop
819
820         vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^2
821         vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^4
822         vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
823         vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
824         b               .Loop_neon
825
826 .align  5
827 .Loop_neon:
828         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
829         @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
830         @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
831         @   \___________________/
832         @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
833         @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
834         @   \___________________/ \____________________/
835         @
836         @ Note that we start with inp[2:3]*r^2. This is because it
837         @ doesn't depend on reduction in previous iteration.
838         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
839         @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
840         @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
841         @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
842         @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
843         @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
844
845         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
846         @ inp[2:3]*r^2
847
848         vadd.i32        $H2#lo,$H2#lo,$D2#lo    @ accumulate inp[0:1]
849         vmull.u32       $D2,$H2#hi,${R0}[1]
850         vadd.i32        $H0#lo,$H0#lo,$D0#lo
851         vmull.u32       $D0,$H0#hi,${R0}[1]
852         vadd.i32        $H3#lo,$H3#lo,$D3#lo
853         vmull.u32       $D3,$H3#hi,${R0}[1]
854         vmlal.u32       $D2,$H1#hi,${R1}[1]
855         vadd.i32        $H1#lo,$H1#lo,$D1#lo
856         vmull.u32       $D1,$H1#hi,${R0}[1]
857
858         vadd.i32        $H4#lo,$H4#lo,$D4#lo
859         vmull.u32       $D4,$H4#hi,${R0}[1]
860         subs            $len,$len,#64
861         vmlal.u32       $D0,$H4#hi,${S1}[1]
862         it              lo
863         movlo           $in2,$zeros
864         vmlal.u32       $D3,$H2#hi,${R1}[1]
865         vld1.32         ${S4}[1],[$tbl1,:32]
866         vmlal.u32       $D1,$H0#hi,${R1}[1]
867         vmlal.u32       $D4,$H3#hi,${R1}[1]
868
869         vmlal.u32       $D0,$H3#hi,${S2}[1]
870         vmlal.u32       $D3,$H1#hi,${R2}[1]
871         vmlal.u32       $D4,$H2#hi,${R2}[1]
872         vmlal.u32       $D1,$H4#hi,${S2}[1]
873         vmlal.u32       $D2,$H0#hi,${R2}[1]
874
875         vmlal.u32       $D3,$H0#hi,${R3}[1]
876         vmlal.u32       $D0,$H2#hi,${S3}[1]
877         vmlal.u32       $D4,$H1#hi,${R3}[1]
878         vmlal.u32       $D1,$H3#hi,${S3}[1]
879         vmlal.u32       $D2,$H4#hi,${S3}[1]
880
881         vmlal.u32       $D3,$H4#hi,${S4}[1]
882         vmlal.u32       $D0,$H1#hi,${S4}[1]
883         vmlal.u32       $D4,$H0#hi,${R4}[1]
884         vmlal.u32       $D1,$H2#hi,${S4}[1]
885         vmlal.u32       $D2,$H3#hi,${S4}[1]
886
887         vld4.32         {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]    @ inp[2:3] (or 0)
888         add             $in2,$in2,#64
889
890         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
891         @ (hash+inp[0:1])*r^4 and accumulate
892
893         vmlal.u32       $D3,$H3#lo,${R0}[0]
894         vmlal.u32       $D0,$H0#lo,${R0}[0]
895         vmlal.u32       $D4,$H4#lo,${R0}[0]
896         vmlal.u32       $D1,$H1#lo,${R0}[0]
897         vmlal.u32       $D2,$H2#lo,${R0}[0]
898         vld1.32         ${S4}[0],[$tbl0,:32]
899
900         vmlal.u32       $D3,$H2#lo,${R1}[0]
901         vmlal.u32       $D0,$H4#lo,${S1}[0]
902         vmlal.u32       $D4,$H3#lo,${R1}[0]
903         vmlal.u32       $D1,$H0#lo,${R1}[0]
904         vmlal.u32       $D2,$H1#lo,${R1}[0]
905
906         vmlal.u32       $D3,$H1#lo,${R2}[0]
907         vmlal.u32       $D0,$H3#lo,${S2}[0]
908         vmlal.u32       $D4,$H2#lo,${R2}[0]
909         vmlal.u32       $D1,$H4#lo,${S2}[0]
910         vmlal.u32       $D2,$H0#lo,${R2}[0]
911
912         vmlal.u32       $D3,$H0#lo,${R3}[0]
913         vmlal.u32       $D0,$H2#lo,${S3}[0]
914         vmlal.u32       $D4,$H1#lo,${R3}[0]
915         vmlal.u32       $D1,$H3#lo,${S3}[0]
916         vmlal.u32       $D3,$H4#lo,${S4}[0]
917
918         vmlal.u32       $D2,$H4#lo,${S3}[0]
919         vmlal.u32       $D0,$H1#lo,${S4}[0]
920         vmlal.u32       $D4,$H0#lo,${R4}[0]
921         vmov.i32        $H4,#1<<24              @ padbit, yes, always
922         vmlal.u32       $D1,$H2#lo,${S4}[0]
923         vmlal.u32       $D2,$H3#lo,${S4}[0]
924
925         vld4.32         {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]    @ inp[0:1]
926         add             $inp,$inp,#64
927 # ifdef __ARMEB__
928         vrev32.8        $H0,$H0
929         vrev32.8        $H1,$H1
930         vrev32.8        $H2,$H2
931         vrev32.8        $H3,$H3
932 # endif
933
934         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
935         @ lazy reduction interleaved with base 2^32 -> base 2^26 of
936         @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
937
938         vshr.u64        $T0,$D3,#26
939         vmovn.i64       $D3#lo,$D3
940          vshr.u64       $T1,$D0,#26
941          vmovn.i64      $D0#lo,$D0
942         vadd.i64        $D4,$D4,$T0             @ h3 -> h4
943         vbic.i32        $D3#lo,#0xfc000000
944           vsri.u32      $H4,$H3,#8              @ base 2^32 -> base 2^26
945          vadd.i64       $D1,$D1,$T1             @ h0 -> h1
946           vshl.u32      $H3,$H3,#18
947          vbic.i32       $D0#lo,#0xfc000000
948
949         vshrn.u64       $T0#lo,$D4,#26
950         vmovn.i64       $D4#lo,$D4
951          vshr.u64       $T1,$D1,#26
952          vmovn.i64      $D1#lo,$D1
953          vadd.i64       $D2,$D2,$T1             @ h1 -> h2
954           vsri.u32      $H3,$H2,#14
955         vbic.i32        $D4#lo,#0xfc000000
956           vshl.u32      $H2,$H2,#12
957          vbic.i32       $D1#lo,#0xfc000000
958
959         vadd.i32        $D0#lo,$D0#lo,$T0#lo
960         vshl.u32        $T0#lo,$T0#lo,#2
961           vbic.i32      $H3,#0xfc000000
962          vshrn.u64      $T1#lo,$D2,#26
963          vmovn.i64      $D2#lo,$D2
964         vaddl.u32       $D0,$D0#lo,$T0#lo       @ h4 -> h0 [widen for a sec]
965           vsri.u32      $H2,$H1,#20
966          vadd.i32       $D3#lo,$D3#lo,$T1#lo    @ h2 -> h3
967           vshl.u32      $H1,$H1,#6
968          vbic.i32       $D2#lo,#0xfc000000
969           vbic.i32      $H2,#0xfc000000
970
971         vshrn.u64       $T0#lo,$D0,#26          @ re-narrow
972         vmovn.i64       $D0#lo,$D0
973           vsri.u32      $H1,$H0,#26
974           vbic.i32      $H0,#0xfc000000
975          vshr.u32       $T1#lo,$D3#lo,#26
976          vbic.i32       $D3#lo,#0xfc000000
977         vbic.i32        $D0#lo,#0xfc000000
978         vadd.i32        $D1#lo,$D1#lo,$T0#lo    @ h0 -> h1
979          vadd.i32       $D4#lo,$D4#lo,$T1#lo    @ h3 -> h4
980           vbic.i32      $H1,#0xfc000000
981
982         bhi             .Loop_neon
983
984 .Lskip_loop:
985         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
986         @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
987
988         add             $tbl1,$ctx,#(48+0*9*4)
989         add             $tbl0,$ctx,#(48+1*9*4)
990         adds            $len,$len,#32
991         it              ne
992         movne           $len,#0
993         bne             .Long_tail
994
995         vadd.i32        $H2#hi,$H2#lo,$D2#lo    @ add hash value and move to #hi
996         vadd.i32        $H0#hi,$H0#lo,$D0#lo
997         vadd.i32        $H3#hi,$H3#lo,$D3#lo
998         vadd.i32        $H1#hi,$H1#lo,$D1#lo
999         vadd.i32        $H4#hi,$H4#lo,$D4#lo
1000
1001 .Long_tail:
1002         vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^1
1003         vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^2
1004
1005         vadd.i32        $H2#lo,$H2#lo,$D2#lo    @ can be redundant
1006         vmull.u32       $D2,$H2#hi,$R0
1007         vadd.i32        $H0#lo,$H0#lo,$D0#lo
1008         vmull.u32       $D0,$H0#hi,$R0
1009         vadd.i32        $H3#lo,$H3#lo,$D3#lo
1010         vmull.u32       $D3,$H3#hi,$R0
1011         vadd.i32        $H1#lo,$H1#lo,$D1#lo
1012         vmull.u32       $D1,$H1#hi,$R0
1013         vadd.i32        $H4#lo,$H4#lo,$D4#lo
1014         vmull.u32       $D4,$H4#hi,$R0
1015
1016         vmlal.u32       $D0,$H4#hi,$S1
1017         vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1018         vmlal.u32       $D3,$H2#hi,$R1
1019         vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1020         vmlal.u32       $D1,$H0#hi,$R1
1021         vmlal.u32       $D4,$H3#hi,$R1
1022         vmlal.u32       $D2,$H1#hi,$R1
1023
1024         vmlal.u32       $D3,$H1#hi,$R2
1025         vld1.32         ${S4}[1],[$tbl1,:32]
1026         vmlal.u32       $D0,$H3#hi,$S2
1027         vld1.32         ${S4}[0],[$tbl0,:32]
1028         vmlal.u32       $D4,$H2#hi,$R2
1029         vmlal.u32       $D1,$H4#hi,$S2
1030         vmlal.u32       $D2,$H0#hi,$R2
1031
1032         vmlal.u32       $D3,$H0#hi,$R3
1033          it             ne
1034          addne          $tbl1,$ctx,#(48+2*9*4)
1035         vmlal.u32       $D0,$H2#hi,$S3
1036          it             ne
1037          addne          $tbl0,$ctx,#(48+3*9*4)
1038         vmlal.u32       $D4,$H1#hi,$R3
1039         vmlal.u32       $D1,$H3#hi,$S3
1040         vmlal.u32       $D2,$H4#hi,$S3
1041
1042         vmlal.u32       $D3,$H4#hi,$S4
1043          vorn           $MASK,$MASK,$MASK       @ all-ones, can be redundant
1044         vmlal.u32       $D0,$H1#hi,$S4
1045          vshr.u64       $MASK,$MASK,#38
1046         vmlal.u32       $D4,$H0#hi,$R4
1047         vmlal.u32       $D1,$H2#hi,$S4
1048         vmlal.u32       $D2,$H3#hi,$S4
1049
1050         beq             .Lshort_tail
1051
1052         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1053         @ (hash+inp[0:1])*r^4:r^3 and accumulate
1054
1055         vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^3
1056         vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^4
1057
1058         vmlal.u32       $D2,$H2#lo,$R0
1059         vmlal.u32       $D0,$H0#lo,$R0
1060         vmlal.u32       $D3,$H3#lo,$R0
1061         vmlal.u32       $D1,$H1#lo,$R0
1062         vmlal.u32       $D4,$H4#lo,$R0
1063
1064         vmlal.u32       $D0,$H4#lo,$S1
1065         vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1066         vmlal.u32       $D3,$H2#lo,$R1
1067         vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1068         vmlal.u32       $D1,$H0#lo,$R1
1069         vmlal.u32       $D4,$H3#lo,$R1
1070         vmlal.u32       $D2,$H1#lo,$R1
1071
1072         vmlal.u32       $D3,$H1#lo,$R2
1073         vld1.32         ${S4}[1],[$tbl1,:32]
1074         vmlal.u32       $D0,$H3#lo,$S2
1075         vld1.32         ${S4}[0],[$tbl0,:32]
1076         vmlal.u32       $D4,$H2#lo,$R2
1077         vmlal.u32       $D1,$H4#lo,$S2
1078         vmlal.u32       $D2,$H0#lo,$R2
1079
1080         vmlal.u32       $D3,$H0#lo,$R3
1081         vmlal.u32       $D0,$H2#lo,$S3
1082         vmlal.u32       $D4,$H1#lo,$R3
1083         vmlal.u32       $D1,$H3#lo,$S3
1084         vmlal.u32       $D2,$H4#lo,$S3
1085
1086         vmlal.u32       $D3,$H4#lo,$S4
1087          vorn           $MASK,$MASK,$MASK       @ all-ones
1088         vmlal.u32       $D0,$H1#lo,$S4
1089          vshr.u64       $MASK,$MASK,#38
1090         vmlal.u32       $D4,$H0#lo,$R4
1091         vmlal.u32       $D1,$H2#lo,$S4
1092         vmlal.u32       $D2,$H3#lo,$S4
1093
1094 .Lshort_tail:
1095         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1096         @ horizontal addition
1097
1098         vadd.i64        $D3#lo,$D3#lo,$D3#hi
1099         vadd.i64        $D0#lo,$D0#lo,$D0#hi
1100         vadd.i64        $D4#lo,$D4#lo,$D4#hi
1101         vadd.i64        $D1#lo,$D1#lo,$D1#hi
1102         vadd.i64        $D2#lo,$D2#lo,$D2#hi
1103
1104         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1105         @ lazy reduction, but without narrowing
1106
1107         vshr.u64        $T0,$D3,#26
1108         vand.i64        $D3,$D3,$MASK
1109          vshr.u64       $T1,$D0,#26
1110          vand.i64       $D0,$D0,$MASK
1111         vadd.i64        $D4,$D4,$T0             @ h3 -> h4
1112          vadd.i64       $D1,$D1,$T1             @ h0 -> h1
1113
1114         vshr.u64        $T0,$D4,#26
1115         vand.i64        $D4,$D4,$MASK
1116          vshr.u64       $T1,$D1,#26
1117          vand.i64       $D1,$D1,$MASK
1118          vadd.i64       $D2,$D2,$T1             @ h1 -> h2
1119
1120         vadd.i64        $D0,$D0,$T0
1121         vshl.u64        $T0,$T0,#2
1122          vshr.u64       $T1,$D2,#26
1123          vand.i64       $D2,$D2,$MASK
1124         vadd.i64        $D0,$D0,$T0             @ h4 -> h0
1125          vadd.i64       $D3,$D3,$T1             @ h2 -> h3
1126
1127         vshr.u64        $T0,$D0,#26
1128         vand.i64        $D0,$D0,$MASK
1129          vshr.u64       $T1,$D3,#26
1130          vand.i64       $D3,$D3,$MASK
1131         vadd.i64        $D1,$D1,$T0             @ h0 -> h1
1132          vadd.i64       $D4,$D4,$T1             @ h3 -> h4
1133
1134         cmp             $len,#0
1135         bne             .Leven
1136
1137         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1138         @ store hash value
1139
1140         vst4.32         {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
1141         vst1.32         {$D4#lo[0]},[$ctx]
1142
1143         vldmia  sp!,{d8-d15}                    @ epilogue
1144         ldmia   sp!,{r4-r7}
1145 .Lno_data_neon:
1146         ret                                     @ bx    lr
1147 .size   poly1305_blocks_neon,.-poly1305_blocks_neon
1148
1149 .type   poly1305_emit_neon,%function
1150 .align  5
1151 poly1305_emit_neon:
1152         ldr     ip,[$ctx,#36]           @ is_base2_26
1153
1154         stmdb   sp!,{r4-r11}
1155
1156         tst     ip,ip
1157         beq     .Lpoly1305_emit_enter
1158
1159         ldmia   $ctx,{$h0-$h4}
1160         eor     $g0,$g0,$g0
1161
1162         adds    $h0,$h0,$h1,lsl#26      @ base 2^26 -> base 2^32
1163         mov     $h1,$h1,lsr#6
1164         adcs    $h1,$h1,$h2,lsl#20
1165         mov     $h2,$h2,lsr#12
1166         adcs    $h2,$h2,$h3,lsl#14
1167         mov     $h3,$h3,lsr#18
1168         adcs    $h3,$h3,$h4,lsl#8
1169         adc     $h4,$g0,$h4,lsr#24      @ can be partially reduced ...
1170
1171         and     $g0,$h4,#-4             @ ... so reduce
1172         and     $h4,$h3,#3
1173         add     $g0,$g0,$g0,lsr#2       @ *= 5
1174         adds    $h0,$h0,$g0
1175         adcs    $h1,$h1,#0
1176         adcs    $h2,$h2,#0
1177         adcs    $h3,$h3,#0
1178         adc     $h4,$h4,#0
1179
1180         adds    $g0,$h0,#5              @ compare to modulus
1181         adcs    $g1,$h1,#0
1182         adcs    $g2,$h2,#0
1183         adcs    $g3,$h3,#0
1184         adc     $g4,$h4,#0
1185         tst     $g4,#4                  @ did it carry/borrow?
1186
1187         it      ne
1188         movne   $h0,$g0
1189         ldr     $g0,[$nonce,#0]
1190         it      ne
1191         movne   $h1,$g1
1192         ldr     $g1,[$nonce,#4]
1193         it      ne
1194         movne   $h2,$g2
1195         ldr     $g2,[$nonce,#8]
1196         it      ne
1197         movne   $h3,$g3
1198         ldr     $g3,[$nonce,#12]
1199
1200         adds    $h0,$h0,$g0             @ accumulate nonce
1201         adcs    $h1,$h1,$g1
1202         adcs    $h2,$h2,$g2
1203         adc     $h3,$h3,$g3
1204
1205 # ifdef __ARMEB__
1206         rev     $h0,$h0
1207         rev     $h1,$h1
1208         rev     $h2,$h2
1209         rev     $h3,$h3
1210 # endif
1211         str     $h0,[$mac,#0]           @ store the result
1212         str     $h1,[$mac,#4]
1213         str     $h2,[$mac,#8]
1214         str     $h3,[$mac,#12]
1215
1216         ldmia   sp!,{r4-r11}
1217         ret                             @ bx    lr
1218 .size   poly1305_emit_neon,.-poly1305_emit_neon
1219
1220 .align  5
1221 .Lzeros:
1222 .long   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1223 .LOPENSSL_armcap:
1224 .word   OPENSSL_armcap_P-.Lpoly1305_init
1225 #endif
1226 ___
1227 }       }
1228 $code.=<<___;
1229 .asciz  "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1230 .align  2
1231 #if     __ARM_MAX_ARCH__>=7
1232 .comm   OPENSSL_armcap_P,4,4
1233 #endif
1234 ___
1235
1236 foreach (split("\n",$code)) {
1237         s/\`([^\`]*)\`/eval $1/geo;
1238
1239         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
1240         s/\bret\b/bx    lr/go                                           or
1241         s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
1242
1243         print $_,"\n";
1244 }
1245 close STDOUT; # enforce flush