Following the license change, modify the boilerplates in crypto/poly1305/
[openssl.git] / crypto / poly1305 / asm / poly1305-armv4.pl
1 #! /usr/bin/env perl
2 # Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 #                       IALU(*)/gcc-4.4         NEON
18 #
19 # ARM11xx(ARMv6)        7.78/+100%              -
20 # Cortex-A5             6.35/+130%              3.00
21 # Cortex-A8             6.25/+115%              2.36
22 # Cortex-A9             5.10/+95%               2.55
23 # Cortex-A15            3.85/+85%               1.25(**)
24 # Snapdragon S4         5.70/+100%              1.48(**)
25 #
26 # (*)   this is for -march=armv6, i.e. with bunch of ldrb loading data;
27 # (**)  these are trade-off results, they can be improved by ~8% but at
28 #       the cost of 15/12% regression on Cortex-A5/A7, it's even possible
29 #       to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
30
31 $flavour = shift;
32 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
33 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
34
35 if ($flavour && $flavour ne "void") {
36     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
38     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
39     die "can't locate arm-xlate.pl";
40
41     open STDOUT,"| \"$^X\" $xlate $flavour $output";
42 } else {
43     open STDOUT,">$output";
44 }
45
46 ($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
47
48 $code.=<<___;
49 #include "arm_arch.h"
50
51 .text
52 #if defined(__thumb2__)
53 .syntax unified
54 .thumb
55 #else
56 .code   32
57 #endif
58
59 .globl  poly1305_emit
60 .globl  poly1305_blocks
61 .globl  poly1305_init
62 .type   poly1305_init,%function
63 .align  5
64 poly1305_init:
65 .Lpoly1305_init:
66         stmdb   sp!,{r4-r11}
67
68         eor     r3,r3,r3
69         cmp     $inp,#0
70         str     r3,[$ctx,#0]            @ zero hash value
71         str     r3,[$ctx,#4]
72         str     r3,[$ctx,#8]
73         str     r3,[$ctx,#12]
74         str     r3,[$ctx,#16]
75         str     r3,[$ctx,#36]           @ is_base2_26
76         add     $ctx,$ctx,#20
77
78 #ifdef  __thumb2__
79         it      eq
80 #endif
81         moveq   r0,#0
82         beq     .Lno_key
83
84 #if     __ARM_MAX_ARCH__>=7
85         adr     r11,.Lpoly1305_init
86         ldr     r12,.LOPENSSL_armcap
87 #endif
88         ldrb    r4,[$inp,#0]
89         mov     r10,#0x0fffffff
90         ldrb    r5,[$inp,#1]
91         and     r3,r10,#-4              @ 0x0ffffffc
92         ldrb    r6,[$inp,#2]
93         ldrb    r7,[$inp,#3]
94         orr     r4,r4,r5,lsl#8
95         ldrb    r5,[$inp,#4]
96         orr     r4,r4,r6,lsl#16
97         ldrb    r6,[$inp,#5]
98         orr     r4,r4,r7,lsl#24
99         ldrb    r7,[$inp,#6]
100         and     r4,r4,r10
101
102 #if     __ARM_MAX_ARCH__>=7
103         ldr     r12,[r11,r12]           @ OPENSSL_armcap_P
104 # ifdef __APPLE__
105         ldr     r12,[r12]
106 # endif
107 #endif
108         ldrb    r8,[$inp,#7]
109         orr     r5,r5,r6,lsl#8
110         ldrb    r6,[$inp,#8]
111         orr     r5,r5,r7,lsl#16
112         ldrb    r7,[$inp,#9]
113         orr     r5,r5,r8,lsl#24
114         ldrb    r8,[$inp,#10]
115         and     r5,r5,r3
116
117 #if     __ARM_MAX_ARCH__>=7
118         tst     r12,#ARMV7_NEON         @ check for NEON
119 # ifdef __APPLE__
120         adr     r9,poly1305_blocks_neon
121         adr     r11,poly1305_blocks
122 #  ifdef __thumb2__
123         it      ne
124 #  endif
125         movne   r11,r9
126         adr     r12,poly1305_emit
127         adr     r10,poly1305_emit_neon
128 #  ifdef __thumb2__
129         it      ne
130 #  endif
131         movne   r12,r10
132 # else
133 #  ifdef __thumb2__
134         itete   eq
135 #  endif
136         addeq   r12,r11,#(poly1305_emit-.Lpoly1305_init)
137         addne   r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
138         addeq   r11,r11,#(poly1305_blocks-.Lpoly1305_init)
139         addne   r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
140 # endif
141 # ifdef __thumb2__
142         orr     r12,r12,#1      @ thumb-ify address
143         orr     r11,r11,#1
144 # endif
145 #endif
146         ldrb    r9,[$inp,#11]
147         orr     r6,r6,r7,lsl#8
148         ldrb    r7,[$inp,#12]
149         orr     r6,r6,r8,lsl#16
150         ldrb    r8,[$inp,#13]
151         orr     r6,r6,r9,lsl#24
152         ldrb    r9,[$inp,#14]
153         and     r6,r6,r3
154
155         ldrb    r10,[$inp,#15]
156         orr     r7,r7,r8,lsl#8
157         str     r4,[$ctx,#0]
158         orr     r7,r7,r9,lsl#16
159         str     r5,[$ctx,#4]
160         orr     r7,r7,r10,lsl#24
161         str     r6,[$ctx,#8]
162         and     r7,r7,r3
163         str     r7,[$ctx,#12]
164 #if     __ARM_MAX_ARCH__>=7
165         stmia   r2,{r11,r12}            @ fill functions table
166         mov     r0,#1
167 #else
168         mov     r0,#0
169 #endif
170 .Lno_key:
171         ldmia   sp!,{r4-r11}
172 #if     __ARM_ARCH__>=5
173         ret                             @ bx    lr
174 #else
175         tst     lr,#1
176         moveq   pc,lr                   @ be binary compatible with V4, yet
177         bx      lr                      @ interoperable with Thumb ISA:-)
178 #endif
179 .size   poly1305_init,.-poly1305_init
180 ___
181 {
182 my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
183 my ($s1,$s2,$s3)=($r1,$r2,$r3);
184
185 $code.=<<___;
186 .type   poly1305_blocks,%function
187 .align  5
188 poly1305_blocks:
189 .Lpoly1305_blocks:
190         stmdb   sp!,{r3-r11,lr}
191
192         ands    $len,$len,#-16
193         beq     .Lno_data
194
195         cmp     $padbit,#0
196         add     $len,$len,$inp          @ end pointer
197         sub     sp,sp,#32
198
199         ldmia   $ctx,{$h0-$r3}          @ load context
200
201         str     $ctx,[sp,#12]           @ offload stuff
202         mov     lr,$inp
203         str     $len,[sp,#16]
204         str     $r1,[sp,#20]
205         str     $r2,[sp,#24]
206         str     $r3,[sp,#28]
207         b       .Loop
208
209 .Loop:
210 #if __ARM_ARCH__<7
211         ldrb    r0,[lr],#16             @ load input
212 # ifdef __thumb2__
213         it      hi
214 # endif
215         addhi   $h4,$h4,#1              @ 1<<128
216         ldrb    r1,[lr,#-15]
217         ldrb    r2,[lr,#-14]
218         ldrb    r3,[lr,#-13]
219         orr     r1,r0,r1,lsl#8
220         ldrb    r0,[lr,#-12]
221         orr     r2,r1,r2,lsl#16
222         ldrb    r1,[lr,#-11]
223         orr     r3,r2,r3,lsl#24
224         ldrb    r2,[lr,#-10]
225         adds    $h0,$h0,r3              @ accumulate input
226
227         ldrb    r3,[lr,#-9]
228         orr     r1,r0,r1,lsl#8
229         ldrb    r0,[lr,#-8]
230         orr     r2,r1,r2,lsl#16
231         ldrb    r1,[lr,#-7]
232         orr     r3,r2,r3,lsl#24
233         ldrb    r2,[lr,#-6]
234         adcs    $h1,$h1,r3
235
236         ldrb    r3,[lr,#-5]
237         orr     r1,r0,r1,lsl#8
238         ldrb    r0,[lr,#-4]
239         orr     r2,r1,r2,lsl#16
240         ldrb    r1,[lr,#-3]
241         orr     r3,r2,r3,lsl#24
242         ldrb    r2,[lr,#-2]
243         adcs    $h2,$h2,r3
244
245         ldrb    r3,[lr,#-1]
246         orr     r1,r0,r1,lsl#8
247         str     lr,[sp,#8]              @ offload input pointer
248         orr     r2,r1,r2,lsl#16
249         add     $s1,$r1,$r1,lsr#2
250         orr     r3,r2,r3,lsl#24
251 #else
252         ldr     r0,[lr],#16             @ load input
253 # ifdef __thumb2__
254         it      hi
255 # endif
256         addhi   $h4,$h4,#1              @ padbit
257         ldr     r1,[lr,#-12]
258         ldr     r2,[lr,#-8]
259         ldr     r3,[lr,#-4]
260 # ifdef __ARMEB__
261         rev     r0,r0
262         rev     r1,r1
263         rev     r2,r2
264         rev     r3,r3
265 # endif
266         adds    $h0,$h0,r0              @ accumulate input
267         str     lr,[sp,#8]              @ offload input pointer
268         adcs    $h1,$h1,r1
269         add     $s1,$r1,$r1,lsr#2
270         adcs    $h2,$h2,r2
271 #endif
272         add     $s2,$r2,$r2,lsr#2
273         adcs    $h3,$h3,r3
274         add     $s3,$r3,$r3,lsr#2
275
276         umull   r2,r3,$h1,$r0
277          adc    $h4,$h4,#0
278         umull   r0,r1,$h0,$r0
279         umlal   r2,r3,$h4,$s1
280         umlal   r0,r1,$h3,$s1
281         ldr     $r1,[sp,#20]            @ reload $r1
282         umlal   r2,r3,$h2,$s3
283         umlal   r0,r1,$h1,$s3
284         umlal   r2,r3,$h3,$s2
285         umlal   r0,r1,$h2,$s2
286         umlal   r2,r3,$h0,$r1
287         str     r0,[sp,#0]              @ future $h0
288          mul    r0,$s2,$h4
289         ldr     $r2,[sp,#24]            @ reload $r2
290         adds    r2,r2,r1                @ d1+=d0>>32
291          eor    r1,r1,r1
292         adc     lr,r3,#0                @ future $h2
293         str     r2,[sp,#4]              @ future $h1
294
295         mul     r2,$s3,$h4
296         eor     r3,r3,r3
297         umlal   r0,r1,$h3,$s3
298         ldr     $r3,[sp,#28]            @ reload $r3
299         umlal   r2,r3,$h3,$r0
300         umlal   r0,r1,$h2,$r0
301         umlal   r2,r3,$h2,$r1
302         umlal   r0,r1,$h1,$r1
303         umlal   r2,r3,$h1,$r2
304         umlal   r0,r1,$h0,$r2
305         umlal   r2,r3,$h0,$r3
306         ldr     $h0,[sp,#0]
307         mul     $h4,$r0,$h4
308         ldr     $h1,[sp,#4]
309
310         adds    $h2,lr,r0               @ d2+=d1>>32
311         ldr     lr,[sp,#8]              @ reload input pointer
312         adc     r1,r1,#0
313         adds    $h3,r2,r1               @ d3+=d2>>32
314         ldr     r0,[sp,#16]             @ reload end pointer
315         adc     r3,r3,#0
316         add     $h4,$h4,r3              @ h4+=d3>>32
317
318         and     r1,$h4,#-4
319         and     $h4,$h4,#3
320         add     r1,r1,r1,lsr#2          @ *=5
321         adds    $h0,$h0,r1
322         adcs    $h1,$h1,#0
323         adcs    $h2,$h2,#0
324         adcs    $h3,$h3,#0
325         adc     $h4,$h4,#0
326
327         cmp     r0,lr                   @ done yet?
328         bhi     .Loop
329
330         ldr     $ctx,[sp,#12]
331         add     sp,sp,#32
332         stmia   $ctx,{$h0-$h4}          @ store the result
333
334 .Lno_data:
335 #if     __ARM_ARCH__>=5
336         ldmia   sp!,{r3-r11,pc}
337 #else
338         ldmia   sp!,{r3-r11,lr}
339         tst     lr,#1
340         moveq   pc,lr                   @ be binary compatible with V4, yet
341         bx      lr                      @ interoperable with Thumb ISA:-)
342 #endif
343 .size   poly1305_blocks,.-poly1305_blocks
344 ___
345 }
346 {
347 my ($ctx,$mac,$nonce)=map("r$_",(0..2));
348 my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
349 my $g4=$h4;
350
351 $code.=<<___;
352 .type   poly1305_emit,%function
353 .align  5
354 poly1305_emit:
355         stmdb   sp!,{r4-r11}
356 .Lpoly1305_emit_enter:
357
358         ldmia   $ctx,{$h0-$h4}
359         adds    $g0,$h0,#5              @ compare to modulus
360         adcs    $g1,$h1,#0
361         adcs    $g2,$h2,#0
362         adcs    $g3,$h3,#0
363         adc     $g4,$h4,#0
364         tst     $g4,#4                  @ did it carry/borrow?
365
366 #ifdef  __thumb2__
367         it      ne
368 #endif
369         movne   $h0,$g0
370         ldr     $g0,[$nonce,#0]
371 #ifdef  __thumb2__
372         it      ne
373 #endif
374         movne   $h1,$g1
375         ldr     $g1,[$nonce,#4]
376 #ifdef  __thumb2__
377         it      ne
378 #endif
379         movne   $h2,$g2
380         ldr     $g2,[$nonce,#8]
381 #ifdef  __thumb2__
382         it      ne
383 #endif
384         movne   $h3,$g3
385         ldr     $g3,[$nonce,#12]
386
387         adds    $h0,$h0,$g0
388         adcs    $h1,$h1,$g1
389         adcs    $h2,$h2,$g2
390         adc     $h3,$h3,$g3
391
392 #if __ARM_ARCH__>=7
393 # ifdef __ARMEB__
394         rev     $h0,$h0
395         rev     $h1,$h1
396         rev     $h2,$h2
397         rev     $h3,$h3
398 # endif
399         str     $h0,[$mac,#0]
400         str     $h1,[$mac,#4]
401         str     $h2,[$mac,#8]
402         str     $h3,[$mac,#12]
403 #else
404         strb    $h0,[$mac,#0]
405         mov     $h0,$h0,lsr#8
406         strb    $h1,[$mac,#4]
407         mov     $h1,$h1,lsr#8
408         strb    $h2,[$mac,#8]
409         mov     $h2,$h2,lsr#8
410         strb    $h3,[$mac,#12]
411         mov     $h3,$h3,lsr#8
412
413         strb    $h0,[$mac,#1]
414         mov     $h0,$h0,lsr#8
415         strb    $h1,[$mac,#5]
416         mov     $h1,$h1,lsr#8
417         strb    $h2,[$mac,#9]
418         mov     $h2,$h2,lsr#8
419         strb    $h3,[$mac,#13]
420         mov     $h3,$h3,lsr#8
421
422         strb    $h0,[$mac,#2]
423         mov     $h0,$h0,lsr#8
424         strb    $h1,[$mac,#6]
425         mov     $h1,$h1,lsr#8
426         strb    $h2,[$mac,#10]
427         mov     $h2,$h2,lsr#8
428         strb    $h3,[$mac,#14]
429         mov     $h3,$h3,lsr#8
430
431         strb    $h0,[$mac,#3]
432         strb    $h1,[$mac,#7]
433         strb    $h2,[$mac,#11]
434         strb    $h3,[$mac,#15]
435 #endif
436         ldmia   sp!,{r4-r11}
437 #if     __ARM_ARCH__>=5
438         ret                             @ bx    lr
439 #else
440         tst     lr,#1
441         moveq   pc,lr                   @ be binary compatible with V4, yet
442         bx      lr                      @ interoperable with Thumb ISA:-)
443 #endif
444 .size   poly1305_emit,.-poly1305_emit
445 ___
446 {
447 my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
448 my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
449 my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
450
451 my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
452
453 $code.=<<___;
454 #if     __ARM_MAX_ARCH__>=7
455 .fpu    neon
456
457 .type   poly1305_init_neon,%function
458 .align  5
459 poly1305_init_neon:
460         ldr     r4,[$ctx,#20]           @ load key base 2^32
461         ldr     r5,[$ctx,#24]
462         ldr     r6,[$ctx,#28]
463         ldr     r7,[$ctx,#32]
464
465         and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
466         mov     r3,r4,lsr#26
467         mov     r4,r5,lsr#20
468         orr     r3,r3,r5,lsl#6
469         mov     r5,r6,lsr#14
470         orr     r4,r4,r6,lsl#12
471         mov     r6,r7,lsr#8
472         orr     r5,r5,r7,lsl#18
473         and     r3,r3,#0x03ffffff
474         and     r4,r4,#0x03ffffff
475         and     r5,r5,#0x03ffffff
476
477         vdup.32 $R0,r2                  @ r^1 in both lanes
478         add     r2,r3,r3,lsl#2          @ *5
479         vdup.32 $R1,r3
480         add     r3,r4,r4,lsl#2
481         vdup.32 $S1,r2
482         vdup.32 $R2,r4
483         add     r4,r5,r5,lsl#2
484         vdup.32 $S2,r3
485         vdup.32 $R3,r5
486         add     r5,r6,r6,lsl#2
487         vdup.32 $S3,r4
488         vdup.32 $R4,r6
489         vdup.32 $S4,r5
490
491         mov     $zeros,#2               @ counter
492
493 .Lsquare_neon:
494         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
495         @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
496         @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
497         @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
498         @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
499         @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
500
501         vmull.u32       $D0,$R0,${R0}[1]
502         vmull.u32       $D1,$R1,${R0}[1]
503         vmull.u32       $D2,$R2,${R0}[1]
504         vmull.u32       $D3,$R3,${R0}[1]
505         vmull.u32       $D4,$R4,${R0}[1]
506
507         vmlal.u32       $D0,$R4,${S1}[1]
508         vmlal.u32       $D1,$R0,${R1}[1]
509         vmlal.u32       $D2,$R1,${R1}[1]
510         vmlal.u32       $D3,$R2,${R1}[1]
511         vmlal.u32       $D4,$R3,${R1}[1]
512
513         vmlal.u32       $D0,$R3,${S2}[1]
514         vmlal.u32       $D1,$R4,${S2}[1]
515         vmlal.u32       $D3,$R1,${R2}[1]
516         vmlal.u32       $D2,$R0,${R2}[1]
517         vmlal.u32       $D4,$R2,${R2}[1]
518
519         vmlal.u32       $D0,$R2,${S3}[1]
520         vmlal.u32       $D3,$R0,${R3}[1]
521         vmlal.u32       $D1,$R3,${S3}[1]
522         vmlal.u32       $D2,$R4,${S3}[1]
523         vmlal.u32       $D4,$R1,${R3}[1]
524
525         vmlal.u32       $D3,$R4,${S4}[1]
526         vmlal.u32       $D0,$R1,${S4}[1]
527         vmlal.u32       $D1,$R2,${S4}[1]
528         vmlal.u32       $D2,$R3,${S4}[1]
529         vmlal.u32       $D4,$R0,${R4}[1]
530
531         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
532         @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
533         @ and P. Schwabe
534         @
535         @ H0>>+H1>>+H2>>+H3>>+H4
536         @ H3>>+H4>>*5+H0>>+H1
537         @
538         @ Trivia.
539         @
540         @ Result of multiplication of n-bit number by m-bit number is
541         @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
542         @ m-bit number multiplied by 2^n is still n+m bits wide.
543         @
544         @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
545         @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
546         @ one is n+1 bits wide.
547         @
548         @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
549         @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
550         @ can be 27. However! In cases when their width exceeds 26 bits
551         @ they are limited by 2^26+2^6. This in turn means that *sum*
552         @ of the products with these values can still be viewed as sum
553         @ of 52-bit numbers as long as the amount of addends is not a
554         @ power of 2. For example,
555         @
556         @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
557         @
558         @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
559         @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
560         @ 8 * (2^52) or 2^55. However, the value is then multiplied by
561         @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
562         @ which is less than 32 * (2^52) or 2^57. And when processing
563         @ data we are looking at triple as many addends...
564         @
565         @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
566         @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
567         @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
568         @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
569         @ instruction accepts 2x32-bit input and writes 2x64-bit result.
570         @ This means that result of reduction have to be compressed upon
571         @ loop wrap-around. This can be done in the process of reduction
572         @ to minimize amount of instructions [as well as amount of
573         @ 128-bit instructions, which benefits low-end processors], but
574         @ one has to watch for H2 (which is narrower than H0) and 5*H4
575         @ not being wider than 58 bits, so that result of right shift
576         @ by 26 bits fits in 32 bits. This is also useful on x86,
577         @ because it allows to use paddd in place for paddq, which
578         @ benefits Atom, where paddq is ridiculously slow.
579
580         vshr.u64        $T0,$D3,#26
581         vmovn.i64       $D3#lo,$D3
582          vshr.u64       $T1,$D0,#26
583          vmovn.i64      $D0#lo,$D0
584         vadd.i64        $D4,$D4,$T0             @ h3 -> h4
585         vbic.i32        $D3#lo,#0xfc000000      @ &=0x03ffffff
586          vadd.i64       $D1,$D1,$T1             @ h0 -> h1
587          vbic.i32       $D0#lo,#0xfc000000
588
589         vshrn.u64       $T0#lo,$D4,#26
590         vmovn.i64       $D4#lo,$D4
591          vshr.u64       $T1,$D1,#26
592          vmovn.i64      $D1#lo,$D1
593          vadd.i64       $D2,$D2,$T1             @ h1 -> h2
594         vbic.i32        $D4#lo,#0xfc000000
595          vbic.i32       $D1#lo,#0xfc000000
596
597         vadd.i32        $D0#lo,$D0#lo,$T0#lo
598         vshl.u32        $T0#lo,$T0#lo,#2
599          vshrn.u64      $T1#lo,$D2,#26
600          vmovn.i64      $D2#lo,$D2
601         vadd.i32        $D0#lo,$D0#lo,$T0#lo    @ h4 -> h0
602          vadd.i32       $D3#lo,$D3#lo,$T1#lo    @ h2 -> h3
603          vbic.i32       $D2#lo,#0xfc000000
604
605         vshr.u32        $T0#lo,$D0#lo,#26
606         vbic.i32        $D0#lo,#0xfc000000
607          vshr.u32       $T1#lo,$D3#lo,#26
608          vbic.i32       $D3#lo,#0xfc000000
609         vadd.i32        $D1#lo,$D1#lo,$T0#lo    @ h0 -> h1
610          vadd.i32       $D4#lo,$D4#lo,$T1#lo    @ h3 -> h4
611
612         subs            $zeros,$zeros,#1
613         beq             .Lsquare_break_neon
614
615         add             $tbl0,$ctx,#(48+0*9*4)
616         add             $tbl1,$ctx,#(48+1*9*4)
617
618         vtrn.32         $R0,$D0#lo              @ r^2:r^1
619         vtrn.32         $R2,$D2#lo
620         vtrn.32         $R3,$D3#lo
621         vtrn.32         $R1,$D1#lo
622         vtrn.32         $R4,$D4#lo
623
624         vshl.u32        $S2,$R2,#2              @ *5
625         vshl.u32        $S3,$R3,#2
626         vshl.u32        $S1,$R1,#2
627         vshl.u32        $S4,$R4,#2
628         vadd.i32        $S2,$S2,$R2
629         vadd.i32        $S1,$S1,$R1
630         vadd.i32        $S3,$S3,$R3
631         vadd.i32        $S4,$S4,$R4
632
633         vst4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
634         vst4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
635         vst4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
636         vst4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
637         vst1.32         {${S4}[0]},[$tbl0,:32]
638         vst1.32         {${S4}[1]},[$tbl1,:32]
639
640         b               .Lsquare_neon
641
642 .align  4
643 .Lsquare_break_neon:
644         add             $tbl0,$ctx,#(48+2*4*9)
645         add             $tbl1,$ctx,#(48+3*4*9)
646
647         vmov            $R0,$D0#lo              @ r^4:r^3
648         vshl.u32        $S1,$D1#lo,#2           @ *5
649         vmov            $R1,$D1#lo
650         vshl.u32        $S2,$D2#lo,#2
651         vmov            $R2,$D2#lo
652         vshl.u32        $S3,$D3#lo,#2
653         vmov            $R3,$D3#lo
654         vshl.u32        $S4,$D4#lo,#2
655         vmov            $R4,$D4#lo
656         vadd.i32        $S1,$S1,$D1#lo
657         vadd.i32        $S2,$S2,$D2#lo
658         vadd.i32        $S3,$S3,$D3#lo
659         vadd.i32        $S4,$S4,$D4#lo
660
661         vst4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
662         vst4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
663         vst4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
664         vst4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
665         vst1.32         {${S4}[0]},[$tbl0]
666         vst1.32         {${S4}[1]},[$tbl1]
667
668         ret                             @ bx    lr
669 .size   poly1305_init_neon,.-poly1305_init_neon
670
671 .type   poly1305_blocks_neon,%function
672 .align  5
673 poly1305_blocks_neon:
674         ldr     ip,[$ctx,#36]           @ is_base2_26
675         ands    $len,$len,#-16
676         beq     .Lno_data_neon
677
678         cmp     $len,#64
679         bhs     .Lenter_neon
680         tst     ip,ip                   @ is_base2_26?
681         beq     .Lpoly1305_blocks
682
683 .Lenter_neon:
684         stmdb   sp!,{r4-r7}
685         vstmdb  sp!,{d8-d15}            @ ABI specification says so
686
687         tst     ip,ip                   @ is_base2_26?
688         bne     .Lbase2_26_neon
689
690         stmdb   sp!,{r1-r3,lr}
691         bl      poly1305_init_neon
692
693         ldr     r4,[$ctx,#0]            @ load hash value base 2^32
694         ldr     r5,[$ctx,#4]
695         ldr     r6,[$ctx,#8]
696         ldr     r7,[$ctx,#12]
697         ldr     ip,[$ctx,#16]
698
699         and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
700         mov     r3,r4,lsr#26
701          veor   $D0#lo,$D0#lo,$D0#lo
702         mov     r4,r5,lsr#20
703         orr     r3,r3,r5,lsl#6
704          veor   $D1#lo,$D1#lo,$D1#lo
705         mov     r5,r6,lsr#14
706         orr     r4,r4,r6,lsl#12
707          veor   $D2#lo,$D2#lo,$D2#lo
708         mov     r6,r7,lsr#8
709         orr     r5,r5,r7,lsl#18
710          veor   $D3#lo,$D3#lo,$D3#lo
711         and     r3,r3,#0x03ffffff
712         orr     r6,r6,ip,lsl#24
713          veor   $D4#lo,$D4#lo,$D4#lo
714         and     r4,r4,#0x03ffffff
715         mov     r1,#1
716         and     r5,r5,#0x03ffffff
717         str     r1,[$ctx,#36]           @ is_base2_26
718
719         vmov.32 $D0#lo[0],r2
720         vmov.32 $D1#lo[0],r3
721         vmov.32 $D2#lo[0],r4
722         vmov.32 $D3#lo[0],r5
723         vmov.32 $D4#lo[0],r6
724         adr     $zeros,.Lzeros
725
726         ldmia   sp!,{r1-r3,lr}
727         b       .Lbase2_32_neon
728
729 .align  4
730 .Lbase2_26_neon:
731         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
732         @ load hash value
733
734         veor            $D0#lo,$D0#lo,$D0#lo
735         veor            $D1#lo,$D1#lo,$D1#lo
736         veor            $D2#lo,$D2#lo,$D2#lo
737         veor            $D3#lo,$D3#lo,$D3#lo
738         veor            $D4#lo,$D4#lo,$D4#lo
739         vld4.32         {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
740         adr             $zeros,.Lzeros
741         vld1.32         {$D4#lo[0]},[$ctx]
742         sub             $ctx,$ctx,#16           @ rewind
743
744 .Lbase2_32_neon:
745         add             $in2,$inp,#32
746         mov             $padbit,$padbit,lsl#24
747         tst             $len,#31
748         beq             .Leven
749
750         vld4.32         {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
751         vmov.32         $H4#lo[0],$padbit
752         sub             $len,$len,#16
753         add             $in2,$inp,#32
754
755 # ifdef __ARMEB__
756         vrev32.8        $H0,$H0
757         vrev32.8        $H3,$H3
758         vrev32.8        $H1,$H1
759         vrev32.8        $H2,$H2
760 # endif
761         vsri.u32        $H4#lo,$H3#lo,#8        @ base 2^32 -> base 2^26
762         vshl.u32        $H3#lo,$H3#lo,#18
763
764         vsri.u32        $H3#lo,$H2#lo,#14
765         vshl.u32        $H2#lo,$H2#lo,#12
766         vadd.i32        $H4#hi,$H4#lo,$D4#lo    @ add hash value and move to #hi
767
768         vbic.i32        $H3#lo,#0xfc000000
769         vsri.u32        $H2#lo,$H1#lo,#20
770         vshl.u32        $H1#lo,$H1#lo,#6
771
772         vbic.i32        $H2#lo,#0xfc000000
773         vsri.u32        $H1#lo,$H0#lo,#26
774         vadd.i32        $H3#hi,$H3#lo,$D3#lo
775
776         vbic.i32        $H0#lo,#0xfc000000
777         vbic.i32        $H1#lo,#0xfc000000
778         vadd.i32        $H2#hi,$H2#lo,$D2#lo
779
780         vadd.i32        $H0#hi,$H0#lo,$D0#lo
781         vadd.i32        $H1#hi,$H1#lo,$D1#lo
782
783         mov             $tbl1,$zeros
784         add             $tbl0,$ctx,#48
785
786         cmp             $len,$len
787         b               .Long_tail
788
789 .align  4
790 .Leven:
791         subs            $len,$len,#64
792         it              lo
793         movlo           $in2,$zeros
794
795         vmov.i32        $H4,#1<<24              @ padbit, yes, always
796         vld4.32         {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]    @ inp[0:1]
797         add             $inp,$inp,#64
798         vld4.32         {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]    @ inp[2:3] (or 0)
799         add             $in2,$in2,#64
800         itt             hi
801         addhi           $tbl1,$ctx,#(48+1*9*4)
802         addhi           $tbl0,$ctx,#(48+3*9*4)
803
804 # ifdef __ARMEB__
805         vrev32.8        $H0,$H0
806         vrev32.8        $H3,$H3
807         vrev32.8        $H1,$H1
808         vrev32.8        $H2,$H2
809 # endif
810         vsri.u32        $H4,$H3,#8              @ base 2^32 -> base 2^26
811         vshl.u32        $H3,$H3,#18
812
813         vsri.u32        $H3,$H2,#14
814         vshl.u32        $H2,$H2,#12
815
816         vbic.i32        $H3,#0xfc000000
817         vsri.u32        $H2,$H1,#20
818         vshl.u32        $H1,$H1,#6
819
820         vbic.i32        $H2,#0xfc000000
821         vsri.u32        $H1,$H0,#26
822
823         vbic.i32        $H0,#0xfc000000
824         vbic.i32        $H1,#0xfc000000
825
826         bls             .Lskip_loop
827
828         vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^2
829         vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^4
830         vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
831         vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
832         b               .Loop_neon
833
834 .align  5
835 .Loop_neon:
836         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
837         @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
838         @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
839         @   \___________________/
840         @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
841         @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
842         @   \___________________/ \____________________/
843         @
844         @ Note that we start with inp[2:3]*r^2. This is because it
845         @ doesn't depend on reduction in previous iteration.
846         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
847         @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
848         @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
849         @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
850         @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
851         @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
852
853         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
854         @ inp[2:3]*r^2
855
856         vadd.i32        $H2#lo,$H2#lo,$D2#lo    @ accumulate inp[0:1]
857         vmull.u32       $D2,$H2#hi,${R0}[1]
858         vadd.i32        $H0#lo,$H0#lo,$D0#lo
859         vmull.u32       $D0,$H0#hi,${R0}[1]
860         vadd.i32        $H3#lo,$H3#lo,$D3#lo
861         vmull.u32       $D3,$H3#hi,${R0}[1]
862         vmlal.u32       $D2,$H1#hi,${R1}[1]
863         vadd.i32        $H1#lo,$H1#lo,$D1#lo
864         vmull.u32       $D1,$H1#hi,${R0}[1]
865
866         vadd.i32        $H4#lo,$H4#lo,$D4#lo
867         vmull.u32       $D4,$H4#hi,${R0}[1]
868         subs            $len,$len,#64
869         vmlal.u32       $D0,$H4#hi,${S1}[1]
870         it              lo
871         movlo           $in2,$zeros
872         vmlal.u32       $D3,$H2#hi,${R1}[1]
873         vld1.32         ${S4}[1],[$tbl1,:32]
874         vmlal.u32       $D1,$H0#hi,${R1}[1]
875         vmlal.u32       $D4,$H3#hi,${R1}[1]
876
877         vmlal.u32       $D0,$H3#hi,${S2}[1]
878         vmlal.u32       $D3,$H1#hi,${R2}[1]
879         vmlal.u32       $D4,$H2#hi,${R2}[1]
880         vmlal.u32       $D1,$H4#hi,${S2}[1]
881         vmlal.u32       $D2,$H0#hi,${R2}[1]
882
883         vmlal.u32       $D3,$H0#hi,${R3}[1]
884         vmlal.u32       $D0,$H2#hi,${S3}[1]
885         vmlal.u32       $D4,$H1#hi,${R3}[1]
886         vmlal.u32       $D1,$H3#hi,${S3}[1]
887         vmlal.u32       $D2,$H4#hi,${S3}[1]
888
889         vmlal.u32       $D3,$H4#hi,${S4}[1]
890         vmlal.u32       $D0,$H1#hi,${S4}[1]
891         vmlal.u32       $D4,$H0#hi,${R4}[1]
892         vmlal.u32       $D1,$H2#hi,${S4}[1]
893         vmlal.u32       $D2,$H3#hi,${S4}[1]
894
895         vld4.32         {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]    @ inp[2:3] (or 0)
896         add             $in2,$in2,#64
897
898         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
899         @ (hash+inp[0:1])*r^4 and accumulate
900
901         vmlal.u32       $D3,$H3#lo,${R0}[0]
902         vmlal.u32       $D0,$H0#lo,${R0}[0]
903         vmlal.u32       $D4,$H4#lo,${R0}[0]
904         vmlal.u32       $D1,$H1#lo,${R0}[0]
905         vmlal.u32       $D2,$H2#lo,${R0}[0]
906         vld1.32         ${S4}[0],[$tbl0,:32]
907
908         vmlal.u32       $D3,$H2#lo,${R1}[0]
909         vmlal.u32       $D0,$H4#lo,${S1}[0]
910         vmlal.u32       $D4,$H3#lo,${R1}[0]
911         vmlal.u32       $D1,$H0#lo,${R1}[0]
912         vmlal.u32       $D2,$H1#lo,${R1}[0]
913
914         vmlal.u32       $D3,$H1#lo,${R2}[0]
915         vmlal.u32       $D0,$H3#lo,${S2}[0]
916         vmlal.u32       $D4,$H2#lo,${R2}[0]
917         vmlal.u32       $D1,$H4#lo,${S2}[0]
918         vmlal.u32       $D2,$H0#lo,${R2}[0]
919
920         vmlal.u32       $D3,$H0#lo,${R3}[0]
921         vmlal.u32       $D0,$H2#lo,${S3}[0]
922         vmlal.u32       $D4,$H1#lo,${R3}[0]
923         vmlal.u32       $D1,$H3#lo,${S3}[0]
924         vmlal.u32       $D3,$H4#lo,${S4}[0]
925
926         vmlal.u32       $D2,$H4#lo,${S3}[0]
927         vmlal.u32       $D0,$H1#lo,${S4}[0]
928         vmlal.u32       $D4,$H0#lo,${R4}[0]
929         vmov.i32        $H4,#1<<24              @ padbit, yes, always
930         vmlal.u32       $D1,$H2#lo,${S4}[0]
931         vmlal.u32       $D2,$H3#lo,${S4}[0]
932
933         vld4.32         {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]    @ inp[0:1]
934         add             $inp,$inp,#64
935 # ifdef __ARMEB__
936         vrev32.8        $H0,$H0
937         vrev32.8        $H1,$H1
938         vrev32.8        $H2,$H2
939         vrev32.8        $H3,$H3
940 # endif
941
942         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
943         @ lazy reduction interleaved with base 2^32 -> base 2^26 of
944         @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
945
946         vshr.u64        $T0,$D3,#26
947         vmovn.i64       $D3#lo,$D3
948          vshr.u64       $T1,$D0,#26
949          vmovn.i64      $D0#lo,$D0
950         vadd.i64        $D4,$D4,$T0             @ h3 -> h4
951         vbic.i32        $D3#lo,#0xfc000000
952           vsri.u32      $H4,$H3,#8              @ base 2^32 -> base 2^26
953          vadd.i64       $D1,$D1,$T1             @ h0 -> h1
954           vshl.u32      $H3,$H3,#18
955          vbic.i32       $D0#lo,#0xfc000000
956
957         vshrn.u64       $T0#lo,$D4,#26
958         vmovn.i64       $D4#lo,$D4
959          vshr.u64       $T1,$D1,#26
960          vmovn.i64      $D1#lo,$D1
961          vadd.i64       $D2,$D2,$T1             @ h1 -> h2
962           vsri.u32      $H3,$H2,#14
963         vbic.i32        $D4#lo,#0xfc000000
964           vshl.u32      $H2,$H2,#12
965          vbic.i32       $D1#lo,#0xfc000000
966
967         vadd.i32        $D0#lo,$D0#lo,$T0#lo
968         vshl.u32        $T0#lo,$T0#lo,#2
969           vbic.i32      $H3,#0xfc000000
970          vshrn.u64      $T1#lo,$D2,#26
971          vmovn.i64      $D2#lo,$D2
972         vaddl.u32       $D0,$D0#lo,$T0#lo       @ h4 -> h0 [widen for a sec]
973           vsri.u32      $H2,$H1,#20
974          vadd.i32       $D3#lo,$D3#lo,$T1#lo    @ h2 -> h3
975           vshl.u32      $H1,$H1,#6
976          vbic.i32       $D2#lo,#0xfc000000
977           vbic.i32      $H2,#0xfc000000
978
979         vshrn.u64       $T0#lo,$D0,#26          @ re-narrow
980         vmovn.i64       $D0#lo,$D0
981           vsri.u32      $H1,$H0,#26
982           vbic.i32      $H0,#0xfc000000
983          vshr.u32       $T1#lo,$D3#lo,#26
984          vbic.i32       $D3#lo,#0xfc000000
985         vbic.i32        $D0#lo,#0xfc000000
986         vadd.i32        $D1#lo,$D1#lo,$T0#lo    @ h0 -> h1
987          vadd.i32       $D4#lo,$D4#lo,$T1#lo    @ h3 -> h4
988           vbic.i32      $H1,#0xfc000000
989
990         bhi             .Loop_neon
991
992 .Lskip_loop:
993         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
994         @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
995
996         add             $tbl1,$ctx,#(48+0*9*4)
997         add             $tbl0,$ctx,#(48+1*9*4)
998         adds            $len,$len,#32
999         it              ne
1000         movne           $len,#0
1001         bne             .Long_tail
1002
1003         vadd.i32        $H2#hi,$H2#lo,$D2#lo    @ add hash value and move to #hi
1004         vadd.i32        $H0#hi,$H0#lo,$D0#lo
1005         vadd.i32        $H3#hi,$H3#lo,$D3#lo
1006         vadd.i32        $H1#hi,$H1#lo,$D1#lo
1007         vadd.i32        $H4#hi,$H4#lo,$D4#lo
1008
1009 .Long_tail:
1010         vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^1
1011         vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^2
1012
1013         vadd.i32        $H2#lo,$H2#lo,$D2#lo    @ can be redundant
1014         vmull.u32       $D2,$H2#hi,$R0
1015         vadd.i32        $H0#lo,$H0#lo,$D0#lo
1016         vmull.u32       $D0,$H0#hi,$R0
1017         vadd.i32        $H3#lo,$H3#lo,$D3#lo
1018         vmull.u32       $D3,$H3#hi,$R0
1019         vadd.i32        $H1#lo,$H1#lo,$D1#lo
1020         vmull.u32       $D1,$H1#hi,$R0
1021         vadd.i32        $H4#lo,$H4#lo,$D4#lo
1022         vmull.u32       $D4,$H4#hi,$R0
1023
1024         vmlal.u32       $D0,$H4#hi,$S1
1025         vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1026         vmlal.u32       $D3,$H2#hi,$R1
1027         vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1028         vmlal.u32       $D1,$H0#hi,$R1
1029         vmlal.u32       $D4,$H3#hi,$R1
1030         vmlal.u32       $D2,$H1#hi,$R1
1031
1032         vmlal.u32       $D3,$H1#hi,$R2
1033         vld1.32         ${S4}[1],[$tbl1,:32]
1034         vmlal.u32       $D0,$H3#hi,$S2
1035         vld1.32         ${S4}[0],[$tbl0,:32]
1036         vmlal.u32       $D4,$H2#hi,$R2
1037         vmlal.u32       $D1,$H4#hi,$S2
1038         vmlal.u32       $D2,$H0#hi,$R2
1039
1040         vmlal.u32       $D3,$H0#hi,$R3
1041          it             ne
1042          addne          $tbl1,$ctx,#(48+2*9*4)
1043         vmlal.u32       $D0,$H2#hi,$S3
1044          it             ne
1045          addne          $tbl0,$ctx,#(48+3*9*4)
1046         vmlal.u32       $D4,$H1#hi,$R3
1047         vmlal.u32       $D1,$H3#hi,$S3
1048         vmlal.u32       $D2,$H4#hi,$S3
1049
1050         vmlal.u32       $D3,$H4#hi,$S4
1051          vorn           $MASK,$MASK,$MASK       @ all-ones, can be redundant
1052         vmlal.u32       $D0,$H1#hi,$S4
1053          vshr.u64       $MASK,$MASK,#38
1054         vmlal.u32       $D4,$H0#hi,$R4
1055         vmlal.u32       $D1,$H2#hi,$S4
1056         vmlal.u32       $D2,$H3#hi,$S4
1057
1058         beq             .Lshort_tail
1059
1060         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1061         @ (hash+inp[0:1])*r^4:r^3 and accumulate
1062
1063         vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^3
1064         vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^4
1065
1066         vmlal.u32       $D2,$H2#lo,$R0
1067         vmlal.u32       $D0,$H0#lo,$R0
1068         vmlal.u32       $D3,$H3#lo,$R0
1069         vmlal.u32       $D1,$H1#lo,$R0
1070         vmlal.u32       $D4,$H4#lo,$R0
1071
1072         vmlal.u32       $D0,$H4#lo,$S1
1073         vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1074         vmlal.u32       $D3,$H2#lo,$R1
1075         vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1076         vmlal.u32       $D1,$H0#lo,$R1
1077         vmlal.u32       $D4,$H3#lo,$R1
1078         vmlal.u32       $D2,$H1#lo,$R1
1079
1080         vmlal.u32       $D3,$H1#lo,$R2
1081         vld1.32         ${S4}[1],[$tbl1,:32]
1082         vmlal.u32       $D0,$H3#lo,$S2
1083         vld1.32         ${S4}[0],[$tbl0,:32]
1084         vmlal.u32       $D4,$H2#lo,$R2
1085         vmlal.u32       $D1,$H4#lo,$S2
1086         vmlal.u32       $D2,$H0#lo,$R2
1087
1088         vmlal.u32       $D3,$H0#lo,$R3
1089         vmlal.u32       $D0,$H2#lo,$S3
1090         vmlal.u32       $D4,$H1#lo,$R3
1091         vmlal.u32       $D1,$H3#lo,$S3
1092         vmlal.u32       $D2,$H4#lo,$S3
1093
1094         vmlal.u32       $D3,$H4#lo,$S4
1095          vorn           $MASK,$MASK,$MASK       @ all-ones
1096         vmlal.u32       $D0,$H1#lo,$S4
1097          vshr.u64       $MASK,$MASK,#38
1098         vmlal.u32       $D4,$H0#lo,$R4
1099         vmlal.u32       $D1,$H2#lo,$S4
1100         vmlal.u32       $D2,$H3#lo,$S4
1101
1102 .Lshort_tail:
1103         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1104         @ horizontal addition
1105
1106         vadd.i64        $D3#lo,$D3#lo,$D3#hi
1107         vadd.i64        $D0#lo,$D0#lo,$D0#hi
1108         vadd.i64        $D4#lo,$D4#lo,$D4#hi
1109         vadd.i64        $D1#lo,$D1#lo,$D1#hi
1110         vadd.i64        $D2#lo,$D2#lo,$D2#hi
1111
1112         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1113         @ lazy reduction, but without narrowing
1114
1115         vshr.u64        $T0,$D3,#26
1116         vand.i64        $D3,$D3,$MASK
1117          vshr.u64       $T1,$D0,#26
1118          vand.i64       $D0,$D0,$MASK
1119         vadd.i64        $D4,$D4,$T0             @ h3 -> h4
1120          vadd.i64       $D1,$D1,$T1             @ h0 -> h1
1121
1122         vshr.u64        $T0,$D4,#26
1123         vand.i64        $D4,$D4,$MASK
1124          vshr.u64       $T1,$D1,#26
1125          vand.i64       $D1,$D1,$MASK
1126          vadd.i64       $D2,$D2,$T1             @ h1 -> h2
1127
1128         vadd.i64        $D0,$D0,$T0
1129         vshl.u64        $T0,$T0,#2
1130          vshr.u64       $T1,$D2,#26
1131          vand.i64       $D2,$D2,$MASK
1132         vadd.i64        $D0,$D0,$T0             @ h4 -> h0
1133          vadd.i64       $D3,$D3,$T1             @ h2 -> h3
1134
1135         vshr.u64        $T0,$D0,#26
1136         vand.i64        $D0,$D0,$MASK
1137          vshr.u64       $T1,$D3,#26
1138          vand.i64       $D3,$D3,$MASK
1139         vadd.i64        $D1,$D1,$T0             @ h0 -> h1
1140          vadd.i64       $D4,$D4,$T1             @ h3 -> h4
1141
1142         cmp             $len,#0
1143         bne             .Leven
1144
1145         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1146         @ store hash value
1147
1148         vst4.32         {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
1149         vst1.32         {$D4#lo[0]},[$ctx]
1150
1151         vldmia  sp!,{d8-d15}                    @ epilogue
1152         ldmia   sp!,{r4-r7}
1153 .Lno_data_neon:
1154         ret                                     @ bx    lr
1155 .size   poly1305_blocks_neon,.-poly1305_blocks_neon
1156
1157 .type   poly1305_emit_neon,%function
1158 .align  5
1159 poly1305_emit_neon:
1160         ldr     ip,[$ctx,#36]           @ is_base2_26
1161
1162         stmdb   sp!,{r4-r11}
1163
1164         tst     ip,ip
1165         beq     .Lpoly1305_emit_enter
1166
1167         ldmia   $ctx,{$h0-$h4}
1168         eor     $g0,$g0,$g0
1169
1170         adds    $h0,$h0,$h1,lsl#26      @ base 2^26 -> base 2^32
1171         mov     $h1,$h1,lsr#6
1172         adcs    $h1,$h1,$h2,lsl#20
1173         mov     $h2,$h2,lsr#12
1174         adcs    $h2,$h2,$h3,lsl#14
1175         mov     $h3,$h3,lsr#18
1176         adcs    $h3,$h3,$h4,lsl#8
1177         adc     $h4,$g0,$h4,lsr#24      @ can be partially reduced ...
1178
1179         and     $g0,$h4,#-4             @ ... so reduce
1180         and     $h4,$h3,#3
1181         add     $g0,$g0,$g0,lsr#2       @ *= 5
1182         adds    $h0,$h0,$g0
1183         adcs    $h1,$h1,#0
1184         adcs    $h2,$h2,#0
1185         adcs    $h3,$h3,#0
1186         adc     $h4,$h4,#0
1187
1188         adds    $g0,$h0,#5              @ compare to modulus
1189         adcs    $g1,$h1,#0
1190         adcs    $g2,$h2,#0
1191         adcs    $g3,$h3,#0
1192         adc     $g4,$h4,#0
1193         tst     $g4,#4                  @ did it carry/borrow?
1194
1195         it      ne
1196         movne   $h0,$g0
1197         ldr     $g0,[$nonce,#0]
1198         it      ne
1199         movne   $h1,$g1
1200         ldr     $g1,[$nonce,#4]
1201         it      ne
1202         movne   $h2,$g2
1203         ldr     $g2,[$nonce,#8]
1204         it      ne
1205         movne   $h3,$g3
1206         ldr     $g3,[$nonce,#12]
1207
1208         adds    $h0,$h0,$g0             @ accumulate nonce
1209         adcs    $h1,$h1,$g1
1210         adcs    $h2,$h2,$g2
1211         adc     $h3,$h3,$g3
1212
1213 # ifdef __ARMEB__
1214         rev     $h0,$h0
1215         rev     $h1,$h1
1216         rev     $h2,$h2
1217         rev     $h3,$h3
1218 # endif
1219         str     $h0,[$mac,#0]           @ store the result
1220         str     $h1,[$mac,#4]
1221         str     $h2,[$mac,#8]
1222         str     $h3,[$mac,#12]
1223
1224         ldmia   sp!,{r4-r11}
1225         ret                             @ bx    lr
1226 .size   poly1305_emit_neon,.-poly1305_emit_neon
1227
1228 .align  5
1229 .Lzeros:
1230 .long   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1231 .LOPENSSL_armcap:
1232 .word   OPENSSL_armcap_P-.Lpoly1305_init
1233 #endif
1234 ___
1235 }       }
1236 $code.=<<___;
1237 .asciz  "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1238 .align  2
1239 #if     __ARM_MAX_ARCH__>=7
1240 .comm   OPENSSL_armcap_P,4,4
1241 #endif
1242 ___
1243
1244 foreach (split("\n",$code)) {
1245         s/\`([^\`]*)\`/eval $1/geo;
1246
1247         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
1248         s/\bret\b/bx    lr/go                                           or
1249         s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
1250
1251         print $_,"\n";
1252 }
1253 close STDOUT; # enforce flush