ARM assembly pack: make it Windows-friendly.
[openssl.git] / crypto / poly1305 / asm / poly1305-armv4.pl
1 #! /usr/bin/env perl
2 # Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 #                       IALU(*)/gcc-4.4         NEON
18 #
19 # ARM11xx(ARMv6)        7.78/+100%              -
20 # Cortex-A5             6.35/+130%              3.00
21 # Cortex-A8             6.25/+115%              2.36
22 # Cortex-A9             5.10/+95%               2.55
23 # Cortex-A15            3.85/+85%               1.25(**)
24 # Snapdragon S4         5.70/+100%              1.48(**)
25 #
26 # (*)   this is for -march=armv6, i.e. with bunch of ldrb loading data;
27 # (**)  these are trade-off results, they can be improved by ~8% but at
28 #       the cost of 15/12% regression on Cortex-A5/A7, it's even possible
29 #       to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
30
31 $flavour = shift;
32 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
33 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
34
35 if ($flavour && $flavour ne "void") {
36     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
38     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
39     die "can't locate arm-xlate.pl";
40
41     open STDOUT,"| \"$^X\" $xlate $flavour $output";
42 } else {
43     open STDOUT,">$output";
44 }
45
46 ($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
47
48 $code.=<<___;
49 #include "arm_arch.h"
50
51 #if defined(__thumb2__)
52 .syntax unified
53 .thumb
54 #else
55 .code   32
56 #endif
57
58 .text
59
60 .globl  poly1305_emit
61 .globl  poly1305_blocks
62 .globl  poly1305_init
63 .type   poly1305_init,%function
64 .align  5
65 poly1305_init:
66 .Lpoly1305_init:
67         stmdb   sp!,{r4-r11}
68
69         eor     r3,r3,r3
70         cmp     $inp,#0
71         str     r3,[$ctx,#0]            @ zero hash value
72         str     r3,[$ctx,#4]
73         str     r3,[$ctx,#8]
74         str     r3,[$ctx,#12]
75         str     r3,[$ctx,#16]
76         str     r3,[$ctx,#36]           @ is_base2_26
77         add     $ctx,$ctx,#20
78
79 #ifdef  __thumb2__
80         it      eq
81 #endif
82         moveq   r0,#0
83         beq     .Lno_key
84
85 #if     __ARM_MAX_ARCH__>=7
86         adr     r11,.Lpoly1305_init
87         ldr     r12,.LOPENSSL_armcap
88 #endif
89         ldrb    r4,[$inp,#0]
90         mov     r10,#0x0fffffff
91         ldrb    r5,[$inp,#1]
92         and     r3,r10,#-4              @ 0x0ffffffc
93         ldrb    r6,[$inp,#2]
94         ldrb    r7,[$inp,#3]
95         orr     r4,r4,r5,lsl#8
96         ldrb    r5,[$inp,#4]
97         orr     r4,r4,r6,lsl#16
98         ldrb    r6,[$inp,#5]
99         orr     r4,r4,r7,lsl#24
100         ldrb    r7,[$inp,#6]
101         and     r4,r4,r10
102
103 #if     __ARM_MAX_ARCH__>=7
104 # if !defined(_WIN32)
105         ldr     r12,[r11,r12]           @ OPENSSL_armcap_P
106 # endif
107 # if defined(__APPLE__) || defined(_WIN32)
108         ldr     r12,[r12]
109 # endif
110 #endif
111         ldrb    r8,[$inp,#7]
112         orr     r5,r5,r6,lsl#8
113         ldrb    r6,[$inp,#8]
114         orr     r5,r5,r7,lsl#16
115         ldrb    r7,[$inp,#9]
116         orr     r5,r5,r8,lsl#24
117         ldrb    r8,[$inp,#10]
118         and     r5,r5,r3
119
120 #if     __ARM_MAX_ARCH__>=7
121         tst     r12,#ARMV7_NEON         @ check for NEON
122 # ifdef __thumb2__
123         adr     r9,.Lpoly1305_blocks_neon
124         adr     r11,.Lpoly1305_blocks
125         adr     r12,.Lpoly1305_emit
126         adr     r10,.Lpoly1305_emit_neon
127         itt     ne
128         movne   r11,r9
129         movne   r12,r10
130         orr     r11,r11,#1      @ thumb-ify address
131         orr     r12,r12,#1
132 # else
133         addeq   r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
134         addne   r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
135         addeq   r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
136         addne   r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
137 # endif
138 #endif
139         ldrb    r9,[$inp,#11]
140         orr     r6,r6,r7,lsl#8
141         ldrb    r7,[$inp,#12]
142         orr     r6,r6,r8,lsl#16
143         ldrb    r8,[$inp,#13]
144         orr     r6,r6,r9,lsl#24
145         ldrb    r9,[$inp,#14]
146         and     r6,r6,r3
147
148         ldrb    r10,[$inp,#15]
149         orr     r7,r7,r8,lsl#8
150         str     r4,[$ctx,#0]
151         orr     r7,r7,r9,lsl#16
152         str     r5,[$ctx,#4]
153         orr     r7,r7,r10,lsl#24
154         str     r6,[$ctx,#8]
155         and     r7,r7,r3
156         str     r7,[$ctx,#12]
157 #if     __ARM_MAX_ARCH__>=7
158         stmia   r2,{r11,r12}            @ fill functions table
159         mov     r0,#1
160 #else
161         mov     r0,#0
162 #endif
163 .Lno_key:
164         ldmia   sp!,{r4-r11}
165 #if     __ARM_ARCH__>=5
166         ret                             @ bx    lr
167 #else
168         tst     lr,#1
169         moveq   pc,lr                   @ be binary compatible with V4, yet
170         bx      lr                      @ interoperable with Thumb ISA:-)
171 #endif
172 .size   poly1305_init,.-poly1305_init
173 ___
174 {
175 my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
176 my ($s1,$s2,$s3)=($r1,$r2,$r3);
177
178 $code.=<<___;
179 .type   poly1305_blocks,%function
180 .align  5
181 poly1305_blocks:
182 .Lpoly1305_blocks:
183         stmdb   sp!,{r3-r11,lr}
184
185         ands    $len,$len,#-16
186         beq     .Lno_data
187
188         cmp     $padbit,#0
189         add     $len,$len,$inp          @ end pointer
190         sub     sp,sp,#32
191
192         ldmia   $ctx,{$h0-$r3}          @ load context
193
194         str     $ctx,[sp,#12]           @ offload stuff
195         mov     lr,$inp
196         str     $len,[sp,#16]
197         str     $r1,[sp,#20]
198         str     $r2,[sp,#24]
199         str     $r3,[sp,#28]
200         b       .Loop
201
202 .Loop:
203 #if __ARM_ARCH__<7
204         ldrb    r0,[lr],#16             @ load input
205 # ifdef __thumb2__
206         it      hi
207 # endif
208         addhi   $h4,$h4,#1              @ 1<<128
209         ldrb    r1,[lr,#-15]
210         ldrb    r2,[lr,#-14]
211         ldrb    r3,[lr,#-13]
212         orr     r1,r0,r1,lsl#8
213         ldrb    r0,[lr,#-12]
214         orr     r2,r1,r2,lsl#16
215         ldrb    r1,[lr,#-11]
216         orr     r3,r2,r3,lsl#24
217         ldrb    r2,[lr,#-10]
218         adds    $h0,$h0,r3              @ accumulate input
219
220         ldrb    r3,[lr,#-9]
221         orr     r1,r0,r1,lsl#8
222         ldrb    r0,[lr,#-8]
223         orr     r2,r1,r2,lsl#16
224         ldrb    r1,[lr,#-7]
225         orr     r3,r2,r3,lsl#24
226         ldrb    r2,[lr,#-6]
227         adcs    $h1,$h1,r3
228
229         ldrb    r3,[lr,#-5]
230         orr     r1,r0,r1,lsl#8
231         ldrb    r0,[lr,#-4]
232         orr     r2,r1,r2,lsl#16
233         ldrb    r1,[lr,#-3]
234         orr     r3,r2,r3,lsl#24
235         ldrb    r2,[lr,#-2]
236         adcs    $h2,$h2,r3
237
238         ldrb    r3,[lr,#-1]
239         orr     r1,r0,r1,lsl#8
240         str     lr,[sp,#8]              @ offload input pointer
241         orr     r2,r1,r2,lsl#16
242         add     $s1,$r1,$r1,lsr#2
243         orr     r3,r2,r3,lsl#24
244 #else
245         ldr     r0,[lr],#16             @ load input
246 # ifdef __thumb2__
247         it      hi
248 # endif
249         addhi   $h4,$h4,#1              @ padbit
250         ldr     r1,[lr,#-12]
251         ldr     r2,[lr,#-8]
252         ldr     r3,[lr,#-4]
253 # ifdef __ARMEB__
254         rev     r0,r0
255         rev     r1,r1
256         rev     r2,r2
257         rev     r3,r3
258 # endif
259         adds    $h0,$h0,r0              @ accumulate input
260         str     lr,[sp,#8]              @ offload input pointer
261         adcs    $h1,$h1,r1
262         add     $s1,$r1,$r1,lsr#2
263         adcs    $h2,$h2,r2
264 #endif
265         add     $s2,$r2,$r2,lsr#2
266         adcs    $h3,$h3,r3
267         add     $s3,$r3,$r3,lsr#2
268
269         umull   r2,r3,$h1,$r0
270          adc    $h4,$h4,#0
271         umull   r0,r1,$h0,$r0
272         umlal   r2,r3,$h4,$s1
273         umlal   r0,r1,$h3,$s1
274         ldr     $r1,[sp,#20]            @ reload $r1
275         umlal   r2,r3,$h2,$s3
276         umlal   r0,r1,$h1,$s3
277         umlal   r2,r3,$h3,$s2
278         umlal   r0,r1,$h2,$s2
279         umlal   r2,r3,$h0,$r1
280         str     r0,[sp,#0]              @ future $h0
281          mul    r0,$s2,$h4
282         ldr     $r2,[sp,#24]            @ reload $r2
283         adds    r2,r2,r1                @ d1+=d0>>32
284          eor    r1,r1,r1
285         adc     lr,r3,#0                @ future $h2
286         str     r2,[sp,#4]              @ future $h1
287
288         mul     r2,$s3,$h4
289         eor     r3,r3,r3
290         umlal   r0,r1,$h3,$s3
291         ldr     $r3,[sp,#28]            @ reload $r3
292         umlal   r2,r3,$h3,$r0
293         umlal   r0,r1,$h2,$r0
294         umlal   r2,r3,$h2,$r1
295         umlal   r0,r1,$h1,$r1
296         umlal   r2,r3,$h1,$r2
297         umlal   r0,r1,$h0,$r2
298         umlal   r2,r3,$h0,$r3
299         ldr     $h0,[sp,#0]
300         mul     $h4,$r0,$h4
301         ldr     $h1,[sp,#4]
302
303         adds    $h2,lr,r0               @ d2+=d1>>32
304         ldr     lr,[sp,#8]              @ reload input pointer
305         adc     r1,r1,#0
306         adds    $h3,r2,r1               @ d3+=d2>>32
307         ldr     r0,[sp,#16]             @ reload end pointer
308         adc     r3,r3,#0
309         add     $h4,$h4,r3              @ h4+=d3>>32
310
311         and     r1,$h4,#-4
312         and     $h4,$h4,#3
313         add     r1,r1,r1,lsr#2          @ *=5
314         adds    $h0,$h0,r1
315         adcs    $h1,$h1,#0
316         adcs    $h2,$h2,#0
317         adcs    $h3,$h3,#0
318         adc     $h4,$h4,#0
319
320         cmp     r0,lr                   @ done yet?
321         bhi     .Loop
322
323         ldr     $ctx,[sp,#12]
324         add     sp,sp,#32
325         stmia   $ctx,{$h0-$h4}          @ store the result
326
327 .Lno_data:
328 #if     __ARM_ARCH__>=5
329         ldmia   sp!,{r3-r11,pc}
330 #else
331         ldmia   sp!,{r3-r11,lr}
332         tst     lr,#1
333         moveq   pc,lr                   @ be binary compatible with V4, yet
334         bx      lr                      @ interoperable with Thumb ISA:-)
335 #endif
336 .size   poly1305_blocks,.-poly1305_blocks
337 ___
338 }
339 {
340 my ($ctx,$mac,$nonce)=map("r$_",(0..2));
341 my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
342 my $g4=$h4;
343
344 $code.=<<___;
345 .type   poly1305_emit,%function
346 .align  5
347 poly1305_emit:
348 .Lpoly1305_emit:
349         stmdb   sp!,{r4-r11}
350 .Lpoly1305_emit_enter:
351
352         ldmia   $ctx,{$h0-$h4}
353         adds    $g0,$h0,#5              @ compare to modulus
354         adcs    $g1,$h1,#0
355         adcs    $g2,$h2,#0
356         adcs    $g3,$h3,#0
357         adc     $g4,$h4,#0
358         tst     $g4,#4                  @ did it carry/borrow?
359
360 #ifdef  __thumb2__
361         it      ne
362 #endif
363         movne   $h0,$g0
364         ldr     $g0,[$nonce,#0]
365 #ifdef  __thumb2__
366         it      ne
367 #endif
368         movne   $h1,$g1
369         ldr     $g1,[$nonce,#4]
370 #ifdef  __thumb2__
371         it      ne
372 #endif
373         movne   $h2,$g2
374         ldr     $g2,[$nonce,#8]
375 #ifdef  __thumb2__
376         it      ne
377 #endif
378         movne   $h3,$g3
379         ldr     $g3,[$nonce,#12]
380
381         adds    $h0,$h0,$g0
382         adcs    $h1,$h1,$g1
383         adcs    $h2,$h2,$g2
384         adc     $h3,$h3,$g3
385
386 #if __ARM_ARCH__>=7
387 # ifdef __ARMEB__
388         rev     $h0,$h0
389         rev     $h1,$h1
390         rev     $h2,$h2
391         rev     $h3,$h3
392 # endif
393         str     $h0,[$mac,#0]
394         str     $h1,[$mac,#4]
395         str     $h2,[$mac,#8]
396         str     $h3,[$mac,#12]
397 #else
398         strb    $h0,[$mac,#0]
399         mov     $h0,$h0,lsr#8
400         strb    $h1,[$mac,#4]
401         mov     $h1,$h1,lsr#8
402         strb    $h2,[$mac,#8]
403         mov     $h2,$h2,lsr#8
404         strb    $h3,[$mac,#12]
405         mov     $h3,$h3,lsr#8
406
407         strb    $h0,[$mac,#1]
408         mov     $h0,$h0,lsr#8
409         strb    $h1,[$mac,#5]
410         mov     $h1,$h1,lsr#8
411         strb    $h2,[$mac,#9]
412         mov     $h2,$h2,lsr#8
413         strb    $h3,[$mac,#13]
414         mov     $h3,$h3,lsr#8
415
416         strb    $h0,[$mac,#2]
417         mov     $h0,$h0,lsr#8
418         strb    $h1,[$mac,#6]
419         mov     $h1,$h1,lsr#8
420         strb    $h2,[$mac,#10]
421         mov     $h2,$h2,lsr#8
422         strb    $h3,[$mac,#14]
423         mov     $h3,$h3,lsr#8
424
425         strb    $h0,[$mac,#3]
426         strb    $h1,[$mac,#7]
427         strb    $h2,[$mac,#11]
428         strb    $h3,[$mac,#15]
429 #endif
430         ldmia   sp!,{r4-r11}
431 #if     __ARM_ARCH__>=5
432         ret                             @ bx    lr
433 #else
434         tst     lr,#1
435         moveq   pc,lr                   @ be binary compatible with V4, yet
436         bx      lr                      @ interoperable with Thumb ISA:-)
437 #endif
438 .size   poly1305_emit,.-poly1305_emit
439 ___
440 {
441 my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
442 my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
443 my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
444
445 my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
446
447 $code.=<<___;
448 #if     __ARM_MAX_ARCH__>=7
449 .fpu    neon
450
451 .type   poly1305_init_neon,%function
452 .align  5
453 poly1305_init_neon:
454         ldr     r4,[$ctx,#20]           @ load key base 2^32
455         ldr     r5,[$ctx,#24]
456         ldr     r6,[$ctx,#28]
457         ldr     r7,[$ctx,#32]
458
459         and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
460         mov     r3,r4,lsr#26
461         mov     r4,r5,lsr#20
462         orr     r3,r3,r5,lsl#6
463         mov     r5,r6,lsr#14
464         orr     r4,r4,r6,lsl#12
465         mov     r6,r7,lsr#8
466         orr     r5,r5,r7,lsl#18
467         and     r3,r3,#0x03ffffff
468         and     r4,r4,#0x03ffffff
469         and     r5,r5,#0x03ffffff
470
471         vdup.32 $R0,r2                  @ r^1 in both lanes
472         add     r2,r3,r3,lsl#2          @ *5
473         vdup.32 $R1,r3
474         add     r3,r4,r4,lsl#2
475         vdup.32 $S1,r2
476         vdup.32 $R2,r4
477         add     r4,r5,r5,lsl#2
478         vdup.32 $S2,r3
479         vdup.32 $R3,r5
480         add     r5,r6,r6,lsl#2
481         vdup.32 $S3,r4
482         vdup.32 $R4,r6
483         vdup.32 $S4,r5
484
485         mov     $zeros,#2               @ counter
486
487 .Lsquare_neon:
488         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
489         @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
490         @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
491         @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
492         @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
493         @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
494
495         vmull.u32       $D0,$R0,${R0}[1]
496         vmull.u32       $D1,$R1,${R0}[1]
497         vmull.u32       $D2,$R2,${R0}[1]
498         vmull.u32       $D3,$R3,${R0}[1]
499         vmull.u32       $D4,$R4,${R0}[1]
500
501         vmlal.u32       $D0,$R4,${S1}[1]
502         vmlal.u32       $D1,$R0,${R1}[1]
503         vmlal.u32       $D2,$R1,${R1}[1]
504         vmlal.u32       $D3,$R2,${R1}[1]
505         vmlal.u32       $D4,$R3,${R1}[1]
506
507         vmlal.u32       $D0,$R3,${S2}[1]
508         vmlal.u32       $D1,$R4,${S2}[1]
509         vmlal.u32       $D3,$R1,${R2}[1]
510         vmlal.u32       $D2,$R0,${R2}[1]
511         vmlal.u32       $D4,$R2,${R2}[1]
512
513         vmlal.u32       $D0,$R2,${S3}[1]
514         vmlal.u32       $D3,$R0,${R3}[1]
515         vmlal.u32       $D1,$R3,${S3}[1]
516         vmlal.u32       $D2,$R4,${S3}[1]
517         vmlal.u32       $D4,$R1,${R3}[1]
518
519         vmlal.u32       $D3,$R4,${S4}[1]
520         vmlal.u32       $D0,$R1,${S4}[1]
521         vmlal.u32       $D1,$R2,${S4}[1]
522         vmlal.u32       $D2,$R3,${S4}[1]
523         vmlal.u32       $D4,$R0,${R4}[1]
524
525         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
526         @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
527         @ and P. Schwabe
528         @
529         @ H0>>+H1>>+H2>>+H3>>+H4
530         @ H3>>+H4>>*5+H0>>+H1
531         @
532         @ Trivia.
533         @
534         @ Result of multiplication of n-bit number by m-bit number is
535         @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
536         @ m-bit number multiplied by 2^n is still n+m bits wide.
537         @
538         @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
539         @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
540         @ one is n+1 bits wide.
541         @
542         @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
543         @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
544         @ can be 27. However! In cases when their width exceeds 26 bits
545         @ they are limited by 2^26+2^6. This in turn means that *sum*
546         @ of the products with these values can still be viewed as sum
547         @ of 52-bit numbers as long as the amount of addends is not a
548         @ power of 2. For example,
549         @
550         @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
551         @
552         @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
553         @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
554         @ 8 * (2^52) or 2^55. However, the value is then multiplied by
555         @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
556         @ which is less than 32 * (2^52) or 2^57. And when processing
557         @ data we are looking at triple as many addends...
558         @
559         @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
560         @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
561         @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
562         @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
563         @ instruction accepts 2x32-bit input and writes 2x64-bit result.
564         @ This means that result of reduction have to be compressed upon
565         @ loop wrap-around. This can be done in the process of reduction
566         @ to minimize amount of instructions [as well as amount of
567         @ 128-bit instructions, which benefits low-end processors], but
568         @ one has to watch for H2 (which is narrower than H0) and 5*H4
569         @ not being wider than 58 bits, so that result of right shift
570         @ by 26 bits fits in 32 bits. This is also useful on x86,
571         @ because it allows to use paddd in place for paddq, which
572         @ benefits Atom, where paddq is ridiculously slow.
573
574         vshr.u64        $T0,$D3,#26
575         vmovn.i64       $D3#lo,$D3
576          vshr.u64       $T1,$D0,#26
577          vmovn.i64      $D0#lo,$D0
578         vadd.i64        $D4,$D4,$T0             @ h3 -> h4
579         vbic.i32        $D3#lo,#0xfc000000      @ &=0x03ffffff
580          vadd.i64       $D1,$D1,$T1             @ h0 -> h1
581          vbic.i32       $D0#lo,#0xfc000000
582
583         vshrn.u64       $T0#lo,$D4,#26
584         vmovn.i64       $D4#lo,$D4
585          vshr.u64       $T1,$D1,#26
586          vmovn.i64      $D1#lo,$D1
587          vadd.i64       $D2,$D2,$T1             @ h1 -> h2
588         vbic.i32        $D4#lo,#0xfc000000
589          vbic.i32       $D1#lo,#0xfc000000
590
591         vadd.i32        $D0#lo,$D0#lo,$T0#lo
592         vshl.u32        $T0#lo,$T0#lo,#2
593          vshrn.u64      $T1#lo,$D2,#26
594          vmovn.i64      $D2#lo,$D2
595         vadd.i32        $D0#lo,$D0#lo,$T0#lo    @ h4 -> h0
596          vadd.i32       $D3#lo,$D3#lo,$T1#lo    @ h2 -> h3
597          vbic.i32       $D2#lo,#0xfc000000
598
599         vshr.u32        $T0#lo,$D0#lo,#26
600         vbic.i32        $D0#lo,#0xfc000000
601          vshr.u32       $T1#lo,$D3#lo,#26
602          vbic.i32       $D3#lo,#0xfc000000
603         vadd.i32        $D1#lo,$D1#lo,$T0#lo    @ h0 -> h1
604          vadd.i32       $D4#lo,$D4#lo,$T1#lo    @ h3 -> h4
605
606         subs            $zeros,$zeros,#1
607         beq             .Lsquare_break_neon
608
609         add             $tbl0,$ctx,#(48+0*9*4)
610         add             $tbl1,$ctx,#(48+1*9*4)
611
612         vtrn.32         $R0,$D0#lo              @ r^2:r^1
613         vtrn.32         $R2,$D2#lo
614         vtrn.32         $R3,$D3#lo
615         vtrn.32         $R1,$D1#lo
616         vtrn.32         $R4,$D4#lo
617
618         vshl.u32        $S2,$R2,#2              @ *5
619         vshl.u32        $S3,$R3,#2
620         vshl.u32        $S1,$R1,#2
621         vshl.u32        $S4,$R4,#2
622         vadd.i32        $S2,$S2,$R2
623         vadd.i32        $S1,$S1,$R1
624         vadd.i32        $S3,$S3,$R3
625         vadd.i32        $S4,$S4,$R4
626
627         vst4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
628         vst4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
629         vst4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
630         vst4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
631         vst1.32         {${S4}[0]},[$tbl0,:32]
632         vst1.32         {${S4}[1]},[$tbl1,:32]
633
634         b               .Lsquare_neon
635
636 .align  4
637 .Lsquare_break_neon:
638         add             $tbl0,$ctx,#(48+2*4*9)
639         add             $tbl1,$ctx,#(48+3*4*9)
640
641         vmov            $R0,$D0#lo              @ r^4:r^3
642         vshl.u32        $S1,$D1#lo,#2           @ *5
643         vmov            $R1,$D1#lo
644         vshl.u32        $S2,$D2#lo,#2
645         vmov            $R2,$D2#lo
646         vshl.u32        $S3,$D3#lo,#2
647         vmov            $R3,$D3#lo
648         vshl.u32        $S4,$D4#lo,#2
649         vmov            $R4,$D4#lo
650         vadd.i32        $S1,$S1,$D1#lo
651         vadd.i32        $S2,$S2,$D2#lo
652         vadd.i32        $S3,$S3,$D3#lo
653         vadd.i32        $S4,$S4,$D4#lo
654
655         vst4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
656         vst4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
657         vst4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
658         vst4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
659         vst1.32         {${S4}[0]},[$tbl0]
660         vst1.32         {${S4}[1]},[$tbl1]
661
662         ret                             @ bx    lr
663 .size   poly1305_init_neon,.-poly1305_init_neon
664
665 .type   poly1305_blocks_neon,%function
666 .align  5
667 poly1305_blocks_neon:
668 .Lpoly1305_blocks_neon:
669         ldr     ip,[$ctx,#36]           @ is_base2_26
670         ands    $len,$len,#-16
671         beq     .Lno_data_neon
672
673         cmp     $len,#64
674         bhs     .Lenter_neon
675         tst     ip,ip                   @ is_base2_26?
676         beq     .Lpoly1305_blocks
677
678 .Lenter_neon:
679         stmdb   sp!,{r4-r7}
680         vstmdb  sp!,{d8-d15}            @ ABI specification says so
681
682         tst     ip,ip                   @ is_base2_26?
683         bne     .Lbase2_26_neon
684
685         stmdb   sp!,{r1-r3,lr}
686         bl      poly1305_init_neon
687
688         ldr     r4,[$ctx,#0]            @ load hash value base 2^32
689         ldr     r5,[$ctx,#4]
690         ldr     r6,[$ctx,#8]
691         ldr     r7,[$ctx,#12]
692         ldr     ip,[$ctx,#16]
693
694         and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
695         mov     r3,r4,lsr#26
696          veor   $D0#lo,$D0#lo,$D0#lo
697         mov     r4,r5,lsr#20
698         orr     r3,r3,r5,lsl#6
699          veor   $D1#lo,$D1#lo,$D1#lo
700         mov     r5,r6,lsr#14
701         orr     r4,r4,r6,lsl#12
702          veor   $D2#lo,$D2#lo,$D2#lo
703         mov     r6,r7,lsr#8
704         orr     r5,r5,r7,lsl#18
705          veor   $D3#lo,$D3#lo,$D3#lo
706         and     r3,r3,#0x03ffffff
707         orr     r6,r6,ip,lsl#24
708          veor   $D4#lo,$D4#lo,$D4#lo
709         and     r4,r4,#0x03ffffff
710         mov     r1,#1
711         and     r5,r5,#0x03ffffff
712         str     r1,[$ctx,#36]           @ is_base2_26
713
714         vmov.32 $D0#lo[0],r2
715         vmov.32 $D1#lo[0],r3
716         vmov.32 $D2#lo[0],r4
717         vmov.32 $D3#lo[0],r5
718         vmov.32 $D4#lo[0],r6
719         adr     $zeros,.Lzeros
720
721         ldmia   sp!,{r1-r3,lr}
722         b       .Lbase2_32_neon
723
724 .align  4
725 .Lbase2_26_neon:
726         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
727         @ load hash value
728
729         veor            $D0#lo,$D0#lo,$D0#lo
730         veor            $D1#lo,$D1#lo,$D1#lo
731         veor            $D2#lo,$D2#lo,$D2#lo
732         veor            $D3#lo,$D3#lo,$D3#lo
733         veor            $D4#lo,$D4#lo,$D4#lo
734         vld4.32         {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
735         adr             $zeros,.Lzeros
736         vld1.32         {$D4#lo[0]},[$ctx]
737         sub             $ctx,$ctx,#16           @ rewind
738
739 .Lbase2_32_neon:
740         add             $in2,$inp,#32
741         mov             $padbit,$padbit,lsl#24
742         tst             $len,#31
743         beq             .Leven
744
745         vld4.32         {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
746         vmov.32         $H4#lo[0],$padbit
747         sub             $len,$len,#16
748         add             $in2,$inp,#32
749
750 # ifdef __ARMEB__
751         vrev32.8        $H0,$H0
752         vrev32.8        $H3,$H3
753         vrev32.8        $H1,$H1
754         vrev32.8        $H2,$H2
755 # endif
756         vsri.u32        $H4#lo,$H3#lo,#8        @ base 2^32 -> base 2^26
757         vshl.u32        $H3#lo,$H3#lo,#18
758
759         vsri.u32        $H3#lo,$H2#lo,#14
760         vshl.u32        $H2#lo,$H2#lo,#12
761         vadd.i32        $H4#hi,$H4#lo,$D4#lo    @ add hash value and move to #hi
762
763         vbic.i32        $H3#lo,#0xfc000000
764         vsri.u32        $H2#lo,$H1#lo,#20
765         vshl.u32        $H1#lo,$H1#lo,#6
766
767         vbic.i32        $H2#lo,#0xfc000000
768         vsri.u32        $H1#lo,$H0#lo,#26
769         vadd.i32        $H3#hi,$H3#lo,$D3#lo
770
771         vbic.i32        $H0#lo,#0xfc000000
772         vbic.i32        $H1#lo,#0xfc000000
773         vadd.i32        $H2#hi,$H2#lo,$D2#lo
774
775         vadd.i32        $H0#hi,$H0#lo,$D0#lo
776         vadd.i32        $H1#hi,$H1#lo,$D1#lo
777
778         mov             $tbl1,$zeros
779         add             $tbl0,$ctx,#48
780
781         cmp             $len,$len
782         b               .Long_tail
783
784 .align  4
785 .Leven:
786         subs            $len,$len,#64
787         it              lo
788         movlo           $in2,$zeros
789
790         vmov.i32        $H4,#1<<24              @ padbit, yes, always
791         vld4.32         {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]    @ inp[0:1]
792         add             $inp,$inp,#64
793         vld4.32         {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]    @ inp[2:3] (or 0)
794         add             $in2,$in2,#64
795         itt             hi
796         addhi           $tbl1,$ctx,#(48+1*9*4)
797         addhi           $tbl0,$ctx,#(48+3*9*4)
798
799 # ifdef __ARMEB__
800         vrev32.8        $H0,$H0
801         vrev32.8        $H3,$H3
802         vrev32.8        $H1,$H1
803         vrev32.8        $H2,$H2
804 # endif
805         vsri.u32        $H4,$H3,#8              @ base 2^32 -> base 2^26
806         vshl.u32        $H3,$H3,#18
807
808         vsri.u32        $H3,$H2,#14
809         vshl.u32        $H2,$H2,#12
810
811         vbic.i32        $H3,#0xfc000000
812         vsri.u32        $H2,$H1,#20
813         vshl.u32        $H1,$H1,#6
814
815         vbic.i32        $H2,#0xfc000000
816         vsri.u32        $H1,$H0,#26
817
818         vbic.i32        $H0,#0xfc000000
819         vbic.i32        $H1,#0xfc000000
820
821         bls             .Lskip_loop
822
823         vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^2
824         vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^4
825         vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
826         vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
827         b               .Loop_neon
828
829 .align  5
830 .Loop_neon:
831         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
832         @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
833         @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
834         @   \___________________/
835         @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
836         @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
837         @   \___________________/ \____________________/
838         @
839         @ Note that we start with inp[2:3]*r^2. This is because it
840         @ doesn't depend on reduction in previous iteration.
841         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
842         @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
843         @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
844         @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
845         @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
846         @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
847
848         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
849         @ inp[2:3]*r^2
850
851         vadd.i32        $H2#lo,$H2#lo,$D2#lo    @ accumulate inp[0:1]
852         vmull.u32       $D2,$H2#hi,${R0}[1]
853         vadd.i32        $H0#lo,$H0#lo,$D0#lo
854         vmull.u32       $D0,$H0#hi,${R0}[1]
855         vadd.i32        $H3#lo,$H3#lo,$D3#lo
856         vmull.u32       $D3,$H3#hi,${R0}[1]
857         vmlal.u32       $D2,$H1#hi,${R1}[1]
858         vadd.i32        $H1#lo,$H1#lo,$D1#lo
859         vmull.u32       $D1,$H1#hi,${R0}[1]
860
861         vadd.i32        $H4#lo,$H4#lo,$D4#lo
862         vmull.u32       $D4,$H4#hi,${R0}[1]
863         subs            $len,$len,#64
864         vmlal.u32       $D0,$H4#hi,${S1}[1]
865         it              lo
866         movlo           $in2,$zeros
867         vmlal.u32       $D3,$H2#hi,${R1}[1]
868         vld1.32         ${S4}[1],[$tbl1,:32]
869         vmlal.u32       $D1,$H0#hi,${R1}[1]
870         vmlal.u32       $D4,$H3#hi,${R1}[1]
871
872         vmlal.u32       $D0,$H3#hi,${S2}[1]
873         vmlal.u32       $D3,$H1#hi,${R2}[1]
874         vmlal.u32       $D4,$H2#hi,${R2}[1]
875         vmlal.u32       $D1,$H4#hi,${S2}[1]
876         vmlal.u32       $D2,$H0#hi,${R2}[1]
877
878         vmlal.u32       $D3,$H0#hi,${R3}[1]
879         vmlal.u32       $D0,$H2#hi,${S3}[1]
880         vmlal.u32       $D4,$H1#hi,${R3}[1]
881         vmlal.u32       $D1,$H3#hi,${S3}[1]
882         vmlal.u32       $D2,$H4#hi,${S3}[1]
883
884         vmlal.u32       $D3,$H4#hi,${S4}[1]
885         vmlal.u32       $D0,$H1#hi,${S4}[1]
886         vmlal.u32       $D4,$H0#hi,${R4}[1]
887         vmlal.u32       $D1,$H2#hi,${S4}[1]
888         vmlal.u32       $D2,$H3#hi,${S4}[1]
889
890         vld4.32         {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]    @ inp[2:3] (or 0)
891         add             $in2,$in2,#64
892
893         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
894         @ (hash+inp[0:1])*r^4 and accumulate
895
896         vmlal.u32       $D3,$H3#lo,${R0}[0]
897         vmlal.u32       $D0,$H0#lo,${R0}[0]
898         vmlal.u32       $D4,$H4#lo,${R0}[0]
899         vmlal.u32       $D1,$H1#lo,${R0}[0]
900         vmlal.u32       $D2,$H2#lo,${R0}[0]
901         vld1.32         ${S4}[0],[$tbl0,:32]
902
903         vmlal.u32       $D3,$H2#lo,${R1}[0]
904         vmlal.u32       $D0,$H4#lo,${S1}[0]
905         vmlal.u32       $D4,$H3#lo,${R1}[0]
906         vmlal.u32       $D1,$H0#lo,${R1}[0]
907         vmlal.u32       $D2,$H1#lo,${R1}[0]
908
909         vmlal.u32       $D3,$H1#lo,${R2}[0]
910         vmlal.u32       $D0,$H3#lo,${S2}[0]
911         vmlal.u32       $D4,$H2#lo,${R2}[0]
912         vmlal.u32       $D1,$H4#lo,${S2}[0]
913         vmlal.u32       $D2,$H0#lo,${R2}[0]
914
915         vmlal.u32       $D3,$H0#lo,${R3}[0]
916         vmlal.u32       $D0,$H2#lo,${S3}[0]
917         vmlal.u32       $D4,$H1#lo,${R3}[0]
918         vmlal.u32       $D1,$H3#lo,${S3}[0]
919         vmlal.u32       $D3,$H4#lo,${S4}[0]
920
921         vmlal.u32       $D2,$H4#lo,${S3}[0]
922         vmlal.u32       $D0,$H1#lo,${S4}[0]
923         vmlal.u32       $D4,$H0#lo,${R4}[0]
924         vmov.i32        $H4,#1<<24              @ padbit, yes, always
925         vmlal.u32       $D1,$H2#lo,${S4}[0]
926         vmlal.u32       $D2,$H3#lo,${S4}[0]
927
928         vld4.32         {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]    @ inp[0:1]
929         add             $inp,$inp,#64
930 # ifdef __ARMEB__
931         vrev32.8        $H0,$H0
932         vrev32.8        $H1,$H1
933         vrev32.8        $H2,$H2
934         vrev32.8        $H3,$H3
935 # endif
936
937         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
938         @ lazy reduction interleaved with base 2^32 -> base 2^26 of
939         @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
940
941         vshr.u64        $T0,$D3,#26
942         vmovn.i64       $D3#lo,$D3
943          vshr.u64       $T1,$D0,#26
944          vmovn.i64      $D0#lo,$D0
945         vadd.i64        $D4,$D4,$T0             @ h3 -> h4
946         vbic.i32        $D3#lo,#0xfc000000
947           vsri.u32      $H4,$H3,#8              @ base 2^32 -> base 2^26
948          vadd.i64       $D1,$D1,$T1             @ h0 -> h1
949           vshl.u32      $H3,$H3,#18
950          vbic.i32       $D0#lo,#0xfc000000
951
952         vshrn.u64       $T0#lo,$D4,#26
953         vmovn.i64       $D4#lo,$D4
954          vshr.u64       $T1,$D1,#26
955          vmovn.i64      $D1#lo,$D1
956          vadd.i64       $D2,$D2,$T1             @ h1 -> h2
957           vsri.u32      $H3,$H2,#14
958         vbic.i32        $D4#lo,#0xfc000000
959           vshl.u32      $H2,$H2,#12
960          vbic.i32       $D1#lo,#0xfc000000
961
962         vadd.i32        $D0#lo,$D0#lo,$T0#lo
963         vshl.u32        $T0#lo,$T0#lo,#2
964           vbic.i32      $H3,#0xfc000000
965          vshrn.u64      $T1#lo,$D2,#26
966          vmovn.i64      $D2#lo,$D2
967         vaddl.u32       $D0,$D0#lo,$T0#lo       @ h4 -> h0 [widen for a sec]
968           vsri.u32      $H2,$H1,#20
969          vadd.i32       $D3#lo,$D3#lo,$T1#lo    @ h2 -> h3
970           vshl.u32      $H1,$H1,#6
971          vbic.i32       $D2#lo,#0xfc000000
972           vbic.i32      $H2,#0xfc000000
973
974         vshrn.u64       $T0#lo,$D0,#26          @ re-narrow
975         vmovn.i64       $D0#lo,$D0
976           vsri.u32      $H1,$H0,#26
977           vbic.i32      $H0,#0xfc000000
978          vshr.u32       $T1#lo,$D3#lo,#26
979          vbic.i32       $D3#lo,#0xfc000000
980         vbic.i32        $D0#lo,#0xfc000000
981         vadd.i32        $D1#lo,$D1#lo,$T0#lo    @ h0 -> h1
982          vadd.i32       $D4#lo,$D4#lo,$T1#lo    @ h3 -> h4
983           vbic.i32      $H1,#0xfc000000
984
985         bhi             .Loop_neon
986
987 .Lskip_loop:
988         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
989         @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
990
991         add             $tbl1,$ctx,#(48+0*9*4)
992         add             $tbl0,$ctx,#(48+1*9*4)
993         adds            $len,$len,#32
994         it              ne
995         movne           $len,#0
996         bne             .Long_tail
997
998         vadd.i32        $H2#hi,$H2#lo,$D2#lo    @ add hash value and move to #hi
999         vadd.i32        $H0#hi,$H0#lo,$D0#lo
1000         vadd.i32        $H3#hi,$H3#lo,$D3#lo
1001         vadd.i32        $H1#hi,$H1#lo,$D1#lo
1002         vadd.i32        $H4#hi,$H4#lo,$D4#lo
1003
1004 .Long_tail:
1005         vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^1
1006         vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^2
1007
1008         vadd.i32        $H2#lo,$H2#lo,$D2#lo    @ can be redundant
1009         vmull.u32       $D2,$H2#hi,$R0
1010         vadd.i32        $H0#lo,$H0#lo,$D0#lo
1011         vmull.u32       $D0,$H0#hi,$R0
1012         vadd.i32        $H3#lo,$H3#lo,$D3#lo
1013         vmull.u32       $D3,$H3#hi,$R0
1014         vadd.i32        $H1#lo,$H1#lo,$D1#lo
1015         vmull.u32       $D1,$H1#hi,$R0
1016         vadd.i32        $H4#lo,$H4#lo,$D4#lo
1017         vmull.u32       $D4,$H4#hi,$R0
1018
1019         vmlal.u32       $D0,$H4#hi,$S1
1020         vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1021         vmlal.u32       $D3,$H2#hi,$R1
1022         vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1023         vmlal.u32       $D1,$H0#hi,$R1
1024         vmlal.u32       $D4,$H3#hi,$R1
1025         vmlal.u32       $D2,$H1#hi,$R1
1026
1027         vmlal.u32       $D3,$H1#hi,$R2
1028         vld1.32         ${S4}[1],[$tbl1,:32]
1029         vmlal.u32       $D0,$H3#hi,$S2
1030         vld1.32         ${S4}[0],[$tbl0,:32]
1031         vmlal.u32       $D4,$H2#hi,$R2
1032         vmlal.u32       $D1,$H4#hi,$S2
1033         vmlal.u32       $D2,$H0#hi,$R2
1034
1035         vmlal.u32       $D3,$H0#hi,$R3
1036          it             ne
1037          addne          $tbl1,$ctx,#(48+2*9*4)
1038         vmlal.u32       $D0,$H2#hi,$S3
1039          it             ne
1040          addne          $tbl0,$ctx,#(48+3*9*4)
1041         vmlal.u32       $D4,$H1#hi,$R3
1042         vmlal.u32       $D1,$H3#hi,$S3
1043         vmlal.u32       $D2,$H4#hi,$S3
1044
1045         vmlal.u32       $D3,$H4#hi,$S4
1046          vorn           $MASK,$MASK,$MASK       @ all-ones, can be redundant
1047         vmlal.u32       $D0,$H1#hi,$S4
1048          vshr.u64       $MASK,$MASK,#38
1049         vmlal.u32       $D4,$H0#hi,$R4
1050         vmlal.u32       $D1,$H2#hi,$S4
1051         vmlal.u32       $D2,$H3#hi,$S4
1052
1053         beq             .Lshort_tail
1054
1055         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1056         @ (hash+inp[0:1])*r^4:r^3 and accumulate
1057
1058         vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^3
1059         vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^4
1060
1061         vmlal.u32       $D2,$H2#lo,$R0
1062         vmlal.u32       $D0,$H0#lo,$R0
1063         vmlal.u32       $D3,$H3#lo,$R0
1064         vmlal.u32       $D1,$H1#lo,$R0
1065         vmlal.u32       $D4,$H4#lo,$R0
1066
1067         vmlal.u32       $D0,$H4#lo,$S1
1068         vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1069         vmlal.u32       $D3,$H2#lo,$R1
1070         vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1071         vmlal.u32       $D1,$H0#lo,$R1
1072         vmlal.u32       $D4,$H3#lo,$R1
1073         vmlal.u32       $D2,$H1#lo,$R1
1074
1075         vmlal.u32       $D3,$H1#lo,$R2
1076         vld1.32         ${S4}[1],[$tbl1,:32]
1077         vmlal.u32       $D0,$H3#lo,$S2
1078         vld1.32         ${S4}[0],[$tbl0,:32]
1079         vmlal.u32       $D4,$H2#lo,$R2
1080         vmlal.u32       $D1,$H4#lo,$S2
1081         vmlal.u32       $D2,$H0#lo,$R2
1082
1083         vmlal.u32       $D3,$H0#lo,$R3
1084         vmlal.u32       $D0,$H2#lo,$S3
1085         vmlal.u32       $D4,$H1#lo,$R3
1086         vmlal.u32       $D1,$H3#lo,$S3
1087         vmlal.u32       $D2,$H4#lo,$S3
1088
1089         vmlal.u32       $D3,$H4#lo,$S4
1090          vorn           $MASK,$MASK,$MASK       @ all-ones
1091         vmlal.u32       $D0,$H1#lo,$S4
1092          vshr.u64       $MASK,$MASK,#38
1093         vmlal.u32       $D4,$H0#lo,$R4
1094         vmlal.u32       $D1,$H2#lo,$S4
1095         vmlal.u32       $D2,$H3#lo,$S4
1096
1097 .Lshort_tail:
1098         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1099         @ horizontal addition
1100
1101         vadd.i64        $D3#lo,$D3#lo,$D3#hi
1102         vadd.i64        $D0#lo,$D0#lo,$D0#hi
1103         vadd.i64        $D4#lo,$D4#lo,$D4#hi
1104         vadd.i64        $D1#lo,$D1#lo,$D1#hi
1105         vadd.i64        $D2#lo,$D2#lo,$D2#hi
1106
1107         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1108         @ lazy reduction, but without narrowing
1109
1110         vshr.u64        $T0,$D3,#26
1111         vand.i64        $D3,$D3,$MASK
1112          vshr.u64       $T1,$D0,#26
1113          vand.i64       $D0,$D0,$MASK
1114         vadd.i64        $D4,$D4,$T0             @ h3 -> h4
1115          vadd.i64       $D1,$D1,$T1             @ h0 -> h1
1116
1117         vshr.u64        $T0,$D4,#26
1118         vand.i64        $D4,$D4,$MASK
1119          vshr.u64       $T1,$D1,#26
1120          vand.i64       $D1,$D1,$MASK
1121          vadd.i64       $D2,$D2,$T1             @ h1 -> h2
1122
1123         vadd.i64        $D0,$D0,$T0
1124         vshl.u64        $T0,$T0,#2
1125          vshr.u64       $T1,$D2,#26
1126          vand.i64       $D2,$D2,$MASK
1127         vadd.i64        $D0,$D0,$T0             @ h4 -> h0
1128          vadd.i64       $D3,$D3,$T1             @ h2 -> h3
1129
1130         vshr.u64        $T0,$D0,#26
1131         vand.i64        $D0,$D0,$MASK
1132          vshr.u64       $T1,$D3,#26
1133          vand.i64       $D3,$D3,$MASK
1134         vadd.i64        $D1,$D1,$T0             @ h0 -> h1
1135          vadd.i64       $D4,$D4,$T1             @ h3 -> h4
1136
1137         cmp             $len,#0
1138         bne             .Leven
1139
1140         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1141         @ store hash value
1142
1143         vst4.32         {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
1144         vst1.32         {$D4#lo[0]},[$ctx]
1145
1146         vldmia  sp!,{d8-d15}                    @ epilogue
1147         ldmia   sp!,{r4-r7}
1148 .Lno_data_neon:
1149         ret                                     @ bx    lr
1150 .size   poly1305_blocks_neon,.-poly1305_blocks_neon
1151
1152 .type   poly1305_emit_neon,%function
1153 .align  5
1154 poly1305_emit_neon:
1155 .Lpoly1305_emit_neon:
1156         ldr     ip,[$ctx,#36]           @ is_base2_26
1157
1158         stmdb   sp!,{r4-r11}
1159
1160         tst     ip,ip
1161         beq     .Lpoly1305_emit_enter
1162
1163         ldmia   $ctx,{$h0-$h4}
1164         eor     $g0,$g0,$g0
1165
1166         adds    $h0,$h0,$h1,lsl#26      @ base 2^26 -> base 2^32
1167         mov     $h1,$h1,lsr#6
1168         adcs    $h1,$h1,$h2,lsl#20
1169         mov     $h2,$h2,lsr#12
1170         adcs    $h2,$h2,$h3,lsl#14
1171         mov     $h3,$h3,lsr#18
1172         adcs    $h3,$h3,$h4,lsl#8
1173         adc     $h4,$g0,$h4,lsr#24      @ can be partially reduced ...
1174
1175         and     $g0,$h4,#-4             @ ... so reduce
1176         and     $h4,$h3,#3
1177         add     $g0,$g0,$g0,lsr#2       @ *= 5
1178         adds    $h0,$h0,$g0
1179         adcs    $h1,$h1,#0
1180         adcs    $h2,$h2,#0
1181         adcs    $h3,$h3,#0
1182         adc     $h4,$h4,#0
1183
1184         adds    $g0,$h0,#5              @ compare to modulus
1185         adcs    $g1,$h1,#0
1186         adcs    $g2,$h2,#0
1187         adcs    $g3,$h3,#0
1188         adc     $g4,$h4,#0
1189         tst     $g4,#4                  @ did it carry/borrow?
1190
1191         it      ne
1192         movne   $h0,$g0
1193         ldr     $g0,[$nonce,#0]
1194         it      ne
1195         movne   $h1,$g1
1196         ldr     $g1,[$nonce,#4]
1197         it      ne
1198         movne   $h2,$g2
1199         ldr     $g2,[$nonce,#8]
1200         it      ne
1201         movne   $h3,$g3
1202         ldr     $g3,[$nonce,#12]
1203
1204         adds    $h0,$h0,$g0             @ accumulate nonce
1205         adcs    $h1,$h1,$g1
1206         adcs    $h2,$h2,$g2
1207         adc     $h3,$h3,$g3
1208
1209 # ifdef __ARMEB__
1210         rev     $h0,$h0
1211         rev     $h1,$h1
1212         rev     $h2,$h2
1213         rev     $h3,$h3
1214 # endif
1215         str     $h0,[$mac,#0]           @ store the result
1216         str     $h1,[$mac,#4]
1217         str     $h2,[$mac,#8]
1218         str     $h3,[$mac,#12]
1219
1220         ldmia   sp!,{r4-r11}
1221         ret                             @ bx    lr
1222 .size   poly1305_emit_neon,.-poly1305_emit_neon
1223
1224 .align  5
1225 .Lzeros:
1226 .long   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1227 .LOPENSSL_armcap:
1228 # ifdef _WIN32
1229 .word   OPENSSL_armcap_P
1230 # else
1231 .word   OPENSSL_armcap_P-.Lpoly1305_init
1232 # endif
1233 #endif
1234 ___
1235 }       }
1236 $code.=<<___;
1237 .asciz  "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1238 .align  2
1239 #if     __ARM_MAX_ARCH__>=7
1240 .comm   OPENSSL_armcap_P,4,4
1241 #endif
1242 ___
1243
1244 foreach (split("\n",$code)) {
1245         s/\`([^\`]*)\`/eval $1/geo;
1246
1247         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
1248         s/\bret\b/bx    lr/go                                           or
1249         s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
1250
1251         print $_,"\n";
1252 }
1253 close STDOUT; # enforce flush