Update the imported curve448 code to use OpenSSL copyright headers
[openssl.git] / crypto / ec / curve448 / arch_neon / f_impl.c
1 /*
2  * Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
3  * Copyright 2014 Cryptography Research, Inc.
4  *
5  * Licensed under the OpenSSL license (the "License").  You may not use
6  * this file except in compliance with the License.  You can obtain a copy
7  * in the file LICENSE in the source distribution or at
8  * https://www.openssl.org/source/license.html
9  *
10  * Originally written by Mike Hamburg
11  */
12
13 #include "f_field.h"
14
15 static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline,unused))
16 xx_vaddup_u64(uint64x2_t x) {
17     __asm__ ("vadd.s64 %f0, %e0" : "+w"(x));
18     return x;
19 }
20
21 static __inline__ int64x2_t __attribute__((gnu_inline,always_inline,unused))
22 vrev128_s64(int64x2_t x) {
23     __asm__ ("vswp.s64 %e0, %f0" : "+w"(x));
24     return x;
25 }
26
27 static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline))
28 vrev128_u64(uint64x2_t x) {
29     __asm__ ("vswp.s64 %e0, %f0" : "+w"(x));
30     return x;
31 }
32
33 static inline void __attribute__((gnu_inline,always_inline,unused))
34 smlal (
35     uint64_t *acc,
36     const uint32_t a,
37     const uint32_t b
38 ) {
39     *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
40 }
41
42 static inline void __attribute__((gnu_inline,always_inline,unused))
43 smlal2 (
44     uint64_t *acc,
45     const uint32_t a,
46     const uint32_t b
47 ) {
48     *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
49 }
50
51 static inline void __attribute__((gnu_inline,always_inline,unused))
52 smull (
53     uint64_t *acc,
54     const uint32_t a,
55     const uint32_t b
56 ) {
57     *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
58 }
59
60 static inline void __attribute__((gnu_inline,always_inline,unused))
61 smull2 (
62     uint64_t *acc,
63     const uint32_t a,
64     const uint32_t b
65 ) {
66     *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
67 }
68
69 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
70     #define _bl0 "q0"
71     #define _bl0_0 "d0"
72     #define _bl0_1 "d1"
73     #define _bh0 "q1"
74     #define _bh0_0 "d2"
75     #define _bh0_1 "d3"
76     #define _bs0 "q2"
77     #define _bs0_0 "d4"
78     #define _bs0_1 "d5"
79     #define _bl2 "q3"
80     #define _bl2_0 "d6"
81     #define _bl2_1 "d7"
82     #define _bh2 "q4"
83     #define _bh2_0 "d8"
84     #define _bh2_1 "d9"
85     #define _bs2 "q5"
86     #define _bs2_0 "d10"
87     #define _bs2_1 "d11"
88
89     #define _as0 "q6"
90     #define _as0_0 "d12"
91     #define _as0_1 "d13"
92     #define _as2 "q7"
93     #define _as2_0 "d14"
94     #define _as2_1 "d15"
95     #define _al0 "q8"
96     #define _al0_0 "d16"
97     #define _al0_1 "d17"
98     #define _ah0 "q9"
99     #define _ah0_0 "d18"
100     #define _ah0_1 "d19"
101     #define _al2 "q10"
102     #define _al2_0 "d20"
103     #define _al2_1 "d21"
104     #define _ah2 "q11"
105     #define _ah2_0 "d22"
106     #define _ah2_1 "d23"
107
108     #define _a0a "q12"
109     #define _a0a_0 "d24"
110     #define _a0a_1 "d25"
111     #define _a0b "q13"
112     #define _a0b_0 "d26"
113     #define _a0b_1 "d27"
114     #define _a1a "q14"
115     #define _a1a_0 "d28"
116     #define _a1a_1 "d29"
117     #define _a1b "q15"
118     #define _a1b_0 "d30"
119     #define _a1b_1 "d31"
120     #define VMAC(op,result,a,b,n) #op" "result", "a", "b"[" #n "]\n\t"
121     #define VOP3(op,result,a,b)   #op" "result", "a", "b"\n\t"
122     #define VOP2(op,result,a)     #op" "result", "a"\n\t"
123
124     int32x2_t *vc = (int32x2_t*) cs->limb;
125
126     __asm__ __volatile__(
127         
128         "vld2.32 {"_al0_0","_al0_1","_ah0_0","_ah0_1"}, [%[a],:128]!" "\n\t"
129         VOP3(vadd.i32,_as0,_al0,_ah0)
130         
131         "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t"
132         VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1)
133         VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0)
134             
135         "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t"
136         VOP3(vadd.i32,_bs2,_bl2,_bh2)
137             
138         "vld2.32 {"_al2_0","_al2_1","_ah2_0","_ah2_1"}, [%[a],:128]!" "\n\t"
139         VOP3(vadd.i32,_as2,_al2,_ah2)
140         
141         VMAC(vmull.s32,_a0b,_as0_1,_bs2_1,0)
142         VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0)
143         VMAC(vmlal.s32,_a0b,_as2_1,_bs0_1,0)
144         VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0)
145             
146         VMAC(vmull.s32,_a1b,_as0_1,_bs2_1,1)
147         VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1)
148         VMAC(vmlal.s32,_a1b,_as2_1,_bs0_1,1)
149         VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1)
150             
151         VOP2(vmov,_a0a,_a0b)
152         VMAC(vmlal.s32,_a0a,_ah0_1,_bh2_1,0)
153         VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_0,0)
154         VMAC(vmlal.s32,_a0a,_ah2_1,_bh0_1,0)
155         VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_0,0)
156             
157         VMAC(vmlsl.s32,_a0b,_al0_1,_bl2_1,0)
158         VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_0,0)
159         VMAC(vmlsl.s32,_a0b,_al2_1,_bl0_1,0)
160         VMAC(vmlal.s32,_a0b,_al0_0,_bs0_0,0)
161             
162         VOP2(vmov,_a1a,_a1b)
163         VMAC(vmlal.s32,_a1a,_ah0_1,_bh2_1,1)
164         VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_0,1)
165         VMAC(vmlal.s32,_a1a,_ah2_1,_bh0_1,1)
166         VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_0,1)
167             
168             VOP2(vswp,_a0b_1,_a0a_0)
169             
170         VMAC(vmlsl.s32,_a1b,_al0_1,_bl2_1,1)
171         VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_0,1)
172         VMAC(vmlsl.s32,_a1b,_al2_1,_bl0_1,1)
173         VMAC(vmlal.s32,_a1b,_al0_0,_bs0_0,1)
174                 
175             VOP3(vsra.u64,_a0a,_a0b,"#28")
176             VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1)
177             VOP2(vmovn.i64,_a0b_0,_a0b)
178                 
179             VOP2(vswp,_a1b_1,_a1a_0)
180             VOP3(vadd.i64,_a1b,_a0a,_a1b)
181                     
182                     
183         VMAC(vmull.s32,_a0a,_as2_0,_bs2_1,0)
184             VOP2(vmovn.i64,_a0b_1,_a1b)
185         VMAC(vmlal.s32,_a0a,_as2_1,_bs2_0,0)
186             VOP3(vsra.u64,_a1a,_a1b,"#28")
187         VMAC(vmlal.s32,_a0a,_as0_0,_bh0_1,0)
188             VOP2(vbic.i32,_a0b,"#0xf0000000")
189         VMAC(vmlal.s32,_a0a,_as0_1,_bh0_0,0)
190             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
191                     
192         VMAC(vmull.s32,_a1b,_as2_0,_bs2_1,1)
193         VMAC(vmlal.s32,_a1b,_as2_1,_bs2_0,1)
194         VMAC(vmlal.s32,_a1b,_as0_0,_bh0_1,1)
195         VMAC(vmlal.s32,_a1b,_as0_1,_bh0_0,1)
196
197         VOP2(vmov,_a0b_1,_a0a_1)
198         VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
199         VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
200         VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_1,0)
201         VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_0,0)
202         VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_1,0)
203         VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_0,0)
204
205         VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_1,0)
206         VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_0,0)
207         VMAC(vmlal.s32,_a0b,_al0_0,_bs0_1,0)
208         VMAC(vmlal.s32,_a0b,_al0_1,_bs0_0,0)
209
210         VOP2(vmov,_a1a,_a1b)
211         VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_1,1)
212         VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_0,1)
213         VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_1,1)
214         VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_0,1)
215
216             VOP2(vswp,_a0b_1,_a0a_0)
217
218         VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_1,1)
219         VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_0,1)
220         VMAC(vmlal.s32,_a1b,_al0_0,_bs0_1,1)
221         VMAC(vmlal.s32,_a1b,_al0_1,_bs0_0,1)
222                                         
223             VOP3(vsra.u64,_a0a,_a0b,"#28")
224             VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0)
225             VOP2(vmovn.i64,_a0b_0,_a0b)
226                         
227             VOP2(vswp,_a1b_1,_a1a_0)
228             VOP3(vadd.i64,_a1b,_a0a,_a1b)
229
230         VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0)
231             VOP2(vmovn.i64,_a0b_1,_a1b)
232         VMAC(vmlal.s32,_a0a,_as0_0,_bh2_0,0)
233             VOP3(vsra.u64,_a1a,_a1b,"#28")
234         VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0)
235             VOP2(vbic.i32,_a0b,"#0xf0000000")
236         VMAC(vmlal.s32,_a0a,_as2_0,_bh0_0,0)
237             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
238
239         VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1)
240         VMAC(vmlal.s32,_a1b,_as0_0,_bh2_0,1)
241         VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1)
242         VMAC(vmlal.s32,_a1b,_as2_0,_bh0_0,1)
243
244         VOP2(vmov,_a0b_1,_a0a_1)
245         VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
246         VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
247         VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_1,0)
248         VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_0,0)
249         VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_1,0)
250         VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_0,0)
251
252         VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_1,0)
253         VMAC(vmlal.s32,_a0b,_al0_0,_bs2_0,0)
254         VMAC(vmlal.s32,_a0b,_al0_1,_bs0_1,0)
255         VMAC(vmlal.s32,_a0b,_al2_0,_bs0_0,0)
256
257         VOP2(vmov,_a1a,_a1b)
258         VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_1,1)
259         VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_0,1)
260         VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_1,1)
261         VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_0,1)
262
263             VOP2(vswp,_a0b_1,_a0a_0)
264
265         VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_1,1)
266         VMAC(vmlal.s32,_a1b,_al0_0,_bs2_0,1)
267         VMAC(vmlal.s32,_a1b,_al0_1,_bs0_1,1)
268         VMAC(vmlal.s32,_a1b,_al2_0,_bs0_0,1)
269                                                                 
270             VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1)
271             VOP3(vsra.u64,_a0a,_a0b,"#28")
272             VOP2(vmovn.i64,_a0b_0,_a0b)
273                         
274             VOP2(vswp,_a1b_1,_a1a_0)
275             VOP3(vadd.i64,_a1b,_a0a,_a1b)
276
277         VMAC(vmull.s32,_a0a,_as0_0,_bh2_1,0)
278             VOP2(vmovn.i64,_a0b_1,_a1b)
279         VMAC(vmlal.s32,_a0a,_as0_1,_bh2_0,0)
280             VOP3(vsra.u64,_a1a,_a1b,"#28")
281         VMAC(vmlal.s32,_a0a,_as2_0,_bh0_1,0)
282             VOP2(vbic.i32,_a0b,"#0xf0000000")
283         VMAC(vmlal.s32,_a0a,_as2_1,_bh0_0,0)
284             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
285
286         VMAC(vmull.s32,_a1b,_as0_0,_bh2_1,1)
287         VMAC(vmlal.s32,_a1b,_as0_1,_bh2_0,1)
288         VMAC(vmlal.s32,_a1b,_as2_0,_bh0_1,1)
289         VMAC(vmlal.s32,_a1b,_as2_1,_bh0_0,1)
290
291         VOP2(vmov,_a0b_1,_a0a_1)
292         VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
293         VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
294         VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_1,0)
295         VMAC(vmlal.s32,_a0a,_ah0_1,_bl2_0,0)
296         VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_1,0)
297         VMAC(vmlal.s32,_a0a,_ah2_1,_bl0_0,0)
298
299         VMAC(vmlal.s32,_a0b,_al0_0,_bs2_1,0)
300         VMAC(vmlal.s32,_a0b,_al0_1,_bs2_0,0)
301         VMAC(vmlal.s32,_a0b,_al2_0,_bs0_1,0)
302         VMAC(vmlal.s32,_a0b,_al2_1,_bs0_0,0)
303
304         VOP2(vmov,_a1a,_a1b)
305         VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_1,1)
306         VMAC(vmlal.s32,_a1a,_ah0_1,_bl2_0,1)
307         VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_1,1)
308         VMAC(vmlal.s32,_a1a,_ah2_1,_bl0_0,1)
309
310             VOP2(vswp,_a0b_1,_a0a_0)
311
312         VMAC(vmlal.s32,_a1b,_al0_0,_bs2_1,1)
313         VMAC(vmlal.s32,_a1b,_al0_1,_bs2_0,1)
314         VMAC(vmlal.s32,_a1b,_al2_0,_bs0_1,1)
315         VMAC(vmlal.s32,_a1b,_al2_1,_bs0_0,1)
316                         
317             VOP3(vsra.u64,_a0a,_a0b,"#28")
318             VOP2(vmovn.i64,_a0b_0,_a0b)
319                                                                                             
320             VOP2(vswp,_a1b_1,_a1a_0)
321             VOP3(vadd.i64,_a0a,_a0a,_a1b)
322
323             VOP2(vmovn.i64,_a0b_1,_a0a)
324             VOP3(vsra.u64,_a1a,_a0a,"#28")
325                                                                                             
326             VOP2(vbic.i32,_a0b,"#0xf0000000") 
327                                                                                             
328         VOP2(vswp,_a1a_0,_a1a_1)
329                                                                                             
330             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"  
331             "sub %[c], #64" "\n\t"
332                                                                                                 
333         VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0)
334         
335             "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
336             VOP2(vaddw.s32,_a1a,_a0a_0)
337             VOP2(vmovn.i64,_a0a_0,_a1a)
338             VOP2(vshr.s64,_a1a,"#28")
339                                                 
340             VOP2(vaddw.s32,_a1a,_a0a_1)
341             VOP2(vmovn.i64,_a0a_1,_a1a)
342             VOP2(vshr.s64,_a1a,"#28")
343                                                                                                     
344             VOP2(vbic.i32,_a0a,"#0xf0000000")
345                                                 
346             VOP2(vaddw.s32,_a1a,_a0b_0) 
347             VOP2(vmovn.i64,_a0b_0,_a1a)
348             
349             "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
350         
351         : [a]"+r"(as)
352         , [b]"+r"(bs)
353         , [c]"+r"(vc)
354                             
355         :: "q0","q1","q2","q3",
356             "q4","q5","q6","q7",
357             "q8","q9","q10","q11",
358             "q12","q13","q14","q15",
359             "memory"
360     );
361 }
362
363 void gf_sqr (gf_s *__restrict__ cs, const gf bs) {
364     int32x2_t *vc = (int32x2_t*) cs->limb;
365
366     __asm__ __volatile__ (
367         "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t"
368         VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1) /* 0 .. 2^30 */
369         VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0) /* +- 2^29 */
370         VOP3(vadd.i32,_as0,_bl0,_bh0)       /* 0 .. 2^30 */
371             
372         "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t"
373         VOP3(vadd.i32,_bs2,_bl2,_bh2)       /* 0 .. 2^30 */
374         VOP2(vmov,_as2,_bs2)
375         
376         VMAC(vqdmull.s32,_a0b,_as0_1,_bs2_1,0) /* 0 .. 8 * 2^58.  danger for vqdmlal is 32 */
377         VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0)   /* 0 .. 12 */
378         VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0)   /* 0 .. 14 */
379             
380         VMAC(vqdmull.s32,_a1b,_as0_1,_bs2_1,1) /* 0 .. 8 */
381         VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1)   /* 0 .. 14 */
382         VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1)   /* 0 .. 16 */
383             
384         VOP2(vmov,_a0a,_a0b)                   /* 0 .. 14 */
385         VMAC(vqdmlal.s32,_a0a,_bh0_1,_bh2_1,0) /* 0 .. 16 */
386         VMAC(vmlal.s32,_a0a,_bh2_0,_bh2_0,0)   /* 0 .. 17 */
387         VMAC(vmlal.s32,_a0a,_bh0_0,_bl0_0,0)   /* 0 .. 18 */
388             
389         VMAC(vqdmlsl.s32,_a0b,_bl0_1,_bl2_1,0) /*-2 .. 14 */
390         VMAC(vmlsl.s32,_a0b,_bl2_0,_bl2_0,0)   /*-3 .. 14 */
391         VMAC(vmlal.s32,_a0b,_bl0_0,_bs0_0,0)   /*-4 .. 15 */
392             
393         VOP2(vmov,_a1a,_a1b)
394         VMAC(vqdmlal.s32,_a1a,_bh0_1,_bh2_1,1) /* 0 .. 18 */
395         VMAC(vmlal.s32,_a1a,_bh2_0,_bh2_0,1)   /* 0 .. 19 */
396         VMAC(vmlal.s32,_a1a,_bh0_0,_bl0_0,1)   /* 0 .. 20 */
397             
398             VOP2(vswp,_a0b_1,_a0a_0)
399             
400         VMAC(vqdmlsl.s32,_a1b,_bl0_1,_bl2_1,1) /*-2 .. 16 */
401         VMAC(vmlsl.s32,_a1b,_bl2_0,_bl2_0,1)   /*-3 .. 16 */
402         VMAC(vmlal.s32,_a1b,_bl0_0,_bs0_0,1)   /*-4 .. 17 */
403                 
404             VOP3(vsra.u64,_a0a,_a0b,"#28")
405             VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1)
406             VOP2(vmovn.i64,_a0b_0,_a0b)
407                 
408             VOP2(vswp,_a1b_1,_a1a_0)
409             VOP3(vadd.i64,_a1b,_a0a,_a1b)
410                     
411                     
412         VMAC(vqdmull.s32,_a0a,_as2_0,_bs2_1,0) /* 0 .. 8 */
413             VOP2(vmovn.i64,_a0b_1,_a1b)
414             VOP3(vsra.u64,_a1a,_a1b,"#28")
415         VMAC(vqdmlal.s32,_a0a,_as0_0,_bh0_1,0) /* 0 .. 12 */
416             VOP2(vbic.i32,_a0b,"#0xf0000000")
417             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
418                     
419         VMAC(vqdmull.s32,_a1b,_as2_0,_bs2_1,1) /* 0 .. 8 */
420         VMAC(vqdmlal.s32,_a1b,_as0_0,_bh0_1,1) /* 0 .. 12 */
421
422         VOP2(vmov,_a0b,_a0a)               /* 0 .. 12 */
423         VMAC(vqdmlal.s32,_a0a,_bh2_0,_bh2_1,0) /* 0 .. 14 */
424         VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl0_1,0) /* 0 .. 16 */
425
426         VMAC(vqdmlsl.s32,_a0b,_bl2_0,_bl2_1,0) /*-2 .. 12 */
427         VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs0_1,0) /*-4 .. 14 */
428         VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
429         VOP3(vadd.i64,_a0b_0,_a0b_0,_a1a_0)
430
431         VOP2(vmov,_a1a,_a1b)                   /* 0 .. 12 */
432         VMAC(vqdmlal.s32,_a1a,_bh2_0,_bh2_1,1) /* 0 .. 14 */
433         VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl0_1,1) /* 0 .. 16 */
434
435             VOP2(vswp,_a0b_1,_a0a_0)
436
437         VMAC(vqdmlsl.s32,_a1b,_bl2_0,_bl2_1,1) /*-2 .. 12 */
438         VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs0_1,1) /*-4 .. 14 */
439                                         
440             VOP3(vsra.u64,_a0a,_a0b,"#28")
441             VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0)
442             VOP2(vmovn.i64,_a0b_0,_a0b)
443                         
444             VOP2(vswp,_a1b_1,_a1a_0)
445             VOP3(vadd.i64,_a1b,_a0a,_a1b)
446
447         VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0)
448             VOP2(vmovn.i64,_a0b_1,_a1b)
449         VMAC(vqdmlal.s32,_a0a,_as0_0,_bh2_0,0)
450             VOP3(vsra.u64,_a1a,_a1b,"#28")
451         VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0)
452             VOP2(vbic.i32,_a0b,"#0xf0000000")
453             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
454
455         VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1)
456         VMAC(vqdmlal.s32,_a1b,_as0_0,_bh2_0,1)
457         VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1)
458
459         VOP2(vmov,_a0b_1,_a0a_1)
460         VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
461         VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
462         VMAC(vmlal.s32,_a0a,_bh2_1,_bh2_1,0)
463         VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_0,0)
464         VMAC(vmlal.s32,_a0a,_bh0_1,_bl0_1,0)
465
466         VMAC(vmlsl.s32,_a0b,_bl2_1,_bl2_1,0)
467         VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_0,0)
468         VMAC(vmlal.s32,_a0b,_bl0_1,_bs0_1,0)
469
470         VOP2(vmov,_a1a,_a1b)
471         VMAC(vmlal.s32,_a1a,_bh2_1,_bh2_1,1)
472         VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_0,1)
473         VMAC(vmlal.s32,_a1a,_bh0_1,_bl0_1,1)
474
475             VOP2(vswp,_a0b_1,_a0a_0)
476
477         VMAC(vmlsl.s32,_a1b,_bl2_1,_bl2_1,1)
478         VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_0,1)
479         VMAC(vmlal.s32,_a1b,_bl0_1,_bs0_1,1)
480                                                                 
481             VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1)
482             VOP3(vsra.u64,_a0a,_a0b,"#28")
483             VOP2(vmovn.i64,_a0b_0,_a0b)
484                         
485             VOP2(vswp,_a1b_1,_a1a_0)
486             VOP3(vadd.i64,_a1b,_a0a,_a1b)
487
488         VMAC(vqdmull.s32,_a0a,_as0_0,_bh2_1,0)
489             VOP2(vmovn.i64,_a0b_1,_a1b)
490             VOP3(vsra.u64,_a1a,_a1b,"#28")
491         VMAC(vqdmlal.s32,_a0a,_as2_0,_bh0_1,0)
492             VOP2(vbic.i32,_a0b,"#0xf0000000")
493             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
494
495         VMAC(vqdmull.s32,_a1b,_as0_0,_bh2_1,1)
496         VMAC(vqdmlal.s32,_a1b,_as2_0,_bh0_1,1)
497
498         VOP2(vmov,_a0b_1,_a0a_1)
499         VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
500         VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
501         VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_1,0)
502         VMAC(vqdmlal.s32,_a0a,_bh2_0,_bl0_1,0)
503
504         VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_1,0)
505         VMAC(vqdmlal.s32,_a0b,_bl2_0,_bs0_1,0)
506
507         VOP2(vmov,_a1a,_a1b)
508         VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_1,1)
509         VMAC(vqdmlal.s32,_a1a,_bh2_0,_bl0_1,1)
510
511             VOP2(vswp,_a0b_1,_a0a_0)
512
513         VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_1,1)
514         VMAC(vqdmlal.s32,_a1b,_bl2_0,_bs0_1,1)
515                         
516             VOP3(vsra.u64,_a0a,_a0b,"#28")
517             VOP2(vmovn.i64,_a0b_0,_a0b)
518                                                                                             
519             VOP2(vswp,_a1b_1,_a1a_0)
520             VOP3(vadd.i64,_a0a,_a0a,_a1b)
521
522             VOP2(vmovn.i64,_a0b_1,_a0a)
523             VOP3(vsra.u64,_a1a,_a0a,"#28")
524                                                                                             
525             VOP2(vbic.i32,_a0b,"#0xf0000000") 
526                                                                                             
527         VOP2(vswp,_a1a_0,_a1a_1)
528                                                                                             
529             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"  
530             "sub %[c], #64" "\n\t"
531                                                                                                 
532         VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0)
533         
534             "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
535             VOP2(vaddw.s32,_a1a,_a0a_0)
536             VOP2(vmovn.i64,_a0a_0,_a1a)
537             VOP2(vshr.s64,_a1a,"#28")
538                                                 
539             VOP2(vaddw.s32,_a1a,_a0a_1)
540             VOP2(vmovn.i64,_a0a_1,_a1a)
541             VOP2(vshr.s64,_a1a,"#28")
542                                                                                                     
543             VOP2(vbic.i32,_a0a,"#0xf0000000")
544                                                 
545             VOP2(vaddw.s32,_a1a,_a0b_0) 
546             VOP2(vmovn.i64,_a0b_0,_a1a)
547             
548             "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
549         
550         : [b]"+r"(bs)
551         , [c]"+r"(vc)
552                             
553         :: "q0","q1","q2","q3",
554             "q4","q5","q6","q7",
555             "q12","q13","q14","q15",
556             "memory"
557     );
558 }
559
560 void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) { 
561     uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1};
562     assert(b<(1<<28));
563     
564     uint64x2_t accum;
565     const uint32x2_t *va = (const uint32x2_t *) as->limb;
566     uint32x2_t *vo = (uint32x2_t *) cs->limb;
567     uint32x2_t vc, vn;
568     uint32x2_t vb = {b, 0};
569     
570     vc = va[0];
571     accum = vmull_lane_u32(vc, vb, 0);
572     vo[0] = vmovn_u64(accum) & vmask;
573     accum = vshrq_n_u64(accum,28);
574     
575     /* PERF: the right way to do this is to reduce behind, i.e.
576      * vmull + vmlal round 0
577      * vmull + vmlal round 1
578      * vmull + vmlal round 2
579      * vsraq round 0, 1
580      * vmull + vmlal round 3
581      * vsraq round 1, 2
582      * ...
583      */
584     
585     int i;
586     for (i=1; i<8; i++) {
587         vn = va[i];
588         accum = vmlal_lane_u32(accum, vn, vb, 0);
589         vo[i] = vmovn_u64(accum) & vmask;
590         accum = vshrq_n_u64(accum,28);
591         vc = vn;
592     }
593         
594     accum = xx_vaddup_u64(vrev128_u64(accum));
595     accum = vaddw_u32(accum, vo[0]);
596     vo[0] = vmovn_u64(accum) & vmask;
597     
598     accum = vshrq_n_u64(accum,28);
599     vo[1] += vmovn_u64(accum);
600 }