5e998f9f371081d8baebb4185d01c1b57a44218c
[openssl.git] / crypto / ec / curve448 / p448 / arch_neon / f_impl.c
1 /* Copyright (c) 2014 Cryptography Research, Inc.
2  * Released under the MIT License.  See LICENSE.txt for license information.
3  */
4
5 #include "f_field.h"
6
7 static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline,unused))
8 xx_vaddup_u64(uint64x2_t x) {
9     __asm__ ("vadd.s64 %f0, %e0" : "+w"(x));
10     return x;
11 }
12
13 static __inline__ int64x2_t __attribute__((gnu_inline,always_inline,unused))
14 vrev128_s64(int64x2_t x) {
15     __asm__ ("vswp.s64 %e0, %f0" : "+w"(x));
16     return x;
17 }
18
19 static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline))
20 vrev128_u64(uint64x2_t x) {
21     __asm__ ("vswp.s64 %e0, %f0" : "+w"(x));
22     return x;
23 }
24
25 static inline void __attribute__((gnu_inline,always_inline,unused))
26 smlal (
27     uint64_t *acc,
28     const uint32_t a,
29     const uint32_t b
30 ) {
31     *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
32 }
33
34 static inline void __attribute__((gnu_inline,always_inline,unused))
35 smlal2 (
36     uint64_t *acc,
37     const uint32_t a,
38     const uint32_t b
39 ) {
40     *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
41 }
42
43 static inline void __attribute__((gnu_inline,always_inline,unused))
44 smull (
45     uint64_t *acc,
46     const uint32_t a,
47     const uint32_t b
48 ) {
49     *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
50 }
51
52 static inline void __attribute__((gnu_inline,always_inline,unused))
53 smull2 (
54     uint64_t *acc,
55     const uint32_t a,
56     const uint32_t b
57 ) {
58     *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
59 }
60
61 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
62     #define _bl0 "q0"
63     #define _bl0_0 "d0"
64     #define _bl0_1 "d1"
65     #define _bh0 "q1"
66     #define _bh0_0 "d2"
67     #define _bh0_1 "d3"
68     #define _bs0 "q2"
69     #define _bs0_0 "d4"
70     #define _bs0_1 "d5"
71     #define _bl2 "q3"
72     #define _bl2_0 "d6"
73     #define _bl2_1 "d7"
74     #define _bh2 "q4"
75     #define _bh2_0 "d8"
76     #define _bh2_1 "d9"
77     #define _bs2 "q5"
78     #define _bs2_0 "d10"
79     #define _bs2_1 "d11"
80
81     #define _as0 "q6"
82     #define _as0_0 "d12"
83     #define _as0_1 "d13"
84     #define _as2 "q7"
85     #define _as2_0 "d14"
86     #define _as2_1 "d15"
87     #define _al0 "q8"
88     #define _al0_0 "d16"
89     #define _al0_1 "d17"
90     #define _ah0 "q9"
91     #define _ah0_0 "d18"
92     #define _ah0_1 "d19"
93     #define _al2 "q10"
94     #define _al2_0 "d20"
95     #define _al2_1 "d21"
96     #define _ah2 "q11"
97     #define _ah2_0 "d22"
98     #define _ah2_1 "d23"
99
100     #define _a0a "q12"
101     #define _a0a_0 "d24"
102     #define _a0a_1 "d25"
103     #define _a0b "q13"
104     #define _a0b_0 "d26"
105     #define _a0b_1 "d27"
106     #define _a1a "q14"
107     #define _a1a_0 "d28"
108     #define _a1a_1 "d29"
109     #define _a1b "q15"
110     #define _a1b_0 "d30"
111     #define _a1b_1 "d31"
112     #define VMAC(op,result,a,b,n) #op" "result", "a", "b"[" #n "]\n\t"
113     #define VOP3(op,result,a,b)   #op" "result", "a", "b"\n\t"
114     #define VOP2(op,result,a)     #op" "result", "a"\n\t"
115
116     int32x2_t *vc = (int32x2_t*) cs->limb;
117
118     __asm__ __volatile__(
119         
120         "vld2.32 {"_al0_0","_al0_1","_ah0_0","_ah0_1"}, [%[a],:128]!" "\n\t"
121         VOP3(vadd.i32,_as0,_al0,_ah0)
122         
123         "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t"
124         VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1)
125         VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0)
126             
127         "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t"
128         VOP3(vadd.i32,_bs2,_bl2,_bh2)
129             
130         "vld2.32 {"_al2_0","_al2_1","_ah2_0","_ah2_1"}, [%[a],:128]!" "\n\t"
131         VOP3(vadd.i32,_as2,_al2,_ah2)
132         
133         VMAC(vmull.s32,_a0b,_as0_1,_bs2_1,0)
134         VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0)
135         VMAC(vmlal.s32,_a0b,_as2_1,_bs0_1,0)
136         VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0)
137             
138         VMAC(vmull.s32,_a1b,_as0_1,_bs2_1,1)
139         VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1)
140         VMAC(vmlal.s32,_a1b,_as2_1,_bs0_1,1)
141         VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1)
142             
143         VOP2(vmov,_a0a,_a0b)
144         VMAC(vmlal.s32,_a0a,_ah0_1,_bh2_1,0)
145         VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_0,0)
146         VMAC(vmlal.s32,_a0a,_ah2_1,_bh0_1,0)
147         VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_0,0)
148             
149         VMAC(vmlsl.s32,_a0b,_al0_1,_bl2_1,0)
150         VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_0,0)
151         VMAC(vmlsl.s32,_a0b,_al2_1,_bl0_1,0)
152         VMAC(vmlal.s32,_a0b,_al0_0,_bs0_0,0)
153             
154         VOP2(vmov,_a1a,_a1b)
155         VMAC(vmlal.s32,_a1a,_ah0_1,_bh2_1,1)
156         VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_0,1)
157         VMAC(vmlal.s32,_a1a,_ah2_1,_bh0_1,1)
158         VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_0,1)
159             
160             VOP2(vswp,_a0b_1,_a0a_0)
161             
162         VMAC(vmlsl.s32,_a1b,_al0_1,_bl2_1,1)
163         VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_0,1)
164         VMAC(vmlsl.s32,_a1b,_al2_1,_bl0_1,1)
165         VMAC(vmlal.s32,_a1b,_al0_0,_bs0_0,1)
166                 
167             VOP3(vsra.u64,_a0a,_a0b,"#28")
168             VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1)
169             VOP2(vmovn.i64,_a0b_0,_a0b)
170                 
171             VOP2(vswp,_a1b_1,_a1a_0)
172             VOP3(vadd.i64,_a1b,_a0a,_a1b)
173                     
174                     
175         VMAC(vmull.s32,_a0a,_as2_0,_bs2_1,0)
176             VOP2(vmovn.i64,_a0b_1,_a1b)
177         VMAC(vmlal.s32,_a0a,_as2_1,_bs2_0,0)
178             VOP3(vsra.u64,_a1a,_a1b,"#28")
179         VMAC(vmlal.s32,_a0a,_as0_0,_bh0_1,0)
180             VOP2(vbic.i32,_a0b,"#0xf0000000")
181         VMAC(vmlal.s32,_a0a,_as0_1,_bh0_0,0)
182             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
183                     
184         VMAC(vmull.s32,_a1b,_as2_0,_bs2_1,1)
185         VMAC(vmlal.s32,_a1b,_as2_1,_bs2_0,1)
186         VMAC(vmlal.s32,_a1b,_as0_0,_bh0_1,1)
187         VMAC(vmlal.s32,_a1b,_as0_1,_bh0_0,1)
188
189         VOP2(vmov,_a0b_1,_a0a_1)
190         VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
191         VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
192         VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_1,0)
193         VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_0,0)
194         VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_1,0)
195         VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_0,0)
196
197         VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_1,0)
198         VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_0,0)
199         VMAC(vmlal.s32,_a0b,_al0_0,_bs0_1,0)
200         VMAC(vmlal.s32,_a0b,_al0_1,_bs0_0,0)
201
202         VOP2(vmov,_a1a,_a1b)
203         VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_1,1)
204         VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_0,1)
205         VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_1,1)
206         VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_0,1)
207
208             VOP2(vswp,_a0b_1,_a0a_0)
209
210         VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_1,1)
211         VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_0,1)
212         VMAC(vmlal.s32,_a1b,_al0_0,_bs0_1,1)
213         VMAC(vmlal.s32,_a1b,_al0_1,_bs0_0,1)
214                                         
215             VOP3(vsra.u64,_a0a,_a0b,"#28")
216             VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0)
217             VOP2(vmovn.i64,_a0b_0,_a0b)
218                         
219             VOP2(vswp,_a1b_1,_a1a_0)
220             VOP3(vadd.i64,_a1b,_a0a,_a1b)
221
222         VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0)
223             VOP2(vmovn.i64,_a0b_1,_a1b)
224         VMAC(vmlal.s32,_a0a,_as0_0,_bh2_0,0)
225             VOP3(vsra.u64,_a1a,_a1b,"#28")
226         VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0)
227             VOP2(vbic.i32,_a0b,"#0xf0000000")
228         VMAC(vmlal.s32,_a0a,_as2_0,_bh0_0,0)
229             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
230
231         VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1)
232         VMAC(vmlal.s32,_a1b,_as0_0,_bh2_0,1)
233         VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1)
234         VMAC(vmlal.s32,_a1b,_as2_0,_bh0_0,1)
235
236         VOP2(vmov,_a0b_1,_a0a_1)
237         VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
238         VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
239         VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_1,0)
240         VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_0,0)
241         VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_1,0)
242         VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_0,0)
243
244         VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_1,0)
245         VMAC(vmlal.s32,_a0b,_al0_0,_bs2_0,0)
246         VMAC(vmlal.s32,_a0b,_al0_1,_bs0_1,0)
247         VMAC(vmlal.s32,_a0b,_al2_0,_bs0_0,0)
248
249         VOP2(vmov,_a1a,_a1b)
250         VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_1,1)
251         VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_0,1)
252         VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_1,1)
253         VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_0,1)
254
255             VOP2(vswp,_a0b_1,_a0a_0)
256
257         VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_1,1)
258         VMAC(vmlal.s32,_a1b,_al0_0,_bs2_0,1)
259         VMAC(vmlal.s32,_a1b,_al0_1,_bs0_1,1)
260         VMAC(vmlal.s32,_a1b,_al2_0,_bs0_0,1)
261                                                                 
262             VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1)
263             VOP3(vsra.u64,_a0a,_a0b,"#28")
264             VOP2(vmovn.i64,_a0b_0,_a0b)
265                         
266             VOP2(vswp,_a1b_1,_a1a_0)
267             VOP3(vadd.i64,_a1b,_a0a,_a1b)
268
269         VMAC(vmull.s32,_a0a,_as0_0,_bh2_1,0)
270             VOP2(vmovn.i64,_a0b_1,_a1b)
271         VMAC(vmlal.s32,_a0a,_as0_1,_bh2_0,0)
272             VOP3(vsra.u64,_a1a,_a1b,"#28")
273         VMAC(vmlal.s32,_a0a,_as2_0,_bh0_1,0)
274             VOP2(vbic.i32,_a0b,"#0xf0000000")
275         VMAC(vmlal.s32,_a0a,_as2_1,_bh0_0,0)
276             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
277
278         VMAC(vmull.s32,_a1b,_as0_0,_bh2_1,1)
279         VMAC(vmlal.s32,_a1b,_as0_1,_bh2_0,1)
280         VMAC(vmlal.s32,_a1b,_as2_0,_bh0_1,1)
281         VMAC(vmlal.s32,_a1b,_as2_1,_bh0_0,1)
282
283         VOP2(vmov,_a0b_1,_a0a_1)
284         VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
285         VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
286         VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_1,0)
287         VMAC(vmlal.s32,_a0a,_ah0_1,_bl2_0,0)
288         VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_1,0)
289         VMAC(vmlal.s32,_a0a,_ah2_1,_bl0_0,0)
290
291         VMAC(vmlal.s32,_a0b,_al0_0,_bs2_1,0)
292         VMAC(vmlal.s32,_a0b,_al0_1,_bs2_0,0)
293         VMAC(vmlal.s32,_a0b,_al2_0,_bs0_1,0)
294         VMAC(vmlal.s32,_a0b,_al2_1,_bs0_0,0)
295
296         VOP2(vmov,_a1a,_a1b)
297         VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_1,1)
298         VMAC(vmlal.s32,_a1a,_ah0_1,_bl2_0,1)
299         VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_1,1)
300         VMAC(vmlal.s32,_a1a,_ah2_1,_bl0_0,1)
301
302             VOP2(vswp,_a0b_1,_a0a_0)
303
304         VMAC(vmlal.s32,_a1b,_al0_0,_bs2_1,1)
305         VMAC(vmlal.s32,_a1b,_al0_1,_bs2_0,1)
306         VMAC(vmlal.s32,_a1b,_al2_0,_bs0_1,1)
307         VMAC(vmlal.s32,_a1b,_al2_1,_bs0_0,1)
308                         
309             VOP3(vsra.u64,_a0a,_a0b,"#28")
310             VOP2(vmovn.i64,_a0b_0,_a0b)
311                                                                                             
312             VOP2(vswp,_a1b_1,_a1a_0)
313             VOP3(vadd.i64,_a0a,_a0a,_a1b)
314
315             VOP2(vmovn.i64,_a0b_1,_a0a)
316             VOP3(vsra.u64,_a1a,_a0a,"#28")
317                                                                                             
318             VOP2(vbic.i32,_a0b,"#0xf0000000") 
319                                                                                             
320         VOP2(vswp,_a1a_0,_a1a_1)
321                                                                                             
322             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"  
323             "sub %[c], #64" "\n\t"
324                                                                                                 
325         VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0)
326         
327             "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
328             VOP2(vaddw.s32,_a1a,_a0a_0)
329             VOP2(vmovn.i64,_a0a_0,_a1a)
330             VOP2(vshr.s64,_a1a,"#28")
331                                                 
332             VOP2(vaddw.s32,_a1a,_a0a_1)
333             VOP2(vmovn.i64,_a0a_1,_a1a)
334             VOP2(vshr.s64,_a1a,"#28")
335                                                                                                     
336             VOP2(vbic.i32,_a0a,"#0xf0000000")
337                                                 
338             VOP2(vaddw.s32,_a1a,_a0b_0) 
339             VOP2(vmovn.i64,_a0b_0,_a1a)
340             
341             "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
342         
343         : [a]"+r"(as)
344         , [b]"+r"(bs)
345         , [c]"+r"(vc)
346                             
347         :: "q0","q1","q2","q3",
348             "q4","q5","q6","q7",
349             "q8","q9","q10","q11",
350             "q12","q13","q14","q15",
351             "memory"
352     );
353 }
354
355 void gf_sqr (gf_s *__restrict__ cs, const gf bs) {
356     int32x2_t *vc = (int32x2_t*) cs->limb;
357
358     __asm__ __volatile__ (
359         "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t"
360         VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1) /* 0 .. 2^30 */
361         VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0) /* +- 2^29 */
362         VOP3(vadd.i32,_as0,_bl0,_bh0)       /* 0 .. 2^30 */
363             
364         "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t"
365         VOP3(vadd.i32,_bs2,_bl2,_bh2)       /* 0 .. 2^30 */
366         VOP2(vmov,_as2,_bs2)
367         
368         VMAC(vqdmull.s32,_a0b,_as0_1,_bs2_1,0) /* 0 .. 8 * 2^58.  danger for vqdmlal is 32 */
369         VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0)   /* 0 .. 12 */
370         VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0)   /* 0 .. 14 */
371             
372         VMAC(vqdmull.s32,_a1b,_as0_1,_bs2_1,1) /* 0 .. 8 */
373         VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1)   /* 0 .. 14 */
374         VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1)   /* 0 .. 16 */
375             
376         VOP2(vmov,_a0a,_a0b)                   /* 0 .. 14 */
377         VMAC(vqdmlal.s32,_a0a,_bh0_1,_bh2_1,0) /* 0 .. 16 */
378         VMAC(vmlal.s32,_a0a,_bh2_0,_bh2_0,0)   /* 0 .. 17 */
379         VMAC(vmlal.s32,_a0a,_bh0_0,_bl0_0,0)   /* 0 .. 18 */
380             
381         VMAC(vqdmlsl.s32,_a0b,_bl0_1,_bl2_1,0) /*-2 .. 14 */
382         VMAC(vmlsl.s32,_a0b,_bl2_0,_bl2_0,0)   /*-3 .. 14 */
383         VMAC(vmlal.s32,_a0b,_bl0_0,_bs0_0,0)   /*-4 .. 15 */
384             
385         VOP2(vmov,_a1a,_a1b)
386         VMAC(vqdmlal.s32,_a1a,_bh0_1,_bh2_1,1) /* 0 .. 18 */
387         VMAC(vmlal.s32,_a1a,_bh2_0,_bh2_0,1)   /* 0 .. 19 */
388         VMAC(vmlal.s32,_a1a,_bh0_0,_bl0_0,1)   /* 0 .. 20 */
389             
390             VOP2(vswp,_a0b_1,_a0a_0)
391             
392         VMAC(vqdmlsl.s32,_a1b,_bl0_1,_bl2_1,1) /*-2 .. 16 */
393         VMAC(vmlsl.s32,_a1b,_bl2_0,_bl2_0,1)   /*-3 .. 16 */
394         VMAC(vmlal.s32,_a1b,_bl0_0,_bs0_0,1)   /*-4 .. 17 */
395                 
396             VOP3(vsra.u64,_a0a,_a0b,"#28")
397             VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1)
398             VOP2(vmovn.i64,_a0b_0,_a0b)
399                 
400             VOP2(vswp,_a1b_1,_a1a_0)
401             VOP3(vadd.i64,_a1b,_a0a,_a1b)
402                     
403                     
404         VMAC(vqdmull.s32,_a0a,_as2_0,_bs2_1,0) /* 0 .. 8 */
405             VOP2(vmovn.i64,_a0b_1,_a1b)
406             VOP3(vsra.u64,_a1a,_a1b,"#28")
407         VMAC(vqdmlal.s32,_a0a,_as0_0,_bh0_1,0) /* 0 .. 12 */
408             VOP2(vbic.i32,_a0b,"#0xf0000000")
409             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
410                     
411         VMAC(vqdmull.s32,_a1b,_as2_0,_bs2_1,1) /* 0 .. 8 */
412         VMAC(vqdmlal.s32,_a1b,_as0_0,_bh0_1,1) /* 0 .. 12 */
413
414         VOP2(vmov,_a0b,_a0a)               /* 0 .. 12 */
415         VMAC(vqdmlal.s32,_a0a,_bh2_0,_bh2_1,0) /* 0 .. 14 */
416         VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl0_1,0) /* 0 .. 16 */
417
418         VMAC(vqdmlsl.s32,_a0b,_bl2_0,_bl2_1,0) /*-2 .. 12 */
419         VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs0_1,0) /*-4 .. 14 */
420         VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
421         VOP3(vadd.i64,_a0b_0,_a0b_0,_a1a_0)
422
423         VOP2(vmov,_a1a,_a1b)                   /* 0 .. 12 */
424         VMAC(vqdmlal.s32,_a1a,_bh2_0,_bh2_1,1) /* 0 .. 14 */
425         VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl0_1,1) /* 0 .. 16 */
426
427             VOP2(vswp,_a0b_1,_a0a_0)
428
429         VMAC(vqdmlsl.s32,_a1b,_bl2_0,_bl2_1,1) /*-2 .. 12 */
430         VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs0_1,1) /*-4 .. 14 */
431                                         
432             VOP3(vsra.u64,_a0a,_a0b,"#28")
433             VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0)
434             VOP2(vmovn.i64,_a0b_0,_a0b)
435                         
436             VOP2(vswp,_a1b_1,_a1a_0)
437             VOP3(vadd.i64,_a1b,_a0a,_a1b)
438
439         VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0)
440             VOP2(vmovn.i64,_a0b_1,_a1b)
441         VMAC(vqdmlal.s32,_a0a,_as0_0,_bh2_0,0)
442             VOP3(vsra.u64,_a1a,_a1b,"#28")
443         VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0)
444             VOP2(vbic.i32,_a0b,"#0xf0000000")
445             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
446
447         VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1)
448         VMAC(vqdmlal.s32,_a1b,_as0_0,_bh2_0,1)
449         VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1)
450
451         VOP2(vmov,_a0b_1,_a0a_1)
452         VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
453         VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
454         VMAC(vmlal.s32,_a0a,_bh2_1,_bh2_1,0)
455         VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_0,0)
456         VMAC(vmlal.s32,_a0a,_bh0_1,_bl0_1,0)
457
458         VMAC(vmlsl.s32,_a0b,_bl2_1,_bl2_1,0)
459         VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_0,0)
460         VMAC(vmlal.s32,_a0b,_bl0_1,_bs0_1,0)
461
462         VOP2(vmov,_a1a,_a1b)
463         VMAC(vmlal.s32,_a1a,_bh2_1,_bh2_1,1)
464         VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_0,1)
465         VMAC(vmlal.s32,_a1a,_bh0_1,_bl0_1,1)
466
467             VOP2(vswp,_a0b_1,_a0a_0)
468
469         VMAC(vmlsl.s32,_a1b,_bl2_1,_bl2_1,1)
470         VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_0,1)
471         VMAC(vmlal.s32,_a1b,_bl0_1,_bs0_1,1)
472                                                                 
473             VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1)
474             VOP3(vsra.u64,_a0a,_a0b,"#28")
475             VOP2(vmovn.i64,_a0b_0,_a0b)
476                         
477             VOP2(vswp,_a1b_1,_a1a_0)
478             VOP3(vadd.i64,_a1b,_a0a,_a1b)
479
480         VMAC(vqdmull.s32,_a0a,_as0_0,_bh2_1,0)
481             VOP2(vmovn.i64,_a0b_1,_a1b)
482             VOP3(vsra.u64,_a1a,_a1b,"#28")
483         VMAC(vqdmlal.s32,_a0a,_as2_0,_bh0_1,0)
484             VOP2(vbic.i32,_a0b,"#0xf0000000")
485             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
486
487         VMAC(vqdmull.s32,_a1b,_as0_0,_bh2_1,1)
488         VMAC(vqdmlal.s32,_a1b,_as2_0,_bh0_1,1)
489
490         VOP2(vmov,_a0b_1,_a0a_1)
491         VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
492         VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
493         VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_1,0)
494         VMAC(vqdmlal.s32,_a0a,_bh2_0,_bl0_1,0)
495
496         VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_1,0)
497         VMAC(vqdmlal.s32,_a0b,_bl2_0,_bs0_1,0)
498
499         VOP2(vmov,_a1a,_a1b)
500         VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_1,1)
501         VMAC(vqdmlal.s32,_a1a,_bh2_0,_bl0_1,1)
502
503             VOP2(vswp,_a0b_1,_a0a_0)
504
505         VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_1,1)
506         VMAC(vqdmlal.s32,_a1b,_bl2_0,_bs0_1,1)
507                         
508             VOP3(vsra.u64,_a0a,_a0b,"#28")
509             VOP2(vmovn.i64,_a0b_0,_a0b)
510                                                                                             
511             VOP2(vswp,_a1b_1,_a1a_0)
512             VOP3(vadd.i64,_a0a,_a0a,_a1b)
513
514             VOP2(vmovn.i64,_a0b_1,_a0a)
515             VOP3(vsra.u64,_a1a,_a0a,"#28")
516                                                                                             
517             VOP2(vbic.i32,_a0b,"#0xf0000000") 
518                                                                                             
519         VOP2(vswp,_a1a_0,_a1a_1)
520                                                                                             
521             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"  
522             "sub %[c], #64" "\n\t"
523                                                                                                 
524         VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0)
525         
526             "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
527             VOP2(vaddw.s32,_a1a,_a0a_0)
528             VOP2(vmovn.i64,_a0a_0,_a1a)
529             VOP2(vshr.s64,_a1a,"#28")
530                                                 
531             VOP2(vaddw.s32,_a1a,_a0a_1)
532             VOP2(vmovn.i64,_a0a_1,_a1a)
533             VOP2(vshr.s64,_a1a,"#28")
534                                                                                                     
535             VOP2(vbic.i32,_a0a,"#0xf0000000")
536                                                 
537             VOP2(vaddw.s32,_a1a,_a0b_0) 
538             VOP2(vmovn.i64,_a0b_0,_a1a)
539             
540             "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
541         
542         : [b]"+r"(bs)
543         , [c]"+r"(vc)
544                             
545         :: "q0","q1","q2","q3",
546             "q4","q5","q6","q7",
547             "q12","q13","q14","q15",
548             "memory"
549     );
550 }
551
552 void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) { 
553     uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1};
554     assert(b<(1<<28));
555     
556     uint64x2_t accum;
557     const uint32x2_t *va = (const uint32x2_t *) as->limb;
558     uint32x2_t *vo = (uint32x2_t *) cs->limb;
559     uint32x2_t vc, vn;
560     uint32x2_t vb = {b, 0};
561     
562     vc = va[0];
563     accum = vmull_lane_u32(vc, vb, 0);
564     vo[0] = vmovn_u64(accum) & vmask;
565     accum = vshrq_n_u64(accum,28);
566     
567     /* PERF: the right way to do this is to reduce behind, i.e.
568      * vmull + vmlal round 0
569      * vmull + vmlal round 1
570      * vmull + vmlal round 2
571      * vsraq round 0, 1
572      * vmull + vmlal round 3
573      * vsraq round 1, 2
574      * ...
575      */
576     
577     int i;
578     for (i=1; i<8; i++) {
579         vn = va[i];
580         accum = vmlal_lane_u32(accum, vn, vb, 0);
581         vo[i] = vmovn_u64(accum) & vmask;
582         accum = vshrq_n_u64(accum,28);
583         vc = vn;
584     }
585         
586     accum = xx_vaddup_u64(vrev128_u64(accum));
587     accum = vaddw_u32(accum, vo[0]);
588     vo[0] = vmovn_u64(accum) & vmask;
589     
590     accum = vshrq_n_u64(accum,28);
591     vo[1] += vmovn_u64(accum);
592 }