Manual formatting tweaks to Curve448 code
[openssl.git] / crypto / ec / curve448 / arch_neon / f_impl.c
1 /*
2  * Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
3  * Copyright 2014 Cryptography Research, Inc.
4  *
5  * Licensed under the OpenSSL license (the "License").  You may not use
6  * this file except in compliance with the License.  You can obtain a copy
7  * in the file LICENSE in the source distribution or at
8  * https://www.openssl.org/source/license.html
9  *
10  * Originally written by Mike Hamburg
11  */
12
13 #include "f_field.h"
14
15 static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline,unused))
16 xx_vaddup_u64(uint64x2_t x)
17 {
18     __asm__ ("vadd.s64 %f0, %e0" : "+w"(x));
19     return x;
20 }
21
22 static __inline__ int64x2_t __attribute__((gnu_inline,always_inline,unused))
23 vrev128_s64(int64x2_t x)
24 {
25     __asm__ ("vswp.s64 %e0, %f0" : "+w"(x));
26     return x;
27 }
28
29 static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline))
30 vrev128_u64(uint64x2_t x)
31 {
32     __asm__ ("vswp.s64 %e0, %f0" : "+w"(x));
33     return x;
34 }
35
36 static inline void __attribute__((gnu_inline,always_inline,unused))
37 smlal(uint64_t *acc, const uint32_t a, const uint32_t b)
38 {
39     *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
40 }
41
42 static inline void __attribute__((gnu_inline,always_inline,unused))
43 smlal2(uint64_t *acc, const uint32_t a, const uint32_t b)
44 {
45     *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
46 }
47
48 static inline void __attribute__((gnu_inline,always_inline,unused))
49 smull(uint64_t *acc, const uint32_t a, const uint32_t b)
50 {
51     *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
52 }
53
54 static inline void __attribute__((gnu_inline,always_inline,unused))
55 smull2(uint64_t *acc, const uint32_t a, const uint32_t b)
56 {
57     *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
58 }
59
60 void gf_mul(gf_s *__restrict__ cs, const gf as, const gf bs)
61 {
62     #define _bl0 "q0"
63     #define _bl0_0 "d0"
64     #define _bl0_1 "d1"
65     #define _bh0 "q1"
66     #define _bh0_0 "d2"
67     #define _bh0_1 "d3"
68     #define _bs0 "q2"
69     #define _bs0_0 "d4"
70     #define _bs0_1 "d5"
71     #define _bl2 "q3"
72     #define _bl2_0 "d6"
73     #define _bl2_1 "d7"
74     #define _bh2 "q4"
75     #define _bh2_0 "d8"
76     #define _bh2_1 "d9"
77     #define _bs2 "q5"
78     #define _bs2_0 "d10"
79     #define _bs2_1 "d11"
80
81     #define _as0 "q6"
82     #define _as0_0 "d12"
83     #define _as0_1 "d13"
84     #define _as2 "q7"
85     #define _as2_0 "d14"
86     #define _as2_1 "d15"
87     #define _al0 "q8"
88     #define _al0_0 "d16"
89     #define _al0_1 "d17"
90     #define _ah0 "q9"
91     #define _ah0_0 "d18"
92     #define _ah0_1 "d19"
93     #define _al2 "q10"
94     #define _al2_0 "d20"
95     #define _al2_1 "d21"
96     #define _ah2 "q11"
97     #define _ah2_0 "d22"
98     #define _ah2_1 "d23"
99
100     #define _a0a "q12"
101     #define _a0a_0 "d24"
102     #define _a0a_1 "d25"
103     #define _a0b "q13"
104     #define _a0b_0 "d26"
105     #define _a0b_1 "d27"
106     #define _a1a "q14"
107     #define _a1a_0 "d28"
108     #define _a1a_1 "d29"
109     #define _a1b "q15"
110     #define _a1b_0 "d30"
111     #define _a1b_1 "d31"
112     #define VMAC(op,result,a,b,n) #op" "result", "a", "b"[" #n "]\n\t"
113     #define VOP3(op,result,a,b)   #op" "result", "a", "b"\n\t"
114     #define VOP2(op,result,a)     #op" "result", "a"\n\t"
115
116     int32x2_t *vc = (int32x2_t*) cs->limb;
117
118     __asm__ __volatile__(
119         
120         "vld2.32 {"_al0_0","_al0_1","_ah0_0","_ah0_1"}, [%[a],:128]!" "\n\t"
121         VOP3(vadd.i32,_as0,_al0,_ah0)
122         
123         "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t"
124         VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1)
125         VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0)
126             
127         "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t"
128         VOP3(vadd.i32,_bs2,_bl2,_bh2)
129             
130         "vld2.32 {"_al2_0","_al2_1","_ah2_0","_ah2_1"}, [%[a],:128]!" "\n\t"
131         VOP3(vadd.i32,_as2,_al2,_ah2)
132         
133         VMAC(vmull.s32,_a0b,_as0_1,_bs2_1,0)
134         VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0)
135         VMAC(vmlal.s32,_a0b,_as2_1,_bs0_1,0)
136         VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0)
137             
138         VMAC(vmull.s32,_a1b,_as0_1,_bs2_1,1)
139         VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1)
140         VMAC(vmlal.s32,_a1b,_as2_1,_bs0_1,1)
141         VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1)
142             
143         VOP2(vmov,_a0a,_a0b)
144         VMAC(vmlal.s32,_a0a,_ah0_1,_bh2_1,0)
145         VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_0,0)
146         VMAC(vmlal.s32,_a0a,_ah2_1,_bh0_1,0)
147         VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_0,0)
148             
149         VMAC(vmlsl.s32,_a0b,_al0_1,_bl2_1,0)
150         VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_0,0)
151         VMAC(vmlsl.s32,_a0b,_al2_1,_bl0_1,0)
152         VMAC(vmlal.s32,_a0b,_al0_0,_bs0_0,0)
153             
154         VOP2(vmov,_a1a,_a1b)
155         VMAC(vmlal.s32,_a1a,_ah0_1,_bh2_1,1)
156         VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_0,1)
157         VMAC(vmlal.s32,_a1a,_ah2_1,_bh0_1,1)
158         VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_0,1)
159             
160             VOP2(vswp,_a0b_1,_a0a_0)
161             
162         VMAC(vmlsl.s32,_a1b,_al0_1,_bl2_1,1)
163         VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_0,1)
164         VMAC(vmlsl.s32,_a1b,_al2_1,_bl0_1,1)
165         VMAC(vmlal.s32,_a1b,_al0_0,_bs0_0,1)
166                 
167             VOP3(vsra.u64,_a0a,_a0b,"#28")
168             VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1)
169             VOP2(vmovn.i64,_a0b_0,_a0b)
170                 
171             VOP2(vswp,_a1b_1,_a1a_0)
172             VOP3(vadd.i64,_a1b,_a0a,_a1b)
173                     
174                     
175         VMAC(vmull.s32,_a0a,_as2_0,_bs2_1,0)
176             VOP2(vmovn.i64,_a0b_1,_a1b)
177         VMAC(vmlal.s32,_a0a,_as2_1,_bs2_0,0)
178             VOP3(vsra.u64,_a1a,_a1b,"#28")
179         VMAC(vmlal.s32,_a0a,_as0_0,_bh0_1,0)
180             VOP2(vbic.i32,_a0b,"#0xf0000000")
181         VMAC(vmlal.s32,_a0a,_as0_1,_bh0_0,0)
182             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
183                     
184         VMAC(vmull.s32,_a1b,_as2_0,_bs2_1,1)
185         VMAC(vmlal.s32,_a1b,_as2_1,_bs2_0,1)
186         VMAC(vmlal.s32,_a1b,_as0_0,_bh0_1,1)
187         VMAC(vmlal.s32,_a1b,_as0_1,_bh0_0,1)
188
189         VOP2(vmov,_a0b_1,_a0a_1)
190         VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
191         VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
192         VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_1,0)
193         VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_0,0)
194         VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_1,0)
195         VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_0,0)
196
197         VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_1,0)
198         VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_0,0)
199         VMAC(vmlal.s32,_a0b,_al0_0,_bs0_1,0)
200         VMAC(vmlal.s32,_a0b,_al0_1,_bs0_0,0)
201
202         VOP2(vmov,_a1a,_a1b)
203         VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_1,1)
204         VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_0,1)
205         VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_1,1)
206         VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_0,1)
207
208             VOP2(vswp,_a0b_1,_a0a_0)
209
210         VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_1,1)
211         VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_0,1)
212         VMAC(vmlal.s32,_a1b,_al0_0,_bs0_1,1)
213         VMAC(vmlal.s32,_a1b,_al0_1,_bs0_0,1)
214                                         
215             VOP3(vsra.u64,_a0a,_a0b,"#28")
216             VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0)
217             VOP2(vmovn.i64,_a0b_0,_a0b)
218                         
219             VOP2(vswp,_a1b_1,_a1a_0)
220             VOP3(vadd.i64,_a1b,_a0a,_a1b)
221
222         VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0)
223             VOP2(vmovn.i64,_a0b_1,_a1b)
224         VMAC(vmlal.s32,_a0a,_as0_0,_bh2_0,0)
225             VOP3(vsra.u64,_a1a,_a1b,"#28")
226         VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0)
227             VOP2(vbic.i32,_a0b,"#0xf0000000")
228         VMAC(vmlal.s32,_a0a,_as2_0,_bh0_0,0)
229             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
230
231         VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1)
232         VMAC(vmlal.s32,_a1b,_as0_0,_bh2_0,1)
233         VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1)
234         VMAC(vmlal.s32,_a1b,_as2_0,_bh0_0,1)
235
236         VOP2(vmov,_a0b_1,_a0a_1)
237         VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
238         VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
239         VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_1,0)
240         VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_0,0)
241         VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_1,0)
242         VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_0,0)
243
244         VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_1,0)
245         VMAC(vmlal.s32,_a0b,_al0_0,_bs2_0,0)
246         VMAC(vmlal.s32,_a0b,_al0_1,_bs0_1,0)
247         VMAC(vmlal.s32,_a0b,_al2_0,_bs0_0,0)
248
249         VOP2(vmov,_a1a,_a1b)
250         VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_1,1)
251         VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_0,1)
252         VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_1,1)
253         VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_0,1)
254
255             VOP2(vswp,_a0b_1,_a0a_0)
256
257         VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_1,1)
258         VMAC(vmlal.s32,_a1b,_al0_0,_bs2_0,1)
259         VMAC(vmlal.s32,_a1b,_al0_1,_bs0_1,1)
260         VMAC(vmlal.s32,_a1b,_al2_0,_bs0_0,1)
261                                                                 
262             VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1)
263             VOP3(vsra.u64,_a0a,_a0b,"#28")
264             VOP2(vmovn.i64,_a0b_0,_a0b)
265                         
266             VOP2(vswp,_a1b_1,_a1a_0)
267             VOP3(vadd.i64,_a1b,_a0a,_a1b)
268
269         VMAC(vmull.s32,_a0a,_as0_0,_bh2_1,0)
270             VOP2(vmovn.i64,_a0b_1,_a1b)
271         VMAC(vmlal.s32,_a0a,_as0_1,_bh2_0,0)
272             VOP3(vsra.u64,_a1a,_a1b,"#28")
273         VMAC(vmlal.s32,_a0a,_as2_0,_bh0_1,0)
274             VOP2(vbic.i32,_a0b,"#0xf0000000")
275         VMAC(vmlal.s32,_a0a,_as2_1,_bh0_0,0)
276             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
277
278         VMAC(vmull.s32,_a1b,_as0_0,_bh2_1,1)
279         VMAC(vmlal.s32,_a1b,_as0_1,_bh2_0,1)
280         VMAC(vmlal.s32,_a1b,_as2_0,_bh0_1,1)
281         VMAC(vmlal.s32,_a1b,_as2_1,_bh0_0,1)
282
283         VOP2(vmov,_a0b_1,_a0a_1)
284         VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
285         VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
286         VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_1,0)
287         VMAC(vmlal.s32,_a0a,_ah0_1,_bl2_0,0)
288         VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_1,0)
289         VMAC(vmlal.s32,_a0a,_ah2_1,_bl0_0,0)
290
291         VMAC(vmlal.s32,_a0b,_al0_0,_bs2_1,0)
292         VMAC(vmlal.s32,_a0b,_al0_1,_bs2_0,0)
293         VMAC(vmlal.s32,_a0b,_al2_0,_bs0_1,0)
294         VMAC(vmlal.s32,_a0b,_al2_1,_bs0_0,0)
295
296         VOP2(vmov,_a1a,_a1b)
297         VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_1,1)
298         VMAC(vmlal.s32,_a1a,_ah0_1,_bl2_0,1)
299         VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_1,1)
300         VMAC(vmlal.s32,_a1a,_ah2_1,_bl0_0,1)
301
302             VOP2(vswp,_a0b_1,_a0a_0)
303
304         VMAC(vmlal.s32,_a1b,_al0_0,_bs2_1,1)
305         VMAC(vmlal.s32,_a1b,_al0_1,_bs2_0,1)
306         VMAC(vmlal.s32,_a1b,_al2_0,_bs0_1,1)
307         VMAC(vmlal.s32,_a1b,_al2_1,_bs0_0,1)
308                         
309             VOP3(vsra.u64,_a0a,_a0b,"#28")
310             VOP2(vmovn.i64,_a0b_0,_a0b)
311                                                                                             
312             VOP2(vswp,_a1b_1,_a1a_0)
313             VOP3(vadd.i64,_a0a,_a0a,_a1b)
314
315             VOP2(vmovn.i64,_a0b_1,_a0a)
316             VOP3(vsra.u64,_a1a,_a0a,"#28")
317                                                                                             
318             VOP2(vbic.i32,_a0b,"#0xf0000000") 
319                                                                                             
320         VOP2(vswp,_a1a_0,_a1a_1)
321                                                                                             
322             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"  
323             "sub %[c], #64" "\n\t"
324                                                                                                 
325         VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0)
326         
327             "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
328             VOP2(vaddw.s32,_a1a,_a0a_0)
329             VOP2(vmovn.i64,_a0a_0,_a1a)
330             VOP2(vshr.s64,_a1a,"#28")
331                                                 
332             VOP2(vaddw.s32,_a1a,_a0a_1)
333             VOP2(vmovn.i64,_a0a_1,_a1a)
334             VOP2(vshr.s64,_a1a,"#28")
335                                                                                                     
336             VOP2(vbic.i32,_a0a,"#0xf0000000")
337                                                 
338             VOP2(vaddw.s32,_a1a,_a0b_0) 
339             VOP2(vmovn.i64,_a0b_0,_a1a)
340             
341             "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
342         
343         : [a]"+r"(as)
344         , [b]"+r"(bs)
345         , [c]"+r"(vc)
346                             
347         :: "q0","q1","q2","q3",
348             "q4","q5","q6","q7",
349             "q8","q9","q10","q11",
350             "q12","q13","q14","q15",
351             "memory"
352     );
353 }
354
355 void gf_sqr(gf_s *__restrict__ cs, const gf bs)
356 {
357     int32x2_t *vc = (int32x2_t*) cs->limb;
358
359     __asm__ __volatile__ (
360         "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t"
361         VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1) /* 0 .. 2^30 */
362         VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0) /* +- 2^29 */
363         VOP3(vadd.i32,_as0,_bl0,_bh0)       /* 0 .. 2^30 */
364             
365         "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t"
366         VOP3(vadd.i32,_bs2,_bl2,_bh2)       /* 0 .. 2^30 */
367         VOP2(vmov,_as2,_bs2)
368         
369         VMAC(vqdmull.s32,_a0b,_as0_1,_bs2_1,0) /* 0 .. 8 * 2^58.  danger for vqdmlal is 32 */
370         VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0)   /* 0 .. 12 */
371         VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0)   /* 0 .. 14 */
372             
373         VMAC(vqdmull.s32,_a1b,_as0_1,_bs2_1,1) /* 0 .. 8 */
374         VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1)   /* 0 .. 14 */
375         VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1)   /* 0 .. 16 */
376             
377         VOP2(vmov,_a0a,_a0b)                   /* 0 .. 14 */
378         VMAC(vqdmlal.s32,_a0a,_bh0_1,_bh2_1,0) /* 0 .. 16 */
379         VMAC(vmlal.s32,_a0a,_bh2_0,_bh2_0,0)   /* 0 .. 17 */
380         VMAC(vmlal.s32,_a0a,_bh0_0,_bl0_0,0)   /* 0 .. 18 */
381             
382         VMAC(vqdmlsl.s32,_a0b,_bl0_1,_bl2_1,0) /*-2 .. 14 */
383         VMAC(vmlsl.s32,_a0b,_bl2_0,_bl2_0,0)   /*-3 .. 14 */
384         VMAC(vmlal.s32,_a0b,_bl0_0,_bs0_0,0)   /*-4 .. 15 */
385             
386         VOP2(vmov,_a1a,_a1b)
387         VMAC(vqdmlal.s32,_a1a,_bh0_1,_bh2_1,1) /* 0 .. 18 */
388         VMAC(vmlal.s32,_a1a,_bh2_0,_bh2_0,1)   /* 0 .. 19 */
389         VMAC(vmlal.s32,_a1a,_bh0_0,_bl0_0,1)   /* 0 .. 20 */
390             
391             VOP2(vswp,_a0b_1,_a0a_0)
392             
393         VMAC(vqdmlsl.s32,_a1b,_bl0_1,_bl2_1,1) /*-2 .. 16 */
394         VMAC(vmlsl.s32,_a1b,_bl2_0,_bl2_0,1)   /*-3 .. 16 */
395         VMAC(vmlal.s32,_a1b,_bl0_0,_bs0_0,1)   /*-4 .. 17 */
396                 
397             VOP3(vsra.u64,_a0a,_a0b,"#28")
398             VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1)
399             VOP2(vmovn.i64,_a0b_0,_a0b)
400                 
401             VOP2(vswp,_a1b_1,_a1a_0)
402             VOP3(vadd.i64,_a1b,_a0a,_a1b)
403                     
404                     
405         VMAC(vqdmull.s32,_a0a,_as2_0,_bs2_1,0) /* 0 .. 8 */
406             VOP2(vmovn.i64,_a0b_1,_a1b)
407             VOP3(vsra.u64,_a1a,_a1b,"#28")
408         VMAC(vqdmlal.s32,_a0a,_as0_0,_bh0_1,0) /* 0 .. 12 */
409             VOP2(vbic.i32,_a0b,"#0xf0000000")
410             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
411                     
412         VMAC(vqdmull.s32,_a1b,_as2_0,_bs2_1,1) /* 0 .. 8 */
413         VMAC(vqdmlal.s32,_a1b,_as0_0,_bh0_1,1) /* 0 .. 12 */
414
415         VOP2(vmov,_a0b,_a0a)               /* 0 .. 12 */
416         VMAC(vqdmlal.s32,_a0a,_bh2_0,_bh2_1,0) /* 0 .. 14 */
417         VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl0_1,0) /* 0 .. 16 */
418
419         VMAC(vqdmlsl.s32,_a0b,_bl2_0,_bl2_1,0) /*-2 .. 12 */
420         VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs0_1,0) /*-4 .. 14 */
421         VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
422         VOP3(vadd.i64,_a0b_0,_a0b_0,_a1a_0)
423
424         VOP2(vmov,_a1a,_a1b)                   /* 0 .. 12 */
425         VMAC(vqdmlal.s32,_a1a,_bh2_0,_bh2_1,1) /* 0 .. 14 */
426         VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl0_1,1) /* 0 .. 16 */
427
428             VOP2(vswp,_a0b_1,_a0a_0)
429
430         VMAC(vqdmlsl.s32,_a1b,_bl2_0,_bl2_1,1) /*-2 .. 12 */
431         VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs0_1,1) /*-4 .. 14 */
432                                         
433             VOP3(vsra.u64,_a0a,_a0b,"#28")
434             VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0)
435             VOP2(vmovn.i64,_a0b_0,_a0b)
436                         
437             VOP2(vswp,_a1b_1,_a1a_0)
438             VOP3(vadd.i64,_a1b,_a0a,_a1b)
439
440         VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0)
441             VOP2(vmovn.i64,_a0b_1,_a1b)
442         VMAC(vqdmlal.s32,_a0a,_as0_0,_bh2_0,0)
443             VOP3(vsra.u64,_a1a,_a1b,"#28")
444         VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0)
445             VOP2(vbic.i32,_a0b,"#0xf0000000")
446             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
447
448         VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1)
449         VMAC(vqdmlal.s32,_a1b,_as0_0,_bh2_0,1)
450         VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1)
451
452         VOP2(vmov,_a0b_1,_a0a_1)
453         VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
454         VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
455         VMAC(vmlal.s32,_a0a,_bh2_1,_bh2_1,0)
456         VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_0,0)
457         VMAC(vmlal.s32,_a0a,_bh0_1,_bl0_1,0)
458
459         VMAC(vmlsl.s32,_a0b,_bl2_1,_bl2_1,0)
460         VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_0,0)
461         VMAC(vmlal.s32,_a0b,_bl0_1,_bs0_1,0)
462
463         VOP2(vmov,_a1a,_a1b)
464         VMAC(vmlal.s32,_a1a,_bh2_1,_bh2_1,1)
465         VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_0,1)
466         VMAC(vmlal.s32,_a1a,_bh0_1,_bl0_1,1)
467
468             VOP2(vswp,_a0b_1,_a0a_0)
469
470         VMAC(vmlsl.s32,_a1b,_bl2_1,_bl2_1,1)
471         VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_0,1)
472         VMAC(vmlal.s32,_a1b,_bl0_1,_bs0_1,1)
473                                                                 
474             VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1)
475             VOP3(vsra.u64,_a0a,_a0b,"#28")
476             VOP2(vmovn.i64,_a0b_0,_a0b)
477                         
478             VOP2(vswp,_a1b_1,_a1a_0)
479             VOP3(vadd.i64,_a1b,_a0a,_a1b)
480
481         VMAC(vqdmull.s32,_a0a,_as0_0,_bh2_1,0)
482             VOP2(vmovn.i64,_a0b_1,_a1b)
483             VOP3(vsra.u64,_a1a,_a1b,"#28")
484         VMAC(vqdmlal.s32,_a0a,_as2_0,_bh0_1,0)
485             VOP2(vbic.i32,_a0b,"#0xf0000000")
486             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
487
488         VMAC(vqdmull.s32,_a1b,_as0_0,_bh2_1,1)
489         VMAC(vqdmlal.s32,_a1b,_as2_0,_bh0_1,1)
490
491         VOP2(vmov,_a0b_1,_a0a_1)
492         VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
493         VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
494         VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_1,0)
495         VMAC(vqdmlal.s32,_a0a,_bh2_0,_bl0_1,0)
496
497         VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_1,0)
498         VMAC(vqdmlal.s32,_a0b,_bl2_0,_bs0_1,0)
499
500         VOP2(vmov,_a1a,_a1b)
501         VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_1,1)
502         VMAC(vqdmlal.s32,_a1a,_bh2_0,_bl0_1,1)
503
504             VOP2(vswp,_a0b_1,_a0a_0)
505
506         VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_1,1)
507         VMAC(vqdmlal.s32,_a1b,_bl2_0,_bs0_1,1)
508                         
509             VOP3(vsra.u64,_a0a,_a0b,"#28")
510             VOP2(vmovn.i64,_a0b_0,_a0b)
511                                                                                             
512             VOP2(vswp,_a1b_1,_a1a_0)
513             VOP3(vadd.i64,_a0a,_a0a,_a1b)
514
515             VOP2(vmovn.i64,_a0b_1,_a0a)
516             VOP3(vsra.u64,_a1a,_a0a,"#28")
517                                                                                             
518             VOP2(vbic.i32,_a0b,"#0xf0000000") 
519                                                                                             
520         VOP2(vswp,_a1a_0,_a1a_1)
521                                                                                             
522             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"  
523             "sub %[c], #64" "\n\t"
524                                                                                                 
525         VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0)
526         
527             "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
528             VOP2(vaddw.s32,_a1a,_a0a_0)
529             VOP2(vmovn.i64,_a0a_0,_a1a)
530             VOP2(vshr.s64,_a1a,"#28")
531                                                 
532             VOP2(vaddw.s32,_a1a,_a0a_1)
533             VOP2(vmovn.i64,_a0a_1,_a1a)
534             VOP2(vshr.s64,_a1a,"#28")
535                                                                                                     
536             VOP2(vbic.i32,_a0a,"#0xf0000000")
537                                                 
538             VOP2(vaddw.s32,_a1a,_a0b_0) 
539             VOP2(vmovn.i64,_a0b_0,_a1a)
540             
541             "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
542         
543         : [b]"+r"(bs)
544         , [c]"+r"(vc)
545                             
546         :: "q0","q1","q2","q3",
547             "q4","q5","q6","q7",
548             "q12","q13","q14","q15",
549             "memory"
550     );
551 }
552
553 void gf_mulw_unsigned(gf_s *__restrict__ cs, const gf as, uint32_t b)
554
555     uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1};
556     assert(b<(1<<28));
557     
558     uint64x2_t accum;
559     const uint32x2_t *va = (const uint32x2_t *) as->limb;
560     uint32x2_t *vo = (uint32x2_t *) cs->limb;
561     uint32x2_t vc, vn;
562     uint32x2_t vb = {b, 0};
563     
564     vc = va[0];
565     accum = vmull_lane_u32(vc, vb, 0);
566     vo[0] = vmovn_u64(accum) & vmask;
567     accum = vshrq_n_u64(accum,28);
568     
569     /* PERF: the right way to do this is to reduce behind, i.e.
570      * vmull + vmlal round 0
571      * vmull + vmlal round 1
572      * vmull + vmlal round 2
573      * vsraq round 0, 1
574      * vmull + vmlal round 3
575      * vsraq round 1, 2
576      * ...
577      */
578     
579     int i;
580     for (i=1; i<8; i++) {
581         vn = va[i];
582         accum = vmlal_lane_u32(accum, vn, vb, 0);
583         vo[i] = vmovn_u64(accum) & vmask;
584         accum = vshrq_n_u64(accum,28);
585         vc = vn;
586     }
587         
588     accum = xx_vaddup_u64(vrev128_u64(accum));
589     accum = vaddw_u32(accum, vo[0]);
590     vo[0] = vmovn_u64(accum) & vmask;
591     
592     accum = vshrq_n_u64(accum,28);
593     vo[1] += vmovn_u64(accum);
594 }