1 /* Copyright (c) 2014 Cryptography Research, Inc.
2 * Released under the MIT License. See LICENSE.txt for license information.
7 static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline,unused))
8 xx_vaddup_u64(uint64x2_t x) {
9 __asm__ ("vadd.s64 %f0, %e0" : "+w"(x));
13 static __inline__ int64x2_t __attribute__((gnu_inline,always_inline,unused))
14 vrev128_s64(int64x2_t x) {
15 __asm__ ("vswp.s64 %e0, %f0" : "+w"(x));
19 static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline))
20 vrev128_u64(uint64x2_t x) {
21 __asm__ ("vswp.s64 %e0, %f0" : "+w"(x));
25 static inline void __attribute__((gnu_inline,always_inline,unused))
31 *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
34 static inline void __attribute__((gnu_inline,always_inline,unused))
40 *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
43 static inline void __attribute__((gnu_inline,always_inline,unused))
49 *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
52 static inline void __attribute__((gnu_inline,always_inline,unused))
58 *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
61 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
112 #define VMAC(op,result,a,b,n) #op" "result", "a", "b"[" #n "]\n\t"
113 #define VOP3(op,result,a,b) #op" "result", "a", "b"\n\t"
114 #define VOP2(op,result,a) #op" "result", "a"\n\t"
116 int32x2_t *vc = (int32x2_t*) cs->limb;
118 __asm__ __volatile__(
120 "vld2.32 {"_al0_0","_al0_1","_ah0_0","_ah0_1"}, [%[a],:128]!" "\n\t"
121 VOP3(vadd.i32,_as0,_al0,_ah0)
123 "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t"
124 VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1)
125 VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0)
127 "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t"
128 VOP3(vadd.i32,_bs2,_bl2,_bh2)
130 "vld2.32 {"_al2_0","_al2_1","_ah2_0","_ah2_1"}, [%[a],:128]!" "\n\t"
131 VOP3(vadd.i32,_as2,_al2,_ah2)
133 VMAC(vmull.s32,_a0b,_as0_1,_bs2_1,0)
134 VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0)
135 VMAC(vmlal.s32,_a0b,_as2_1,_bs0_1,0)
136 VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0)
138 VMAC(vmull.s32,_a1b,_as0_1,_bs2_1,1)
139 VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1)
140 VMAC(vmlal.s32,_a1b,_as2_1,_bs0_1,1)
141 VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1)
144 VMAC(vmlal.s32,_a0a,_ah0_1,_bh2_1,0)
145 VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_0,0)
146 VMAC(vmlal.s32,_a0a,_ah2_1,_bh0_1,0)
147 VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_0,0)
149 VMAC(vmlsl.s32,_a0b,_al0_1,_bl2_1,0)
150 VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_0,0)
151 VMAC(vmlsl.s32,_a0b,_al2_1,_bl0_1,0)
152 VMAC(vmlal.s32,_a0b,_al0_0,_bs0_0,0)
155 VMAC(vmlal.s32,_a1a,_ah0_1,_bh2_1,1)
156 VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_0,1)
157 VMAC(vmlal.s32,_a1a,_ah2_1,_bh0_1,1)
158 VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_0,1)
160 VOP2(vswp,_a0b_1,_a0a_0)
162 VMAC(vmlsl.s32,_a1b,_al0_1,_bl2_1,1)
163 VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_0,1)
164 VMAC(vmlsl.s32,_a1b,_al2_1,_bl0_1,1)
165 VMAC(vmlal.s32,_a1b,_al0_0,_bs0_0,1)
167 VOP3(vsra.u64,_a0a,_a0b,"#28")
168 VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1)
169 VOP2(vmovn.i64,_a0b_0,_a0b)
171 VOP2(vswp,_a1b_1,_a1a_0)
172 VOP3(vadd.i64,_a1b,_a0a,_a1b)
175 VMAC(vmull.s32,_a0a,_as2_0,_bs2_1,0)
176 VOP2(vmovn.i64,_a0b_1,_a1b)
177 VMAC(vmlal.s32,_a0a,_as2_1,_bs2_0,0)
178 VOP3(vsra.u64,_a1a,_a1b,"#28")
179 VMAC(vmlal.s32,_a0a,_as0_0,_bh0_1,0)
180 VOP2(vbic.i32,_a0b,"#0xf0000000")
181 VMAC(vmlal.s32,_a0a,_as0_1,_bh0_0,0)
182 "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
184 VMAC(vmull.s32,_a1b,_as2_0,_bs2_1,1)
185 VMAC(vmlal.s32,_a1b,_as2_1,_bs2_0,1)
186 VMAC(vmlal.s32,_a1b,_as0_0,_bh0_1,1)
187 VMAC(vmlal.s32,_a1b,_as0_1,_bh0_0,1)
189 VOP2(vmov,_a0b_1,_a0a_1)
190 VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
191 VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
192 VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_1,0)
193 VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_0,0)
194 VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_1,0)
195 VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_0,0)
197 VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_1,0)
198 VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_0,0)
199 VMAC(vmlal.s32,_a0b,_al0_0,_bs0_1,0)
200 VMAC(vmlal.s32,_a0b,_al0_1,_bs0_0,0)
203 VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_1,1)
204 VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_0,1)
205 VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_1,1)
206 VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_0,1)
208 VOP2(vswp,_a0b_1,_a0a_0)
210 VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_1,1)
211 VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_0,1)
212 VMAC(vmlal.s32,_a1b,_al0_0,_bs0_1,1)
213 VMAC(vmlal.s32,_a1b,_al0_1,_bs0_0,1)
215 VOP3(vsra.u64,_a0a,_a0b,"#28")
216 VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0)
217 VOP2(vmovn.i64,_a0b_0,_a0b)
219 VOP2(vswp,_a1b_1,_a1a_0)
220 VOP3(vadd.i64,_a1b,_a0a,_a1b)
222 VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0)
223 VOP2(vmovn.i64,_a0b_1,_a1b)
224 VMAC(vmlal.s32,_a0a,_as0_0,_bh2_0,0)
225 VOP3(vsra.u64,_a1a,_a1b,"#28")
226 VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0)
227 VOP2(vbic.i32,_a0b,"#0xf0000000")
228 VMAC(vmlal.s32,_a0a,_as2_0,_bh0_0,0)
229 "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
231 VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1)
232 VMAC(vmlal.s32,_a1b,_as0_0,_bh2_0,1)
233 VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1)
234 VMAC(vmlal.s32,_a1b,_as2_0,_bh0_0,1)
236 VOP2(vmov,_a0b_1,_a0a_1)
237 VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
238 VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
239 VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_1,0)
240 VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_0,0)
241 VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_1,0)
242 VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_0,0)
244 VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_1,0)
245 VMAC(vmlal.s32,_a0b,_al0_0,_bs2_0,0)
246 VMAC(vmlal.s32,_a0b,_al0_1,_bs0_1,0)
247 VMAC(vmlal.s32,_a0b,_al2_0,_bs0_0,0)
250 VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_1,1)
251 VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_0,1)
252 VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_1,1)
253 VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_0,1)
255 VOP2(vswp,_a0b_1,_a0a_0)
257 VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_1,1)
258 VMAC(vmlal.s32,_a1b,_al0_0,_bs2_0,1)
259 VMAC(vmlal.s32,_a1b,_al0_1,_bs0_1,1)
260 VMAC(vmlal.s32,_a1b,_al2_0,_bs0_0,1)
262 VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1)
263 VOP3(vsra.u64,_a0a,_a0b,"#28")
264 VOP2(vmovn.i64,_a0b_0,_a0b)
266 VOP2(vswp,_a1b_1,_a1a_0)
267 VOP3(vadd.i64,_a1b,_a0a,_a1b)
269 VMAC(vmull.s32,_a0a,_as0_0,_bh2_1,0)
270 VOP2(vmovn.i64,_a0b_1,_a1b)
271 VMAC(vmlal.s32,_a0a,_as0_1,_bh2_0,0)
272 VOP3(vsra.u64,_a1a,_a1b,"#28")
273 VMAC(vmlal.s32,_a0a,_as2_0,_bh0_1,0)
274 VOP2(vbic.i32,_a0b,"#0xf0000000")
275 VMAC(vmlal.s32,_a0a,_as2_1,_bh0_0,0)
276 "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
278 VMAC(vmull.s32,_a1b,_as0_0,_bh2_1,1)
279 VMAC(vmlal.s32,_a1b,_as0_1,_bh2_0,1)
280 VMAC(vmlal.s32,_a1b,_as2_0,_bh0_1,1)
281 VMAC(vmlal.s32,_a1b,_as2_1,_bh0_0,1)
283 VOP2(vmov,_a0b_1,_a0a_1)
284 VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
285 VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
286 VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_1,0)
287 VMAC(vmlal.s32,_a0a,_ah0_1,_bl2_0,0)
288 VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_1,0)
289 VMAC(vmlal.s32,_a0a,_ah2_1,_bl0_0,0)
291 VMAC(vmlal.s32,_a0b,_al0_0,_bs2_1,0)
292 VMAC(vmlal.s32,_a0b,_al0_1,_bs2_0,0)
293 VMAC(vmlal.s32,_a0b,_al2_0,_bs0_1,0)
294 VMAC(vmlal.s32,_a0b,_al2_1,_bs0_0,0)
297 VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_1,1)
298 VMAC(vmlal.s32,_a1a,_ah0_1,_bl2_0,1)
299 VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_1,1)
300 VMAC(vmlal.s32,_a1a,_ah2_1,_bl0_0,1)
302 VOP2(vswp,_a0b_1,_a0a_0)
304 VMAC(vmlal.s32,_a1b,_al0_0,_bs2_1,1)
305 VMAC(vmlal.s32,_a1b,_al0_1,_bs2_0,1)
306 VMAC(vmlal.s32,_a1b,_al2_0,_bs0_1,1)
307 VMAC(vmlal.s32,_a1b,_al2_1,_bs0_0,1)
309 VOP3(vsra.u64,_a0a,_a0b,"#28")
310 VOP2(vmovn.i64,_a0b_0,_a0b)
312 VOP2(vswp,_a1b_1,_a1a_0)
313 VOP3(vadd.i64,_a0a,_a0a,_a1b)
315 VOP2(vmovn.i64,_a0b_1,_a0a)
316 VOP3(vsra.u64,_a1a,_a0a,"#28")
318 VOP2(vbic.i32,_a0b,"#0xf0000000")
320 VOP2(vswp,_a1a_0,_a1a_1)
322 "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
323 "sub %[c], #64" "\n\t"
325 VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0)
327 "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
328 VOP2(vaddw.s32,_a1a,_a0a_0)
329 VOP2(vmovn.i64,_a0a_0,_a1a)
330 VOP2(vshr.s64,_a1a,"#28")
332 VOP2(vaddw.s32,_a1a,_a0a_1)
333 VOP2(vmovn.i64,_a0a_1,_a1a)
334 VOP2(vshr.s64,_a1a,"#28")
336 VOP2(vbic.i32,_a0a,"#0xf0000000")
338 VOP2(vaddw.s32,_a1a,_a0b_0)
339 VOP2(vmovn.i64,_a0b_0,_a1a)
341 "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
347 :: "q0","q1","q2","q3",
349 "q8","q9","q10","q11",
350 "q12","q13","q14","q15",
355 void gf_sqr (gf_s *__restrict__ cs, const gf bs) {
356 int32x2_t *vc = (int32x2_t*) cs->limb;
358 __asm__ __volatile__ (
359 "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t"
360 VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1) /* 0 .. 2^30 */
361 VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0) /* +- 2^29 */
362 VOP3(vadd.i32,_as0,_bl0,_bh0) /* 0 .. 2^30 */
364 "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t"
365 VOP3(vadd.i32,_bs2,_bl2,_bh2) /* 0 .. 2^30 */
368 VMAC(vqdmull.s32,_a0b,_as0_1,_bs2_1,0) /* 0 .. 8 * 2^58. danger for vqdmlal is 32 */
369 VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0) /* 0 .. 12 */
370 VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0) /* 0 .. 14 */
372 VMAC(vqdmull.s32,_a1b,_as0_1,_bs2_1,1) /* 0 .. 8 */
373 VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1) /* 0 .. 14 */
374 VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1) /* 0 .. 16 */
376 VOP2(vmov,_a0a,_a0b) /* 0 .. 14 */
377 VMAC(vqdmlal.s32,_a0a,_bh0_1,_bh2_1,0) /* 0 .. 16 */
378 VMAC(vmlal.s32,_a0a,_bh2_0,_bh2_0,0) /* 0 .. 17 */
379 VMAC(vmlal.s32,_a0a,_bh0_0,_bl0_0,0) /* 0 .. 18 */
381 VMAC(vqdmlsl.s32,_a0b,_bl0_1,_bl2_1,0) /*-2 .. 14 */
382 VMAC(vmlsl.s32,_a0b,_bl2_0,_bl2_0,0) /*-3 .. 14 */
383 VMAC(vmlal.s32,_a0b,_bl0_0,_bs0_0,0) /*-4 .. 15 */
386 VMAC(vqdmlal.s32,_a1a,_bh0_1,_bh2_1,1) /* 0 .. 18 */
387 VMAC(vmlal.s32,_a1a,_bh2_0,_bh2_0,1) /* 0 .. 19 */
388 VMAC(vmlal.s32,_a1a,_bh0_0,_bl0_0,1) /* 0 .. 20 */
390 VOP2(vswp,_a0b_1,_a0a_0)
392 VMAC(vqdmlsl.s32,_a1b,_bl0_1,_bl2_1,1) /*-2 .. 16 */
393 VMAC(vmlsl.s32,_a1b,_bl2_0,_bl2_0,1) /*-3 .. 16 */
394 VMAC(vmlal.s32,_a1b,_bl0_0,_bs0_0,1) /*-4 .. 17 */
396 VOP3(vsra.u64,_a0a,_a0b,"#28")
397 VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1)
398 VOP2(vmovn.i64,_a0b_0,_a0b)
400 VOP2(vswp,_a1b_1,_a1a_0)
401 VOP3(vadd.i64,_a1b,_a0a,_a1b)
404 VMAC(vqdmull.s32,_a0a,_as2_0,_bs2_1,0) /* 0 .. 8 */
405 VOP2(vmovn.i64,_a0b_1,_a1b)
406 VOP3(vsra.u64,_a1a,_a1b,"#28")
407 VMAC(vqdmlal.s32,_a0a,_as0_0,_bh0_1,0) /* 0 .. 12 */
408 VOP2(vbic.i32,_a0b,"#0xf0000000")
409 "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
411 VMAC(vqdmull.s32,_a1b,_as2_0,_bs2_1,1) /* 0 .. 8 */
412 VMAC(vqdmlal.s32,_a1b,_as0_0,_bh0_1,1) /* 0 .. 12 */
414 VOP2(vmov,_a0b,_a0a) /* 0 .. 12 */
415 VMAC(vqdmlal.s32,_a0a,_bh2_0,_bh2_1,0) /* 0 .. 14 */
416 VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl0_1,0) /* 0 .. 16 */
418 VMAC(vqdmlsl.s32,_a0b,_bl2_0,_bl2_1,0) /*-2 .. 12 */
419 VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs0_1,0) /*-4 .. 14 */
420 VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
421 VOP3(vadd.i64,_a0b_0,_a0b_0,_a1a_0)
423 VOP2(vmov,_a1a,_a1b) /* 0 .. 12 */
424 VMAC(vqdmlal.s32,_a1a,_bh2_0,_bh2_1,1) /* 0 .. 14 */
425 VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl0_1,1) /* 0 .. 16 */
427 VOP2(vswp,_a0b_1,_a0a_0)
429 VMAC(vqdmlsl.s32,_a1b,_bl2_0,_bl2_1,1) /*-2 .. 12 */
430 VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs0_1,1) /*-4 .. 14 */
432 VOP3(vsra.u64,_a0a,_a0b,"#28")
433 VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0)
434 VOP2(vmovn.i64,_a0b_0,_a0b)
436 VOP2(vswp,_a1b_1,_a1a_0)
437 VOP3(vadd.i64,_a1b,_a0a,_a1b)
439 VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0)
440 VOP2(vmovn.i64,_a0b_1,_a1b)
441 VMAC(vqdmlal.s32,_a0a,_as0_0,_bh2_0,0)
442 VOP3(vsra.u64,_a1a,_a1b,"#28")
443 VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0)
444 VOP2(vbic.i32,_a0b,"#0xf0000000")
445 "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
447 VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1)
448 VMAC(vqdmlal.s32,_a1b,_as0_0,_bh2_0,1)
449 VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1)
451 VOP2(vmov,_a0b_1,_a0a_1)
452 VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
453 VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
454 VMAC(vmlal.s32,_a0a,_bh2_1,_bh2_1,0)
455 VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_0,0)
456 VMAC(vmlal.s32,_a0a,_bh0_1,_bl0_1,0)
458 VMAC(vmlsl.s32,_a0b,_bl2_1,_bl2_1,0)
459 VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_0,0)
460 VMAC(vmlal.s32,_a0b,_bl0_1,_bs0_1,0)
463 VMAC(vmlal.s32,_a1a,_bh2_1,_bh2_1,1)
464 VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_0,1)
465 VMAC(vmlal.s32,_a1a,_bh0_1,_bl0_1,1)
467 VOP2(vswp,_a0b_1,_a0a_0)
469 VMAC(vmlsl.s32,_a1b,_bl2_1,_bl2_1,1)
470 VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_0,1)
471 VMAC(vmlal.s32,_a1b,_bl0_1,_bs0_1,1)
473 VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1)
474 VOP3(vsra.u64,_a0a,_a0b,"#28")
475 VOP2(vmovn.i64,_a0b_0,_a0b)
477 VOP2(vswp,_a1b_1,_a1a_0)
478 VOP3(vadd.i64,_a1b,_a0a,_a1b)
480 VMAC(vqdmull.s32,_a0a,_as0_0,_bh2_1,0)
481 VOP2(vmovn.i64,_a0b_1,_a1b)
482 VOP3(vsra.u64,_a1a,_a1b,"#28")
483 VMAC(vqdmlal.s32,_a0a,_as2_0,_bh0_1,0)
484 VOP2(vbic.i32,_a0b,"#0xf0000000")
485 "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
487 VMAC(vqdmull.s32,_a1b,_as0_0,_bh2_1,1)
488 VMAC(vqdmlal.s32,_a1b,_as2_0,_bh0_1,1)
490 VOP2(vmov,_a0b_1,_a0a_1)
491 VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
492 VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
493 VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_1,0)
494 VMAC(vqdmlal.s32,_a0a,_bh2_0,_bl0_1,0)
496 VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_1,0)
497 VMAC(vqdmlal.s32,_a0b,_bl2_0,_bs0_1,0)
500 VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_1,1)
501 VMAC(vqdmlal.s32,_a1a,_bh2_0,_bl0_1,1)
503 VOP2(vswp,_a0b_1,_a0a_0)
505 VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_1,1)
506 VMAC(vqdmlal.s32,_a1b,_bl2_0,_bs0_1,1)
508 VOP3(vsra.u64,_a0a,_a0b,"#28")
509 VOP2(vmovn.i64,_a0b_0,_a0b)
511 VOP2(vswp,_a1b_1,_a1a_0)
512 VOP3(vadd.i64,_a0a,_a0a,_a1b)
514 VOP2(vmovn.i64,_a0b_1,_a0a)
515 VOP3(vsra.u64,_a1a,_a0a,"#28")
517 VOP2(vbic.i32,_a0b,"#0xf0000000")
519 VOP2(vswp,_a1a_0,_a1a_1)
521 "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
522 "sub %[c], #64" "\n\t"
524 VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0)
526 "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
527 VOP2(vaddw.s32,_a1a,_a0a_0)
528 VOP2(vmovn.i64,_a0a_0,_a1a)
529 VOP2(vshr.s64,_a1a,"#28")
531 VOP2(vaddw.s32,_a1a,_a0a_1)
532 VOP2(vmovn.i64,_a0a_1,_a1a)
533 VOP2(vshr.s64,_a1a,"#28")
535 VOP2(vbic.i32,_a0a,"#0xf0000000")
537 VOP2(vaddw.s32,_a1a,_a0b_0)
538 VOP2(vmovn.i64,_a0b_0,_a1a)
540 "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
545 :: "q0","q1","q2","q3",
547 "q12","q13","q14","q15",
552 void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
553 uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1};
557 const uint32x2_t *va = (const uint32x2_t *) as->limb;
558 uint32x2_t *vo = (uint32x2_t *) cs->limb;
560 uint32x2_t vb = {b, 0};
563 accum = vmull_lane_u32(vc, vb, 0);
564 vo[0] = vmovn_u64(accum) & vmask;
565 accum = vshrq_n_u64(accum,28);
567 /* PERF: the right way to do this is to reduce behind, i.e.
568 * vmull + vmlal round 0
569 * vmull + vmlal round 1
570 * vmull + vmlal round 2
572 * vmull + vmlal round 3
578 for (i=1; i<8; i++) {
580 accum = vmlal_lane_u32(accum, vn, vb, 0);
581 vo[i] = vmovn_u64(accum) & vmask;
582 accum = vshrq_n_u64(accum,28);
586 accum = xx_vaddup_u64(vrev128_u64(accum));
587 accum = vaddw_u32(accum, vo[0]);
588 vo[0] = vmovn_u64(accum) & vmask;
590 accum = vshrq_n_u64(accum,28);
591 vo[1] += vmovn_u64(accum);