2 * Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
3 * Copyright 2014 Cryptography Research, Inc.
5 * Licensed under the OpenSSL license (the "License"). You may not use
6 * this file except in compliance with the License. You can obtain a copy
7 * in the file LICENSE in the source distribution or at
8 * https://www.openssl.org/source/license.html
10 * Originally written by Mike Hamburg
15 static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline,unused))
16 xx_vaddup_u64(uint64x2_t x)
18 __asm__ ("vadd.s64 %f0, %e0" : "+w"(x));
22 static __inline__ int64x2_t __attribute__((gnu_inline,always_inline,unused))
23 vrev128_s64(int64x2_t x)
25 __asm__ ("vswp.s64 %e0, %f0" : "+w"(x));
29 static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline))
30 vrev128_u64(uint64x2_t x)
32 __asm__ ("vswp.s64 %e0, %f0" : "+w"(x));
36 static inline void __attribute__((gnu_inline,always_inline,unused))
37 smlal(uint64_t *acc, const uint32_t a, const uint32_t b)
39 *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
42 static inline void __attribute__((gnu_inline,always_inline,unused))
43 smlal2(uint64_t *acc, const uint32_t a, const uint32_t b)
45 *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
48 static inline void __attribute__((gnu_inline,always_inline,unused))
49 smull(uint64_t *acc, const uint32_t a, const uint32_t b)
51 *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
54 static inline void __attribute__((gnu_inline,always_inline,unused))
55 smull2(uint64_t *acc, const uint32_t a, const uint32_t b)
57 *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
60 void gf_mul(gf_s *__restrict__ cs, const gf as, const gf bs)
112 #define VMAC(op,result,a,b,n) #op" "result", "a", "b"[" #n "]\n\t"
113 #define VOP3(op,result,a,b) #op" "result", "a", "b"\n\t"
114 #define VOP2(op,result,a) #op" "result", "a"\n\t"
116 int32x2_t *vc = (int32x2_t*) cs->limb;
118 __asm__ __volatile__(
120 "vld2.32 {"_al0_0","_al0_1","_ah0_0","_ah0_1"}, [%[a],:128]!" "\n\t"
121 VOP3(vadd.i32,_as0,_al0,_ah0)
123 "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t"
124 VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1)
125 VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0)
127 "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t"
128 VOP3(vadd.i32,_bs2,_bl2,_bh2)
130 "vld2.32 {"_al2_0","_al2_1","_ah2_0","_ah2_1"}, [%[a],:128]!" "\n\t"
131 VOP3(vadd.i32,_as2,_al2,_ah2)
133 VMAC(vmull.s32,_a0b,_as0_1,_bs2_1,0)
134 VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0)
135 VMAC(vmlal.s32,_a0b,_as2_1,_bs0_1,0)
136 VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0)
138 VMAC(vmull.s32,_a1b,_as0_1,_bs2_1,1)
139 VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1)
140 VMAC(vmlal.s32,_a1b,_as2_1,_bs0_1,1)
141 VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1)
144 VMAC(vmlal.s32,_a0a,_ah0_1,_bh2_1,0)
145 VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_0,0)
146 VMAC(vmlal.s32,_a0a,_ah2_1,_bh0_1,0)
147 VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_0,0)
149 VMAC(vmlsl.s32,_a0b,_al0_1,_bl2_1,0)
150 VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_0,0)
151 VMAC(vmlsl.s32,_a0b,_al2_1,_bl0_1,0)
152 VMAC(vmlal.s32,_a0b,_al0_0,_bs0_0,0)
155 VMAC(vmlal.s32,_a1a,_ah0_1,_bh2_1,1)
156 VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_0,1)
157 VMAC(vmlal.s32,_a1a,_ah2_1,_bh0_1,1)
158 VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_0,1)
160 VOP2(vswp,_a0b_1,_a0a_0)
162 VMAC(vmlsl.s32,_a1b,_al0_1,_bl2_1,1)
163 VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_0,1)
164 VMAC(vmlsl.s32,_a1b,_al2_1,_bl0_1,1)
165 VMAC(vmlal.s32,_a1b,_al0_0,_bs0_0,1)
167 VOP3(vsra.u64,_a0a,_a0b,"#28")
168 VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1)
169 VOP2(vmovn.i64,_a0b_0,_a0b)
171 VOP2(vswp,_a1b_1,_a1a_0)
172 VOP3(vadd.i64,_a1b,_a0a,_a1b)
175 VMAC(vmull.s32,_a0a,_as2_0,_bs2_1,0)
176 VOP2(vmovn.i64,_a0b_1,_a1b)
177 VMAC(vmlal.s32,_a0a,_as2_1,_bs2_0,0)
178 VOP3(vsra.u64,_a1a,_a1b,"#28")
179 VMAC(vmlal.s32,_a0a,_as0_0,_bh0_1,0)
180 VOP2(vbic.i32,_a0b,"#0xf0000000")
181 VMAC(vmlal.s32,_a0a,_as0_1,_bh0_0,0)
182 "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
184 VMAC(vmull.s32,_a1b,_as2_0,_bs2_1,1)
185 VMAC(vmlal.s32,_a1b,_as2_1,_bs2_0,1)
186 VMAC(vmlal.s32,_a1b,_as0_0,_bh0_1,1)
187 VMAC(vmlal.s32,_a1b,_as0_1,_bh0_0,1)
189 VOP2(vmov,_a0b_1,_a0a_1)
190 VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
191 VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
192 VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_1,0)
193 VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_0,0)
194 VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_1,0)
195 VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_0,0)
197 VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_1,0)
198 VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_0,0)
199 VMAC(vmlal.s32,_a0b,_al0_0,_bs0_1,0)
200 VMAC(vmlal.s32,_a0b,_al0_1,_bs0_0,0)
203 VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_1,1)
204 VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_0,1)
205 VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_1,1)
206 VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_0,1)
208 VOP2(vswp,_a0b_1,_a0a_0)
210 VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_1,1)
211 VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_0,1)
212 VMAC(vmlal.s32,_a1b,_al0_0,_bs0_1,1)
213 VMAC(vmlal.s32,_a1b,_al0_1,_bs0_0,1)
215 VOP3(vsra.u64,_a0a,_a0b,"#28")
216 VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0)
217 VOP2(vmovn.i64,_a0b_0,_a0b)
219 VOP2(vswp,_a1b_1,_a1a_0)
220 VOP3(vadd.i64,_a1b,_a0a,_a1b)
222 VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0)
223 VOP2(vmovn.i64,_a0b_1,_a1b)
224 VMAC(vmlal.s32,_a0a,_as0_0,_bh2_0,0)
225 VOP3(vsra.u64,_a1a,_a1b,"#28")
226 VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0)
227 VOP2(vbic.i32,_a0b,"#0xf0000000")
228 VMAC(vmlal.s32,_a0a,_as2_0,_bh0_0,0)
229 "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
231 VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1)
232 VMAC(vmlal.s32,_a1b,_as0_0,_bh2_0,1)
233 VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1)
234 VMAC(vmlal.s32,_a1b,_as2_0,_bh0_0,1)
236 VOP2(vmov,_a0b_1,_a0a_1)
237 VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
238 VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
239 VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_1,0)
240 VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_0,0)
241 VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_1,0)
242 VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_0,0)
244 VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_1,0)
245 VMAC(vmlal.s32,_a0b,_al0_0,_bs2_0,0)
246 VMAC(vmlal.s32,_a0b,_al0_1,_bs0_1,0)
247 VMAC(vmlal.s32,_a0b,_al2_0,_bs0_0,0)
250 VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_1,1)
251 VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_0,1)
252 VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_1,1)
253 VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_0,1)
255 VOP2(vswp,_a0b_1,_a0a_0)
257 VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_1,1)
258 VMAC(vmlal.s32,_a1b,_al0_0,_bs2_0,1)
259 VMAC(vmlal.s32,_a1b,_al0_1,_bs0_1,1)
260 VMAC(vmlal.s32,_a1b,_al2_0,_bs0_0,1)
262 VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1)
263 VOP3(vsra.u64,_a0a,_a0b,"#28")
264 VOP2(vmovn.i64,_a0b_0,_a0b)
266 VOP2(vswp,_a1b_1,_a1a_0)
267 VOP3(vadd.i64,_a1b,_a0a,_a1b)
269 VMAC(vmull.s32,_a0a,_as0_0,_bh2_1,0)
270 VOP2(vmovn.i64,_a0b_1,_a1b)
271 VMAC(vmlal.s32,_a0a,_as0_1,_bh2_0,0)
272 VOP3(vsra.u64,_a1a,_a1b,"#28")
273 VMAC(vmlal.s32,_a0a,_as2_0,_bh0_1,0)
274 VOP2(vbic.i32,_a0b,"#0xf0000000")
275 VMAC(vmlal.s32,_a0a,_as2_1,_bh0_0,0)
276 "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
278 VMAC(vmull.s32,_a1b,_as0_0,_bh2_1,1)
279 VMAC(vmlal.s32,_a1b,_as0_1,_bh2_0,1)
280 VMAC(vmlal.s32,_a1b,_as2_0,_bh0_1,1)
281 VMAC(vmlal.s32,_a1b,_as2_1,_bh0_0,1)
283 VOP2(vmov,_a0b_1,_a0a_1)
284 VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
285 VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
286 VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_1,0)
287 VMAC(vmlal.s32,_a0a,_ah0_1,_bl2_0,0)
288 VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_1,0)
289 VMAC(vmlal.s32,_a0a,_ah2_1,_bl0_0,0)
291 VMAC(vmlal.s32,_a0b,_al0_0,_bs2_1,0)
292 VMAC(vmlal.s32,_a0b,_al0_1,_bs2_0,0)
293 VMAC(vmlal.s32,_a0b,_al2_0,_bs0_1,0)
294 VMAC(vmlal.s32,_a0b,_al2_1,_bs0_0,0)
297 VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_1,1)
298 VMAC(vmlal.s32,_a1a,_ah0_1,_bl2_0,1)
299 VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_1,1)
300 VMAC(vmlal.s32,_a1a,_ah2_1,_bl0_0,1)
302 VOP2(vswp,_a0b_1,_a0a_0)
304 VMAC(vmlal.s32,_a1b,_al0_0,_bs2_1,1)
305 VMAC(vmlal.s32,_a1b,_al0_1,_bs2_0,1)
306 VMAC(vmlal.s32,_a1b,_al2_0,_bs0_1,1)
307 VMAC(vmlal.s32,_a1b,_al2_1,_bs0_0,1)
309 VOP3(vsra.u64,_a0a,_a0b,"#28")
310 VOP2(vmovn.i64,_a0b_0,_a0b)
312 VOP2(vswp,_a1b_1,_a1a_0)
313 VOP3(vadd.i64,_a0a,_a0a,_a1b)
315 VOP2(vmovn.i64,_a0b_1,_a0a)
316 VOP3(vsra.u64,_a1a,_a0a,"#28")
318 VOP2(vbic.i32,_a0b,"#0xf0000000")
320 VOP2(vswp,_a1a_0,_a1a_1)
322 "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
323 "sub %[c], #64" "\n\t"
325 VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0)
327 "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
328 VOP2(vaddw.s32,_a1a,_a0a_0)
329 VOP2(vmovn.i64,_a0a_0,_a1a)
330 VOP2(vshr.s64,_a1a,"#28")
332 VOP2(vaddw.s32,_a1a,_a0a_1)
333 VOP2(vmovn.i64,_a0a_1,_a1a)
334 VOP2(vshr.s64,_a1a,"#28")
336 VOP2(vbic.i32,_a0a,"#0xf0000000")
338 VOP2(vaddw.s32,_a1a,_a0b_0)
339 VOP2(vmovn.i64,_a0b_0,_a1a)
341 "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
347 :: "q0","q1","q2","q3",
349 "q8","q9","q10","q11",
350 "q12","q13","q14","q15",
355 void gf_sqr(gf_s *__restrict__ cs, const gf bs)
357 int32x2_t *vc = (int32x2_t*) cs->limb;
359 __asm__ __volatile__ (
360 "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t"
361 VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1) /* 0 .. 2^30 */
362 VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0) /* +- 2^29 */
363 VOP3(vadd.i32,_as0,_bl0,_bh0) /* 0 .. 2^30 */
365 "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t"
366 VOP3(vadd.i32,_bs2,_bl2,_bh2) /* 0 .. 2^30 */
369 VMAC(vqdmull.s32,_a0b,_as0_1,_bs2_1,0) /* 0 .. 8 * 2^58. danger for vqdmlal is 32 */
370 VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0) /* 0 .. 12 */
371 VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0) /* 0 .. 14 */
373 VMAC(vqdmull.s32,_a1b,_as0_1,_bs2_1,1) /* 0 .. 8 */
374 VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1) /* 0 .. 14 */
375 VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1) /* 0 .. 16 */
377 VOP2(vmov,_a0a,_a0b) /* 0 .. 14 */
378 VMAC(vqdmlal.s32,_a0a,_bh0_1,_bh2_1,0) /* 0 .. 16 */
379 VMAC(vmlal.s32,_a0a,_bh2_0,_bh2_0,0) /* 0 .. 17 */
380 VMAC(vmlal.s32,_a0a,_bh0_0,_bl0_0,0) /* 0 .. 18 */
382 VMAC(vqdmlsl.s32,_a0b,_bl0_1,_bl2_1,0) /*-2 .. 14 */
383 VMAC(vmlsl.s32,_a0b,_bl2_0,_bl2_0,0) /*-3 .. 14 */
384 VMAC(vmlal.s32,_a0b,_bl0_0,_bs0_0,0) /*-4 .. 15 */
387 VMAC(vqdmlal.s32,_a1a,_bh0_1,_bh2_1,1) /* 0 .. 18 */
388 VMAC(vmlal.s32,_a1a,_bh2_0,_bh2_0,1) /* 0 .. 19 */
389 VMAC(vmlal.s32,_a1a,_bh0_0,_bl0_0,1) /* 0 .. 20 */
391 VOP2(vswp,_a0b_1,_a0a_0)
393 VMAC(vqdmlsl.s32,_a1b,_bl0_1,_bl2_1,1) /*-2 .. 16 */
394 VMAC(vmlsl.s32,_a1b,_bl2_0,_bl2_0,1) /*-3 .. 16 */
395 VMAC(vmlal.s32,_a1b,_bl0_0,_bs0_0,1) /*-4 .. 17 */
397 VOP3(vsra.u64,_a0a,_a0b,"#28")
398 VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1)
399 VOP2(vmovn.i64,_a0b_0,_a0b)
401 VOP2(vswp,_a1b_1,_a1a_0)
402 VOP3(vadd.i64,_a1b,_a0a,_a1b)
405 VMAC(vqdmull.s32,_a0a,_as2_0,_bs2_1,0) /* 0 .. 8 */
406 VOP2(vmovn.i64,_a0b_1,_a1b)
407 VOP3(vsra.u64,_a1a,_a1b,"#28")
408 VMAC(vqdmlal.s32,_a0a,_as0_0,_bh0_1,0) /* 0 .. 12 */
409 VOP2(vbic.i32,_a0b,"#0xf0000000")
410 "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
412 VMAC(vqdmull.s32,_a1b,_as2_0,_bs2_1,1) /* 0 .. 8 */
413 VMAC(vqdmlal.s32,_a1b,_as0_0,_bh0_1,1) /* 0 .. 12 */
415 VOP2(vmov,_a0b,_a0a) /* 0 .. 12 */
416 VMAC(vqdmlal.s32,_a0a,_bh2_0,_bh2_1,0) /* 0 .. 14 */
417 VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl0_1,0) /* 0 .. 16 */
419 VMAC(vqdmlsl.s32,_a0b,_bl2_0,_bl2_1,0) /*-2 .. 12 */
420 VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs0_1,0) /*-4 .. 14 */
421 VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
422 VOP3(vadd.i64,_a0b_0,_a0b_0,_a1a_0)
424 VOP2(vmov,_a1a,_a1b) /* 0 .. 12 */
425 VMAC(vqdmlal.s32,_a1a,_bh2_0,_bh2_1,1) /* 0 .. 14 */
426 VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl0_1,1) /* 0 .. 16 */
428 VOP2(vswp,_a0b_1,_a0a_0)
430 VMAC(vqdmlsl.s32,_a1b,_bl2_0,_bl2_1,1) /*-2 .. 12 */
431 VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs0_1,1) /*-4 .. 14 */
433 VOP3(vsra.u64,_a0a,_a0b,"#28")
434 VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0)
435 VOP2(vmovn.i64,_a0b_0,_a0b)
437 VOP2(vswp,_a1b_1,_a1a_0)
438 VOP3(vadd.i64,_a1b,_a0a,_a1b)
440 VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0)
441 VOP2(vmovn.i64,_a0b_1,_a1b)
442 VMAC(vqdmlal.s32,_a0a,_as0_0,_bh2_0,0)
443 VOP3(vsra.u64,_a1a,_a1b,"#28")
444 VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0)
445 VOP2(vbic.i32,_a0b,"#0xf0000000")
446 "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
448 VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1)
449 VMAC(vqdmlal.s32,_a1b,_as0_0,_bh2_0,1)
450 VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1)
452 VOP2(vmov,_a0b_1,_a0a_1)
453 VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
454 VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
455 VMAC(vmlal.s32,_a0a,_bh2_1,_bh2_1,0)
456 VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_0,0)
457 VMAC(vmlal.s32,_a0a,_bh0_1,_bl0_1,0)
459 VMAC(vmlsl.s32,_a0b,_bl2_1,_bl2_1,0)
460 VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_0,0)
461 VMAC(vmlal.s32,_a0b,_bl0_1,_bs0_1,0)
464 VMAC(vmlal.s32,_a1a,_bh2_1,_bh2_1,1)
465 VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_0,1)
466 VMAC(vmlal.s32,_a1a,_bh0_1,_bl0_1,1)
468 VOP2(vswp,_a0b_1,_a0a_0)
470 VMAC(vmlsl.s32,_a1b,_bl2_1,_bl2_1,1)
471 VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_0,1)
472 VMAC(vmlal.s32,_a1b,_bl0_1,_bs0_1,1)
474 VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1)
475 VOP3(vsra.u64,_a0a,_a0b,"#28")
476 VOP2(vmovn.i64,_a0b_0,_a0b)
478 VOP2(vswp,_a1b_1,_a1a_0)
479 VOP3(vadd.i64,_a1b,_a0a,_a1b)
481 VMAC(vqdmull.s32,_a0a,_as0_0,_bh2_1,0)
482 VOP2(vmovn.i64,_a0b_1,_a1b)
483 VOP3(vsra.u64,_a1a,_a1b,"#28")
484 VMAC(vqdmlal.s32,_a0a,_as2_0,_bh0_1,0)
485 VOP2(vbic.i32,_a0b,"#0xf0000000")
486 "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
488 VMAC(vqdmull.s32,_a1b,_as0_0,_bh2_1,1)
489 VMAC(vqdmlal.s32,_a1b,_as2_0,_bh0_1,1)
491 VOP2(vmov,_a0b_1,_a0a_1)
492 VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
493 VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
494 VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_1,0)
495 VMAC(vqdmlal.s32,_a0a,_bh2_0,_bl0_1,0)
497 VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_1,0)
498 VMAC(vqdmlal.s32,_a0b,_bl2_0,_bs0_1,0)
501 VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_1,1)
502 VMAC(vqdmlal.s32,_a1a,_bh2_0,_bl0_1,1)
504 VOP2(vswp,_a0b_1,_a0a_0)
506 VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_1,1)
507 VMAC(vqdmlal.s32,_a1b,_bl2_0,_bs0_1,1)
509 VOP3(vsra.u64,_a0a,_a0b,"#28")
510 VOP2(vmovn.i64,_a0b_0,_a0b)
512 VOP2(vswp,_a1b_1,_a1a_0)
513 VOP3(vadd.i64,_a0a,_a0a,_a1b)
515 VOP2(vmovn.i64,_a0b_1,_a0a)
516 VOP3(vsra.u64,_a1a,_a0a,"#28")
518 VOP2(vbic.i32,_a0b,"#0xf0000000")
520 VOP2(vswp,_a1a_0,_a1a_1)
522 "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
523 "sub %[c], #64" "\n\t"
525 VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0)
527 "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
528 VOP2(vaddw.s32,_a1a,_a0a_0)
529 VOP2(vmovn.i64,_a0a_0,_a1a)
530 VOP2(vshr.s64,_a1a,"#28")
532 VOP2(vaddw.s32,_a1a,_a0a_1)
533 VOP2(vmovn.i64,_a0a_1,_a1a)
534 VOP2(vshr.s64,_a1a,"#28")
536 VOP2(vbic.i32,_a0a,"#0xf0000000")
538 VOP2(vaddw.s32,_a1a,_a0b_0)
539 VOP2(vmovn.i64,_a0b_0,_a1a)
541 "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
546 :: "q0","q1","q2","q3",
548 "q12","q13","q14","q15",
553 void gf_mulw_unsigned(gf_s *__restrict__ cs, const gf as, uint32_t b)
555 uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1};
559 const uint32x2_t *va = (const uint32x2_t *) as->limb;
560 uint32x2_t *vo = (uint32x2_t *) cs->limb;
562 uint32x2_t vb = {b, 0};
565 accum = vmull_lane_u32(vc, vb, 0);
566 vo[0] = vmovn_u64(accum) & vmask;
567 accum = vshrq_n_u64(accum,28);
569 /* PERF: the right way to do this is to reduce behind, i.e.
570 * vmull + vmlal round 0
571 * vmull + vmlal round 1
572 * vmull + vmlal round 2
574 * vmull + vmlal round 3
580 for (i=1; i<8; i++) {
582 accum = vmlal_lane_u32(accum, vn, vb, 0);
583 vo[i] = vmovn_u64(accum) & vmask;
584 accum = vshrq_n_u64(accum,28);
588 accum = xx_vaddup_u64(vrev128_u64(accum));
589 accum = vaddw_u32(accum, vo[0]);
590 vo[0] = vmovn_u64(accum) & vmask;
592 accum = vshrq_n_u64(accum,28);
593 vo[1] += vmovn_u64(accum);