2 * Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
3 * Copyright 2014 Cryptography Research, Inc.
5 * Licensed under the OpenSSL license (the "License"). You may not use
6 * this file except in compliance with the License. You can obtain a copy
7 * in the file LICENSE in the source distribution or at
8 * https://www.openssl.org/source/license.html
10 * Originally written by Mike Hamburg
15 static __inline__ uint64x2_t __attribute__ ((gnu_inline, always_inline, unused))
16 xx_vaddup_u64(uint64x2_t x)
18 __asm__("vadd.s64 %f0, %e0":"+w"(x));
22 static __inline__ int64x2_t __attribute__ ((gnu_inline, always_inline, unused))
23 vrev128_s64(int64x2_t x)
25 __asm__("vswp.s64 %e0, %f0":"+w"(x));
29 static __inline__ uint64x2_t __attribute__ ((gnu_inline, always_inline))
30 vrev128_u64(uint64x2_t x)
32 __asm__("vswp.s64 %e0, %f0":"+w"(x));
36 static inline void __attribute__ ((gnu_inline, always_inline, unused))
37 smlal(uint64_t *acc, const uint32_t a, const uint32_t b)
39 *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)b;
42 static inline void __attribute__ ((gnu_inline, always_inline, unused))
43 smlal2(uint64_t *acc, const uint32_t a, const uint32_t b)
45 *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)b *2;
48 static inline void __attribute__ ((gnu_inline, always_inline, unused))
49 smull(uint64_t *acc, const uint32_t a, const uint32_t b)
51 *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)b;
54 static inline void __attribute__ ((gnu_inline, always_inline, unused))
55 smull2(uint64_t *acc, const uint32_t a, const uint32_t b)
57 *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)b *2;
60 void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
112 #define VMAC(op,result,a,b,n) #op" "result", "a", "b"[" #n "]\n\t"
113 #define VOP3(op,result,a,b) #op" "result", "a", "b"\n\t"
114 #define VOP2(op,result,a) #op" "result", "a"\n\t"
116 int32x2_t *vc = (int32x2_t *) cs->limb;
118 __asm__ __volatile__("vld2.32 {" _al0_0 "," _al0_1 "," _ah0_0 "," _ah0_1
119 "}, [%[a],:128]!" "\n\t" VOP3(vadd.i32, _as0, _al0,
121 "vld2.32 {" _bl0_0 "," _bl0_1 "," _bh0_0 "," _bh0_1
122 "}, [%[b],:128]!" "\n\t" VOP3(vadd.i32, _bs0_1, _bl0_1,
123 _bh0_1) VOP3(vsub.i32,
127 "vld2.32 {" _bl2_0 "," _bl2_1 "," _bh2_0 "," _bh2_1
128 "}, [%[b],:128]!" "\n\t" VOP3(vadd.i32, _bs2, _bl2,
130 "vld2.32 {" _al2_0 "," _al2_1 "," _ah2_0 "," _ah2_1
131 "}, [%[a],:128]!" "\n\t" VOP3(vadd.i32, _as2, _al2,
133 VMAC(vmull.s32, _a0b, _as0_1, _bs2_1,
134 0) VMAC(vmlal.s32, _a0b, _as2_0, _bs2_0,
135 0) VMAC(vmlal.s32, _a0b, _as2_1, _bs0_1,
136 0) VMAC(vmlal.s32, _a0b, _as0_0,
138 VMAC(vmull.s32, _a1b, _as0_1, _bs2_1,
139 1) VMAC(vmlal.s32, _a1b, _as2_0, _bs2_0,
140 1) VMAC(vmlal.s32, _a1b, _as2_1, _bs0_1,
141 1) VMAC(vmlal.s32, _a1b, _as0_0,
143 VOP2(vmov, _a0a, _a0b) VMAC(vmlal.s32, _a0a, _ah0_1,
144 _bh2_1, 0) VMAC(vmlal.s32,
149 VMAC(vmlal.s32, _a0a, _ah2_1, _bh0_1,
150 0) VMAC(vmlal.s32, _a0a, _ah0_0, _bl0_0, 0)
151 VMAC(vmlsl.s32, _a0b, _al0_1, _bl2_1,
152 0) VMAC(vmlsl.s32, _a0b, _al2_0, _bl2_0,
153 0) VMAC(vmlsl.s32, _a0b, _al2_1, _bl0_1,
154 0) VMAC(vmlal.s32, _a0b, _al0_0,
156 VOP2(vmov, _a1a, _a1b) VMAC(vmlal.s32, _a1a, _ah0_1,
157 _bh2_1, 1) VMAC(vmlal.s32,
162 VMAC(vmlal.s32, _a1a, _ah2_1, _bh0_1,
163 1) VMAC(vmlal.s32, _a1a, _ah0_0, _bl0_0, 1)
164 VOP2(vswp, _a0b_1, _a0a_0)
165 VMAC(vmlsl.s32, _a1b, _al0_1, _bl2_1, 1)
166 VMAC(vmlsl.s32, _a1b, _al2_0, _bl2_0, 1)
167 VMAC(vmlsl.s32, _a1b, _al2_1, _bl0_1, 1)
168 VMAC(vmlal.s32, _a1b, _al0_0, _bs0_0, 1)
169 VOP3(vsra.u64, _a0a, _a0b, "#28")
170 VOP3(vsub.i32, _bs0_1, _bl0_1, _bh0_1)
171 VOP2(vmovn.i64, _a0b_0, _a0b)
172 VOP2(vswp, _a1b_1, _a1a_0)
173 VOP3(vadd.i64, _a1b, _a0a, _a1b)
174 VMAC(vmull.s32, _a0a, _as2_0, _bs2_1, 0)
175 VOP2(vmovn.i64, _a0b_1, _a1b)
176 VMAC(vmlal.s32, _a0a, _as2_1, _bs2_0, 0)
177 VOP3(vsra.u64, _a1a, _a1b, "#28")
178 VMAC(vmlal.s32, _a0a, _as0_0, _bh0_1, 0)
179 VOP2(vbic.i32, _a0b, "#0xf0000000")
180 VMAC(vmlal.s32, _a0a, _as0_1, _bh0_0, 0)
181 "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t"
182 VMAC(vmull.s32, _a1b, _as2_0, _bs2_1, 1)
183 VMAC(vmlal.s32, _a1b, _as2_1, _bs2_0, 1)
184 VMAC(vmlal.s32, _a1b, _as0_0, _bh0_1, 1)
185 VMAC(vmlal.s32, _a1b, _as0_1, _bh0_0, 1)
186 VOP2(vmov, _a0b_1, _a0a_1)
187 VOP3(vadd.i64, _a0b_0, _a0a_0, _a1a_0)
188 VOP3(vadd.i64, _a0a_0, _a0a_0, _a1a_1)
189 VMAC(vmlal.s32, _a0a, _ah2_0, _bh2_1, 0)
190 VMAC(vmlal.s32, _a0a, _ah2_1, _bh2_0, 0)
191 VMAC(vmlal.s32, _a0a, _ah0_0, _bl0_1, 0)
192 VMAC(vmlal.s32, _a0a, _ah0_1, _bl0_0, 0)
193 VMAC(vmlsl.s32, _a0b, _al2_0, _bl2_1, 0)
194 VMAC(vmlsl.s32, _a0b, _al2_1, _bl2_0, 0)
195 VMAC(vmlal.s32, _a0b, _al0_0, _bs0_1, 0)
196 VMAC(vmlal.s32, _a0b, _al0_1, _bs0_0, 0)
197 VOP2(vmov, _a1a, _a1b)
198 VMAC(vmlal.s32, _a1a, _ah2_0, _bh2_1, 1)
199 VMAC(vmlal.s32, _a1a, _ah2_1, _bh2_0, 1)
200 VMAC(vmlal.s32, _a1a, _ah0_0, _bl0_1, 1)
201 VMAC(vmlal.s32, _a1a, _ah0_1, _bl0_0, 1)
202 VOP2(vswp, _a0b_1, _a0a_0)
203 VMAC(vmlsl.s32, _a1b, _al2_0, _bl2_1, 1)
204 VMAC(vmlsl.s32, _a1b, _al2_1, _bl2_0, 1)
205 VMAC(vmlal.s32, _a1b, _al0_0, _bs0_1, 1)
206 VMAC(vmlal.s32, _a1b, _al0_1, _bs0_0, 1)
207 VOP3(vsra.u64, _a0a, _a0b, "#28")
208 VOP3(vsub.i32, _bs2_0, _bl2_0, _bh2_0)
209 VOP2(vmovn.i64, _a0b_0, _a0b)
210 VOP2(vswp, _a1b_1, _a1a_0)
211 VOP3(vadd.i64, _a1b, _a0a, _a1b)
212 VMAC(vmull.s32, _a0a, _as2_1, _bs2_1, 0)
213 VOP2(vmovn.i64, _a0b_1, _a1b)
214 VMAC(vmlal.s32, _a0a, _as0_0, _bh2_0, 0)
215 VOP3(vsra.u64, _a1a, _a1b, "#28")
216 VMAC(vmlal.s32, _a0a, _as0_1, _bh0_1, 0)
217 VOP2(vbic.i32, _a0b, "#0xf0000000")
218 VMAC(vmlal.s32, _a0a, _as2_0, _bh0_0, 0)
219 "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t"
220 VMAC(vmull.s32, _a1b, _as2_1, _bs2_1, 1)
221 VMAC(vmlal.s32, _a1b, _as0_0, _bh2_0, 1)
222 VMAC(vmlal.s32, _a1b, _as0_1, _bh0_1, 1)
223 VMAC(vmlal.s32, _a1b, _as2_0, _bh0_0, 1)
224 VOP2(vmov, _a0b_1, _a0a_1)
225 VOP3(vadd.i64, _a0b_0, _a0a_0, _a1a_0)
226 VOP3(vadd.i64, _a0a_0, _a0a_0, _a1a_1)
227 VMAC(vmlal.s32, _a0a, _ah2_1, _bh2_1, 0)
228 VMAC(vmlal.s32, _a0a, _ah0_0, _bl2_0, 0)
229 VMAC(vmlal.s32, _a0a, _ah0_1, _bl0_1, 0)
230 VMAC(vmlal.s32, _a0a, _ah2_0, _bl0_0, 0)
231 VMAC(vmlsl.s32, _a0b, _al2_1, _bl2_1, 0)
232 VMAC(vmlal.s32, _a0b, _al0_0, _bs2_0, 0)
233 VMAC(vmlal.s32, _a0b, _al0_1, _bs0_1, 0)
234 VMAC(vmlal.s32, _a0b, _al2_0, _bs0_0, 0)
235 VOP2(vmov, _a1a, _a1b)
236 VMAC(vmlal.s32, _a1a, _ah2_1, _bh2_1, 1)
237 VMAC(vmlal.s32, _a1a, _ah0_0, _bl2_0, 1)
238 VMAC(vmlal.s32, _a1a, _ah0_1, _bl0_1, 1)
239 VMAC(vmlal.s32, _a1a, _ah2_0, _bl0_0, 1)
240 VOP2(vswp, _a0b_1, _a0a_0)
241 VMAC(vmlsl.s32, _a1b, _al2_1, _bl2_1, 1)
242 VMAC(vmlal.s32, _a1b, _al0_0, _bs2_0, 1)
243 VMAC(vmlal.s32, _a1b, _al0_1, _bs0_1, 1)
244 VMAC(vmlal.s32, _a1b, _al2_0, _bs0_0, 1)
245 VOP3(vsub.i32, _bs2_1, _bl2_1, _bh2_1)
246 VOP3(vsra.u64, _a0a, _a0b, "#28")
247 VOP2(vmovn.i64, _a0b_0, _a0b)
248 VOP2(vswp, _a1b_1, _a1a_0)
249 VOP3(vadd.i64, _a1b, _a0a, _a1b)
250 VMAC(vmull.s32, _a0a, _as0_0, _bh2_1, 0)
251 VOP2(vmovn.i64, _a0b_1, _a1b)
252 VMAC(vmlal.s32, _a0a, _as0_1, _bh2_0, 0)
253 VOP3(vsra.u64, _a1a, _a1b, "#28")
254 VMAC(vmlal.s32, _a0a, _as2_0, _bh0_1, 0)
255 VOP2(vbic.i32, _a0b, "#0xf0000000")
256 VMAC(vmlal.s32, _a0a, _as2_1, _bh0_0, 0)
257 "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t"
258 VMAC(vmull.s32, _a1b, _as0_0, _bh2_1, 1)
259 VMAC(vmlal.s32, _a1b, _as0_1, _bh2_0, 1)
260 VMAC(vmlal.s32, _a1b, _as2_0, _bh0_1, 1)
261 VMAC(vmlal.s32, _a1b, _as2_1, _bh0_0, 1)
262 VOP2(vmov, _a0b_1, _a0a_1)
263 VOP3(vadd.i64, _a0b_0, _a0a_0, _a1a_0)
264 VOP3(vadd.i64, _a0a_0, _a0a_0, _a1a_1)
265 VMAC(vmlal.s32, _a0a, _ah0_0, _bl2_1, 0)
266 VMAC(vmlal.s32, _a0a, _ah0_1, _bl2_0, 0)
267 VMAC(vmlal.s32, _a0a, _ah2_0, _bl0_1, 0)
268 VMAC(vmlal.s32, _a0a, _ah2_1, _bl0_0, 0)
269 VMAC(vmlal.s32, _a0b, _al0_0, _bs2_1, 0)
270 VMAC(vmlal.s32, _a0b, _al0_1, _bs2_0, 0)
271 VMAC(vmlal.s32, _a0b, _al2_0, _bs0_1, 0)
272 VMAC(vmlal.s32, _a0b, _al2_1, _bs0_0, 0)
273 VOP2(vmov, _a1a, _a1b)
274 VMAC(vmlal.s32, _a1a, _ah0_0, _bl2_1, 1)
275 VMAC(vmlal.s32, _a1a, _ah0_1, _bl2_0, 1)
276 VMAC(vmlal.s32, _a1a, _ah2_0, _bl0_1, 1)
277 VMAC(vmlal.s32, _a1a, _ah2_1, _bl0_0, 1)
278 VOP2(vswp, _a0b_1, _a0a_0)
279 VMAC(vmlal.s32, _a1b, _al0_0, _bs2_1, 1)
280 VMAC(vmlal.s32, _a1b, _al0_1, _bs2_0, 1)
281 VMAC(vmlal.s32, _a1b, _al2_0, _bs0_1, 1)
282 VMAC(vmlal.s32, _a1b, _al2_1, _bs0_0, 1)
283 VOP3(vsra.u64, _a0a, _a0b, "#28")
284 VOP2(vmovn.i64, _a0b_0, _a0b)
285 VOP2(vswp, _a1b_1, _a1a_0)
286 VOP3(vadd.i64, _a0a, _a0a, _a1b)
287 VOP2(vmovn.i64, _a0b_1, _a0a)
288 VOP3(vsra.u64, _a1a, _a0a, "#28")
289 VOP2(vbic.i32, _a0b, "#0xf0000000")
290 VOP2(vswp, _a1a_0, _a1a_1)
291 "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t"
292 "sub %[c], #64" "\n\t"
293 VOP3(vadd.i64, _a1a_1, _a1a_1, _a1a_0)
294 "vldmia %[c], {" _a0a_0 ", " _a0a_1 ", " _a0b_0 "}"
295 "\n\t" VOP2(vaddw.s32, _a1a, _a0a_0) VOP2(vmovn.i64,
298 VOP2(vshr.s64, _a1a, "#28")
299 VOP2(vaddw.s32, _a1a, _a0a_1) VOP2(vmovn.i64, _a0a_1,
303 VOP2(vbic.i32, _a0a, "#0xf0000000")
304 VOP2(vaddw.s32, _a1a, _a0b_0)
305 VOP2(vmovn.i64, _a0b_0, _a1a)
306 "vstmia %[c], {" _a0a_0 ", " _a0a_1 ", " _a0b_0 "}"
311 ::"q0", "q1", "q2", "q3",
312 "q4", "q5", "q6", "q7",
313 "q8", "q9", "q10", "q11",
314 "q12", "q13", "q14", "q15", "memory");
317 void gf_sqr(gf_s * __restrict__ cs, const gf bs)
319 int32x2_t *vc = (int32x2_t *) cs->limb;
321 __asm__ __volatile__("vld2.32 {" _bl0_0 "," _bl0_1 "," _bh0_0 "," _bh0_1 "}, [%[b],:128]!" "\n\t" VOP3(vadd.i32, _bs0_1, _bl0_1, _bh0_1) /* 0
325 * 2^30 */ VOP3(vsub.i32, _bs0_0, _bl0_0, _bh0_0)
327 VOP3(vadd.i32, _as0, _bl0, _bh0)
331 "vld2.32 {" _bl2_0 "," _bl2_1 "," _bh2_0 "," _bh2_1 "}, [%[b],:128]!" "\n\t" VOP3(vadd.i32, _bs2, _bl2, _bh2) /* 0
336 VOP2(vmov, _as2, _bs2)
338 VMAC(vqdmull.s32, _a0b, _as0_1, _bs2_1, 0) /* 0 .. 8 *
344 VMAC(vmlal.s32, _a0b, _as2_0, _bs2_0, 0) /* 0 .. 12 */
345 VMAC(vmlal.s32, _a0b, _as0_0, _bh0_0, 0)
349 VMAC(vqdmull.s32, _a1b, _as0_1, _bs2_1, 1) /* 0 .. 8 */
350 VMAC(vmlal.s32, _a1b, _as2_0, _bs2_0, 1) /* 0 .. 14 */
351 VMAC(vmlal.s32, _a1b, _as0_0, _bh0_0, 1)
355 VOP2(vmov, _a0a, _a0b) /* 0 .. 14 */
356 VMAC(vqdmlal.s32, _a0a, _bh0_1, _bh2_1, 0) /* 0 .. 16 */
357 VMAC(vmlal.s32, _a0a, _bh2_0, _bh2_0, 0) /* 0 .. 17 */
358 VMAC(vmlal.s32, _a0a, _bh0_0, _bl0_0, 0)
362 VMAC(vqdmlsl.s32, _a0b, _bl0_1, _bl2_1, 0)
366 VMAC(vmlsl.s32, _a0b, _bl2_0, _bl2_0, 0)
370 VMAC(vmlal.s32, _a0b, _bl0_0, _bs0_0, 0)
374 VOP2(vmov, _a1a, _a1b)
375 VMAC(vqdmlal.s32, _a1a, _bh0_1, _bh2_1, 1) /* 0 .. 18 */
376 VMAC(vmlal.s32, _a1a, _bh2_0, _bh2_0, 1) /* 0 .. 19 */
377 VMAC(vmlal.s32, _a1a, _bh0_0, _bl0_0, 1)
381 VOP2(vswp, _a0b_1, _a0a_0)
383 VMAC(vqdmlsl.s32, _a1b, _bl0_1, _bl2_1, 1)
387 VMAC(vmlsl.s32, _a1b, _bl2_0, _bl2_0, 1)
391 VMAC(vmlal.s32, _a1b, _bl0_0, _bs0_0, 1)
395 VOP3(vsra.u64, _a0a, _a0b, "#28")
396 VOP3(vsub.i32, _bs0_1, _bl0_1, _bh0_1)
397 VOP2(vmovn.i64, _a0b_0, _a0b)
399 VOP2(vswp, _a1b_1, _a1a_0)
400 VOP3(vadd.i64, _a1b, _a0a, _a1b)
402 VMAC(vqdmull.s32, _a0a, _as2_0, _bs2_1, 0) /* 0 .. 8 */
403 VOP2(vmovn.i64, _a0b_1, _a1b)
404 VOP3(vsra.u64, _a1a, _a1b, "#28")
405 VMAC(vqdmlal.s32, _a0a, _as0_0, _bh0_1, 0) /* 0 .. 12 */
406 VOP2(vbic.i32, _a0b, "#0xf0000000")
407 "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t" VMAC(vqdmull.s32, _a1b, _as2_0, _bs2_1, 1) /* 0
411 VMAC(vqdmlal.s32, _a1b, _as0_0, _bh0_1, 1)
415 VOP2(vmov, _a0b, _a0a) /* 0 .. 12 */
416 VMAC(vqdmlal.s32, _a0a, _bh2_0, _bh2_1, 0) /* 0 .. 14 */
417 VMAC(vqdmlal.s32, _a0a, _bh0_0, _bl0_1, 0)
421 VMAC(vqdmlsl.s32, _a0b, _bl2_0, _bl2_1, 0)
425 VMAC(vqdmlal.s32, _a0b, _bl0_0, _bs0_1, 0)
429 VOP3(vadd.i64, _a0a_0, _a0a_0, _a1a_1)
430 VOP3(vadd.i64, _a0b_0, _a0b_0, _a1a_0)
432 VOP2(vmov, _a1a, _a1b) /* 0 .. 12 */
433 VMAC(vqdmlal.s32, _a1a, _bh2_0, _bh2_1, 1) /* 0 .. 14 */
434 VMAC(vqdmlal.s32, _a1a, _bh0_0, _bl0_1, 1)
438 VOP2(vswp, _a0b_1, _a0a_0)
440 VMAC(vqdmlsl.s32, _a1b, _bl2_0, _bl2_1, 1)
444 VMAC(vqdmlal.s32, _a1b, _bl0_0, _bs0_1, 1)
448 VOP3(vsra.u64, _a0a, _a0b, "#28")
449 VOP3(vsub.i32, _bs2_0, _bl2_0, _bh2_0)
450 VOP2(vmovn.i64, _a0b_0, _a0b)
452 VOP2(vswp, _a1b_1, _a1a_0)
453 VOP3(vadd.i64, _a1b, _a0a, _a1b)
455 VMAC(vmull.s32, _a0a, _as2_1, _bs2_1, 0)
456 VOP2(vmovn.i64, _a0b_1, _a1b)
457 VMAC(vqdmlal.s32, _a0a, _as0_0, _bh2_0, 0)
458 VOP3(vsra.u64, _a1a, _a1b, "#28")
459 VMAC(vmlal.s32, _a0a, _as0_1, _bh0_1, 0)
460 VOP2(vbic.i32, _a0b, "#0xf0000000")
461 "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t"
462 VMAC(vmull.s32, _a1b, _as2_1, _bs2_1, 1)
463 VMAC(vqdmlal.s32, _a1b, _as0_0, _bh2_0, 1)
464 VMAC(vmlal.s32, _a1b, _as0_1, _bh0_1, 1)
466 VOP2(vmov, _a0b_1, _a0a_1)
467 VOP3(vadd.i64, _a0b_0, _a0a_0, _a1a_0)
468 VOP3(vadd.i64, _a0a_0, _a0a_0, _a1a_1)
469 VMAC(vmlal.s32, _a0a, _bh2_1, _bh2_1, 0)
470 VMAC(vqdmlal.s32, _a0a, _bh0_0, _bl2_0, 0)
471 VMAC(vmlal.s32, _a0a, _bh0_1, _bl0_1, 0)
473 VMAC(vmlsl.s32, _a0b, _bl2_1, _bl2_1, 0)
474 VMAC(vqdmlal.s32, _a0b, _bl0_0, _bs2_0, 0)
475 VMAC(vmlal.s32, _a0b, _bl0_1, _bs0_1, 0)
477 VOP2(vmov, _a1a, _a1b)
478 VMAC(vmlal.s32, _a1a, _bh2_1, _bh2_1, 1)
479 VMAC(vqdmlal.s32, _a1a, _bh0_0, _bl2_0, 1)
480 VMAC(vmlal.s32, _a1a, _bh0_1, _bl0_1, 1)
482 VOP2(vswp, _a0b_1, _a0a_0)
484 VMAC(vmlsl.s32, _a1b, _bl2_1, _bl2_1, 1)
485 VMAC(vqdmlal.s32, _a1b, _bl0_0, _bs2_0, 1)
486 VMAC(vmlal.s32, _a1b, _bl0_1, _bs0_1, 1)
488 VOP3(vsub.i32, _bs2_1, _bl2_1, _bh2_1)
489 VOP3(vsra.u64, _a0a, _a0b, "#28")
490 VOP2(vmovn.i64, _a0b_0, _a0b)
492 VOP2(vswp, _a1b_1, _a1a_0)
493 VOP3(vadd.i64, _a1b, _a0a, _a1b)
495 VMAC(vqdmull.s32, _a0a, _as0_0, _bh2_1, 0)
496 VOP2(vmovn.i64, _a0b_1, _a1b)
497 VOP3(vsra.u64, _a1a, _a1b, "#28")
498 VMAC(vqdmlal.s32, _a0a, _as2_0, _bh0_1, 0)
499 VOP2(vbic.i32, _a0b, "#0xf0000000")
500 "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t"
501 VMAC(vqdmull.s32, _a1b, _as0_0, _bh2_1, 1)
502 VMAC(vqdmlal.s32, _a1b, _as2_0, _bh0_1, 1)
504 VOP2(vmov, _a0b_1, _a0a_1)
505 VOP3(vadd.i64, _a0b_0, _a0a_0, _a1a_0)
506 VOP3(vadd.i64, _a0a_0, _a0a_0, _a1a_1)
507 VMAC(vqdmlal.s32, _a0a, _bh0_0, _bl2_1, 0)
508 VMAC(vqdmlal.s32, _a0a, _bh2_0, _bl0_1, 0)
510 VMAC(vqdmlal.s32, _a0b, _bl0_0, _bs2_1, 0)
511 VMAC(vqdmlal.s32, _a0b, _bl2_0, _bs0_1, 0)
513 VOP2(vmov, _a1a, _a1b)
514 VMAC(vqdmlal.s32, _a1a, _bh0_0, _bl2_1, 1)
515 VMAC(vqdmlal.s32, _a1a, _bh2_0, _bl0_1, 1)
517 VOP2(vswp, _a0b_1, _a0a_0)
519 VMAC(vqdmlal.s32, _a1b, _bl0_0, _bs2_1, 1)
520 VMAC(vqdmlal.s32, _a1b, _bl2_0, _bs0_1, 1)
522 VOP3(vsra.u64, _a0a, _a0b, "#28")
523 VOP2(vmovn.i64, _a0b_0, _a0b)
525 VOP2(vswp, _a1b_1, _a1a_0)
526 VOP3(vadd.i64, _a0a, _a0a, _a1b)
528 VOP2(vmovn.i64, _a0b_1, _a0a)
529 VOP3(vsra.u64, _a1a, _a0a, "#28")
531 VOP2(vbic.i32, _a0b, "#0xf0000000")
533 VOP2(vswp, _a1a_0, _a1a_1)
535 "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t"
536 "sub %[c], #64" "\n\t"
537 VOP3(vadd.i64, _a1a_1, _a1a_1, _a1a_0)
539 "vldmia %[c], {" _a0a_0 ", " _a0a_1 ", " _a0b_0 "}"
540 "\n\t" VOP2(vaddw.s32, _a1a, _a0a_0)
541 VOP2(vmovn.i64, _a0a_0, _a1a)
542 VOP2(vshr.s64, _a1a, "#28")
544 VOP2(vaddw.s32, _a1a, _a0a_1)
545 VOP2(vmovn.i64, _a0a_1, _a1a)
546 VOP2(vshr.s64, _a1a, "#28")
548 VOP2(vbic.i32, _a0a, "#0xf0000000")
550 VOP2(vaddw.s32, _a1a, _a0b_0)
551 VOP2(vmovn.i64, _a0b_0, _a1a)
553 "vstmia %[c], {" _a0a_0 ", " _a0a_1 ", " _a0b_0 "}"
557 ::"q0", "q1", "q2", "q3",
558 "q4", "q5", "q6", "q7",
559 "q12", "q13", "q14", "q15", "memory");
562 void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
564 uint32x2_t vmask = { (1 << 28) - 1, (1 << 28) - 1 };
565 assert(b < (1 << 28));
568 const uint32x2_t *va = (const uint32x2_t *)as->limb;
569 uint32x2_t *vo = (uint32x2_t *) cs->limb;
571 uint32x2_t vb = { b, 0 };
574 accum = vmull_lane_u32(vc, vb, 0);
575 vo[0] = vmovn_u64(accum) & vmask;
576 accum = vshrq_n_u64(accum, 28);
579 * PERF: the right way to do this is to reduce behind, i.e. vmull + vmlal
580 * round 0 vmull + vmlal round 1 vmull + vmlal round 2 vsraq round 0, 1
581 * vmull + vmlal round 3 vsraq round 1, 2 ...
585 for (i = 1; i < 8; i++) {
587 accum = vmlal_lane_u32(accum, vn, vb, 0);
588 vo[i] = vmovn_u64(accum) & vmask;
589 accum = vshrq_n_u64(accum, 28);
593 accum = xx_vaddup_u64(vrev128_u64(accum));
594 accum = vaddw_u32(accum, vo[0]);
595 vo[0] = vmovn_u64(accum) & vmask;
597 accum = vshrq_n_u64(accum, 28);
598 vo[1] += vmovn_u64(accum);