fe21aee1281f24c6379010c2d7bfbe56fd8047ce
[openssl.git] / crypto / ec / curve448 / arch_neon / f_impl.c
1 /*
2  * Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
3  * Copyright 2014 Cryptography Research, Inc.
4  *
5  * Licensed under the OpenSSL license (the "License").  You may not use
6  * this file except in compliance with the License.  You can obtain a copy
7  * in the file LICENSE in the source distribution or at
8  * https://www.openssl.org/source/license.html
9  *
10  * Originally written by Mike Hamburg
11  */
12
13 #include "f_field.h"
14
15 static __inline__ uint64x2_t __attribute__ ((gnu_inline, always_inline, unused))
16     xx_vaddup_u64(uint64x2_t x)
17 {
18  __asm__("vadd.s64 %f0, %e0":"+w"(x));
19     return x;
20 }
21
22 static __inline__ int64x2_t __attribute__ ((gnu_inline, always_inline, unused))
23     vrev128_s64(int64x2_t x)
24 {
25  __asm__("vswp.s64 %e0, %f0":"+w"(x));
26     return x;
27 }
28
29 static __inline__ uint64x2_t __attribute__ ((gnu_inline, always_inline))
30     vrev128_u64(uint64x2_t x)
31 {
32  __asm__("vswp.s64 %e0, %f0":"+w"(x));
33     return x;
34 }
35
36 static inline void __attribute__ ((gnu_inline, always_inline, unused))
37     smlal(uint64_t *acc, const uint32_t a, const uint32_t b)
38 {
39     *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)b;
40 }
41
42 static inline void __attribute__ ((gnu_inline, always_inline, unused))
43     smlal2(uint64_t *acc, const uint32_t a, const uint32_t b)
44 {
45     *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)b *2;
46 }
47
48 static inline void __attribute__ ((gnu_inline, always_inline, unused))
49     smull(uint64_t *acc, const uint32_t a, const uint32_t b)
50 {
51     *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)b;
52 }
53
54 static inline void __attribute__ ((gnu_inline, always_inline, unused))
55     smull2(uint64_t *acc, const uint32_t a, const uint32_t b)
56 {
57     *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)b *2;
58 }
59
60 void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
61 {
62 #define _bl0 "q0"
63 #define _bl0_0 "d0"
64 #define _bl0_1 "d1"
65 #define _bh0 "q1"
66 #define _bh0_0 "d2"
67 #define _bh0_1 "d3"
68 #define _bs0 "q2"
69 #define _bs0_0 "d4"
70 #define _bs0_1 "d5"
71 #define _bl2 "q3"
72 #define _bl2_0 "d6"
73 #define _bl2_1 "d7"
74 #define _bh2 "q4"
75 #define _bh2_0 "d8"
76 #define _bh2_1 "d9"
77 #define _bs2 "q5"
78 #define _bs2_0 "d10"
79 #define _bs2_1 "d11"
80
81 #define _as0 "q6"
82 #define _as0_0 "d12"
83 #define _as0_1 "d13"
84 #define _as2 "q7"
85 #define _as2_0 "d14"
86 #define _as2_1 "d15"
87 #define _al0 "q8"
88 #define _al0_0 "d16"
89 #define _al0_1 "d17"
90 #define _ah0 "q9"
91 #define _ah0_0 "d18"
92 #define _ah0_1 "d19"
93 #define _al2 "q10"
94 #define _al2_0 "d20"
95 #define _al2_1 "d21"
96 #define _ah2 "q11"
97 #define _ah2_0 "d22"
98 #define _ah2_1 "d23"
99
100 #define _a0a "q12"
101 #define _a0a_0 "d24"
102 #define _a0a_1 "d25"
103 #define _a0b "q13"
104 #define _a0b_0 "d26"
105 #define _a0b_1 "d27"
106 #define _a1a "q14"
107 #define _a1a_0 "d28"
108 #define _a1a_1 "d29"
109 #define _a1b "q15"
110 #define _a1b_0 "d30"
111 #define _a1b_1 "d31"
112 #define VMAC(op,result,a,b,n) #op" "result", "a", "b"[" #n "]\n\t"
113 #define VOP3(op,result,a,b)   #op" "result", "a", "b"\n\t"
114 #define VOP2(op,result,a)     #op" "result", "a"\n\t"
115
116     int32x2_t *vc = (int32x2_t *) cs->limb;
117
118     __asm__ __volatile__("vld2.32 {" _al0_0 "," _al0_1 "," _ah0_0 "," _ah0_1
119                          "}, [%[a],:128]!" "\n\t" VOP3(vadd.i32, _as0, _al0,
120                                                        _ah0)
121                          "vld2.32 {" _bl0_0 "," _bl0_1 "," _bh0_0 "," _bh0_1
122                          "}, [%[b],:128]!" "\n\t" VOP3(vadd.i32, _bs0_1, _bl0_1,
123                                                        _bh0_1) VOP3(vsub.i32,
124                                                                     _bs0_0,
125                                                                     _bl0_0,
126                                                                     _bh0_0)
127                          "vld2.32 {" _bl2_0 "," _bl2_1 "," _bh2_0 "," _bh2_1
128                          "}, [%[b],:128]!" "\n\t" VOP3(vadd.i32, _bs2, _bl2,
129                                                        _bh2)
130                          "vld2.32 {" _al2_0 "," _al2_1 "," _ah2_0 "," _ah2_1
131                          "}, [%[a],:128]!" "\n\t" VOP3(vadd.i32, _as2, _al2,
132                                                        _ah2)
133                          VMAC(vmull.s32, _a0b, _as0_1, _bs2_1,
134                               0) VMAC(vmlal.s32, _a0b, _as2_0, _bs2_0,
135                                       0) VMAC(vmlal.s32, _a0b, _as2_1, _bs0_1,
136                                               0) VMAC(vmlal.s32, _a0b, _as0_0,
137                                                       _bh0_0, 0)
138                          VMAC(vmull.s32, _a1b, _as0_1, _bs2_1,
139                               1) VMAC(vmlal.s32, _a1b, _as2_0, _bs2_0,
140                                       1) VMAC(vmlal.s32, _a1b, _as2_1, _bs0_1,
141                                               1) VMAC(vmlal.s32, _a1b, _as0_0,
142                                                       _bh0_0, 1)
143                          VOP2(vmov, _a0a, _a0b) VMAC(vmlal.s32, _a0a, _ah0_1,
144                                                      _bh2_1, 0) VMAC(vmlal.s32,
145                                                                      _a0a,
146                                                                      _ah2_0,
147                                                                      _bh2_0,
148                                                                      0)
149                          VMAC(vmlal.s32, _a0a, _ah2_1, _bh0_1,
150                               0) VMAC(vmlal.s32, _a0a, _ah0_0, _bl0_0, 0)
151                          VMAC(vmlsl.s32, _a0b, _al0_1, _bl2_1,
152                               0) VMAC(vmlsl.s32, _a0b, _al2_0, _bl2_0,
153                                       0) VMAC(vmlsl.s32, _a0b, _al2_1, _bl0_1,
154                                               0) VMAC(vmlal.s32, _a0b, _al0_0,
155                                                       _bs0_0, 0)
156                          VOP2(vmov, _a1a, _a1b) VMAC(vmlal.s32, _a1a, _ah0_1,
157                                                      _bh2_1, 1) VMAC(vmlal.s32,
158                                                                      _a1a,
159                                                                      _ah2_0,
160                                                                      _bh2_0,
161                                                                      1)
162                          VMAC(vmlal.s32, _a1a, _ah2_1, _bh0_1,
163                               1) VMAC(vmlal.s32, _a1a, _ah0_0, _bl0_0, 1)
164                          VOP2(vswp, _a0b_1, _a0a_0)
165                          VMAC(vmlsl.s32, _a1b, _al0_1, _bl2_1, 1)
166                          VMAC(vmlsl.s32, _a1b, _al2_0, _bl2_0, 1)
167                          VMAC(vmlsl.s32, _a1b, _al2_1, _bl0_1, 1)
168                          VMAC(vmlal.s32, _a1b, _al0_0, _bs0_0, 1)
169                          VOP3(vsra.u64, _a0a, _a0b, "#28")
170                          VOP3(vsub.i32, _bs0_1, _bl0_1, _bh0_1)
171                          VOP2(vmovn.i64, _a0b_0, _a0b)
172                          VOP2(vswp, _a1b_1, _a1a_0)
173                          VOP3(vadd.i64, _a1b, _a0a, _a1b)
174                          VMAC(vmull.s32, _a0a, _as2_0, _bs2_1, 0)
175                          VOP2(vmovn.i64, _a0b_1, _a1b)
176                          VMAC(vmlal.s32, _a0a, _as2_1, _bs2_0, 0)
177                          VOP3(vsra.u64, _a1a, _a1b, "#28")
178                          VMAC(vmlal.s32, _a0a, _as0_0, _bh0_1, 0)
179                          VOP2(vbic.i32, _a0b, "#0xf0000000")
180                          VMAC(vmlal.s32, _a0a, _as0_1, _bh0_0, 0)
181                          "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t"
182                          VMAC(vmull.s32, _a1b, _as2_0, _bs2_1, 1)
183                          VMAC(vmlal.s32, _a1b, _as2_1, _bs2_0, 1)
184                          VMAC(vmlal.s32, _a1b, _as0_0, _bh0_1, 1)
185                          VMAC(vmlal.s32, _a1b, _as0_1, _bh0_0, 1)
186                          VOP2(vmov, _a0b_1, _a0a_1)
187                          VOP3(vadd.i64, _a0b_0, _a0a_0, _a1a_0)
188                          VOP3(vadd.i64, _a0a_0, _a0a_0, _a1a_1)
189                          VMAC(vmlal.s32, _a0a, _ah2_0, _bh2_1, 0)
190                          VMAC(vmlal.s32, _a0a, _ah2_1, _bh2_0, 0)
191                          VMAC(vmlal.s32, _a0a, _ah0_0, _bl0_1, 0)
192                          VMAC(vmlal.s32, _a0a, _ah0_1, _bl0_0, 0)
193                          VMAC(vmlsl.s32, _a0b, _al2_0, _bl2_1, 0)
194                          VMAC(vmlsl.s32, _a0b, _al2_1, _bl2_0, 0)
195                          VMAC(vmlal.s32, _a0b, _al0_0, _bs0_1, 0)
196                          VMAC(vmlal.s32, _a0b, _al0_1, _bs0_0, 0)
197                          VOP2(vmov, _a1a, _a1b)
198                          VMAC(vmlal.s32, _a1a, _ah2_0, _bh2_1, 1)
199                          VMAC(vmlal.s32, _a1a, _ah2_1, _bh2_0, 1)
200                          VMAC(vmlal.s32, _a1a, _ah0_0, _bl0_1, 1)
201                          VMAC(vmlal.s32, _a1a, _ah0_1, _bl0_0, 1)
202                          VOP2(vswp, _a0b_1, _a0a_0)
203                          VMAC(vmlsl.s32, _a1b, _al2_0, _bl2_1, 1)
204                          VMAC(vmlsl.s32, _a1b, _al2_1, _bl2_0, 1)
205                          VMAC(vmlal.s32, _a1b, _al0_0, _bs0_1, 1)
206                          VMAC(vmlal.s32, _a1b, _al0_1, _bs0_0, 1)
207                          VOP3(vsra.u64, _a0a, _a0b, "#28")
208                          VOP3(vsub.i32, _bs2_0, _bl2_0, _bh2_0)
209                          VOP2(vmovn.i64, _a0b_0, _a0b)
210                          VOP2(vswp, _a1b_1, _a1a_0)
211                          VOP3(vadd.i64, _a1b, _a0a, _a1b)
212                          VMAC(vmull.s32, _a0a, _as2_1, _bs2_1, 0)
213                          VOP2(vmovn.i64, _a0b_1, _a1b)
214                          VMAC(vmlal.s32, _a0a, _as0_0, _bh2_0, 0)
215                          VOP3(vsra.u64, _a1a, _a1b, "#28")
216                          VMAC(vmlal.s32, _a0a, _as0_1, _bh0_1, 0)
217                          VOP2(vbic.i32, _a0b, "#0xf0000000")
218                          VMAC(vmlal.s32, _a0a, _as2_0, _bh0_0, 0)
219                          "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t"
220                          VMAC(vmull.s32, _a1b, _as2_1, _bs2_1, 1)
221                          VMAC(vmlal.s32, _a1b, _as0_0, _bh2_0, 1)
222                          VMAC(vmlal.s32, _a1b, _as0_1, _bh0_1, 1)
223                          VMAC(vmlal.s32, _a1b, _as2_0, _bh0_0, 1)
224                          VOP2(vmov, _a0b_1, _a0a_1)
225                          VOP3(vadd.i64, _a0b_0, _a0a_0, _a1a_0)
226                          VOP3(vadd.i64, _a0a_0, _a0a_0, _a1a_1)
227                          VMAC(vmlal.s32, _a0a, _ah2_1, _bh2_1, 0)
228                          VMAC(vmlal.s32, _a0a, _ah0_0, _bl2_0, 0)
229                          VMAC(vmlal.s32, _a0a, _ah0_1, _bl0_1, 0)
230                          VMAC(vmlal.s32, _a0a, _ah2_0, _bl0_0, 0)
231                          VMAC(vmlsl.s32, _a0b, _al2_1, _bl2_1, 0)
232                          VMAC(vmlal.s32, _a0b, _al0_0, _bs2_0, 0)
233                          VMAC(vmlal.s32, _a0b, _al0_1, _bs0_1, 0)
234                          VMAC(vmlal.s32, _a0b, _al2_0, _bs0_0, 0)
235                          VOP2(vmov, _a1a, _a1b)
236                          VMAC(vmlal.s32, _a1a, _ah2_1, _bh2_1, 1)
237                          VMAC(vmlal.s32, _a1a, _ah0_0, _bl2_0, 1)
238                          VMAC(vmlal.s32, _a1a, _ah0_1, _bl0_1, 1)
239                          VMAC(vmlal.s32, _a1a, _ah2_0, _bl0_0, 1)
240                          VOP2(vswp, _a0b_1, _a0a_0)
241                          VMAC(vmlsl.s32, _a1b, _al2_1, _bl2_1, 1)
242                          VMAC(vmlal.s32, _a1b, _al0_0, _bs2_0, 1)
243                          VMAC(vmlal.s32, _a1b, _al0_1, _bs0_1, 1)
244                          VMAC(vmlal.s32, _a1b, _al2_0, _bs0_0, 1)
245                          VOP3(vsub.i32, _bs2_1, _bl2_1, _bh2_1)
246                          VOP3(vsra.u64, _a0a, _a0b, "#28")
247                          VOP2(vmovn.i64, _a0b_0, _a0b)
248                          VOP2(vswp, _a1b_1, _a1a_0)
249                          VOP3(vadd.i64, _a1b, _a0a, _a1b)
250                          VMAC(vmull.s32, _a0a, _as0_0, _bh2_1, 0)
251                          VOP2(vmovn.i64, _a0b_1, _a1b)
252                          VMAC(vmlal.s32, _a0a, _as0_1, _bh2_0, 0)
253                          VOP3(vsra.u64, _a1a, _a1b, "#28")
254                          VMAC(vmlal.s32, _a0a, _as2_0, _bh0_1, 0)
255                          VOP2(vbic.i32, _a0b, "#0xf0000000")
256                          VMAC(vmlal.s32, _a0a, _as2_1, _bh0_0, 0)
257                          "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t"
258                          VMAC(vmull.s32, _a1b, _as0_0, _bh2_1, 1)
259                          VMAC(vmlal.s32, _a1b, _as0_1, _bh2_0, 1)
260                          VMAC(vmlal.s32, _a1b, _as2_0, _bh0_1, 1)
261                          VMAC(vmlal.s32, _a1b, _as2_1, _bh0_0, 1)
262                          VOP2(vmov, _a0b_1, _a0a_1)
263                          VOP3(vadd.i64, _a0b_0, _a0a_0, _a1a_0)
264                          VOP3(vadd.i64, _a0a_0, _a0a_0, _a1a_1)
265                          VMAC(vmlal.s32, _a0a, _ah0_0, _bl2_1, 0)
266                          VMAC(vmlal.s32, _a0a, _ah0_1, _bl2_0, 0)
267                          VMAC(vmlal.s32, _a0a, _ah2_0, _bl0_1, 0)
268                          VMAC(vmlal.s32, _a0a, _ah2_1, _bl0_0, 0)
269                          VMAC(vmlal.s32, _a0b, _al0_0, _bs2_1, 0)
270                          VMAC(vmlal.s32, _a0b, _al0_1, _bs2_0, 0)
271                          VMAC(vmlal.s32, _a0b, _al2_0, _bs0_1, 0)
272                          VMAC(vmlal.s32, _a0b, _al2_1, _bs0_0, 0)
273                          VOP2(vmov, _a1a, _a1b)
274                          VMAC(vmlal.s32, _a1a, _ah0_0, _bl2_1, 1)
275                          VMAC(vmlal.s32, _a1a, _ah0_1, _bl2_0, 1)
276                          VMAC(vmlal.s32, _a1a, _ah2_0, _bl0_1, 1)
277                          VMAC(vmlal.s32, _a1a, _ah2_1, _bl0_0, 1)
278                          VOP2(vswp, _a0b_1, _a0a_0)
279                          VMAC(vmlal.s32, _a1b, _al0_0, _bs2_1, 1)
280                          VMAC(vmlal.s32, _a1b, _al0_1, _bs2_0, 1)
281                          VMAC(vmlal.s32, _a1b, _al2_0, _bs0_1, 1)
282                          VMAC(vmlal.s32, _a1b, _al2_1, _bs0_0, 1)
283                          VOP3(vsra.u64, _a0a, _a0b, "#28")
284                          VOP2(vmovn.i64, _a0b_0, _a0b)
285                          VOP2(vswp, _a1b_1, _a1a_0)
286                          VOP3(vadd.i64, _a0a, _a0a, _a1b)
287                          VOP2(vmovn.i64, _a0b_1, _a0a)
288                          VOP3(vsra.u64, _a1a, _a0a, "#28")
289                          VOP2(vbic.i32, _a0b, "#0xf0000000")
290                          VOP2(vswp, _a1a_0, _a1a_1)
291                          "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t"
292                          "sub %[c], #64" "\n\t"
293                          VOP3(vadd.i64, _a1a_1, _a1a_1, _a1a_0)
294                          "vldmia %[c], {" _a0a_0 ", " _a0a_1 ", " _a0b_0 "}"
295                          "\n\t" VOP2(vaddw.s32, _a1a, _a0a_0) VOP2(vmovn.i64,
296                                                                    _a0a_0,
297                                                                    _a1a)
298                          VOP2(vshr.s64, _a1a, "#28")
299                          VOP2(vaddw.s32, _a1a, _a0a_1) VOP2(vmovn.i64, _a0a_1,
300                                                             _a1a) VOP2(vshr.s64,
301                                                                        _a1a,
302                                                                        "#28")
303                          VOP2(vbic.i32, _a0a, "#0xf0000000")
304                          VOP2(vaddw.s32, _a1a, _a0b_0)
305                          VOP2(vmovn.i64, _a0b_0, _a1a)
306                          "vstmia %[c], {" _a0a_0 ", " _a0a_1 ", " _a0b_0 "}"
307                          "\n\t":[a] "+r"(as)
308                          ,[b] "+r"(bs)
309                          ,[c] "+r"(vc)
310
311                          ::"q0", "q1", "q2", "q3",
312                          "q4", "q5", "q6", "q7",
313                          "q8", "q9", "q10", "q11",
314                          "q12", "q13", "q14", "q15", "memory");
315 }
316
317 void gf_sqr(gf_s * __restrict__ cs, const gf bs)
318 {
319     int32x2_t *vc = (int32x2_t *) cs->limb;
320
321     __asm__ __volatile__("vld2.32 {" _bl0_0 "," _bl0_1 "," _bh0_0 "," _bh0_1 "}, [%[b],:128]!" "\n\t" VOP3(vadd.i32, _bs0_1, _bl0_1, _bh0_1) /* 0
322                                                                                                                                               * *
323                                                                                                                                               * ..
324                                                                                                                                               * *
325                                                                                                                                               * 2^30 */ VOP3(vsub.i32, _bs0_0, _bl0_0, _bh0_0)
326                          /* +- 2^29 */
327                          VOP3(vadd.i32, _as0, _bl0, _bh0)
328                          /*
329                           * 0 .. 2^30
330                           */
331                          "vld2.32 {" _bl2_0 "," _bl2_1 "," _bh2_0 "," _bh2_1 "}, [%[b],:128]!" "\n\t" VOP3(vadd.i32, _bs2, _bl2, _bh2) /* 0
332                                                                                                                                         * *
333                                                                                                                                         * ..
334                                                                                                                                         * *
335                                                                                                                                         * 2^30 */
336                          VOP2(vmov, _as2, _bs2)
337
338                          VMAC(vqdmull.s32, _a0b, _as0_1, _bs2_1, 0) /* 0 .. 8 *
339                                                                      * * 2^58. *
340                                                                      * danger *
341                                                                      * for *
342                                                                      * vqdmlal *
343                                                                      * is 32 */
344                          VMAC(vmlal.s32, _a0b, _as2_0, _bs2_0, 0) /* 0 .. 12 */
345                          VMAC(vmlal.s32, _a0b, _as0_0, _bh0_0, 0)
346                          /*
347                           * 0 .. 14
348                           */
349                          VMAC(vqdmull.s32, _a1b, _as0_1, _bs2_1, 1) /* 0 .. 8 */
350                          VMAC(vmlal.s32, _a1b, _as2_0, _bs2_0, 1) /* 0 .. 14 */
351                          VMAC(vmlal.s32, _a1b, _as0_0, _bh0_0, 1)
352                          /*
353                           * 0 .. 16
354                           */
355                          VOP2(vmov, _a0a, _a0b) /* 0 .. 14 */
356                          VMAC(vqdmlal.s32, _a0a, _bh0_1, _bh2_1, 0) /* 0 .. 16 */
357                          VMAC(vmlal.s32, _a0a, _bh2_0, _bh2_0, 0) /* 0 .. 17 */
358                          VMAC(vmlal.s32, _a0a, _bh0_0, _bl0_0, 0)
359                          /*
360                           * 0 .. 18
361                           */
362                          VMAC(vqdmlsl.s32, _a0b, _bl0_1, _bl2_1, 0)
363                          /*
364                           * 2 .. 14
365                           */
366                          VMAC(vmlsl.s32, _a0b, _bl2_0, _bl2_0, 0)
367                          /*
368                           * 3 .. 14
369                           */
370                          VMAC(vmlal.s32, _a0b, _bl0_0, _bs0_0, 0)
371                          /*
372                           * 4 .. 15
373                           */
374                          VOP2(vmov, _a1a, _a1b)
375                          VMAC(vqdmlal.s32, _a1a, _bh0_1, _bh2_1, 1) /* 0 .. 18 */
376                          VMAC(vmlal.s32, _a1a, _bh2_0, _bh2_0, 1) /* 0 .. 19 */
377                          VMAC(vmlal.s32, _a1a, _bh0_0, _bl0_0, 1)
378                          /*
379                           * 0 .. 20
380                           */
381                          VOP2(vswp, _a0b_1, _a0a_0)
382
383                          VMAC(vqdmlsl.s32, _a1b, _bl0_1, _bl2_1, 1)
384                          /*
385                           * 2 .. 16
386                           */
387                          VMAC(vmlsl.s32, _a1b, _bl2_0, _bl2_0, 1)
388                          /*
389                           * 3 .. 16
390                           */
391                          VMAC(vmlal.s32, _a1b, _bl0_0, _bs0_0, 1)
392                          /*
393                           * 4 .. 17
394                           */
395                          VOP3(vsra.u64, _a0a, _a0b, "#28")
396                          VOP3(vsub.i32, _bs0_1, _bl0_1, _bh0_1)
397                          VOP2(vmovn.i64, _a0b_0, _a0b)
398
399                          VOP2(vswp, _a1b_1, _a1a_0)
400                          VOP3(vadd.i64, _a1b, _a0a, _a1b)
401
402                          VMAC(vqdmull.s32, _a0a, _as2_0, _bs2_1, 0) /* 0 .. 8 */
403                          VOP2(vmovn.i64, _a0b_1, _a1b)
404                          VOP3(vsra.u64, _a1a, _a1b, "#28")
405                          VMAC(vqdmlal.s32, _a0a, _as0_0, _bh0_1, 0) /* 0 .. 12 */
406                          VOP2(vbic.i32, _a0b, "#0xf0000000")
407                          "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t" VMAC(vqdmull.s32, _a1b, _as2_0, _bs2_1, 1) /* 0
408                                                                                                                      * *
409                                                                                                                      * ..
410                                                                                                                      * * 8 */
411                          VMAC(vqdmlal.s32, _a1b, _as0_0, _bh0_1, 1)
412                          /*
413                           * 0 .. 12
414                           */
415                          VOP2(vmov, _a0b, _a0a) /* 0 .. 12 */
416                          VMAC(vqdmlal.s32, _a0a, _bh2_0, _bh2_1, 0) /* 0 .. 14 */
417                          VMAC(vqdmlal.s32, _a0a, _bh0_0, _bl0_1, 0)
418                          /*
419                           * 0 .. 16
420                           */
421                          VMAC(vqdmlsl.s32, _a0b, _bl2_0, _bl2_1, 0)
422                          /*
423                           * 2 .. 12
424                           */
425                          VMAC(vqdmlal.s32, _a0b, _bl0_0, _bs0_1, 0)
426                          /*
427                           * 4 .. 14
428                           */
429                          VOP3(vadd.i64, _a0a_0, _a0a_0, _a1a_1)
430                          VOP3(vadd.i64, _a0b_0, _a0b_0, _a1a_0)
431
432                          VOP2(vmov, _a1a, _a1b) /* 0 .. 12 */
433                          VMAC(vqdmlal.s32, _a1a, _bh2_0, _bh2_1, 1) /* 0 .. 14 */
434                          VMAC(vqdmlal.s32, _a1a, _bh0_0, _bl0_1, 1)
435                          /*
436                           * 0 .. 16
437                           */
438                          VOP2(vswp, _a0b_1, _a0a_0)
439
440                          VMAC(vqdmlsl.s32, _a1b, _bl2_0, _bl2_1, 1)
441                          /*
442                           * 2 .. 12
443                           */
444                          VMAC(vqdmlal.s32, _a1b, _bl0_0, _bs0_1, 1)
445                          /*
446                           * 4 .. 14
447                           */
448                          VOP3(vsra.u64, _a0a, _a0b, "#28")
449                          VOP3(vsub.i32, _bs2_0, _bl2_0, _bh2_0)
450                          VOP2(vmovn.i64, _a0b_0, _a0b)
451
452                          VOP2(vswp, _a1b_1, _a1a_0)
453                          VOP3(vadd.i64, _a1b, _a0a, _a1b)
454
455                          VMAC(vmull.s32, _a0a, _as2_1, _bs2_1, 0)
456                          VOP2(vmovn.i64, _a0b_1, _a1b)
457                          VMAC(vqdmlal.s32, _a0a, _as0_0, _bh2_0, 0)
458                          VOP3(vsra.u64, _a1a, _a1b, "#28")
459                          VMAC(vmlal.s32, _a0a, _as0_1, _bh0_1, 0)
460                          VOP2(vbic.i32, _a0b, "#0xf0000000")
461                          "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t"
462                          VMAC(vmull.s32, _a1b, _as2_1, _bs2_1, 1)
463                          VMAC(vqdmlal.s32, _a1b, _as0_0, _bh2_0, 1)
464                          VMAC(vmlal.s32, _a1b, _as0_1, _bh0_1, 1)
465
466                          VOP2(vmov, _a0b_1, _a0a_1)
467                          VOP3(vadd.i64, _a0b_0, _a0a_0, _a1a_0)
468                          VOP3(vadd.i64, _a0a_0, _a0a_0, _a1a_1)
469                          VMAC(vmlal.s32, _a0a, _bh2_1, _bh2_1, 0)
470                          VMAC(vqdmlal.s32, _a0a, _bh0_0, _bl2_0, 0)
471                          VMAC(vmlal.s32, _a0a, _bh0_1, _bl0_1, 0)
472
473                          VMAC(vmlsl.s32, _a0b, _bl2_1, _bl2_1, 0)
474                          VMAC(vqdmlal.s32, _a0b, _bl0_0, _bs2_0, 0)
475                          VMAC(vmlal.s32, _a0b, _bl0_1, _bs0_1, 0)
476
477                          VOP2(vmov, _a1a, _a1b)
478                          VMAC(vmlal.s32, _a1a, _bh2_1, _bh2_1, 1)
479                          VMAC(vqdmlal.s32, _a1a, _bh0_0, _bl2_0, 1)
480                          VMAC(vmlal.s32, _a1a, _bh0_1, _bl0_1, 1)
481
482                          VOP2(vswp, _a0b_1, _a0a_0)
483
484                          VMAC(vmlsl.s32, _a1b, _bl2_1, _bl2_1, 1)
485                          VMAC(vqdmlal.s32, _a1b, _bl0_0, _bs2_0, 1)
486                          VMAC(vmlal.s32, _a1b, _bl0_1, _bs0_1, 1)
487
488                          VOP3(vsub.i32, _bs2_1, _bl2_1, _bh2_1)
489                          VOP3(vsra.u64, _a0a, _a0b, "#28")
490                          VOP2(vmovn.i64, _a0b_0, _a0b)
491
492                          VOP2(vswp, _a1b_1, _a1a_0)
493                          VOP3(vadd.i64, _a1b, _a0a, _a1b)
494
495                          VMAC(vqdmull.s32, _a0a, _as0_0, _bh2_1, 0)
496                          VOP2(vmovn.i64, _a0b_1, _a1b)
497                          VOP3(vsra.u64, _a1a, _a1b, "#28")
498                          VMAC(vqdmlal.s32, _a0a, _as2_0, _bh0_1, 0)
499                          VOP2(vbic.i32, _a0b, "#0xf0000000")
500                          "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t"
501                          VMAC(vqdmull.s32, _a1b, _as0_0, _bh2_1, 1)
502                          VMAC(vqdmlal.s32, _a1b, _as2_0, _bh0_1, 1)
503
504                          VOP2(vmov, _a0b_1, _a0a_1)
505                          VOP3(vadd.i64, _a0b_0, _a0a_0, _a1a_0)
506                          VOP3(vadd.i64, _a0a_0, _a0a_0, _a1a_1)
507                          VMAC(vqdmlal.s32, _a0a, _bh0_0, _bl2_1, 0)
508                          VMAC(vqdmlal.s32, _a0a, _bh2_0, _bl0_1, 0)
509
510                          VMAC(vqdmlal.s32, _a0b, _bl0_0, _bs2_1, 0)
511                          VMAC(vqdmlal.s32, _a0b, _bl2_0, _bs0_1, 0)
512
513                          VOP2(vmov, _a1a, _a1b)
514                          VMAC(vqdmlal.s32, _a1a, _bh0_0, _bl2_1, 1)
515                          VMAC(vqdmlal.s32, _a1a, _bh2_0, _bl0_1, 1)
516
517                          VOP2(vswp, _a0b_1, _a0a_0)
518
519                          VMAC(vqdmlal.s32, _a1b, _bl0_0, _bs2_1, 1)
520                          VMAC(vqdmlal.s32, _a1b, _bl2_0, _bs0_1, 1)
521
522                          VOP3(vsra.u64, _a0a, _a0b, "#28")
523                          VOP2(vmovn.i64, _a0b_0, _a0b)
524
525                          VOP2(vswp, _a1b_1, _a1a_0)
526                          VOP3(vadd.i64, _a0a, _a0a, _a1b)
527
528                          VOP2(vmovn.i64, _a0b_1, _a0a)
529                          VOP3(vsra.u64, _a1a, _a0a, "#28")
530
531                          VOP2(vbic.i32, _a0b, "#0xf0000000")
532
533                          VOP2(vswp, _a1a_0, _a1a_1)
534
535                          "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t"
536                          "sub %[c], #64" "\n\t"
537                          VOP3(vadd.i64, _a1a_1, _a1a_1, _a1a_0)
538
539                          "vldmia %[c], {" _a0a_0 ", " _a0a_1 ", " _a0b_0 "}"
540                          "\n\t" VOP2(vaddw.s32, _a1a, _a0a_0)
541                          VOP2(vmovn.i64, _a0a_0, _a1a)
542                          VOP2(vshr.s64, _a1a, "#28")
543
544                          VOP2(vaddw.s32, _a1a, _a0a_1)
545                          VOP2(vmovn.i64, _a0a_1, _a1a)
546                          VOP2(vshr.s64, _a1a, "#28")
547
548                          VOP2(vbic.i32, _a0a, "#0xf0000000")
549
550                          VOP2(vaddw.s32, _a1a, _a0b_0)
551                          VOP2(vmovn.i64, _a0b_0, _a1a)
552
553                          "vstmia %[c], {" _a0a_0 ", " _a0a_1 ", " _a0b_0 "}"
554                          "\n\t":[b] "+r"(bs)
555                          ,[c] "+r"(vc)
556
557                          ::"q0", "q1", "q2", "q3",
558                          "q4", "q5", "q6", "q7",
559                          "q12", "q13", "q14", "q15", "memory");
560 }
561
562 void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
563 {
564     uint32x2_t vmask = { (1 << 28) - 1, (1 << 28) - 1 };
565     assert(b < (1 << 28));
566
567     uint64x2_t accum;
568     const uint32x2_t *va = (const uint32x2_t *)as->limb;
569     uint32x2_t *vo = (uint32x2_t *) cs->limb;
570     uint32x2_t vc, vn;
571     uint32x2_t vb = { b, 0 };
572
573     vc = va[0];
574     accum = vmull_lane_u32(vc, vb, 0);
575     vo[0] = vmovn_u64(accum) & vmask;
576     accum = vshrq_n_u64(accum, 28);
577
578     /*
579      * PERF: the right way to do this is to reduce behind, i.e. vmull + vmlal
580      * round 0 vmull + vmlal round 1 vmull + vmlal round 2 vsraq round 0, 1
581      * vmull + vmlal round 3 vsraq round 1, 2 ...
582      */
583
584     int i;
585     for (i = 1; i < 8; i++) {
586         vn = va[i];
587         accum = vmlal_lane_u32(accum, vn, vb, 0);
588         vo[i] = vmovn_u64(accum) & vmask;
589         accum = vshrq_n_u64(accum, 28);
590         vc = vn;
591     }
592
593     accum = xx_vaddup_u64(vrev128_u64(accum));
594     accum = vaddw_u32(accum, vo[0]);
595     vo[0] = vmovn_u64(accum) & vmask;
596
597     accum = vshrq_n_u64(accum, 28);
598     vo[1] += vmovn_u64(accum);
599 }