2 * Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
3 * Copyright 2014 Cryptography Research, Inc.
5 * Licensed under the OpenSSL license (the "License"). You may not use
6 * this file except in compliance with the License. You can obtain a copy
7 * in the file LICENSE in the source distribution or at
8 * https://www.openssl.org/source/license.html
10 * Originally written by Mike Hamburg
15 void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
17 const uint64_t *a = as->limb, *b = bs->limb;
18 uint64_t *c = cs->limb;
19 __uint128_t accum0 = 0, accum1 = 0, accum2;
20 uint64_t mask = (1ull << 56) - 1;
21 uint64_t aa[4] VECTOR_ALIGNED, bb[4] VECTOR_ALIGNED, bbb[4] VECTOR_ALIGNED;
23 /* For some reason clang doesn't vectorize this without prompting? */
25 for (i = 0; i < sizeof(aa) / sizeof(uint64xn_t); i++) {
26 ((uint64xn_t *) aa)[i] =
27 ((const uint64xn_t *)a)[i] + ((const uint64xn_t *)(&a[4]))[i];
28 ((uint64xn_t *) bb)[i] =
29 ((const uint64xn_t *)b)[i] + ((const uint64xn_t *)(&b[4]))[i];
30 ((uint64xn_t *) bbb)[i] =
31 ((const uint64xn_t *)bb)[i] + ((const uint64xn_t *)(&b[4]))[i];
34 * for (int i=0; i<4; i++) { aa[i] = a[i] + a[i+4]; bb[i] = b[i] + b[i+4];
38 accum2 = widemul(&a[0], &b[3]);
39 accum0 = widemul(&aa[0], &bb[3]);
40 accum1 = widemul(&a[4], &b[7]);
42 mac(&accum2, &a[1], &b[2]);
43 mac(&accum0, &aa[1], &bb[2]);
44 mac(&accum1, &a[5], &b[6]);
46 mac(&accum2, &a[2], &b[1]);
47 mac(&accum0, &aa[2], &bb[1]);
48 mac(&accum1, &a[6], &b[5]);
50 mac(&accum2, &a[3], &b[0]);
51 mac(&accum0, &aa[3], &bb[0]);
52 mac(&accum1, &a[7], &b[4]);
57 c[3] = ((uint64_t)(accum1)) & mask;
58 c[7] = ((uint64_t)(accum0)) & mask;
63 mac(&accum0, &aa[1], &bb[3]);
64 mac(&accum1, &a[5], &b[7]);
65 mac(&accum0, &aa[2], &bb[2]);
66 mac(&accum1, &a[6], &b[6]);
67 mac(&accum0, &aa[3], &bb[1]);
70 accum2 = widemul(&a[0], &b[0]);
74 msb(&accum0, &a[1], &b[3]);
75 msb(&accum0, &a[2], &b[2]);
76 mac(&accum1, &a[7], &b[5]);
77 msb(&accum0, &a[3], &b[1]);
78 mac(&accum1, &aa[0], &bb[0]);
79 mac(&accum0, &a[4], &b[4]);
81 c[0] = ((uint64_t)(accum0)) & mask;
82 c[4] = ((uint64_t)(accum1)) & mask;
87 accum2 = widemul(&a[2], &b[7]);
88 mac(&accum0, &a[6], &bb[3]);
89 mac(&accum1, &aa[2], &bbb[3]);
91 mac(&accum2, &a[3], &b[6]);
92 mac(&accum0, &a[7], &bb[2]);
93 mac(&accum1, &aa[3], &bbb[2]);
95 mac(&accum2, &a[0], &b[1]);
96 mac(&accum1, &aa[0], &bb[1]);
97 mac(&accum0, &a[4], &b[5]);
99 mac(&accum2, &a[1], &b[0]);
100 mac(&accum1, &aa[1], &bb[0]);
101 mac(&accum0, &a[5], &b[4]);
106 c[1] = ((uint64_t)(accum0)) & mask;
107 c[5] = ((uint64_t)(accum1)) & mask;
112 accum2 = widemul(&a[3], &b[7]);
113 mac(&accum0, &a[7], &bb[3]);
114 mac(&accum1, &aa[3], &bbb[3]);
116 mac(&accum2, &a[0], &b[2]);
117 mac(&accum1, &aa[0], &bb[2]);
118 mac(&accum0, &a[4], &b[6]);
120 mac(&accum2, &a[1], &b[1]);
121 mac(&accum1, &aa[1], &bb[1]);
122 mac(&accum0, &a[5], &b[5]);
124 mac(&accum2, &a[2], &b[0]);
125 mac(&accum1, &aa[2], &bb[0]);
126 mac(&accum0, &a[6], &b[4]);
131 c[2] = ((uint64_t)(accum0)) & mask;
132 c[6] = ((uint64_t)(accum1)) & mask;
139 c[3] = ((uint64_t)(accum0)) & mask;
140 c[7] = ((uint64_t)(accum1)) & mask;
142 /* we could almost stop here, but it wouldn't be stable, so... */
146 c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
147 c[0] += ((uint64_t)(accum1));
150 void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
152 const uint64_t *a = as->limb;
153 uint64_t *c = cs->limb;
155 __uint128_t accum0, accum4;
156 uint64_t mask = (1ull << 56) - 1;
158 accum0 = widemul_rm(b, &a[0]);
159 accum4 = widemul_rm(b, &a[4]);
161 c[0] = accum0 & mask;
163 c[4] = accum4 & mask;
166 mac_rm(&accum0, b, &a[1]);
167 mac_rm(&accum4, b, &a[5]);
169 c[1] = accum0 & mask;
171 c[5] = accum4 & mask;
174 mac_rm(&accum0, b, &a[2]);
175 mac_rm(&accum4, b, &a[6]);
177 c[2] = accum0 & mask;
179 c[6] = accum4 & mask;
182 mac_rm(&accum0, b, &a[3]);
183 mac_rm(&accum4, b, &a[7]);
185 c[3] = accum0 & mask;
187 c[7] = accum4 & mask;
190 accum0 += accum4 + c[4];
191 c[4] = accum0 & mask;
192 c[5] += accum0 >> 56;
195 c[0] = accum4 & mask;
196 c[1] += accum4 >> 56;
199 void gf_sqr(gf_s * __restrict__ cs, const gf as)
201 const uint64_t *a = as->limb;
202 uint64_t *c = cs->limb;
203 __uint128_t accum0 = 0, accum1 = 0, accum2;
204 uint64_t mask = (1ull << 56) - 1;
205 uint64_t aa[4] VECTOR_ALIGNED;
207 /* For some reason clang doesn't vectorize this without prompting? */
209 for (i = 0; i < sizeof(aa) / sizeof(uint64xn_t); i++) {
210 ((uint64xn_t *) aa)[i] =
211 ((const uint64xn_t *)a)[i] + ((const uint64xn_t *)(&a[4]))[i];
214 accum2 = widemul(&a[0], &a[3]);
215 accum0 = widemul(&aa[0], &aa[3]);
216 accum1 = widemul(&a[4], &a[7]);
218 mac(&accum2, &a[1], &a[2]);
219 mac(&accum0, &aa[1], &aa[2]);
220 mac(&accum1, &a[5], &a[6]);
225 c[3] = ((uint64_t)(accum1)) << 1 & mask;
226 c[7] = ((uint64_t)(accum0)) << 1 & mask;
231 mac2(&accum0, &aa[1], &aa[3]);
232 mac2(&accum1, &a[5], &a[7]);
233 mac(&accum0, &aa[2], &aa[2]);
236 msb2(&accum0, &a[1], &a[3]);
237 mac(&accum1, &a[6], &a[6]);
239 accum2 = widemul(&a[0], &a[0]);
243 msb(&accum0, &a[2], &a[2]);
244 mac(&accum1, &aa[0], &aa[0]);
245 mac(&accum0, &a[4], &a[4]);
247 c[0] = ((uint64_t)(accum0)) & mask;
248 c[4] = ((uint64_t)(accum1)) & mask;
253 accum2 = widemul2(&aa[2], &aa[3]);
254 msb2(&accum0, &a[2], &a[3]);
255 mac2(&accum1, &a[6], &a[7]);
260 accum2 = widemul2(&a[0], &a[1]);
261 mac2(&accum1, &aa[0], &aa[1]);
262 mac2(&accum0, &a[4], &a[5]);
267 c[1] = ((uint64_t)(accum0)) & mask;
268 c[5] = ((uint64_t)(accum1)) & mask;
273 accum2 = widemul(&aa[3], &aa[3]);
274 msb(&accum0, &a[3], &a[3]);
275 mac(&accum1, &a[7], &a[7]);
280 accum2 = widemul2(&a[0], &a[2]);
281 mac2(&accum1, &aa[0], &aa[2]);
282 mac2(&accum0, &a[4], &a[6]);
284 mac(&accum2, &a[1], &a[1]);
285 mac(&accum1, &aa[1], &aa[1]);
286 mac(&accum0, &a[5], &a[5]);
291 c[2] = ((uint64_t)(accum0)) & mask;
292 c[6] = ((uint64_t)(accum1)) & mask;
299 c[3] = ((uint64_t)(accum0)) & mask;
300 c[7] = ((uint64_t)(accum1)) & mask;
302 /* we could almost stop here, but it wouldn't be stable, so... */
306 c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
307 c[0] += ((uint64_t)(accum1));