1 /* Copyright (c) 2014 Cryptography Research, Inc.
2 * Released under the MIT License. See LICENSE.txt for license information.
7 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
8 const uint64_t *a = as->limb, *b = bs->limb;
9 uint64_t *c = cs->limb;
11 __uint128_t accum0 = 0, accum1 = 0, accum2;
12 uint64_t mask = (1ull<<56) - 1;
14 uint64_t aa[4], bb[4], bbb[4];
18 aa[i] = a[i] + a[i+4];
19 bb[i] = b[i] + b[i+4];
20 bbb[i] = bb[i] + b[i+4];
23 int I_HATE_UNROLLED_LOOPS = 0;
25 if (I_HATE_UNROLLED_LOOPS) {
26 /* The compiler probably won't unroll this,
27 * so it's like 80% slower.
33 for (j=0; j<=i; j++) {
34 accum2 += widemul(a[j], b[i-j]);
35 accum1 += widemul(aa[j], bb[i-j]);
36 accum0 += widemul(a[j+4], b[i-j+4]);
39 accum2 += widemul(a[j], b[i-j+8]);
40 accum1 += widemul(aa[j], bbb[i-j+4]);
41 accum0 += widemul(a[j+4], bb[i-j+4]);
47 c[i] = ((uint64_t)(accum0)) & mask;
48 c[i+4] = ((uint64_t)(accum1)) & mask;
54 accum2 = widemul(a[0], b[0]);
55 accum1 += widemul(aa[0], bb[0]);
56 accum0 += widemul(a[4], b[4]);
58 accum2 += widemul(a[1], b[7]);
59 accum1 += widemul(aa[1], bbb[3]);
60 accum0 += widemul(a[5], bb[3]);
62 accum2 += widemul(a[2], b[6]);
63 accum1 += widemul(aa[2], bbb[2]);
64 accum0 += widemul(a[6], bb[2]);
66 accum2 += widemul(a[3], b[5]);
67 accum1 += widemul(aa[3], bbb[1]);
68 accum0 += widemul(a[7], bb[1]);
73 c[0] = ((uint64_t)(accum0)) & mask;
74 c[4] = ((uint64_t)(accum1)) & mask;
79 accum2 = widemul(a[0], b[1]);
80 accum1 += widemul(aa[0], bb[1]);
81 accum0 += widemul(a[4], b[5]);
83 accum2 += widemul(a[1], b[0]);
84 accum1 += widemul(aa[1], bb[0]);
85 accum0 += widemul(a[5], b[4]);
87 accum2 += widemul(a[2], b[7]);
88 accum1 += widemul(aa[2], bbb[3]);
89 accum0 += widemul(a[6], bb[3]);
91 accum2 += widemul(a[3], b[6]);
92 accum1 += widemul(aa[3], bbb[2]);
93 accum0 += widemul(a[7], bb[2]);
98 c[1] = ((uint64_t)(accum0)) & mask;
99 c[5] = ((uint64_t)(accum1)) & mask;
104 accum2 = widemul(a[0], b[2]);
105 accum1 += widemul(aa[0], bb[2]);
106 accum0 += widemul(a[4], b[6]);
108 accum2 += widemul(a[1], b[1]);
109 accum1 += widemul(aa[1], bb[1]);
110 accum0 += widemul(a[5], b[5]);
112 accum2 += widemul(a[2], b[0]);
113 accum1 += widemul(aa[2], bb[0]);
114 accum0 += widemul(a[6], b[4]);
116 accum2 += widemul(a[3], b[7]);
117 accum1 += widemul(aa[3], bbb[3]);
118 accum0 += widemul(a[7], bb[3]);
123 c[2] = ((uint64_t)(accum0)) & mask;
124 c[6] = ((uint64_t)(accum1)) & mask;
129 accum2 = widemul(a[0], b[3]);
130 accum1 += widemul(aa[0], bb[3]);
131 accum0 += widemul(a[4], b[7]);
133 accum2 += widemul(a[1], b[2]);
134 accum1 += widemul(aa[1], bb[2]);
135 accum0 += widemul(a[5], b[6]);
137 accum2 += widemul(a[2], b[1]);
138 accum1 += widemul(aa[2], bb[1]);
139 accum0 += widemul(a[6], b[5]);
141 accum2 += widemul(a[3], b[0]);
142 accum1 += widemul(aa[3], bb[0]);
143 accum0 += widemul(a[7], b[4]);
148 c[3] = ((uint64_t)(accum0)) & mask;
149 c[7] = ((uint64_t)(accum1)) & mask;
153 } /* !I_HATE_UNROLLED_LOOPS */
158 c[4] = ((uint64_t)(accum0)) & mask;
159 c[0] = ((uint64_t)(accum1)) & mask;
164 c[5] += ((uint64_t)(accum0));
165 c[1] += ((uint64_t)(accum1));
168 void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
169 const uint64_t *a = as->limb;
170 uint64_t *c = cs->limb;
172 __uint128_t accum0 = 0, accum4 = 0;
173 uint64_t mask = (1ull<<56) - 1;
176 for (i=0; i<4; i++) {
177 accum0 += widemul(b, a[i]);
178 accum4 += widemul(b, a[i+4]);
179 c[i] = accum0 & mask; accum0 >>= 56;
180 c[i+4] = accum4 & mask; accum4 >>= 56;
183 accum0 += accum4 + c[4];
184 c[4] = accum0 & mask;
185 c[5] += accum0 >> 56;
188 c[0] = accum4 & mask;
189 c[1] += accum4 >> 56;
192 void gf_sqr (gf_s *__restrict__ cs, const gf as) {
193 const uint64_t *a = as->limb;
194 uint64_t *c = cs->limb;
196 __uint128_t accum0 = 0, accum1 = 0, accum2;
197 uint64_t mask = (1ull<<56) - 1;
201 /* For some reason clang doesn't vectorize this without prompting? */
203 for (i=0; i<4; i++) {
204 aa[i] = a[i] + a[i+4];
207 accum2 = widemul(a[0],a[3]);
208 accum0 = widemul(aa[0],aa[3]);
209 accum1 = widemul(a[4],a[7]);
211 accum2 += widemul(a[1], a[2]);
212 accum0 += widemul(aa[1], aa[2]);
213 accum1 += widemul(a[5], a[6]);
218 c[3] = ((uint64_t)(accum1))<<1 & mask;
219 c[7] = ((uint64_t)(accum0))<<1 & mask;
224 accum0 += widemul(2*aa[1],aa[3]);
225 accum1 += widemul(2*a[5], a[7]);
226 accum0 += widemul(aa[2], aa[2]);
229 accum0 -= widemul(2*a[1], a[3]);
230 accum1 += widemul(a[6], a[6]);
232 accum2 = widemul(a[0],a[0]);
236 accum0 -= widemul(a[2], a[2]);
237 accum1 += widemul(aa[0], aa[0]);
238 accum0 += widemul(a[4], a[4]);
240 c[0] = ((uint64_t)(accum0)) & mask;
241 c[4] = ((uint64_t)(accum1)) & mask;
246 accum2 = widemul(2*aa[2],aa[3]);
247 accum0 -= widemul(2*a[2], a[3]);
248 accum1 += widemul(2*a[6], a[7]);
253 accum2 = widemul(2*a[0],a[1]);
254 accum1 += widemul(2*aa[0], aa[1]);
255 accum0 += widemul(2*a[4], a[5]);
260 c[1] = ((uint64_t)(accum0)) & mask;
261 c[5] = ((uint64_t)(accum1)) & mask;
266 accum2 = widemul(aa[3],aa[3]);
267 accum0 -= widemul(a[3], a[3]);
268 accum1 += widemul(a[7], a[7]);
273 accum2 = widemul(2*a[0],a[2]);
274 accum1 += widemul(2*aa[0], aa[2]);
275 accum0 += widemul(2*a[4], a[6]);
277 accum2 += widemul(a[1], a[1]);
278 accum1 += widemul(aa[1], aa[1]);
279 accum0 += widemul(a[5], a[5]);
284 c[2] = ((uint64_t)(accum0)) & mask;
285 c[6] = ((uint64_t)(accum1)) & mask;
292 c[3] = ((uint64_t)(accum0)) & mask;
293 c[7] = ((uint64_t)(accum1)) & mask;
295 /* we could almost stop here, but it wouldn't be stable, so... */
299 c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
300 c[0] += ((uint64_t)(accum1));