2 * Copyright 2017-2022 The OpenSSL Project Authors. All Rights Reserved.
3 * Copyright 2014 Cryptography Research, Inc.
5 * Licensed under the Apache License 2.0 (the "License"). You may not use
6 * this file except in compliance with the License. You can obtain a copy
7 * in the file LICENSE in the source distribution or at
8 * https://www.openssl.org/source/license.html
10 * Originally written by Mike Hamburg
13 #include "internal/e_os.h"
14 #include <openssl/macros.h>
15 #include "internal/numbers.h"
18 /* No support for 128 bit ints, so do nothing here */
19 NON_EMPTY_TRANSLATION_UNIT
22 # include "../field.h"
24 void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs)
26 const uint64_t *a = as->limb, *b = bs->limb;
27 uint64_t *c = cs->limb;
28 uint128_t accum0 = 0, accum1 = 0, accum2;
29 uint64_t mask = (1ULL << 56) - 1;
30 uint64_t aa[4], bb[4], bbb[4];
33 for (i = 0; i < 4; i++) {
34 aa[i] = a[i] + a[i + 4];
35 bb[i] = b[i] + b[i + 4];
36 bbb[i] = bb[i] + b[i + 4];
39 for (i = 0; i < 4; i++) {
42 for (j = 0; j <= i; j++) {
43 accum2 += widemul(a[j], b[i - j]);
44 accum1 += widemul(aa[j], bb[i - j]);
45 accum0 += widemul(a[j + 4], b[i - j + 4]);
48 accum2 += widemul(a[j], b[i + 8 - j]);
49 accum1 += widemul(aa[j], bbb[i + 4 - j]);
50 accum0 += widemul(a[j + 4], bb[i + 4 - j]);
56 c[i] = ((uint64_t)(accum0)) & mask;
57 c[i + 4] = ((uint64_t)(accum1)) & mask;
66 c[4] = ((uint64_t)(accum0)) & mask;
67 c[0] = ((uint64_t)(accum1)) & mask;
72 c[5] += ((uint64_t)(accum0));
73 c[1] += ((uint64_t)(accum1));
76 void gf_mulw_unsigned(gf_s * RESTRICT cs, const gf as, uint32_t b)
78 const uint64_t *a = as->limb;
79 uint64_t *c = cs->limb;
80 uint128_t accum0 = 0, accum4 = 0;
81 uint64_t mask = (1ULL << 56) - 1;
84 for (i = 0; i < 4; i++) {
85 accum0 += widemul(b, a[i]);
86 accum4 += widemul(b, a[i + 4]);
89 c[i + 4] = accum4 & mask;
93 accum0 += accum4 + c[4];
102 void gf_sqr(gf_s * RESTRICT cs, const gf as)
104 const uint64_t *a = as->limb;
105 uint64_t *c = cs->limb;
106 uint128_t accum0 = 0, accum1 = 0, accum2;
107 uint64_t mask = (1ULL << 56) - 1;
111 /* For some reason clang doesn't vectorize this without prompting? */
112 for (i = 0; i < 4; i++)
113 aa[i] = a[i] + a[i + 4];
115 accum2 = widemul(a[0], a[3]);
116 accum0 = widemul(aa[0], aa[3]);
117 accum1 = widemul(a[4], a[7]);
119 accum2 += widemul(a[1], a[2]);
120 accum0 += widemul(aa[1], aa[2]);
121 accum1 += widemul(a[5], a[6]);
126 c[3] = ((uint64_t)(accum1)) << 1 & mask;
127 c[7] = ((uint64_t)(accum0)) << 1 & mask;
132 accum0 += widemul(2 * aa[1], aa[3]);
133 accum1 += widemul(2 * a[5], a[7]);
134 accum0 += widemul(aa[2], aa[2]);
137 accum0 -= widemul(2 * a[1], a[3]);
138 accum1 += widemul(a[6], a[6]);
140 accum2 = widemul(a[0], a[0]);
144 accum0 -= widemul(a[2], a[2]);
145 accum1 += widemul(aa[0], aa[0]);
146 accum0 += widemul(a[4], a[4]);
148 c[0] = ((uint64_t)(accum0)) & mask;
149 c[4] = ((uint64_t)(accum1)) & mask;
154 accum2 = widemul(2 * aa[2], aa[3]);
155 accum0 -= widemul(2 * a[2], a[3]);
156 accum1 += widemul(2 * a[6], a[7]);
161 accum2 = widemul(2 * a[0], a[1]);
162 accum1 += widemul(2 * aa[0], aa[1]);
163 accum0 += widemul(2 * a[4], a[5]);
168 c[1] = ((uint64_t)(accum0)) & mask;
169 c[5] = ((uint64_t)(accum1)) & mask;
174 accum2 = widemul(aa[3], aa[3]);
175 accum0 -= widemul(a[3], a[3]);
176 accum1 += widemul(a[7], a[7]);
181 accum2 = widemul(2 * a[0], a[2]);
182 accum1 += widemul(2 * aa[0], aa[2]);
183 accum0 += widemul(2 * a[4], a[6]);
185 accum2 += widemul(a[1], a[1]);
186 accum1 += widemul(aa[1], aa[1]);
187 accum0 += widemul(a[5], a[5]);
192 c[2] = ((uint64_t)(accum0)) & mask;
193 c[6] = ((uint64_t)(accum1)) & mask;
200 c[3] = ((uint64_t)(accum0)) & mask;
201 c[7] = ((uint64_t)(accum1)) & mask;
203 /* we could almost stop here, but it wouldn't be stable, so... */
207 c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
208 c[0] += ((uint64_t)(accum1));