1 /* Copyright (c) 2014-2016 Cryptography Research, Inc.
2 * Released under the MIT License. See LICENSE.txt for license information.
5 #ifndef __ARCH_X86_64_ARCH_INTRINSICS_H__
6 #define __ARCH_X86_64_ARCH_INTRINSICS_H__
8 #define ARCH_WORD_BITS 64
10 #include <openssl/e_os2.h>
12 /* FUTURE: autogenerate */
13 static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
19 : [c]"=&a"(c), [d]"=d"(d)
20 : [b]"m"(*b), [a]"m"(*a)
25 "mulx %[b], %[c], %[d];"
26 : [c]"=r"(c), [d]"=r"(d)
27 : [b]"m"(*b), [a]"m"(*a)
30 return (((__uint128_t)(d))<<64) | c;
33 static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
39 : [c]"=&a"(c), [d]"=d"(d)
40 : [b]"m"(*b), [a]"r"(a)
44 ("mulx %[b], %[c], %[d];"
45 : [c]"=r"(c), [d]"=r"(d)
46 : [b]"m"(*b), [a]"d"(a));
48 return (((__uint128_t)(d))<<64) | c;
51 static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) {
56 : [c]"=a"(c), [d]"=d"(d)
61 ("mulx %[b], %[c], %[d];"
62 : [c]"=r"(c), [d]"=r"(d)
63 : [b]"r"(b), [a]"d"(a));
65 return (((__uint128_t)(d))<<64) | c;
68 static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
75 : [c]"=&a"(c), [d]"=d"(d)
76 : [b]"m"(*b), [a]"m"(*a)
81 "leaq (,%%rdx,2), %%rdx;"
82 "mulx %[b], %[c], %[d];"
83 : [c]"=r"(c), [d]"=r"(d)
84 : [b]"m"(*b), [a]"m"(*a)
87 return (((__uint128_t)(d))<<64) | c;
90 static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
91 uint64_t lo = *acc, hi = *acc>>64;
97 "mulx %[b], %[c], %[d]; "
100 : [c]"=&r"(c), [d]"=&r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
101 : [b]"m"(*b), [a]"m"(*a)
105 ("movq %[a], %%rax; "
107 "addq %%rax, %[lo]; "
108 "adcq %%rdx, %[hi]; "
109 : [lo]"+r"(lo), [hi]"+r"(hi)
110 : [b]"m"(*b), [a]"m"(*a)
111 : "rax", "rdx", "cc");
114 *acc = (((__uint128_t)(hi))<<64) | lo;
117 static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
118 uint64_t lo = *acc, hi = *acc>>64;
119 uint64_t lo2 = *acc2, hi2 = *acc2>>64;
124 ("movq %[a], %%rdx; "
125 "mulx %[b], %[c], %[d]; "
128 "addq %[c], %[lo2]; "
129 "adcq %[d], %[hi2]; "
130 : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
131 : [b]"m"(*b), [a]"m"(*a)
135 ("movq %[a], %%rax; "
137 "addq %%rax, %[lo]; "
138 "adcq %%rdx, %[hi]; "
139 "addq %%rax, %[lo2]; "
140 "adcq %%rdx, %[hi2]; "
141 : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
142 : [b]"m"(*b), [a]"m"(*a)
143 : "rax", "rdx", "cc");
146 *acc = (((__uint128_t)(hi))<<64) | lo;
147 *acc2 = (((__uint128_t)(hi2))<<64) | lo2;
150 static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
151 uint64_t lo = *acc, hi = *acc>>64;
156 ("mulx %[b], %[c], %[d]; "
159 : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
160 : [b]"m"(*b), [a]"d"(a)
164 ("movq %[a], %%rax; "
166 "addq %%rax, %[lo]; "
167 "adcq %%rdx, %[hi]; "
168 : [lo]"+r"(lo), [hi]"+r"(hi)
169 : [b]"m"(*b), [a]"r"(a)
170 : "rax", "rdx", "cc");
173 *acc = (((__uint128_t)(hi))<<64) | lo;
176 static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) {
177 uint64_t lo = *acc, hi = *acc>>64;
182 ("mulx %[b], %[c], %[d]; "
185 : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
186 : [b]"r"(b), [a]"d"(a)
191 "addq %%rax, %[lo]; "
192 "adcq %%rdx, %[hi]; "
193 : [lo]"+r"(lo), [hi]"+r"(hi), "+a"(a)
198 *acc = (((__uint128_t)(hi))<<64) | lo;
201 static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
202 uint64_t lo = *acc, hi = *acc>>64;
207 ("movq %[a], %%rdx; "
208 "addq %%rdx, %%rdx; "
209 "mulx %[b], %[c], %[d]; "
212 : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
213 : [b]"m"(*b), [a]"m"(*a)
217 ("movq %[a], %%rax; "
218 "addq %%rax, %%rax; "
220 "addq %%rax, %[lo]; "
221 "adcq %%rdx, %[hi]; "
222 : [lo]"+r"(lo), [hi]"+r"(hi)
223 : [b]"m"(*b), [a]"m"(*a)
224 : "rax", "rdx", "cc");
227 *acc = (((__uint128_t)(hi))<<64) | lo;
230 static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
231 uint64_t lo = *acc, hi = *acc>>64;
235 ("movq %[a], %%rdx; "
236 "mulx %[b], %[c], %[d]; "
239 : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
240 : [b]"m"(*b), [a]"m"(*a)
244 ("movq %[a], %%rax; "
246 "subq %%rax, %[lo]; "
247 "sbbq %%rdx, %[hi]; "
248 : [lo]"+r"(lo), [hi]"+r"(hi)
249 : [b]"m"(*b), [a]"m"(*a)
250 : "rax", "rdx", "cc");
252 *acc = (((__uint128_t)(hi))<<64) | lo;
255 static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
256 uint64_t lo = *acc, hi = *acc>>64;
260 ("movq %[a], %%rdx; "
261 "addq %%rdx, %%rdx; "
262 "mulx %[b], %[c], %[d]; "
265 : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
266 : [b]"m"(*b), [a]"m"(*a)
270 ("movq %[a], %%rax; "
271 "addq %%rax, %%rax; "
273 "subq %%rax, %[lo]; "
274 "sbbq %%rdx, %[hi]; "
275 : [lo]"+r"(lo), [hi]"+r"(hi)
276 : [b]"m"(*b), [a]"m"(*a)
277 : "rax", "rdx", "cc");
279 *acc = (((__uint128_t)(hi))<<64) | lo;
283 static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
284 uint64_t c,d, lo = *acc, hi = *acc>>64;
286 ("movq %[a], %%rdx; "
287 "mulx %[b], %[c], %[d]; "
290 : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
291 : [b]"m"(*b), [a]"m"(*a)
293 *acc = (((__uint128_t)(d))<<64) | c;
296 static __inline__ uint64_t word_is_zero(uint64_t x) {
297 __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
301 static inline uint64_t shrld(__uint128_t x, int n) {
305 #endif /* __ARCH_X86_64_ARCH_INTRINSICS_H__ */