2 * Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
3 * Copyright 2014-2016 Cryptography Research, Inc.
5 * Licensed under the OpenSSL license (the "License"). You may not use
6 * this file except in compliance with the License. You can obtain a copy
7 * in the file LICENSE in the source distribution or at
8 * https://www.openssl.org/source/license.html
10 * Originally written by Mike Hamburg
12 #ifndef __ARCH_X86_64_ARCH_INTRINSICS_H__
13 #define __ARCH_X86_64_ARCH_INTRINSICS_H__
15 #define ARCH_WORD_BITS 64
17 #include <openssl/e_os2.h>
19 /* FUTURE: autogenerate */
20 static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
26 : [c]"=&a"(c), [d]"=d"(d)
27 : [b]"m"(*b), [a]"m"(*a)
32 "mulx %[b], %[c], %[d];"
33 : [c]"=r"(c), [d]"=r"(d)
34 : [b]"m"(*b), [a]"m"(*a)
37 return (((__uint128_t)(d))<<64) | c;
40 static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
46 : [c]"=&a"(c), [d]"=d"(d)
47 : [b]"m"(*b), [a]"r"(a)
51 ("mulx %[b], %[c], %[d];"
52 : [c]"=r"(c), [d]"=r"(d)
53 : [b]"m"(*b), [a]"d"(a));
55 return (((__uint128_t)(d))<<64) | c;
58 static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) {
63 : [c]"=a"(c), [d]"=d"(d)
68 ("mulx %[b], %[c], %[d];"
69 : [c]"=r"(c), [d]"=r"(d)
70 : [b]"r"(b), [a]"d"(a));
72 return (((__uint128_t)(d))<<64) | c;
75 static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
82 : [c]"=&a"(c), [d]"=d"(d)
83 : [b]"m"(*b), [a]"m"(*a)
88 "leaq (,%%rdx,2), %%rdx;"
89 "mulx %[b], %[c], %[d];"
90 : [c]"=r"(c), [d]"=r"(d)
91 : [b]"m"(*b), [a]"m"(*a)
94 return (((__uint128_t)(d))<<64) | c;
97 static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
98 uint64_t lo = *acc, hi = *acc>>64;
103 ("movq %[a], %%rdx; "
104 "mulx %[b], %[c], %[d]; "
107 : [c]"=&r"(c), [d]"=&r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
108 : [b]"m"(*b), [a]"m"(*a)
112 ("movq %[a], %%rax; "
114 "addq %%rax, %[lo]; "
115 "adcq %%rdx, %[hi]; "
116 : [lo]"+r"(lo), [hi]"+r"(hi)
117 : [b]"m"(*b), [a]"m"(*a)
118 : "rax", "rdx", "cc");
121 *acc = (((__uint128_t)(hi))<<64) | lo;
124 static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
125 uint64_t lo = *acc, hi = *acc>>64;
126 uint64_t lo2 = *acc2, hi2 = *acc2>>64;
131 ("movq %[a], %%rdx; "
132 "mulx %[b], %[c], %[d]; "
135 "addq %[c], %[lo2]; "
136 "adcq %[d], %[hi2]; "
137 : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
138 : [b]"m"(*b), [a]"m"(*a)
142 ("movq %[a], %%rax; "
144 "addq %%rax, %[lo]; "
145 "adcq %%rdx, %[hi]; "
146 "addq %%rax, %[lo2]; "
147 "adcq %%rdx, %[hi2]; "
148 : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
149 : [b]"m"(*b), [a]"m"(*a)
150 : "rax", "rdx", "cc");
153 *acc = (((__uint128_t)(hi))<<64) | lo;
154 *acc2 = (((__uint128_t)(hi2))<<64) | lo2;
157 static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
158 uint64_t lo = *acc, hi = *acc>>64;
163 ("mulx %[b], %[c], %[d]; "
166 : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
167 : [b]"m"(*b), [a]"d"(a)
171 ("movq %[a], %%rax; "
173 "addq %%rax, %[lo]; "
174 "adcq %%rdx, %[hi]; "
175 : [lo]"+r"(lo), [hi]"+r"(hi)
176 : [b]"m"(*b), [a]"r"(a)
177 : "rax", "rdx", "cc");
180 *acc = (((__uint128_t)(hi))<<64) | lo;
183 static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) {
184 uint64_t lo = *acc, hi = *acc>>64;
189 ("mulx %[b], %[c], %[d]; "
192 : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
193 : [b]"r"(b), [a]"d"(a)
198 "addq %%rax, %[lo]; "
199 "adcq %%rdx, %[hi]; "
200 : [lo]"+r"(lo), [hi]"+r"(hi), "+a"(a)
205 *acc = (((__uint128_t)(hi))<<64) | lo;
208 static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
209 uint64_t lo = *acc, hi = *acc>>64;
214 ("movq %[a], %%rdx; "
215 "addq %%rdx, %%rdx; "
216 "mulx %[b], %[c], %[d]; "
219 : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
220 : [b]"m"(*b), [a]"m"(*a)
224 ("movq %[a], %%rax; "
225 "addq %%rax, %%rax; "
227 "addq %%rax, %[lo]; "
228 "adcq %%rdx, %[hi]; "
229 : [lo]"+r"(lo), [hi]"+r"(hi)
230 : [b]"m"(*b), [a]"m"(*a)
231 : "rax", "rdx", "cc");
234 *acc = (((__uint128_t)(hi))<<64) | lo;
237 static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
238 uint64_t lo = *acc, hi = *acc>>64;
242 ("movq %[a], %%rdx; "
243 "mulx %[b], %[c], %[d]; "
246 : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
247 : [b]"m"(*b), [a]"m"(*a)
251 ("movq %[a], %%rax; "
253 "subq %%rax, %[lo]; "
254 "sbbq %%rdx, %[hi]; "
255 : [lo]"+r"(lo), [hi]"+r"(hi)
256 : [b]"m"(*b), [a]"m"(*a)
257 : "rax", "rdx", "cc");
259 *acc = (((__uint128_t)(hi))<<64) | lo;
262 static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
263 uint64_t lo = *acc, hi = *acc>>64;
267 ("movq %[a], %%rdx; "
268 "addq %%rdx, %%rdx; "
269 "mulx %[b], %[c], %[d]; "
272 : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
273 : [b]"m"(*b), [a]"m"(*a)
277 ("movq %[a], %%rax; "
278 "addq %%rax, %%rax; "
280 "subq %%rax, %[lo]; "
281 "sbbq %%rdx, %[hi]; "
282 : [lo]"+r"(lo), [hi]"+r"(hi)
283 : [b]"m"(*b), [a]"m"(*a)
284 : "rax", "rdx", "cc");
286 *acc = (((__uint128_t)(hi))<<64) | lo;
290 static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
291 uint64_t c,d, lo = *acc, hi = *acc>>64;
293 ("movq %[a], %%rdx; "
294 "mulx %[b], %[c], %[d]; "
297 : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
298 : [b]"m"(*b), [a]"m"(*a)
300 *acc = (((__uint128_t)(d))<<64) | c;
303 static __inline__ uint64_t word_is_zero(uint64_t x) {
304 __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
308 static inline uint64_t shrld(__uint128_t x, int n) {
312 #endif /* __ARCH_X86_64_ARCH_INTRINSICS_H__ */