1 /* Copyright (c) 2014 Cryptography Research, Inc.
2 * Released under the MIT License. See LICENSE.txt for license information.
8 /* for posix_memalign */
9 #define _XOPEN_SOURCE 600
10 #define __STDC_WANT_LIB_EXT1__ 1 /* for memset_s */
12 #if defined(__sun) && defined(__SVR4)
13 extern int posix_memalign(void **, size_t, size_t);
18 #include "arch_intrinsics.h"
20 #include <decaf/common.h>
26 #ifndef _DEFAULT_SOURCE
27 #define _DEFAULT_SOURCE 1
30 #include "portable_endian.h"
33 #include <sys/types.h>
36 #if defined(__ARM_NEON__)
38 #elif defined(__SSE2__)
39 #if !defined(__GNUC__) || __clang__ || __GNUC__ >= 5 || (__GNUC__==4 && __GNUC_MINOR__ >= 4)
40 #include <immintrin.h>
42 #include <emmintrin.h>
46 #if (ARCH_WORD_BITS == 64)
47 typedef uint64_t word_t, mask_t;
48 typedef __uint128_t dword_t;
49 typedef int32_t hsword_t;
50 typedef int64_t sword_t;
51 typedef __int128_t dsword_t;
52 #elif (ARCH_WORD_BITS == 32)
53 typedef uint32_t word_t, mask_t;
54 typedef uint64_t dword_t;
55 typedef int16_t hsword_t;
56 typedef int32_t sword_t;
57 typedef int64_t dsword_t;
59 #error "For now, libdecaf only supports 32- and 64-bit architectures."
62 /* Scalar limbs are keyed off of the API word size instead of the arch word size. */
63 #if DECAF_WORD_BITS == 64
64 #define SC_LIMB(x) (x##ull)
65 #elif DECAF_WORD_BITS == 32
66 #define SC_LIMB(x) ((uint32_t)x##ull),(x##ull>>32)
68 #error "For now, libdecaf only supports 32- and 64-bit architectures."
72 typedef uint32x4_t vecmask_t;
74 typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
75 typedef int64_t int64x2_t __attribute__((ext_vector_type(2)));
76 typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
77 typedef int64_t int64x4_t __attribute__((ext_vector_type(4)));
78 typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
79 typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
80 typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2)));
81 typedef int32_t int32x2_t __attribute__((ext_vector_type(2)));
82 typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
83 typedef int32_t int32x8_t __attribute__((ext_vector_type(8)));
84 typedef word_t vecmask_t __attribute__((ext_vector_type(4)));
85 #else /* GCC, hopefully? */
86 typedef uint64_t uint64x2_t __attribute__((vector_size(16)));
87 typedef int64_t int64x2_t __attribute__((vector_size(16)));
88 typedef uint64_t uint64x4_t __attribute__((vector_size(32)));
89 typedef int64_t int64x4_t __attribute__((vector_size(32)));
90 typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
91 typedef int32_t int32x4_t __attribute__((vector_size(16)));
92 typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
93 typedef int32_t int32x2_t __attribute__((vector_size(8)));
94 typedef uint32_t uint32x8_t __attribute__((vector_size(32)));
95 typedef int32_t int32x8_t __attribute__((vector_size(32)));
96 typedef word_t vecmask_t __attribute__((vector_size(32)));
100 #define VECTOR_ALIGNED __attribute__((aligned(32)))
101 typedef uint32x8_t big_register_t;
102 typedef uint64x4_t uint64xn_t;
103 typedef uint32x8_t uint32xn_t;
105 static DECAF_INLINE big_register_t
106 br_set_to_mask(mask_t x) {
107 uint32_t y = (uint32_t)x;
108 big_register_t ret = {y,y,y,y,y,y,y,y};
112 #define VECTOR_ALIGNED __attribute__((aligned(16)))
113 typedef uint32x4_t big_register_t;
114 typedef uint64x2_t uint64xn_t;
115 typedef uint32x4_t uint32xn_t;
117 static DECAF_INLINE big_register_t
118 br_set_to_mask(mask_t x) {
120 big_register_t ret = {y,y,y,y};
124 #define VECTOR_ALIGNED __attribute__((aligned(16)))
125 typedef uint32x4_t big_register_t;
126 typedef uint64x2_t uint64xn_t;
127 typedef uint32x4_t uint32xn_t;
129 static DECAF_INLINE big_register_t
130 br_set_to_mask(mask_t x) {
131 return vdupq_n_u32(x);
133 #elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__
134 #define VECTOR_ALIGNED __attribute__((aligned(8)))
135 typedef uint64_t big_register_t, uint64xn_t;
137 typedef uint32_t uint32xn_t;
138 static DECAF_INLINE big_register_t
139 br_set_to_mask(mask_t x) {
140 return (big_register_t)x;
143 #define VECTOR_ALIGNED __attribute__((aligned(4)))
144 typedef uint64_t uint64xn_t;
145 typedef uint32_t uint32xn_t;
146 typedef uint32_t big_register_t;
148 static DECAF_INLINE big_register_t
149 br_set_to_mask(mask_t x) {
150 return (big_register_t)x;
155 uint64xn_t unaligned;
156 } __attribute__((packed)) unaligned_uint64xn_t;
159 uint32xn_t unaligned;
160 } __attribute__((packed)) unaligned_uint32xn_t;
163 static DECAF_INLINE big_register_t
164 br_is_zero(big_register_t x) {
165 return (big_register_t)(x == br_set_to_mask(0));
168 static DECAF_INLINE big_register_t
169 br_is_zero(big_register_t x) {
170 return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128());
171 //return (big_register_t)(x == br_set_to_mask(0));
174 static DECAF_INLINE big_register_t
175 br_is_zero(big_register_t x) {
176 return vceqq_u32(x,x^x);
179 #define br_is_zero word_is_zero
183 * Really call memset, in a way that prevents the compiler from optimizing it out.
184 * @param p The object to zeroize.
185 * @param c The char to set it to (probably zero).
186 * @param s The size of the object.
188 #if defined(__DARWIN_C_LEVEL) || defined(__STDC_LIB_EXT1__)
192 #if !defined(__STDC_WANT_LIB_EXT1__) || __STDC_WANT_LIB_EXT1__ != 1
193 #define NEED_MEMSET_S_EXTERN
197 #ifdef NEED_MEMSET_S_EXTERN
198 extern int memset_s(void *, size_t, int, size_t);
200 static DECAF_INLINE void
201 really_memset(void *p, char c, size_t s) {
202 memset_s(p, s, c, s);
205 /* PERF: use words? */
206 static DECAF_INLINE void
207 really_memset(void *p, char c, size_t s) {
208 volatile char *pv = (volatile char *)p;
210 for (i=0; i<s; i++) pv[i] = c;
215 * Allocate memory which is sufficiently aligned to be used for the
216 * largest vector on the system (for now that's a big_register_t).
218 * Man malloc says that it does this, but at least for AVX2 on MacOS X,
221 * @param size The size of the region to allocate.
222 * @return A suitable pointer, which can be free'd with free(),
223 * or NULL if no memory can be allocated.
225 static DECAF_INLINE void *
226 malloc_vector(size_t size) {
229 int ret = posix_memalign(&out, sizeof(big_register_t), size);
238 /* PERF: vectorize vs unroll */
240 #if 100*__clang_major__ + __clang_minor__ > 305
241 #define UNROLL _Pragma("clang loop unroll(full)")
249 /* The plan on booleans:
251 * The external interface uses decaf_bool_t, but this might be a different
252 * size than our particular arch's word_t (and thus mask_t). Also, the caller
253 * isn't guaranteed to pass it as nonzero. So bool_to_mask converts word sizes
254 * and checks nonzero.
256 * On the flip side, mask_t is always -1 or 0, but it might be a different size
259 * On the third hand, we have success vs boolean types, but that's handled in
260 * common.h: it converts between decaf_bool_t and decaf_error_t.
262 static DECAF_INLINE decaf_bool_t mask_to_bool (mask_t m) {
263 return (decaf_sword_t)(sword_t)m;
266 static DECAF_INLINE mask_t bool_to_mask (decaf_bool_t m) {
267 /* On most arches this will be optimized to a simple cast. */
269 unsigned int limit = sizeof(decaf_bool_t)/sizeof(mask_t);
270 if (limit < 1) limit = 1;
271 for (unsigned int i=0; i<limit; i++) {
272 ret |= ~ word_is_zero(m >> (i*8*sizeof(word_t)));
277 static DECAF_INLINE void ignore_result ( decaf_bool_t boo ) {
281 #endif /* __WORD_H__ */