1 /* Copyright (c) 2014 Cryptography Research, Inc.
2 * Released under the MIT License. See LICENSE.txt for license information.
12 #include "arch_intrinsics.h"
14 #include "curve448utils.h"
20 #ifndef _DEFAULT_SOURCE
21 #define _DEFAULT_SOURCE 1
24 #include "portable_endian.h"
27 #include <sys/types.h>
30 #if defined(__ARM_NEON__)
32 #elif defined(__SSE2__)
33 #if !defined(__GNUC__) || __clang__ || __GNUC__ >= 5 || (__GNUC__==4 && __GNUC_MINOR__ >= 4)
34 #include <immintrin.h>
36 #include <emmintrin.h>
40 #if (ARCH_WORD_BITS == 64)
41 typedef uint64_t word_t, mask_t;
42 typedef __uint128_t dword_t;
43 typedef int32_t hsword_t;
44 typedef int64_t sword_t;
45 typedef __int128_t dsword_t;
46 #elif (ARCH_WORD_BITS == 32)
47 typedef uint32_t word_t, mask_t;
48 typedef uint64_t dword_t;
49 typedef int16_t hsword_t;
50 typedef int32_t sword_t;
51 typedef int64_t dsword_t;
53 #error "For now, libdecaf only supports 32- and 64-bit architectures."
56 /* Scalar limbs are keyed off of the API word size instead of the arch word size. */
57 #if DECAF_WORD_BITS == 64
58 #define SC_LIMB(x) (x##ull)
59 #elif DECAF_WORD_BITS == 32
60 #define SC_LIMB(x) ((uint32_t)x##ull),(x##ull>>32)
62 #error "For now, libdecaf only supports 32- and 64-bit architectures."
66 typedef uint32x4_t vecmask_t;
68 typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
69 typedef int64_t int64x2_t __attribute__((ext_vector_type(2)));
70 typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
71 typedef int64_t int64x4_t __attribute__((ext_vector_type(4)));
72 typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
73 typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
74 typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2)));
75 typedef int32_t int32x2_t __attribute__((ext_vector_type(2)));
76 typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
77 typedef int32_t int32x8_t __attribute__((ext_vector_type(8)));
78 typedef word_t vecmask_t __attribute__((ext_vector_type(4)));
79 #else /* GCC, hopefully? */
80 typedef uint64_t uint64x2_t __attribute__((vector_size(16)));
81 typedef int64_t int64x2_t __attribute__((vector_size(16)));
82 typedef uint64_t uint64x4_t __attribute__((vector_size(32)));
83 typedef int64_t int64x4_t __attribute__((vector_size(32)));
84 typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
85 typedef int32_t int32x4_t __attribute__((vector_size(16)));
86 typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
87 typedef int32_t int32x2_t __attribute__((vector_size(8)));
88 typedef uint32_t uint32x8_t __attribute__((vector_size(32)));
89 typedef int32_t int32x8_t __attribute__((vector_size(32)));
90 typedef word_t vecmask_t __attribute__((vector_size(32)));
94 #define VECTOR_ALIGNED __attribute__((aligned(32)))
95 typedef uint32x8_t big_register_t;
96 typedef uint64x4_t uint64xn_t;
97 typedef uint32x8_t uint32xn_t;
99 static DECAF_INLINE big_register_t
100 br_set_to_mask(mask_t x) {
101 uint32_t y = (uint32_t)x;
102 big_register_t ret = {y,y,y,y,y,y,y,y};
106 #define VECTOR_ALIGNED __attribute__((aligned(16)))
107 typedef uint32x4_t big_register_t;
108 typedef uint64x2_t uint64xn_t;
109 typedef uint32x4_t uint32xn_t;
111 static DECAF_INLINE big_register_t
112 br_set_to_mask(mask_t x) {
114 big_register_t ret = {y,y,y,y};
118 #define VECTOR_ALIGNED __attribute__((aligned(16)))
119 typedef uint32x4_t big_register_t;
120 typedef uint64x2_t uint64xn_t;
121 typedef uint32x4_t uint32xn_t;
123 static DECAF_INLINE big_register_t
124 br_set_to_mask(mask_t x) {
125 return vdupq_n_u32(x);
127 #elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__
128 #define VECTOR_ALIGNED __attribute__((aligned(8)))
129 typedef uint64_t big_register_t, uint64xn_t;
131 typedef uint32_t uint32xn_t;
132 static DECAF_INLINE big_register_t
133 br_set_to_mask(mask_t x) {
134 return (big_register_t)x;
137 #define VECTOR_ALIGNED __attribute__((aligned(4)))
138 typedef uint64_t uint64xn_t;
139 typedef uint32_t uint32xn_t;
140 typedef uint32_t big_register_t;
142 static DECAF_INLINE big_register_t
143 br_set_to_mask(mask_t x) {
144 return (big_register_t)x;
149 static DECAF_INLINE big_register_t
150 br_is_zero(big_register_t x) {
151 return (big_register_t)(x == br_set_to_mask(0));
154 static DECAF_INLINE big_register_t
155 br_is_zero(big_register_t x) {
156 return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128());
157 //return (big_register_t)(x == br_set_to_mask(0));
160 static DECAF_INLINE big_register_t
161 br_is_zero(big_register_t x) {
162 return vceqq_u32(x,x^x);
165 #define br_is_zero word_is_zero
168 /* PERF: vectorize vs unroll */
170 #if 100*__clang_major__ + __clang_minor__ > 305
171 #define UNROLL _Pragma("clang loop unroll(full)")
179 /* The plan on booleans:
181 * The external interface uses decaf_bool_t, but this might be a different
182 * size than our particular arch's word_t (and thus mask_t). Also, the caller
183 * isn't guaranteed to pass it as nonzero. So bool_to_mask converts word sizes
184 * and checks nonzero.
186 * On the flip side, mask_t is always -1 or 0, but it might be a different size
189 * On the third hand, we have success vs boolean types, but that's handled in
190 * common.h: it converts between decaf_bool_t and decaf_error_t.
192 static DECAF_INLINE decaf_bool_t mask_to_bool (mask_t m) {
193 return (decaf_sword_t)(sword_t)m;
196 static DECAF_INLINE mask_t bool_to_mask (decaf_bool_t m) {
197 /* On most arches this will be optimized to a simple cast. */
199 unsigned int limit = sizeof(decaf_bool_t)/sizeof(mask_t);
200 if (limit < 1) limit = 1;
201 for (unsigned int i=0; i<limit; i++) {
202 ret |= ~ word_is_zero(m >> (i*8*sizeof(word_t)));
207 static DECAF_INLINE void ignore_result ( decaf_bool_t boo ) {
211 #endif /* __WORD_H__ */