1 /* Copyright (c) 2014 Cryptography Research, Inc.
2 * Released under the MIT License. See LICENSE.txt for license information.
11 #include <openssl/e_os2.h>
12 #include "arch_intrinsics.h"
14 #include "curve448utils.h"
20 #ifndef _DEFAULT_SOURCE
21 #define _DEFAULT_SOURCE 1
26 #if defined(__ARM_NEON__)
28 #elif defined(__SSE2__)
29 #if !defined(__GNUC__) || defined(__clang__) || __GNUC__ >= 5 || (__GNUC__==4 && __GNUC_MINOR__ >= 4)
30 #include <immintrin.h>
32 #include <emmintrin.h>
36 #if (ARCH_WORD_BITS == 64)
37 typedef uint64_t word_t, mask_t;
38 typedef __uint128_t dword_t;
39 typedef int32_t hsword_t;
40 typedef int64_t sword_t;
41 typedef __int128_t dsword_t;
42 #elif (ARCH_WORD_BITS == 32)
43 typedef uint32_t word_t, mask_t;
44 typedef uint64_t dword_t;
45 typedef int16_t hsword_t;
46 typedef int32_t sword_t;
47 typedef int64_t dsword_t;
49 #error "For now, libdecaf only supports 32- and 64-bit architectures."
52 /* Scalar limbs are keyed off of the API word size instead of the arch word size. */
53 #if DECAF_WORD_BITS == 64
54 #define SC_LIMB(x) (x)
55 #elif DECAF_WORD_BITS == 32
56 #define SC_LIMB(x) ((uint32_t)x),(x>>32)
58 #error "For now, libdecaf only supports 32- and 64-bit architectures."
62 typedef uint32x4_t vecmask_t;
63 #elif defined(__clang__)
64 typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
65 typedef int64_t int64x2_t __attribute__((ext_vector_type(2)));
66 typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
67 typedef int64_t int64x4_t __attribute__((ext_vector_type(4)));
68 typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
69 typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
70 typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2)));
71 typedef int32_t int32x2_t __attribute__((ext_vector_type(2)));
72 typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
73 typedef int32_t int32x8_t __attribute__((ext_vector_type(8)));
74 typedef word_t vecmask_t __attribute__((ext_vector_type(4)));
75 #else /* GCC, hopefully? */
76 typedef uint64_t uint64x2_t __attribute__((vector_size(16)));
77 typedef int64_t int64x2_t __attribute__((vector_size(16)));
78 typedef uint64_t uint64x4_t __attribute__((vector_size(32)));
79 typedef int64_t int64x4_t __attribute__((vector_size(32)));
80 typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
81 typedef int32_t int32x4_t __attribute__((vector_size(16)));
82 typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
83 typedef int32_t int32x2_t __attribute__((vector_size(8)));
84 typedef uint32_t uint32x8_t __attribute__((vector_size(32)));
85 typedef int32_t int32x8_t __attribute__((vector_size(32)));
86 typedef word_t vecmask_t __attribute__((vector_size(32)));
90 #define VECTOR_ALIGNED __attribute__((aligned(32)))
91 typedef uint32x8_t big_register_t;
92 typedef uint64x4_t uint64xn_t;
93 typedef uint32x8_t uint32xn_t;
95 static ossl_inline big_register_t
96 br_set_to_mask(mask_t x) {
97 uint32_t y = (uint32_t)x;
98 big_register_t ret = {y,y,y,y,y,y,y,y};
101 #elif defined(__SSE2__)
102 #define VECTOR_ALIGNED __attribute__((aligned(16)))
103 typedef uint32x4_t big_register_t;
104 typedef uint64x2_t uint64xn_t;
105 typedef uint32x4_t uint32xn_t;
107 static ossl_inline big_register_t
108 br_set_to_mask(mask_t x) {
110 big_register_t ret = {y,y,y,y};
113 #elif defined(__ARM_NEON__)
114 #define VECTOR_ALIGNED __attribute__((aligned(16)))
115 typedef uint32x4_t big_register_t;
116 typedef uint64x2_t uint64xn_t;
117 typedef uint32x4_t uint32xn_t;
119 static ossl_inline big_register_t
120 br_set_to_mask(mask_t x) {
121 return vdupq_n_u32(x);
123 #elif defined(_WIN64) || defined(__amd64__) || defined(__X86_64__) \
124 || defined(__aarch64__)
125 #define VECTOR_ALIGNED __attribute__((aligned(8)))
126 typedef uint64_t big_register_t, uint64xn_t;
128 typedef uint32_t uint32xn_t;
129 static ossl_inline big_register_t
130 br_set_to_mask(mask_t x) {
131 return (big_register_t)x;
134 #define VECTOR_ALIGNED __attribute__((aligned(4)))
135 typedef uint64_t uint64xn_t;
136 typedef uint32_t uint32xn_t;
137 typedef uint32_t big_register_t;
139 static ossl_inline big_register_t
140 br_set_to_mask(mask_t x) {
141 return (big_register_t)x;
145 #if defined(__AVX2__)
146 static ossl_inline big_register_t
147 br_is_zero(big_register_t x) {
148 return (big_register_t)(x == br_set_to_mask(0));
150 #elif defined(__SSE2__)
151 static ossl_inline big_register_t
152 br_is_zero(big_register_t x) {
153 return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128());
154 //return (big_register_t)(x == br_set_to_mask(0));
156 #elif defined(__ARM_NEON__)
157 static ossl_inline big_register_t
158 br_is_zero(big_register_t x) {
159 return vceqq_u32(x,x^x);
162 #define br_is_zero word_is_zero
165 /* PERF: vectorize vs unroll */
167 #if 100*__clang_major__ + __clang_minor__ > 305
168 #define UNROLL _Pragma("clang loop unroll(full)")
176 /* The plan on booleans:
178 * The external interface uses decaf_bool_t, but this might be a different
179 * size than our particular arch's word_t (and thus mask_t). Also, the caller
180 * isn't guaranteed to pass it as nonzero. So bool_to_mask converts word sizes
181 * and checks nonzero.
183 * On the flip side, mask_t is always -1 or 0, but it might be a different size
186 * On the third hand, we have success vs boolean types, but that's handled in
187 * common.h: it converts between decaf_bool_t and decaf_error_t.
189 static ossl_inline decaf_bool_t mask_to_bool (mask_t m) {
190 return (decaf_sword_t)(sword_t)m;
193 static ossl_inline mask_t bool_to_mask (decaf_bool_t m) {
194 /* On most arches this will be optimized to a simple cast. */
198 unsigned int limit = sizeof(decaf_bool_t)/sizeof(mask_t);
199 if (limit < 1) limit = 1;
200 for (i=0; i<limit; i++) {
201 ret |= ~ word_is_zero(m >> (i*8*sizeof(word_t)));
206 static ossl_inline void ignore_result ( decaf_bool_t boo ) {
210 #endif /* __WORD_H__ */