2 * Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
3 * Copyright 2014 Cryptography Research, Inc.
5 * Licensed under the OpenSSL license (the "License"). You may not use
6 * this file except in compliance with the License. You can obtain a copy
7 * in the file LICENSE in the source distribution or at
8 * https://www.openssl.org/source/license.html
10 * Originally written by Mike Hamburg
19 # include <openssl/e_os2.h>
20 # include "arch_intrinsics.h"
22 # include "curve448utils.h"
25 # define _BSD_SOURCE 1
28 # ifndef _DEFAULT_SOURCE
29 # define _DEFAULT_SOURCE 1
34 # if defined(__ARM_NEON__)
35 # include <arm_neon.h>
36 # elif defined(__SSE2__)
37 # if !defined(__GNUC__) || defined(__clang__) || __GNUC__ >= 5 || (__GNUC__==4 && __GNUC_MINOR__ >= 4)
38 # include <immintrin.h>
40 # include <emmintrin.h>
44 # if (ARCH_WORD_BITS == 64)
45 typedef uint64_t word_t, mask_t;
46 typedef __uint128_t dword_t;
47 typedef int32_t hsword_t;
48 typedef int64_t sword_t;
49 typedef __int128_t dsword_t;
50 # elif (ARCH_WORD_BITS == 32)
51 typedef uint32_t word_t, mask_t;
52 typedef uint64_t dword_t;
53 typedef int16_t hsword_t;
54 typedef int32_t sword_t;
55 typedef int64_t dsword_t;
57 # error "For now, libdecaf only supports 32- and 64-bit architectures."
61 * Scalar limbs are keyed off of the API word size instead of the arch word
64 # if DECAF_WORD_BITS == 64
65 # define SC_LIMB(x) (x)
66 # elif DECAF_WORD_BITS == 32
67 # define SC_LIMB(x) ((uint32_t)x),(x>>32)
69 # error "For now, libdecaf only supports 32- and 64-bit architectures."
73 typedef uint32x4_t vecmask_t;
74 # elif defined(__clang__)
75 typedef uint64_t uint64x2_t __attribute__ ((ext_vector_type(2)));
76 typedef int64_t int64x2_t __attribute__ ((ext_vector_type(2)));
77 typedef uint64_t uint64x4_t __attribute__ ((ext_vector_type(4)));
78 typedef int64_t int64x4_t __attribute__ ((ext_vector_type(4)));
79 typedef uint32_t uint32x4_t __attribute__ ((ext_vector_type(4)));
80 typedef int32_t int32x4_t __attribute__ ((ext_vector_type(4)));
81 typedef uint32_t uint32x2_t __attribute__ ((ext_vector_type(2)));
82 typedef int32_t int32x2_t __attribute__ ((ext_vector_type(2)));
83 typedef uint32_t uint32x8_t __attribute__ ((ext_vector_type(8)));
84 typedef int32_t int32x8_t __attribute__ ((ext_vector_type(8)));
85 typedef word_t vecmask_t __attribute__ ((ext_vector_type(4)));
86 # else /* GCC, hopefully? */
87 typedef uint64_t uint64x2_t __attribute__ ((vector_size(16)));
88 typedef int64_t int64x2_t __attribute__ ((vector_size(16)));
89 typedef uint64_t uint64x4_t __attribute__ ((vector_size(32)));
90 typedef int64_t int64x4_t __attribute__ ((vector_size(32)));
91 typedef uint32_t uint32x4_t __attribute__ ((vector_size(16)));
92 typedef int32_t int32x4_t __attribute__ ((vector_size(16)));
93 typedef uint32_t uint32x2_t __attribute__ ((vector_size(8)));
94 typedef int32_t int32x2_t __attribute__ ((vector_size(8)));
95 typedef uint32_t uint32x8_t __attribute__ ((vector_size(32)));
96 typedef int32_t int32x8_t __attribute__ ((vector_size(32)));
97 typedef word_t vecmask_t __attribute__ ((vector_size(32)));
100 # if defined(__AVX2__)
101 # define VECTOR_ALIGNED __attribute__((aligned(32)))
102 typedef uint32x8_t big_register_t;
103 typedef uint64x4_t uint64xn_t;
104 typedef uint32x8_t uint32xn_t;
106 static ossl_inline big_register_t br_set_to_mask(mask_t x)
108 uint32_t y = (uint32_t)x;
109 big_register_t ret = { y, y, y, y, y, y, y, y };
112 # elif defined(__SSE2__)
113 # define VECTOR_ALIGNED __attribute__((aligned(16)))
114 typedef uint32x4_t big_register_t;
115 typedef uint64x2_t uint64xn_t;
116 typedef uint32x4_t uint32xn_t;
118 static ossl_inline big_register_t br_set_to_mask(mask_t x)
121 big_register_t ret = { y, y, y, y };
124 # elif defined(__ARM_NEON__)
125 # define VECTOR_ALIGNED __attribute__((aligned(16)))
126 typedef uint32x4_t big_register_t;
127 typedef uint64x2_t uint64xn_t;
128 typedef uint32x4_t uint32xn_t;
130 static ossl_inline big_register_t br_set_to_mask(mask_t x)
132 return vdupq_n_u32(x);
134 # elif defined(_WIN64) || defined(__amd64__) || defined(__X86_64__) \
135 || defined(__aarch64__)
136 # define VECTOR_ALIGNED __attribute__((aligned(8)))
137 typedef uint64_t big_register_t, uint64xn_t;
139 typedef uint32_t uint32xn_t;
140 static ossl_inline big_register_t br_set_to_mask(mask_t x)
142 return (big_register_t) x;
145 # define VECTOR_ALIGNED __attribute__((aligned(4)))
146 typedef uint64_t uint64xn_t;
147 typedef uint32_t uint32xn_t;
148 typedef uint32_t big_register_t;
150 static ossl_inline big_register_t br_set_to_mask(mask_t x)
152 return (big_register_t) x;
156 # if defined(__AVX2__)
157 static ossl_inline big_register_t br_is_zero(big_register_t x)
159 return (big_register_t) (x == br_set_to_mask(0));
161 # elif defined(__SSE2__)
162 static ossl_inline big_register_t br_is_zero(big_register_t x)
164 return (big_register_t) _mm_cmpeq_epi32((__m128i) x, _mm_setzero_si128());
165 // return (big_register_t)(x == br_set_to_mask(0));
167 # elif defined(__ARM_NEON__)
168 static ossl_inline big_register_t br_is_zero(big_register_t x)
170 return vceqq_u32(x, x ^ x);
173 # define br_is_zero word_is_zero
176 /* PERF: vectorize vs unroll */
178 # if 100*__clang_major__ + __clang_minor__ > 305
179 # define UNROLL _Pragma("clang loop unroll(full)")
188 * The plan on booleans: The external interface uses decaf_bool_t, but this
189 * might be a different size than our particular arch's word_t (and thus
190 * mask_t). Also, the caller isn't guaranteed to pass it as nonzero. So
191 * bool_to_mask converts word sizes and checks nonzero. On the flip side,
192 * mask_t is always -1 or 0, but it might be a different size than
193 * decaf_bool_t. On the third hand, we have success vs boolean types, but
194 * that's handled in common.h: it converts between decaf_bool_t and
197 static ossl_inline decaf_bool_t mask_to_bool(mask_t m)
199 return (decaf_sword_t) (sword_t) m;
202 static ossl_inline mask_t bool_to_mask(decaf_bool_t m)
204 /* On most arches this will be optimized to a simple cast. */
208 unsigned int limit = sizeof(decaf_bool_t) / sizeof(mask_t);
211 for (i = 0; i < limit; i++) {
212 ret |= ~word_is_zero(m >> (i * 8 * sizeof(word_t)));
217 static ossl_inline void ignore_result(decaf_bool_t boo)
222 #endif /* __WORD_H__ */