crypto/ec/curve448/word.h

   1 /* Copyright (c) 2014 Cryptography Research, Inc.
   2  * Released under the MIT License.  See LICENSE.txt for license information.
   3  */
   4
   5 #ifndef __WORD_H__
   6 #define __WORD_H__
   7
   8 #include <string.h>
   9
  10 #include <assert.h>
  11 #include <openssl/e_os2.h>
  12 #include "arch_intrinsics.h"
  13
  14 #include "curve448utils.h"
  15
  16 #ifndef _BSD_SOURCE
  17 #define _BSD_SOURCE 1
  18 #endif
  19
  20 #ifndef _DEFAULT_SOURCE
  21 #define _DEFAULT_SOURCE 1
  22 #endif
  23
  24 #include <stdlib.h>
  25
  26 #if defined(__ARM_NEON__)
  27 #include <arm_neon.h>
  28 #elif defined(__SSE2__)
  29     #if !defined(__GNUC__) || defined(__clang__) || __GNUC__ >= 5 || (__GNUC__==4 && __GNUC_MINOR__ >= 4)
  30         #include <immintrin.h>
  31     #else
  32         #include <emmintrin.h>
  33     #endif
  34 #endif
  35
  36 #if (ARCH_WORD_BITS == 64)
  37     typedef uint64_t word_t, mask_t;
  38     typedef __uint128_t dword_t;
  39     typedef int32_t hsword_t;
  40     typedef int64_t sword_t;
  41     typedef __int128_t dsword_t;
  42 #elif (ARCH_WORD_BITS == 32)
  43     typedef uint32_t word_t, mask_t;
  44     typedef uint64_t dword_t;
  45     typedef int16_t hsword_t;
  46     typedef int32_t sword_t;
  47     typedef int64_t dsword_t;
  48 #else
  49     #error "For now, libdecaf only supports 32- and 64-bit architectures."
  50 #endif
  51
  52 /* Scalar limbs are keyed off of the API word size instead of the arch word size. */
  53 #if DECAF_WORD_BITS == 64
  54     #define SC_LIMB(x) (x)
  55 #elif DECAF_WORD_BITS == 32
  56     #define SC_LIMB(x) ((uint32_t)x),(x>>32)
  57 #else
  58     #error "For now, libdecaf only supports 32- and 64-bit architectures."
  59 #endif
  60
  61 #ifdef __ARM_NEON__
  62     typedef uint32x4_t vecmask_t;
  63 #elif defined(__clang__)
  64     typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
  65     typedef int64_t  int64x2_t __attribute__((ext_vector_type(2)));
  66     typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
  67     typedef int64_t  int64x4_t __attribute__((ext_vector_type(4)));
  68     typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
  69     typedef int32_t  int32x4_t __attribute__((ext_vector_type(4)));
  70     typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2)));
  71     typedef int32_t  int32x2_t __attribute__((ext_vector_type(2)));
  72     typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
  73     typedef int32_t  int32x8_t __attribute__((ext_vector_type(8)));
  74     typedef word_t vecmask_t __attribute__((ext_vector_type(4)));
  75 #else /* GCC, hopefully? */
  76     typedef uint64_t uint64x2_t __attribute__((vector_size(16)));
  77     typedef int64_t  int64x2_t __attribute__((vector_size(16)));
  78     typedef uint64_t uint64x4_t __attribute__((vector_size(32)));
  79     typedef int64_t  int64x4_t __attribute__((vector_size(32)));
  80     typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
  81     typedef int32_t  int32x4_t __attribute__((vector_size(16)));
  82     typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
  83     typedef int32_t  int32x2_t __attribute__((vector_size(8)));
  84     typedef uint32_t uint32x8_t __attribute__((vector_size(32)));
  85     typedef int32_t  int32x8_t __attribute__((vector_size(32)));
  86     typedef word_t vecmask_t __attribute__((vector_size(32)));
  87 #endif
  88
  89 #if defined(__AVX2__)
  90     #define VECTOR_ALIGNED __attribute__((aligned(32)))
  91     typedef uint32x8_t big_register_t;
  92     typedef uint64x4_t uint64xn_t;
  93     typedef uint32x8_t uint32xn_t;
  94
  95     static ossl_inline big_register_t
  96     br_set_to_mask(mask_t x) {
  97         uint32_t y = (uint32_t)x;
  98         big_register_t ret = {y,y,y,y,y,y,y,y};
  99         return ret;
 100     }
 101 #elif defined(__SSE2__)
 102     #define VECTOR_ALIGNED __attribute__((aligned(16)))
 103     typedef uint32x4_t big_register_t;
 104     typedef uint64x2_t uint64xn_t;
 105     typedef uint32x4_t uint32xn_t;
 106
 107     static ossl_inline big_register_t
 108     br_set_to_mask(mask_t x) {
 109         uint32_t y = x;
 110         big_register_t ret = {y,y,y,y};
 111         return ret;
 112     }
 113 #elif defined(__ARM_NEON__)
 114     #define VECTOR_ALIGNED __attribute__((aligned(16)))
 115     typedef uint32x4_t big_register_t;
 116     typedef uint64x2_t uint64xn_t;
 117     typedef uint32x4_t uint32xn_t;
 118
 119     static ossl_inline big_register_t
 120     br_set_to_mask(mask_t x) {
 121         return vdupq_n_u32(x);
 122     }
 123 #elif defined(_WIN64) || defined(__amd64__) || defined(__X86_64__) \
 124       || defined(__aarch64__)
 125     #define VECTOR_ALIGNED __attribute__((aligned(8)))
 126     typedef uint64_t big_register_t, uint64xn_t;
 127
 128     typedef uint32_t uint32xn_t;
 129     static ossl_inline big_register_t
 130     br_set_to_mask(mask_t x) {
 131         return (big_register_t)x;
 132     }
 133 #else
 134     #define VECTOR_ALIGNED __attribute__((aligned(4)))
 135     typedef uint64_t uint64xn_t;
 136     typedef uint32_t uint32xn_t;
 137     typedef uint32_t big_register_t;
 138
 139     static ossl_inline big_register_t
 140     br_set_to_mask(mask_t x) {
 141         return (big_register_t)x;
 142     }
 143 #endif
 144
 145 #if defined(__AVX2__)
 146     static ossl_inline big_register_t
 147     br_is_zero(big_register_t x) {
 148         return (big_register_t)(x == br_set_to_mask(0));
 149     }
 150 #elif defined(__SSE2__)
 151     static ossl_inline big_register_t
 152     br_is_zero(big_register_t x) {
 153         return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128());
 154         //return (big_register_t)(x == br_set_to_mask(0));
 155     }
 156 #elif defined(__ARM_NEON__)
 157     static ossl_inline big_register_t
 158     br_is_zero(big_register_t x) {
 159         return vceqq_u32(x,x^x);
 160     }
 161 #else
 162     #define br_is_zero word_is_zero
 163 #endif
 164
 165 /* PERF: vectorize vs unroll */
 166 #ifdef __clang__
 167 #if 100*__clang_major__ + __clang_minor__ > 305
 168 #define UNROLL _Pragma("clang loop unroll(full)")
 169 #endif
 170 #endif
 171
 172 #ifndef UNROLL
 173 #define UNROLL
 174 #endif
 175
 176 /* The plan on booleans:
 177  *
 178  * The external interface uses decaf_bool_t, but this might be a different
 179  * size than our particular arch's word_t (and thus mask_t).  Also, the caller
 180  * isn't guaranteed to pass it as nonzero.  So bool_to_mask converts word sizes
 181  * and checks nonzero.
 182  *
 183  * On the flip side, mask_t is always -1 or 0, but it might be a different size
 184  * than decaf_bool_t.
 185  *
 186  * On the third hand, we have success vs boolean types, but that's handled in
 187  * common.h: it converts between decaf_bool_t and decaf_error_t.
 188  */
 189 static ossl_inline decaf_bool_t mask_to_bool (mask_t m) {
 190     return (decaf_sword_t)(sword_t)m;
 191 }
 192
 193 static ossl_inline mask_t bool_to_mask (decaf_bool_t m) {
 194     /* On most arches this will be optimized to a simple cast. */
 195     mask_t ret = 0;
 196     unsigned int i;
 197
 198     unsigned int limit = sizeof(decaf_bool_t)/sizeof(mask_t);
 199     if (limit < 1) limit = 1;
 200     for (i=0; i<limit; i++) {
 201         ret |= ~ word_is_zero(m >> (i*8*sizeof(word_t)));
 202     }
 203     return ret;
 204 }
 205
 206 static ossl_inline void ignore_result ( decaf_bool_t boo ) {
 207     (void)boo;
 208 }
 209
 210 #endif /* __WORD_H__ */