crypto/ec/curve448/word.h

   1 /* Copyright (c) 2014 Cryptography Research, Inc.
   2  * Released under the MIT License.  See LICENSE.txt for license information.
   3  */
   4
   5 #ifndef __WORD_H__
   6 #define __WORD_H__
   7
   8 #include <string.h>
   9
  10 #include <assert.h>
  11 #include <openssl/e_os2.h>
  12 #include "arch_intrinsics.h"
  13
  14 #include "curve448utils.h"
  15
  16 #ifndef _BSD_SOURCE
  17 #define _BSD_SOURCE 1
  18 #endif
  19
  20 #ifndef _DEFAULT_SOURCE
  21 #define _DEFAULT_SOURCE 1
  22 #endif
  23
  24 #include "portable_endian.h"
  25
  26 #include <stdlib.h>
  27
  28 #if defined(__ARM_NEON__)
  29 #include <arm_neon.h>
  30 #elif defined(__SSE2__)
  31     #if !defined(__GNUC__) || __clang__ || __GNUC__ >= 5 || (__GNUC__==4 && __GNUC_MINOR__ >= 4)
  32         #include <immintrin.h>
  33     #else
  34         #include <emmintrin.h>
  35     #endif
  36 #endif
  37
  38 #if (ARCH_WORD_BITS == 64)
  39     typedef uint64_t word_t, mask_t;
  40     typedef __uint128_t dword_t;
  41     typedef int32_t hsword_t;
  42     typedef int64_t sword_t;
  43     typedef __int128_t dsword_t;
  44 #elif (ARCH_WORD_BITS == 32)
  45     typedef uint32_t word_t, mask_t;
  46     typedef uint64_t dword_t;
  47     typedef int16_t hsword_t;
  48     typedef int32_t sword_t;
  49     typedef int64_t dsword_t;
  50 #else
  51     #error "For now, libdecaf only supports 32- and 64-bit architectures."
  52 #endif
  53
  54 /* Scalar limbs are keyed off of the API word size instead of the arch word size. */
  55 #if DECAF_WORD_BITS == 64
  56     #define SC_LIMB(x) (x##ull)
  57 #elif DECAF_WORD_BITS == 32
  58     #define SC_LIMB(x) ((uint32_t)x##ull),(x##ull>>32)
  59 #else
  60     #error "For now, libdecaf only supports 32- and 64-bit architectures."
  61 #endif
  62
  63 #ifdef __ARM_NEON__
  64     typedef uint32x4_t vecmask_t;
  65 #elif __clang__
  66     typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
  67     typedef int64_t  int64x2_t __attribute__((ext_vector_type(2)));
  68     typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
  69     typedef int64_t  int64x4_t __attribute__((ext_vector_type(4)));
  70     typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
  71     typedef int32_t  int32x4_t __attribute__((ext_vector_type(4)));
  72     typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2)));
  73     typedef int32_t  int32x2_t __attribute__((ext_vector_type(2)));
  74     typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
  75     typedef int32_t  int32x8_t __attribute__((ext_vector_type(8)));
  76     typedef word_t vecmask_t __attribute__((ext_vector_type(4)));
  77 #else /* GCC, hopefully? */
  78     typedef uint64_t uint64x2_t __attribute__((vector_size(16)));
  79     typedef int64_t  int64x2_t __attribute__((vector_size(16)));
  80     typedef uint64_t uint64x4_t __attribute__((vector_size(32)));
  81     typedef int64_t  int64x4_t __attribute__((vector_size(32)));
  82     typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
  83     typedef int32_t  int32x4_t __attribute__((vector_size(16)));
  84     typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
  85     typedef int32_t  int32x2_t __attribute__((vector_size(8)));
  86     typedef uint32_t uint32x8_t __attribute__((vector_size(32)));
  87     typedef int32_t  int32x8_t __attribute__((vector_size(32)));
  88     typedef word_t vecmask_t __attribute__((vector_size(32)));
  89 #endif
  90
  91 #if __AVX2__
  92     #define VECTOR_ALIGNED __attribute__((aligned(32)))
  93     typedef uint32x8_t big_register_t;
  94     typedef uint64x4_t uint64xn_t;
  95     typedef uint32x8_t uint32xn_t;
  96
  97     static ossl_inline big_register_t
  98     br_set_to_mask(mask_t x) {
  99         uint32_t y = (uint32_t)x;
 100         big_register_t ret = {y,y,y,y,y,y,y,y};
 101         return ret;
 102     }
 103 #elif __SSE2__
 104     #define VECTOR_ALIGNED __attribute__((aligned(16)))
 105     typedef uint32x4_t big_register_t;
 106     typedef uint64x2_t uint64xn_t;
 107     typedef uint32x4_t uint32xn_t;
 108
 109     static ossl_inline big_register_t
 110     br_set_to_mask(mask_t x) {
 111         uint32_t y = x;
 112         big_register_t ret = {y,y,y,y};
 113         return ret;
 114     }
 115 #elif __ARM_NEON__
 116     #define VECTOR_ALIGNED __attribute__((aligned(16)))
 117     typedef uint32x4_t big_register_t;
 118     typedef uint64x2_t uint64xn_t;
 119     typedef uint32x4_t uint32xn_t;
 120
 121     static ossl_inline big_register_t
 122     br_set_to_mask(mask_t x) {
 123         return vdupq_n_u32(x);
 124     }
 125 #elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__
 126     #define VECTOR_ALIGNED __attribute__((aligned(8)))
 127     typedef uint64_t big_register_t, uint64xn_t;
 128
 129     typedef uint32_t uint32xn_t;
 130     static ossl_inline big_register_t
 131     br_set_to_mask(mask_t x) {
 132         return (big_register_t)x;
 133     }
 134 #else
 135     #define VECTOR_ALIGNED __attribute__((aligned(4)))
 136     typedef uint64_t uint64xn_t;
 137     typedef uint32_t uint32xn_t;
 138     typedef uint32_t big_register_t;
 139
 140     static ossl_inline big_register_t
 141     br_set_to_mask(mask_t x) {
 142         return (big_register_t)x;
 143     }
 144 #endif
 145
 146 #if __AVX2__
 147     static ossl_inline big_register_t
 148     br_is_zero(big_register_t x) {
 149         return (big_register_t)(x == br_set_to_mask(0));
 150     }
 151 #elif __SSE2__
 152     static ossl_inline big_register_t
 153     br_is_zero(big_register_t x) {
 154         return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128());
 155         //return (big_register_t)(x == br_set_to_mask(0));
 156     }
 157 #elif __ARM_NEON__
 158     static ossl_inline big_register_t
 159     br_is_zero(big_register_t x) {
 160         return vceqq_u32(x,x^x);
 161     }
 162 #else
 163     #define br_is_zero word_is_zero
 164 #endif
 165
 166 /* PERF: vectorize vs unroll */
 167 #ifdef __clang__
 168 #if 100*__clang_major__ + __clang_minor__ > 305
 169 #define UNROLL _Pragma("clang loop unroll(full)")
 170 #endif
 171 #endif
 172
 173 #ifndef UNROLL
 174 #define UNROLL
 175 #endif
 176
 177 /* The plan on booleans:
 178  *
 179  * The external interface uses decaf_bool_t, but this might be a different
 180  * size than our particular arch's word_t (and thus mask_t).  Also, the caller
 181  * isn't guaranteed to pass it as nonzero.  So bool_to_mask converts word sizes
 182  * and checks nonzero.
 183  *
 184  * On the flip side, mask_t is always -1 or 0, but it might be a different size
 185  * than decaf_bool_t.
 186  *
 187  * On the third hand, we have success vs boolean types, but that's handled in
 188  * common.h: it converts between decaf_bool_t and decaf_error_t.
 189  */
 190 static ossl_inline decaf_bool_t mask_to_bool (mask_t m) {
 191     return (decaf_sword_t)(sword_t)m;
 192 }
 193
 194 static ossl_inline mask_t bool_to_mask (decaf_bool_t m) {
 195     /* On most arches this will be optimized to a simple cast. */
 196     mask_t ret = 0;
 197     unsigned int limit = sizeof(decaf_bool_t)/sizeof(mask_t);
 198     if (limit < 1) limit = 1;
 199     for (unsigned int i=0; i<limit; i++) {
 200         ret |= ~ word_is_zero(m >> (i*8*sizeof(word_t)));
 201     }
 202     return ret;
 203 }
 204
 205 static ossl_inline void ignore_result ( decaf_bool_t boo ) {
 206     (void)boo;
 207 }
 208
 209 #endif /* __WORD_H__ */