crypto/ec/curve448/word.h

   1 /*
   2  * Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
   3  * Copyright 2014 Cryptography Research, Inc.
   4  *
   5  * Licensed under the OpenSSL license (the "License").  You may not use
   6  * this file except in compliance with the License.  You can obtain a copy
   7  * in the file LICENSE in the source distribution or at
   8  * https://www.openssl.org/source/license.html
   9  *
  10  * Originally written by Mike Hamburg
  11  */
  12
  13 #ifndef __WORD_H__
  14 #define __WORD_H__
  15
  16 #include <string.h>
  17
  18 #include <assert.h>
  19 #include <openssl/e_os2.h>
  20 #include "arch_intrinsics.h"
  21
  22 #include "curve448utils.h"
  23
  24 #ifndef _BSD_SOURCE
  25 #define _BSD_SOURCE 1
  26 #endif
  27
  28 #ifndef _DEFAULT_SOURCE
  29 #define _DEFAULT_SOURCE 1
  30 #endif
  31
  32 #include <stdlib.h>
  33
  34 #if defined(__ARM_NEON__)
  35 #include <arm_neon.h>
  36 #elif defined(__SSE2__)
  37     #if !defined(__GNUC__) || defined(__clang__) || __GNUC__ >= 5 || (__GNUC__==4 && __GNUC_MINOR__ >= 4)
  38         #include <immintrin.h>
  39     #else
  40         #include <emmintrin.h>
  41     #endif
  42 #endif
  43
  44 #if (ARCH_WORD_BITS == 64)
  45     typedef uint64_t word_t, mask_t;
  46     typedef __uint128_t dword_t;
  47     typedef int32_t hsword_t;
  48     typedef int64_t sword_t;
  49     typedef __int128_t dsword_t;
  50 #elif (ARCH_WORD_BITS == 32)
  51     typedef uint32_t word_t, mask_t;
  52     typedef uint64_t dword_t;
  53     typedef int16_t hsword_t;
  54     typedef int32_t sword_t;
  55     typedef int64_t dsword_t;
  56 #else
  57     #error "For now, libdecaf only supports 32- and 64-bit architectures."
  58 #endif
  59
  60 /* Scalar limbs are keyed off of the API word size instead of the arch word size. */
  61 #if DECAF_WORD_BITS == 64
  62     #define SC_LIMB(x) (x)
  63 #elif DECAF_WORD_BITS == 32
  64     #define SC_LIMB(x) ((uint32_t)x),(x>>32)
  65 #else
  66     #error "For now, libdecaf only supports 32- and 64-bit architectures."
  67 #endif
  68
  69 #ifdef __ARM_NEON__
  70     typedef uint32x4_t vecmask_t;
  71 #elif defined(__clang__)
  72     typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
  73     typedef int64_t  int64x2_t __attribute__((ext_vector_type(2)));
  74     typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
  75     typedef int64_t  int64x4_t __attribute__((ext_vector_type(4)));
  76     typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
  77     typedef int32_t  int32x4_t __attribute__((ext_vector_type(4)));
  78     typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2)));
  79     typedef int32_t  int32x2_t __attribute__((ext_vector_type(2)));
  80     typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
  81     typedef int32_t  int32x8_t __attribute__((ext_vector_type(8)));
  82     typedef word_t vecmask_t __attribute__((ext_vector_type(4)));
  83 #else /* GCC, hopefully? */
  84     typedef uint64_t uint64x2_t __attribute__((vector_size(16)));
  85     typedef int64_t  int64x2_t __attribute__((vector_size(16)));
  86     typedef uint64_t uint64x4_t __attribute__((vector_size(32)));
  87     typedef int64_t  int64x4_t __attribute__((vector_size(32)));
  88     typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
  89     typedef int32_t  int32x4_t __attribute__((vector_size(16)));
  90     typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
  91     typedef int32_t  int32x2_t __attribute__((vector_size(8)));
  92     typedef uint32_t uint32x8_t __attribute__((vector_size(32)));
  93     typedef int32_t  int32x8_t __attribute__((vector_size(32)));
  94     typedef word_t vecmask_t __attribute__((vector_size(32)));
  95 #endif
  96
  97 #if defined(__AVX2__)
  98     #define VECTOR_ALIGNED __attribute__((aligned(32)))
  99     typedef uint32x8_t big_register_t;
 100     typedef uint64x4_t uint64xn_t;
 101     typedef uint32x8_t uint32xn_t;
 102
 103     static ossl_inline big_register_t
 104     br_set_to_mask(mask_t x) {
 105         uint32_t y = (uint32_t)x;
 106         big_register_t ret = {y,y,y,y,y,y,y,y};
 107         return ret;
 108     }
 109 #elif defined(__SSE2__)
 110     #define VECTOR_ALIGNED __attribute__((aligned(16)))
 111     typedef uint32x4_t big_register_t;
 112     typedef uint64x2_t uint64xn_t;
 113     typedef uint32x4_t uint32xn_t;
 114
 115     static ossl_inline big_register_t
 116     br_set_to_mask(mask_t x) {
 117         uint32_t y = x;
 118         big_register_t ret = {y,y,y,y};
 119         return ret;
 120     }
 121 #elif defined(__ARM_NEON__)
 122     #define VECTOR_ALIGNED __attribute__((aligned(16)))
 123     typedef uint32x4_t big_register_t;
 124     typedef uint64x2_t uint64xn_t;
 125     typedef uint32x4_t uint32xn_t;
 126
 127     static ossl_inline big_register_t
 128     br_set_to_mask(mask_t x) {
 129         return vdupq_n_u32(x);
 130     }
 131 #elif defined(_WIN64) || defined(__amd64__) || defined(__X86_64__) \
 132       || defined(__aarch64__)
 133     #define VECTOR_ALIGNED __attribute__((aligned(8)))
 134     typedef uint64_t big_register_t, uint64xn_t;
 135
 136     typedef uint32_t uint32xn_t;
 137     static ossl_inline big_register_t
 138     br_set_to_mask(mask_t x) {
 139         return (big_register_t)x;
 140     }
 141 #else
 142     #define VECTOR_ALIGNED __attribute__((aligned(4)))
 143     typedef uint64_t uint64xn_t;
 144     typedef uint32_t uint32xn_t;
 145     typedef uint32_t big_register_t;
 146
 147     static ossl_inline big_register_t
 148     br_set_to_mask(mask_t x) {
 149         return (big_register_t)x;
 150     }
 151 #endif
 152
 153 #if defined(__AVX2__)
 154     static ossl_inline big_register_t
 155     br_is_zero(big_register_t x) {
 156         return (big_register_t)(x == br_set_to_mask(0));
 157     }
 158 #elif defined(__SSE2__)
 159     static ossl_inline big_register_t
 160     br_is_zero(big_register_t x) {
 161         return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128());
 162         //return (big_register_t)(x == br_set_to_mask(0));
 163     }
 164 #elif defined(__ARM_NEON__)
 165     static ossl_inline big_register_t
 166     br_is_zero(big_register_t x) {
 167         return vceqq_u32(x,x^x);
 168     }
 169 #else
 170     #define br_is_zero word_is_zero
 171 #endif
 172
 173 /* PERF: vectorize vs unroll */
 174 #ifdef __clang__
 175 #if 100*__clang_major__ + __clang_minor__ > 305
 176 #define UNROLL _Pragma("clang loop unroll(full)")
 177 #endif
 178 #endif
 179
 180 #ifndef UNROLL
 181 #define UNROLL
 182 #endif
 183
 184 /* The plan on booleans:
 185  *
 186  * The external interface uses decaf_bool_t, but this might be a different
 187  * size than our particular arch's word_t (and thus mask_t).  Also, the caller
 188  * isn't guaranteed to pass it as nonzero.  So bool_to_mask converts word sizes
 189  * and checks nonzero.
 190  *
 191  * On the flip side, mask_t is always -1 or 0, but it might be a different size
 192  * than decaf_bool_t.
 193  *
 194  * On the third hand, we have success vs boolean types, but that's handled in
 195  * common.h: it converts between decaf_bool_t and decaf_error_t.
 196  */
 197 static ossl_inline decaf_bool_t mask_to_bool (mask_t m) {
 198     return (decaf_sword_t)(sword_t)m;
 199 }
 200
 201 static ossl_inline mask_t bool_to_mask (decaf_bool_t m) {
 202     /* On most arches this will be optimized to a simple cast. */
 203     mask_t ret = 0;
 204     unsigned int i;
 205
 206     unsigned int limit = sizeof(decaf_bool_t)/sizeof(mask_t);
 207     if (limit < 1) limit = 1;
 208     for (i=0; i<limit; i++) {
 209         ret |= ~ word_is_zero(m >> (i*8*sizeof(word_t)));
 210     }
 211     return ret;
 212 }
 213
 214 static ossl_inline void ignore_result ( decaf_bool_t boo ) {
 215     (void)boo;
 216 }
 217
 218 #endif /* __WORD_H__ */