crypto/ec/curve448/word.h

   1 /*
   2  * Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
   3  * Copyright 2014 Cryptography Research, Inc.
   4  *
   5  * Licensed under the OpenSSL license (the "License").  You may not use
   6  * this file except in compliance with the License.  You can obtain a copy
   7  * in the file LICENSE in the source distribution or at
   8  * https://www.openssl.org/source/license.html
   9  *
  10  * Originally written by Mike Hamburg
  11  */
  12
  13 #ifndef __WORD_H__
  14 # define __WORD_H__
  15
  16 # include <string.h>
  17
  18 # include <assert.h>
  19 # include <openssl/e_os2.h>
  20 # include "arch_intrinsics.h"
  21
  22 # include "curve448utils.h"
  23
  24 # ifndef _BSD_SOURCE
  25 #  define _BSD_SOURCE 1
  26 # endif
  27
  28 # ifndef _DEFAULT_SOURCE
  29 #  define _DEFAULT_SOURCE 1
  30 # endif
  31
  32 # include <stdlib.h>
  33
  34 # if defined(__ARM_NEON__)
  35 #  include <arm_neon.h>
  36 # elif defined(__SSE2__)
  37 #  if !defined(__GNUC__) || defined(__clang__) || __GNUC__ >= 5 \
  38       || (__GNUC__==4 && __GNUC_MINOR__ >= 4)
  39 #   include <immintrin.h>
  40 #  else
  41 #   include <emmintrin.h>
  42 #  endif
  43 # endif
  44
  45 # if (ARCH_WORD_BITS == 64)
  46 typedef uint64_t word_t, mask_t;
  47 typedef __uint128_t dword_t;
  48 typedef int32_t hsword_t;
  49 typedef int64_t sword_t;
  50 typedef __int128_t dsword_t;
  51 # elif (ARCH_WORD_BITS == 32)
  52 typedef uint32_t word_t, mask_t;
  53 typedef uint64_t dword_t;
  54 typedef int16_t hsword_t;
  55 typedef int32_t sword_t;
  56 typedef int64_t dsword_t;
  57 # else
  58 #  error "For now, we only support 32- and 64-bit architectures."
  59 # endif
  60
  61 /*
  62  * Scalar limbs are keyed off of the API word size instead of the arch word
  63  * size.
  64  */
  65 # if C448_WORD_BITS == 64
  66 #  define SC_LIMB(x) (x)
  67 # elif C448_WORD_BITS == 32
  68 #  define SC_LIMB(x) ((uint32_t)x),(x>>32)
  69 # else
  70 #  error "For now we only support 32- and 64-bit architectures."
  71 # endif
  72
  73 # ifdef __ARM_NEON__
  74 typedef uint32x4_t vecmask_t;
  75 # elif defined(__clang__)
  76 typedef uint64_t uint64x2_t __attribute__ ((ext_vector_type(2)));
  77 typedef int64_t int64x2_t __attribute__ ((ext_vector_type(2)));
  78 typedef uint64_t uint64x4_t __attribute__ ((ext_vector_type(4)));
  79 typedef int64_t int64x4_t __attribute__ ((ext_vector_type(4)));
  80 typedef uint32_t uint32x4_t __attribute__ ((ext_vector_type(4)));
  81 typedef int32_t int32x4_t __attribute__ ((ext_vector_type(4)));
  82 typedef uint32_t uint32x2_t __attribute__ ((ext_vector_type(2)));
  83 typedef int32_t int32x2_t __attribute__ ((ext_vector_type(2)));
  84 typedef uint32_t uint32x8_t __attribute__ ((ext_vector_type(8)));
  85 typedef int32_t int32x8_t __attribute__ ((ext_vector_type(8)));
  86 typedef word_t vecmask_t __attribute__ ((ext_vector_type(4)));
  87 # elif defined(__GNUC__) \
  88        && (__GNUC__ >= 4 || (__GNUC__== 3 && __GNUC_MINOR__ >= 1))
  89 typedef uint64_t uint64x2_t __attribute__ ((vector_size(16)));
  90 typedef int64_t int64x2_t __attribute__ ((vector_size(16)));
  91 typedef uint64_t uint64x4_t __attribute__ ((vector_size(32)));
  92 typedef int64_t int64x4_t __attribute__ ((vector_size(32)));
  93 typedef uint32_t uint32x4_t __attribute__ ((vector_size(16)));
  94 typedef int32_t int32x4_t __attribute__ ((vector_size(16)));
  95 typedef uint32_t uint32x2_t __attribute__ ((vector_size(8)));
  96 typedef int32_t int32x2_t __attribute__ ((vector_size(8)));
  97 typedef uint32_t uint32x8_t __attribute__ ((vector_size(32)));
  98 typedef int32_t int32x8_t __attribute__ ((vector_size(32)));
  99 typedef word_t vecmask_t __attribute__ ((vector_size(32)));
 100 # endif
 101
 102 # if defined(__AVX2__)
 103 #  define VECTOR_ALIGNED __attribute__((aligned(32)))
 104 typedef uint32x8_t big_register_t;
 105 typedef uint64x4_t uint64xn_t;
 106 typedef uint32x8_t uint32xn_t;
 107
 108 static ossl_inline big_register_t br_set_to_mask(mask_t x)
 109 {
 110     uint32_t y = (uint32_t)x;
 111     big_register_t ret = { y, y, y, y, y, y, y, y };
 112     return ret;
 113 }
 114 # elif defined(__SSE2__)
 115 #  define VECTOR_ALIGNED __attribute__((aligned(16)))
 116 typedef uint32x4_t big_register_t;
 117 typedef uint64x2_t uint64xn_t;
 118 typedef uint32x4_t uint32xn_t;
 119
 120 static ossl_inline big_register_t br_set_to_mask(mask_t x)
 121 {
 122     uint32_t y = x;
 123     big_register_t ret = { y, y, y, y };
 124     return ret;
 125 }
 126 # elif defined(__ARM_NEON__)
 127 #  define VECTOR_ALIGNED __attribute__((aligned(16)))
 128 typedef uint32x4_t big_register_t;
 129 typedef uint64x2_t uint64xn_t;
 130 typedef uint32x4_t uint32xn_t;
 131
 132 static ossl_inline big_register_t br_set_to_mask(mask_t x)
 133 {
 134     return vdupq_n_u32(x);
 135 }
 136 # elif !defined(_MSC_VER) \
 137        && (defined(_WIN64) || defined(__amd64__) || defined(__X86_64__) \
 138            || defined(__aarch64__))
 139 #  define VECTOR_ALIGNED __attribute__((aligned(8)))
 140 typedef uint64_t big_register_t, uint64xn_t;
 141
 142 typedef uint32_t uint32xn_t;
 143 static ossl_inline big_register_t br_set_to_mask(mask_t x)
 144 {
 145     return (big_register_t) x;
 146 }
 147 # else
 148 #  ifdef __GNUC__
 149 #   define VECTOR_ALIGNED __attribute__((aligned(4)))
 150 #  else
 151 /*
 152  * This shouldn't be a problem because a big_register_t isn't actually a vector
 153  * type anyway in this case.
 154  */
 155 #   define VECTOR_ALIGNED
 156 #  endif
 157 typedef uint64_t uint64xn_t;
 158 typedef uint32_t uint32xn_t;
 159 typedef uint32_t big_register_t;
 160
 161 static ossl_inline big_register_t br_set_to_mask(mask_t x)
 162 {
 163     return (big_register_t) x;
 164 }
 165 # endif
 166
 167 # if defined(__AVX2__)
 168 static ossl_inline big_register_t br_is_zero(big_register_t x)
 169 {
 170     return (big_register_t) (x == br_set_to_mask(0));
 171 }
 172 # elif defined(__SSE2__)
 173 static ossl_inline big_register_t br_is_zero(big_register_t x)
 174 {
 175     return (big_register_t) _mm_cmpeq_epi32((__m128i) x, _mm_setzero_si128());
 176 }
 177 # elif defined(__ARM_NEON__)
 178 static ossl_inline big_register_t br_is_zero(big_register_t x)
 179 {
 180     return vceqq_u32(x, x ^ x);
 181 }
 182 # else
 183 #  define br_is_zero word_is_zero
 184 # endif
 185
 186 /* PERF: vectorize vs unroll */
 187 # ifdef __clang__
 188 #  if 100*__clang_major__ + __clang_minor__ > 305
 189 #   define UNROLL _Pragma("clang loop unroll(full)")
 190 #  endif
 191 # endif
 192
 193 # ifndef UNROLL
 194 #  define UNROLL
 195 # endif
 196
 197 /*
 198  * The plan on booleans: The external interface uses c448_bool_t, but this
 199  * might be a different size than our particular arch's word_t (and thus
 200  * mask_t).  Also, the caller isn't guaranteed to pass it as nonzero.  So
 201  * bool_to_mask converts word sizes and checks nonzero. On the flip side,
 202  * mask_t is always -1 or 0, but it might be a different size than
 203  * c448_bool_t. On the third hand, we have success vs boolean types, but
 204  * that's handled in common.h: it converts between c448_bool_t and
 205  * c448_error_t.
 206  */
 207 static ossl_inline c448_bool_t mask_to_bool(mask_t m)
 208 {
 209     return (c448_sword_t)(sword_t)m;
 210 }
 211
 212 static ossl_inline mask_t bool_to_mask(c448_bool_t m)
 213 {
 214     /* On most arches this will be optimized to a simple cast. */
 215     mask_t ret = 0;
 216     unsigned int i;
 217     unsigned int limit = sizeof(c448_bool_t) / sizeof(mask_t);
 218
 219     if (limit < 1)
 220         limit = 1;
 221     for (i = 0; i < limit; i++)
 222         ret |= ~word_is_zero(m >> (i * 8 * sizeof(word_t)));
 223
 224     return ret;
 225 }
 226
 227 static ossl_inline void ignore_result(c448_bool_t boo)
 228 {
 229     (void)boo;
 230 }
 231
 232 #endif                          /* __WORD_H__ */