X-Git-Url: https://git.openssl.org/?p=openssl.git;a=blobdiff_plain;f=crypto%2Fec%2Fcurve448%2Fword.h;h=7d47a58c327edb23e3690f8c95b8d22d9d0c3fcb;hp=9f62654b6e24532cecf6bdb56c85c439f494a438;hb=52a9587c78a135ff200b8c92f8aad7ea1bd4de75;hpb=094c071cbf3bca19eee73ccb8dfc0f7498f5d8e1;ds=sidebyside diff --git a/crypto/ec/curve448/word.h b/crypto/ec/curve448/word.h index 9f62654b6e..7d47a58c32 100644 --- a/crypto/ec/curve448/word.h +++ b/crypto/ec/curve448/word.h @@ -1,209 +1,232 @@ -/* Copyright (c) 2014 Cryptography Research, Inc. - * Released under the MIT License. See LICENSE.txt for license information. +/* + * Copyright 2017 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 2014 Cryptography Research, Inc. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + * + * Originally written by Mike Hamburg */ #ifndef __WORD_H__ -#define __WORD_H__ - -#include - -#include -#include -#include "arch_intrinsics.h" - -#include "curve448utils.h" - -#ifndef _BSD_SOURCE -#define _BSD_SOURCE 1 -#endif - -#ifndef _DEFAULT_SOURCE -#define _DEFAULT_SOURCE 1 -#endif - -#include - -#if defined(__ARM_NEON__) -#include -#elif defined(__SSE2__) - #if !defined(__GNUC__) || __clang__ || __GNUC__ >= 5 || (__GNUC__==4 && __GNUC_MINOR__ >= 4) - #include - #else - #include - #endif -#endif - -#if (ARCH_WORD_BITS == 64) - typedef uint64_t word_t, mask_t; - typedef __uint128_t dword_t; - typedef int32_t hsword_t; - typedef int64_t sword_t; - typedef __int128_t dsword_t; -#elif (ARCH_WORD_BITS == 32) - typedef uint32_t word_t, mask_t; - typedef uint64_t dword_t; - typedef int16_t hsword_t; - typedef int32_t sword_t; - typedef int64_t dsword_t; -#else - #error "For now, libdecaf only supports 32- and 64-bit architectures." -#endif - -/* Scalar limbs are keyed off of the API word size instead of the arch word size. */ -#if DECAF_WORD_BITS == 64 - #define SC_LIMB(x) (x) -#elif DECAF_WORD_BITS == 32 - #define SC_LIMB(x) ((uint32_t)x),(x>>32) -#else - #error "For now, libdecaf only supports 32- and 64-bit architectures." -#endif - -#ifdef __ARM_NEON__ - typedef uint32x4_t vecmask_t; -#elif __clang__ - typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2))); - typedef int64_t int64x2_t __attribute__((ext_vector_type(2))); - typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4))); - typedef int64_t int64x4_t __attribute__((ext_vector_type(4))); - typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4))); - typedef int32_t int32x4_t __attribute__((ext_vector_type(4))); - typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2))); - typedef int32_t int32x2_t __attribute__((ext_vector_type(2))); - typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8))); - typedef int32_t int32x8_t __attribute__((ext_vector_type(8))); - typedef word_t vecmask_t __attribute__((ext_vector_type(4))); -#else /* GCC, hopefully? */ - typedef uint64_t uint64x2_t __attribute__((vector_size(16))); - typedef int64_t int64x2_t __attribute__((vector_size(16))); - typedef uint64_t uint64x4_t __attribute__((vector_size(32))); - typedef int64_t int64x4_t __attribute__((vector_size(32))); - typedef uint32_t uint32x4_t __attribute__((vector_size(16))); - typedef int32_t int32x4_t __attribute__((vector_size(16))); - typedef uint32_t uint32x2_t __attribute__((vector_size(8))); - typedef int32_t int32x2_t __attribute__((vector_size(8))); - typedef uint32_t uint32x8_t __attribute__((vector_size(32))); - typedef int32_t int32x8_t __attribute__((vector_size(32))); - typedef word_t vecmask_t __attribute__((vector_size(32))); -#endif - -#if __AVX2__ - #define VECTOR_ALIGNED __attribute__((aligned(32))) - typedef uint32x8_t big_register_t; - typedef uint64x4_t uint64xn_t; - typedef uint32x8_t uint32xn_t; - - static ossl_inline big_register_t - br_set_to_mask(mask_t x) { - uint32_t y = (uint32_t)x; - big_register_t ret = {y,y,y,y,y,y,y,y}; - return ret; - } -#elif __SSE2__ - #define VECTOR_ALIGNED __attribute__((aligned(16))) - typedef uint32x4_t big_register_t; - typedef uint64x2_t uint64xn_t; - typedef uint32x4_t uint32xn_t; - - static ossl_inline big_register_t - br_set_to_mask(mask_t x) { - uint32_t y = x; - big_register_t ret = {y,y,y,y}; - return ret; - } -#elif __ARM_NEON__ - #define VECTOR_ALIGNED __attribute__((aligned(16))) - typedef uint32x4_t big_register_t; - typedef uint64x2_t uint64xn_t; - typedef uint32x4_t uint32xn_t; - - static ossl_inline big_register_t - br_set_to_mask(mask_t x) { - return vdupq_n_u32(x); - } -#elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__ - #define VECTOR_ALIGNED __attribute__((aligned(8))) - typedef uint64_t big_register_t, uint64xn_t; - - typedef uint32_t uint32xn_t; - static ossl_inline big_register_t - br_set_to_mask(mask_t x) { - return (big_register_t)x; - } -#else - #define VECTOR_ALIGNED __attribute__((aligned(4))) - typedef uint64_t uint64xn_t; - typedef uint32_t uint32xn_t; - typedef uint32_t big_register_t; - - static ossl_inline big_register_t - br_set_to_mask(mask_t x) { - return (big_register_t)x; - } -#endif - -#if __AVX2__ - static ossl_inline big_register_t - br_is_zero(big_register_t x) { - return (big_register_t)(x == br_set_to_mask(0)); - } -#elif __SSE2__ - static ossl_inline big_register_t - br_is_zero(big_register_t x) { - return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128()); - //return (big_register_t)(x == br_set_to_mask(0)); - } -#elif __ARM_NEON__ - static ossl_inline big_register_t - br_is_zero(big_register_t x) { - return vceqq_u32(x,x^x); - } -#else - #define br_is_zero word_is_zero -#endif - -/* PERF: vectorize vs unroll */ -#ifdef __clang__ -#if 100*__clang_major__ + __clang_minor__ > 305 -#define UNROLL _Pragma("clang loop unroll(full)") -#endif -#endif +# define __WORD_H__ + +# include + +# include +# include +# include "arch_intrinsics.h" + +# include "curve448utils.h" + +# ifndef _BSD_SOURCE +# define _BSD_SOURCE 1 +# endif + +# ifndef _DEFAULT_SOURCE +# define _DEFAULT_SOURCE 1 +# endif + +# include + +# if defined(__ARM_NEON__) +# include +# elif defined(__SSE2__) +# if !defined(__GNUC__) || defined(__clang__) || __GNUC__ >= 5 \ + || (__GNUC__==4 && __GNUC_MINOR__ >= 4) +# include +# else +# include +# endif +# endif + +# if (ARCH_WORD_BITS == 64) +typedef uint64_t word_t, mask_t; +typedef __uint128_t dword_t; +typedef int32_t hsword_t; +typedef int64_t sword_t; +typedef __int128_t dsword_t; +# elif (ARCH_WORD_BITS == 32) +typedef uint32_t word_t, mask_t; +typedef uint64_t dword_t; +typedef int16_t hsword_t; +typedef int32_t sword_t; +typedef int64_t dsword_t; +# else +# error "For now, we only support 32- and 64-bit architectures." +# endif + +/* + * Scalar limbs are keyed off of the API word size instead of the arch word + * size. + */ +# if C448_WORD_BITS == 64 +# define SC_LIMB(x) (x) +# elif C448_WORD_BITS == 32 +# define SC_LIMB(x) ((uint32_t)x),(x>>32) +# else +# error "For now we only support 32- and 64-bit architectures." +# endif + +# ifdef __ARM_NEON__ +typedef uint32x4_t vecmask_t; +# elif defined(__clang__) +typedef uint64_t uint64x2_t __attribute__ ((ext_vector_type(2))); +typedef int64_t int64x2_t __attribute__ ((ext_vector_type(2))); +typedef uint64_t uint64x4_t __attribute__ ((ext_vector_type(4))); +typedef int64_t int64x4_t __attribute__ ((ext_vector_type(4))); +typedef uint32_t uint32x4_t __attribute__ ((ext_vector_type(4))); +typedef int32_t int32x4_t __attribute__ ((ext_vector_type(4))); +typedef uint32_t uint32x2_t __attribute__ ((ext_vector_type(2))); +typedef int32_t int32x2_t __attribute__ ((ext_vector_type(2))); +typedef uint32_t uint32x8_t __attribute__ ((ext_vector_type(8))); +typedef int32_t int32x8_t __attribute__ ((ext_vector_type(8))); +typedef word_t vecmask_t __attribute__ ((ext_vector_type(4))); +# elif defined(__GNUC__) \ + && (__GNUC__ >= 4 || (__GNUC__== 3 && __GNUC_MINOR__ >= 1)) +typedef uint64_t uint64x2_t __attribute__ ((vector_size(16))); +typedef int64_t int64x2_t __attribute__ ((vector_size(16))); +typedef uint64_t uint64x4_t __attribute__ ((vector_size(32))); +typedef int64_t int64x4_t __attribute__ ((vector_size(32))); +typedef uint32_t uint32x4_t __attribute__ ((vector_size(16))); +typedef int32_t int32x4_t __attribute__ ((vector_size(16))); +typedef uint32_t uint32x2_t __attribute__ ((vector_size(8))); +typedef int32_t int32x2_t __attribute__ ((vector_size(8))); +typedef uint32_t uint32x8_t __attribute__ ((vector_size(32))); +typedef int32_t int32x8_t __attribute__ ((vector_size(32))); +typedef word_t vecmask_t __attribute__ ((vector_size(32))); +# endif + +# if defined(__AVX2__) +# define VECTOR_ALIGNED __attribute__((aligned(32))) +typedef uint32x8_t big_register_t; +typedef uint64x4_t uint64xn_t; +typedef uint32x8_t uint32xn_t; + +static ossl_inline big_register_t br_set_to_mask(mask_t x) +{ + uint32_t y = (uint32_t)x; + big_register_t ret = { y, y, y, y, y, y, y, y }; + return ret; +} +# elif defined(__SSE2__) +# define VECTOR_ALIGNED __attribute__((aligned(16))) +typedef uint32x4_t big_register_t; +typedef uint64x2_t uint64xn_t; +typedef uint32x4_t uint32xn_t; + +static ossl_inline big_register_t br_set_to_mask(mask_t x) +{ + uint32_t y = x; + big_register_t ret = { y, y, y, y }; + return ret; +} +# elif defined(__ARM_NEON__) +# define VECTOR_ALIGNED __attribute__((aligned(16))) +typedef uint32x4_t big_register_t; +typedef uint64x2_t uint64xn_t; +typedef uint32x4_t uint32xn_t; + +static ossl_inline big_register_t br_set_to_mask(mask_t x) +{ + return vdupq_n_u32(x); +} +# elif !defined(_MSC_VER) \ + && (defined(_WIN64) || defined(__amd64__) || defined(__X86_64__) \ + || defined(__aarch64__)) +# define VECTOR_ALIGNED __attribute__((aligned(8))) +typedef uint64_t big_register_t, uint64xn_t; + +typedef uint32_t uint32xn_t; +static ossl_inline big_register_t br_set_to_mask(mask_t x) +{ + return (big_register_t) x; +} +# else +# ifdef __GNUC__ +# define VECTOR_ALIGNED __attribute__((aligned(4))) +# else +/* + * This shouldn't be a problem because a big_register_t isn't actually a vector + * type anyway in this case. + */ +# define VECTOR_ALIGNED +# endif +typedef uint64_t uint64xn_t; +typedef uint32_t uint32xn_t; +typedef uint32_t big_register_t; + +static ossl_inline big_register_t br_set_to_mask(mask_t x) +{ + return (big_register_t) x; +} +# endif -#ifndef UNROLL -#define UNROLL -#endif +# if defined(__AVX2__) +static ossl_inline big_register_t br_is_zero(big_register_t x) +{ + return (big_register_t) (x == br_set_to_mask(0)); +} +# elif defined(__SSE2__) +static ossl_inline big_register_t br_is_zero(big_register_t x) +{ + return (big_register_t) _mm_cmpeq_epi32((__m128i) x, _mm_setzero_si128()); +} +# elif defined(__ARM_NEON__) +static ossl_inline big_register_t br_is_zero(big_register_t x) +{ + return vceqq_u32(x, x ^ x); +} +# else +# define br_is_zero word_is_zero +# endif -/* The plan on booleans: - * - * The external interface uses decaf_bool_t, but this might be a different - * size than our particular arch's word_t (and thus mask_t). Also, the caller - * isn't guaranteed to pass it as nonzero. So bool_to_mask converts word sizes - * and checks nonzero. - * - * On the flip side, mask_t is always -1 or 0, but it might be a different size - * than decaf_bool_t. - * - * On the third hand, we have success vs boolean types, but that's handled in - * common.h: it converts between decaf_bool_t and decaf_error_t. +/* PERF: vectorize vs unroll */ +# ifdef __clang__ +# if 100*__clang_major__ + __clang_minor__ > 305 +# define UNROLL _Pragma("clang loop unroll(full)") +# endif +# endif + +# ifndef UNROLL +# define UNROLL +# endif + +/* + * The plan on booleans: The external interface uses c448_bool_t, but this + * might be a different size than our particular arch's word_t (and thus + * mask_t). Also, the caller isn't guaranteed to pass it as nonzero. So + * bool_to_mask converts word sizes and checks nonzero. On the flip side, + * mask_t is always -1 or 0, but it might be a different size than + * c448_bool_t. On the third hand, we have success vs boolean types, but + * that's handled in common.h: it converts between c448_bool_t and + * c448_error_t. */ -static ossl_inline decaf_bool_t mask_to_bool (mask_t m) { - return (decaf_sword_t)(sword_t)m; +static ossl_inline c448_bool_t mask_to_bool(mask_t m) +{ + return (c448_sword_t)(sword_t)m; } -static ossl_inline mask_t bool_to_mask (decaf_bool_t m) { +static ossl_inline mask_t bool_to_mask(c448_bool_t m) +{ /* On most arches this will be optimized to a simple cast. */ mask_t ret = 0; unsigned int i; + unsigned int limit = sizeof(c448_bool_t) / sizeof(mask_t); + + if (limit < 1) + limit = 1; + for (i = 0; i < limit; i++) + ret |= ~word_is_zero(m >> (i * 8 * sizeof(word_t))); - unsigned int limit = sizeof(decaf_bool_t)/sizeof(mask_t); - if (limit < 1) limit = 1; - for (i=0; i> (i*8*sizeof(word_t))); - } return ret; } -static ossl_inline void ignore_result ( decaf_bool_t boo ) { +static ossl_inline void ignore_result(c448_bool_t boo) +{ (void)boo; } -#endif /* __WORD_H__ */ +#endif /* __WORD_H__ */