crypto/ec/ecp_nistp224.c

   1 /*
   2  * Copyright 2010-2019 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the OpenSSL license (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * A 64-bit implementation of the NIST P-224 elliptic curve point multiplication
  28  *
  29  * Inspired by Daniel J. Bernstein's public domain nistp224 implementation
  30  * and Adam Langley's public domain 64-bit C implementation of curve25519
  31  */
  32
  33 #include <openssl/opensslconf.h>
  34 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  35 NON_EMPTY_TRANSLATION_UNIT
  36 #else
  37
  38 # include <stdint.h>
  39 # include <string.h>
  40 # include <openssl/err.h>
  41 # include "ec_lcl.h"
  42
  43 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  44   /* even with gcc, the typedef won't work for 32-bit platforms */
  45 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  46                                  * platforms */
  47 # else
  48 #  error "Need GCC 3.1 or later to define type uint128_t"
  49 # endif
  50
  51 typedef uint8_t u8;
  52 typedef uint64_t u64;
  53
  54 /******************************************************************************/
  55 /*-
  56  * INTERNAL REPRESENTATION OF FIELD ELEMENTS
  57  *
  58  * Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3
  59  * using 64-bit coefficients called 'limbs',
  60  * and sometimes (for multiplication results) as
  61  * b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 + 2^336*b_6
  62  * using 128-bit coefficients called 'widelimbs'.
  63  * A 4-limb representation is an 'felem';
  64  * a 7-widelimb representation is a 'widefelem'.
  65  * Even within felems, bits of adjacent limbs overlap, and we don't always
  66  * reduce the representations: we ensure that inputs to each felem
  67  * multiplication satisfy a_i < 2^60, so outputs satisfy b_i < 4*2^60*2^60,
  68  * and fit into a 128-bit word without overflow. The coefficients are then
  69  * again partially reduced to obtain an felem satisfying a_i < 2^57.
  70  * We only reduce to the unique minimal representation at the end of the
  71  * computation.
  72  */
  73
  74 typedef uint64_t limb;
  75 typedef uint128_t widelimb;
  76
  77 typedef limb felem[4];
  78 typedef widelimb widefelem[7];
  79
  80 /*
  81  * Field element represented as a byte arrary. 28*8 = 224 bits is also the
  82  * group order size for the elliptic curve, and we also use this type for
  83  * scalars for point multiplication.
  84  */
  85 typedef u8 felem_bytearray[28];
  86
  87 static const felem_bytearray nistp224_curve_params[5] = {
  88     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
  89      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
  90      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01},
  91     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a */
  92      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
  93      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE},
  94     {0xB4, 0x05, 0x0A, 0x85, 0x0C, 0x04, 0xB3, 0xAB, 0xF5, 0x41, /* b */
  95      0x32, 0x56, 0x50, 0x44, 0xB0, 0xB7, 0xD7, 0xBF, 0xD8, 0xBA,
  96      0x27, 0x0B, 0x39, 0x43, 0x23, 0x55, 0xFF, 0xB4},
  97     {0xB7, 0x0E, 0x0C, 0xBD, 0x6B, 0xB4, 0xBF, 0x7F, 0x32, 0x13, /* x */
  98      0x90, 0xB9, 0x4A, 0x03, 0xC1, 0xD3, 0x56, 0xC2, 0x11, 0x22,
  99      0x34, 0x32, 0x80, 0xD6, 0x11, 0x5C, 0x1D, 0x21},
 100     {0xbd, 0x37, 0x63, 0x88, 0xb5, 0xf7, 0x23, 0xfb, 0x4c, 0x22, /* y */
 101      0xdf, 0xe6, 0xcd, 0x43, 0x75, 0xa0, 0x5a, 0x07, 0x47, 0x64,
 102      0x44, 0xd5, 0x81, 0x99, 0x85, 0x00, 0x7e, 0x34}
 103 };
 104
 105 /*-
 106  * Precomputed multiples of the standard generator
 107  * Points are given in coordinates (X, Y, Z) where Z normally is 1
 108  * (0 for the point at infinity).
 109  * For each field element, slice a_0 is word 0, etc.
 110  *
 111  * The table has 2 * 16 elements, starting with the following:
 112  * index | bits    | point
 113  * ------+---------+------------------------------
 114  *     0 | 0 0 0 0 | 0G
 115  *     1 | 0 0 0 1 | 1G
 116  *     2 | 0 0 1 0 | 2^56G
 117  *     3 | 0 0 1 1 | (2^56 + 1)G
 118  *     4 | 0 1 0 0 | 2^112G
 119  *     5 | 0 1 0 1 | (2^112 + 1)G
 120  *     6 | 0 1 1 0 | (2^112 + 2^56)G
 121  *     7 | 0 1 1 1 | (2^112 + 2^56 + 1)G
 122  *     8 | 1 0 0 0 | 2^168G
 123  *     9 | 1 0 0 1 | (2^168 + 1)G
 124  *    10 | 1 0 1 0 | (2^168 + 2^56)G
 125  *    11 | 1 0 1 1 | (2^168 + 2^56 + 1)G
 126  *    12 | 1 1 0 0 | (2^168 + 2^112)G
 127  *    13 | 1 1 0 1 | (2^168 + 2^112 + 1)G
 128  *    14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G
 129  *    15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G
 130  * followed by a copy of this with each element multiplied by 2^28.
 131  *
 132  * The reason for this is so that we can clock bits into four different
 133  * locations when doing simple scalar multiplies against the base point,
 134  * and then another four locations using the second 16 elements.
 135  */
 136 static const felem gmul[2][16][3] = {
 137 {{{0, 0, 0, 0},
 138   {0, 0, 0, 0},
 139   {0, 0, 0, 0}},
 140  {{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf},
 141   {0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723},
 142   {1, 0, 0, 0}},
 143  {{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5},
 144   {0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321},
 145   {1, 0, 0, 0}},
 146  {{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748},
 147   {0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17},
 148   {1, 0, 0, 0}},
 149  {{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe},
 150   {0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b},
 151   {1, 0, 0, 0}},
 152  {{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3},
 153   {0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a},
 154   {1, 0, 0, 0}},
 155  {{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c},
 156   {0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244},
 157   {1, 0, 0, 0}},
 158  {{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849},
 159   {0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112},
 160   {1, 0, 0, 0}},
 161  {{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47},
 162   {0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394},
 163   {1, 0, 0, 0}},
 164  {{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d},
 165   {0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7},
 166   {1, 0, 0, 0}},
 167  {{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24},
 168   {0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881},
 169   {1, 0, 0, 0}},
 170  {{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984},
 171   {0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369},
 172   {1, 0, 0, 0}},
 173  {{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3},
 174   {0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60},
 175   {1, 0, 0, 0}},
 176  {{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057},
 177   {0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9},
 178   {1, 0, 0, 0}},
 179  {{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9},
 180   {0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc},
 181   {1, 0, 0, 0}},
 182  {{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58},
 183   {0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558},
 184   {1, 0, 0, 0}}},
 185 {{{0, 0, 0, 0},
 186   {0, 0, 0, 0},
 187   {0, 0, 0, 0}},
 188  {{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31},
 189   {0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d},
 190   {1, 0, 0, 0}},
 191  {{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3},
 192   {0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a},
 193   {1, 0, 0, 0}},
 194  {{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33},
 195   {0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100},
 196   {1, 0, 0, 0}},
 197  {{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5},
 198   {0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea},
 199   {1, 0, 0, 0}},
 200  {{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be},
 201   {0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51},
 202   {1, 0, 0, 0}},
 203  {{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1},
 204   {0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb},
 205   {1, 0, 0, 0}},
 206  {{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233},
 207   {0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def},
 208   {1, 0, 0, 0}},
 209  {{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae},
 210   {0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45},
 211   {1, 0, 0, 0}},
 212  {{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e},
 213   {0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb},
 214   {1, 0, 0, 0}},
 215  {{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de},
 216   {0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3},
 217   {1, 0, 0, 0}},
 218  {{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05},
 219   {0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58},
 220   {1, 0, 0, 0}},
 221  {{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb},
 222   {0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0},
 223   {1, 0, 0, 0}},
 224  {{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9},
 225   {0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea},
 226   {1, 0, 0, 0}},
 227  {{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba},
 228   {0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405},
 229   {1, 0, 0, 0}},
 230  {{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e},
 231   {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e},
 232   {1, 0, 0, 0}}}
 233 };
 234
 235 /* Precomputation for the group generator. */
 236 struct nistp224_pre_comp_st {
 237     felem g_pre_comp[2][16][3];
 238     int references;
 239     CRYPTO_RWLOCK *lock;
 240 };
 241
 242 const EC_METHOD *EC_GFp_nistp224_method(void)
 243 {
 244     static const EC_METHOD ret = {
 245         EC_FLAGS_DEFAULT_OCT,
 246         NID_X9_62_prime_field,
 247         ec_GFp_nistp224_group_init,
 248         ec_GFp_simple_group_finish,
 249         ec_GFp_simple_group_clear_finish,
 250         ec_GFp_nist_group_copy,
 251         ec_GFp_nistp224_group_set_curve,
 252         ec_GFp_simple_group_get_curve,
 253         ec_GFp_simple_group_get_degree,
 254         ec_group_simple_order_bits,
 255         ec_GFp_simple_group_check_discriminant,
 256         ec_GFp_simple_point_init,
 257         ec_GFp_simple_point_finish,
 258         ec_GFp_simple_point_clear_finish,
 259         ec_GFp_simple_point_copy,
 260         ec_GFp_simple_point_set_to_infinity,
 261         ec_GFp_simple_set_Jprojective_coordinates_GFp,
 262         ec_GFp_simple_get_Jprojective_coordinates_GFp,
 263         ec_GFp_simple_point_set_affine_coordinates,
 264         ec_GFp_nistp224_point_get_affine_coordinates,
 265         0 /* point_set_compressed_coordinates */ ,
 266         0 /* point2oct */ ,
 267         0 /* oct2point */ ,
 268         ec_GFp_simple_add,
 269         ec_GFp_simple_dbl,
 270         ec_GFp_simple_invert,
 271         ec_GFp_simple_is_at_infinity,
 272         ec_GFp_simple_is_on_curve,
 273         ec_GFp_simple_cmp,
 274         ec_GFp_simple_make_affine,
 275         ec_GFp_simple_points_make_affine,
 276         ec_GFp_nistp224_points_mul,
 277         ec_GFp_nistp224_precompute_mult,
 278         ec_GFp_nistp224_have_precompute_mult,
 279         ec_GFp_nist_field_mul,
 280         ec_GFp_nist_field_sqr,
 281         0 /* field_div */ ,
 282         ec_GFp_simple_field_inv,
 283         0 /* field_encode */ ,
 284         0 /* field_decode */ ,
 285         0,                      /* field_set_to_one */
 286         ec_key_simple_priv2oct,
 287         ec_key_simple_oct2priv,
 288         0, /* set private */
 289         ec_key_simple_generate_key,
 290         ec_key_simple_check_key,
 291         ec_key_simple_generate_public_key,
 292         0, /* keycopy */
 293         0, /* keyfinish */
 294         ecdh_simple_compute_key,
 295         0  /* blind_coordinates */
 296     };
 297
 298     return &ret;
 299 }
 300
 301 /*
 302  * Helper functions to convert field elements to/from internal representation
 303  */
 304 static void bin28_to_felem(felem out, const u8 in[28])
 305 {
 306     out[0] = *((const uint64_t *)(in)) & 0x00ffffffffffffff;
 307     out[1] = (*((const uint64_t *)(in + 7))) & 0x00ffffffffffffff;
 308     out[2] = (*((const uint64_t *)(in + 14))) & 0x00ffffffffffffff;
 309     out[3] = (*((const uint64_t *)(in+20))) >> 8;
 310 }
 311
 312 static void felem_to_bin28(u8 out[28], const felem in)
 313 {
 314     unsigned i;
 315     for (i = 0; i < 7; ++i) {
 316         out[i] = in[0] >> (8 * i);
 317         out[i + 7] = in[1] >> (8 * i);
 318         out[i + 14] = in[2] >> (8 * i);
 319         out[i + 21] = in[3] >> (8 * i);
 320     }
 321 }
 322
 323 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 324 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 325 {
 326     unsigned i;
 327     for (i = 0; i < len; ++i)
 328         out[i] = in[len - 1 - i];
 329 }
 330
 331 /* From OpenSSL BIGNUM to internal representation */
 332 static int BN_to_felem(felem out, const BIGNUM *bn)
 333 {
 334     felem_bytearray b_in;
 335     felem_bytearray b_out;
 336     unsigned num_bytes;
 337
 338     num_bytes = BN_num_bytes(bn);
 339     if (num_bytes > sizeof(b_out)) {
 340         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 341         return 0;
 342     }
 343     if (BN_is_negative(bn)) {
 344         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 345         return 0;
 346     }
 347     num_bytes = BN_bn2binpad(bn, b_in, sizeof(b_in));
 348     flip_endian(b_out, b_in, num_bytes);
 349     bin28_to_felem(out, b_out);
 350     return 1;
 351 }
 352
 353 /* From internal representation to OpenSSL BIGNUM */
 354 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 355 {
 356     felem_bytearray b_in, b_out;
 357     felem_to_bin28(b_in, in);
 358     flip_endian(b_out, b_in, sizeof(b_out));
 359     return BN_bin2bn(b_out, sizeof(b_out), out);
 360 }
 361
 362 /******************************************************************************/
 363 /*-
 364  *                              FIELD OPERATIONS
 365  *
 366  * Field operations, using the internal representation of field elements.
 367  * NB! These operations are specific to our point multiplication and cannot be
 368  * expected to be correct in general - e.g., multiplication with a large scalar
 369  * will cause an overflow.
 370  *
 371  */
 372
 373 static void felem_one(felem out)
 374 {
 375     out[0] = 1;
 376     out[1] = 0;
 377     out[2] = 0;
 378     out[3] = 0;
 379 }
 380
 381 static void felem_assign(felem out, const felem in)
 382 {
 383     out[0] = in[0];
 384     out[1] = in[1];
 385     out[2] = in[2];
 386     out[3] = in[3];
 387 }
 388
 389 /* Sum two field elements: out += in */
 390 static void felem_sum(felem out, const felem in)
 391 {
 392     out[0] += in[0];
 393     out[1] += in[1];
 394     out[2] += in[2];
 395     out[3] += in[3];
 396 }
 397
 398 /* Get negative value: out = -in */
 399 /* Assumes in[i] < 2^57 */
 400 static void felem_neg(felem out, const felem in)
 401 {
 402     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 403     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 404     static const limb two58m42m2 = (((limb) 1) << 58) -
 405         (((limb) 1) << 42) - (((limb) 1) << 2);
 406
 407     /* Set to 0 mod 2^224-2^96+1 to ensure out > in */
 408     out[0] = two58p2 - in[0];
 409     out[1] = two58m42m2 - in[1];
 410     out[2] = two58m2 - in[2];
 411     out[3] = two58m2 - in[3];
 412 }
 413
 414 /* Subtract field elements: out -= in */
 415 /* Assumes in[i] < 2^57 */
 416 static void felem_diff(felem out, const felem in)
 417 {
 418     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 419     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 420     static const limb two58m42m2 = (((limb) 1) << 58) -
 421         (((limb) 1) << 42) - (((limb) 1) << 2);
 422
 423     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 424     out[0] += two58p2;
 425     out[1] += two58m42m2;
 426     out[2] += two58m2;
 427     out[3] += two58m2;
 428
 429     out[0] -= in[0];
 430     out[1] -= in[1];
 431     out[2] -= in[2];
 432     out[3] -= in[3];
 433 }
 434
 435 /* Subtract in unreduced 128-bit mode: out -= in */
 436 /* Assumes in[i] < 2^119 */
 437 static void widefelem_diff(widefelem out, const widefelem in)
 438 {
 439     static const widelimb two120 = ((widelimb) 1) << 120;
 440     static const widelimb two120m64 = (((widelimb) 1) << 120) -
 441         (((widelimb) 1) << 64);
 442     static const widelimb two120m104m64 = (((widelimb) 1) << 120) -
 443         (((widelimb) 1) << 104) - (((widelimb) 1) << 64);
 444
 445     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 446     out[0] += two120;
 447     out[1] += two120m64;
 448     out[2] += two120m64;
 449     out[3] += two120;
 450     out[4] += two120m104m64;
 451     out[5] += two120m64;
 452     out[6] += two120m64;
 453
 454     out[0] -= in[0];
 455     out[1] -= in[1];
 456     out[2] -= in[2];
 457     out[3] -= in[3];
 458     out[4] -= in[4];
 459     out[5] -= in[5];
 460     out[6] -= in[6];
 461 }
 462
 463 /* Subtract in mixed mode: out128 -= in64 */
 464 /* in[i] < 2^63 */
 465 static void felem_diff_128_64(widefelem out, const felem in)
 466 {
 467     static const widelimb two64p8 = (((widelimb) 1) << 64) +
 468         (((widelimb) 1) << 8);
 469     static const widelimb two64m8 = (((widelimb) 1) << 64) -
 470         (((widelimb) 1) << 8);
 471     static const widelimb two64m48m8 = (((widelimb) 1) << 64) -
 472         (((widelimb) 1) << 48) - (((widelimb) 1) << 8);
 473
 474     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 475     out[0] += two64p8;
 476     out[1] += two64m48m8;
 477     out[2] += two64m8;
 478     out[3] += two64m8;
 479
 480     out[0] -= in[0];
 481     out[1] -= in[1];
 482     out[2] -= in[2];
 483     out[3] -= in[3];
 484 }
 485
 486 /*
 487  * Multiply a field element by a scalar: out = out * scalar The scalars we
 488  * actually use are small, so results fit without overflow
 489  */
 490 static void felem_scalar(felem out, const limb scalar)
 491 {
 492     out[0] *= scalar;
 493     out[1] *= scalar;
 494     out[2] *= scalar;
 495     out[3] *= scalar;
 496 }
 497
 498 /*
 499  * Multiply an unreduced field element by a scalar: out = out * scalar The
 500  * scalars we actually use are small, so results fit without overflow
 501  */
 502 static void widefelem_scalar(widefelem out, const widelimb scalar)
 503 {
 504     out[0] *= scalar;
 505     out[1] *= scalar;
 506     out[2] *= scalar;
 507     out[3] *= scalar;
 508     out[4] *= scalar;
 509     out[5] *= scalar;
 510     out[6] *= scalar;
 511 }
 512
 513 /* Square a field element: out = in^2 */
 514 static void felem_square(widefelem out, const felem in)
 515 {
 516     limb tmp0, tmp1, tmp2;
 517     tmp0 = 2 * in[0];
 518     tmp1 = 2 * in[1];
 519     tmp2 = 2 * in[2];
 520     out[0] = ((widelimb) in[0]) * in[0];
 521     out[1] = ((widelimb) in[0]) * tmp1;
 522     out[2] = ((widelimb) in[0]) * tmp2 + ((widelimb) in[1]) * in[1];
 523     out[3] = ((widelimb) in[3]) * tmp0 + ((widelimb) in[1]) * tmp2;
 524     out[4] = ((widelimb) in[3]) * tmp1 + ((widelimb) in[2]) * in[2];
 525     out[5] = ((widelimb) in[3]) * tmp2;
 526     out[6] = ((widelimb) in[3]) * in[3];
 527 }
 528
 529 /* Multiply two field elements: out = in1 * in2 */
 530 static void felem_mul(widefelem out, const felem in1, const felem in2)
 531 {
 532     out[0] = ((widelimb) in1[0]) * in2[0];
 533     out[1] = ((widelimb) in1[0]) * in2[1] + ((widelimb) in1[1]) * in2[0];
 534     out[2] = ((widelimb) in1[0]) * in2[2] + ((widelimb) in1[1]) * in2[1] +
 535              ((widelimb) in1[2]) * in2[0];
 536     out[3] = ((widelimb) in1[0]) * in2[3] + ((widelimb) in1[1]) * in2[2] +
 537              ((widelimb) in1[2]) * in2[1] + ((widelimb) in1[3]) * in2[0];
 538     out[4] = ((widelimb) in1[1]) * in2[3] + ((widelimb) in1[2]) * in2[2] +
 539              ((widelimb) in1[3]) * in2[1];
 540     out[5] = ((widelimb) in1[2]) * in2[3] + ((widelimb) in1[3]) * in2[2];
 541     out[6] = ((widelimb) in1[3]) * in2[3];
 542 }
 543
 544 /*-
 545  * Reduce seven 128-bit coefficients to four 64-bit coefficients.
 546  * Requires in[i] < 2^126,
 547  * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 */
 548 static void felem_reduce(felem out, const widefelem in)
 549 {
 550     static const widelimb two127p15 = (((widelimb) 1) << 127) +
 551         (((widelimb) 1) << 15);
 552     static const widelimb two127m71 = (((widelimb) 1) << 127) -
 553         (((widelimb) 1) << 71);
 554     static const widelimb two127m71m55 = (((widelimb) 1) << 127) -
 555         (((widelimb) 1) << 71) - (((widelimb) 1) << 55);
 556     widelimb output[5];
 557
 558     /* Add 0 mod 2^224-2^96+1 to ensure all differences are positive */
 559     output[0] = in[0] + two127p15;
 560     output[1] = in[1] + two127m71m55;
 561     output[2] = in[2] + two127m71;
 562     output[3] = in[3];
 563     output[4] = in[4];
 564
 565     /* Eliminate in[4], in[5], in[6] */
 566     output[4] += in[6] >> 16;
 567     output[3] += (in[6] & 0xffff) << 40;
 568     output[2] -= in[6];
 569
 570     output[3] += in[5] >> 16;
 571     output[2] += (in[5] & 0xffff) << 40;
 572     output[1] -= in[5];
 573
 574     output[2] += output[4] >> 16;
 575     output[1] += (output[4] & 0xffff) << 40;
 576     output[0] -= output[4];
 577
 578     /* Carry 2 -> 3 -> 4 */
 579     output[3] += output[2] >> 56;
 580     output[2] &= 0x00ffffffffffffff;
 581
 582     output[4] = output[3] >> 56;
 583     output[3] &= 0x00ffffffffffffff;
 584
 585     /* Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72 */
 586
 587     /* Eliminate output[4] */
 588     output[2] += output[4] >> 16;
 589     /* output[2] < 2^56 + 2^56 = 2^57 */
 590     output[1] += (output[4] & 0xffff) << 40;
 591     output[0] -= output[4];
 592
 593     /* Carry 0 -> 1 -> 2 -> 3 */
 594     output[1] += output[0] >> 56;
 595     out[0] = output[0] & 0x00ffffffffffffff;
 596
 597     output[2] += output[1] >> 56;
 598     /* output[2] < 2^57 + 2^72 */
 599     out[1] = output[1] & 0x00ffffffffffffff;
 600     output[3] += output[2] >> 56;
 601     /* output[3] <= 2^56 + 2^16 */
 602     out[2] = output[2] & 0x00ffffffffffffff;
 603
 604     /*-
 605      * out[0] < 2^56, out[1] < 2^56, out[2] < 2^56,
 606      * out[3] <= 2^56 + 2^16 (due to final carry),
 607      * so out < 2*p
 608      */
 609     out[3] = output[3];
 610 }
 611
 612 static void felem_square_reduce(felem out, const felem in)
 613 {
 614     widefelem tmp;
 615     felem_square(tmp, in);
 616     felem_reduce(out, tmp);
 617 }
 618
 619 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 620 {
 621     widefelem tmp;
 622     felem_mul(tmp, in1, in2);
 623     felem_reduce(out, tmp);
 624 }
 625
 626 /*
 627  * Reduce to unique minimal representation. Requires 0 <= in < 2*p (always
 628  * call felem_reduce first)
 629  */
 630 static void felem_contract(felem out, const felem in)
 631 {
 632     static const int64_t two56 = ((limb) 1) << 56;
 633     /* 0 <= in < 2*p, p = 2^224 - 2^96 + 1 */
 634     /* if in > p , reduce in = in - 2^224 + 2^96 - 1 */
 635     int64_t tmp[4], a;
 636     tmp[0] = in[0];
 637     tmp[1] = in[1];
 638     tmp[2] = in[2];
 639     tmp[3] = in[3];
 640     /* Case 1: a = 1 iff in >= 2^224 */
 641     a = (in[3] >> 56);
 642     tmp[0] -= a;
 643     tmp[1] += a << 40;
 644     tmp[3] &= 0x00ffffffffffffff;
 645     /*
 646      * Case 2: a = 0 iff p <= in < 2^224, i.e., the high 128 bits are all 1
 647      * and the lower part is non-zero
 648      */
 649     a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) |
 650         (((int64_t) (in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63);
 651     a &= 0x00ffffffffffffff;
 652     /* turn a into an all-one mask (if a = 0) or an all-zero mask */
 653     a = (a - 1) >> 63;
 654     /* subtract 2^224 - 2^96 + 1 if a is all-one */
 655     tmp[3] &= a ^ 0xffffffffffffffff;
 656     tmp[2] &= a ^ 0xffffffffffffffff;
 657     tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff;
 658     tmp[0] -= 1 & a;
 659
 660     /*
 661      * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
 662      * non-zero, so we only need one step
 663      */
 664     a = tmp[0] >> 63;
 665     tmp[0] += two56 & a;
 666     tmp[1] -= 1 & a;
 667
 668     /* carry 1 -> 2 -> 3 */
 669     tmp[2] += tmp[1] >> 56;
 670     tmp[1] &= 0x00ffffffffffffff;
 671
 672     tmp[3] += tmp[2] >> 56;
 673     tmp[2] &= 0x00ffffffffffffff;
 674
 675     /* Now 0 <= out < p */
 676     out[0] = tmp[0];
 677     out[1] = tmp[1];
 678     out[2] = tmp[2];
 679     out[3] = tmp[3];
 680 }
 681
 682 /*
 683  * Zero-check: returns 1 if input is 0, and 0 otherwise. We know that field
 684  * elements are reduced to in < 2^225, so we only need to check three cases:
 685  * 0, 2^224 - 2^96 + 1, and 2^225 - 2^97 + 2
 686  */
 687 static limb felem_is_zero(const felem in)
 688 {
 689     limb zero, two224m96p1, two225m97p2;
 690
 691     zero = in[0] | in[1] | in[2] | in[3];
 692     zero = (((int64_t) (zero) - 1) >> 63) & 1;
 693     two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000)
 694         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x00ffffffffffffff);
 695     two224m96p1 = (((int64_t) (two224m96p1) - 1) >> 63) & 1;
 696     two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000)
 697         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x01ffffffffffffff);
 698     two225m97p2 = (((int64_t) (two225m97p2) - 1) >> 63) & 1;
 699     return (zero | two224m96p1 | two225m97p2);
 700 }
 701
 702 static int felem_is_zero_int(const void *in)
 703 {
 704     return (int)(felem_is_zero(in) & ((limb) 1));
 705 }
 706
 707 /* Invert a field element */
 708 /* Computation chain copied from djb's code */
 709 static void felem_inv(felem out, const felem in)
 710 {
 711     felem ftmp, ftmp2, ftmp3, ftmp4;
 712     widefelem tmp;
 713     unsigned i;
 714
 715     felem_square(tmp, in);
 716     felem_reduce(ftmp, tmp);    /* 2 */
 717     felem_mul(tmp, in, ftmp);
 718     felem_reduce(ftmp, tmp);    /* 2^2 - 1 */
 719     felem_square(tmp, ftmp);
 720     felem_reduce(ftmp, tmp);    /* 2^3 - 2 */
 721     felem_mul(tmp, in, ftmp);
 722     felem_reduce(ftmp, tmp);    /* 2^3 - 1 */
 723     felem_square(tmp, ftmp);
 724     felem_reduce(ftmp2, tmp);   /* 2^4 - 2 */
 725     felem_square(tmp, ftmp2);
 726     felem_reduce(ftmp2, tmp);   /* 2^5 - 4 */
 727     felem_square(tmp, ftmp2);
 728     felem_reduce(ftmp2, tmp);   /* 2^6 - 8 */
 729     felem_mul(tmp, ftmp2, ftmp);
 730     felem_reduce(ftmp, tmp);    /* 2^6 - 1 */
 731     felem_square(tmp, ftmp);
 732     felem_reduce(ftmp2, tmp);   /* 2^7 - 2 */
 733     for (i = 0; i < 5; ++i) {   /* 2^12 - 2^6 */
 734         felem_square(tmp, ftmp2);
 735         felem_reduce(ftmp2, tmp);
 736     }
 737     felem_mul(tmp, ftmp2, ftmp);
 738     felem_reduce(ftmp2, tmp);   /* 2^12 - 1 */
 739     felem_square(tmp, ftmp2);
 740     felem_reduce(ftmp3, tmp);   /* 2^13 - 2 */
 741     for (i = 0; i < 11; ++i) {  /* 2^24 - 2^12 */
 742         felem_square(tmp, ftmp3);
 743         felem_reduce(ftmp3, tmp);
 744     }
 745     felem_mul(tmp, ftmp3, ftmp2);
 746     felem_reduce(ftmp2, tmp);   /* 2^24 - 1 */
 747     felem_square(tmp, ftmp2);
 748     felem_reduce(ftmp3, tmp);   /* 2^25 - 2 */
 749     for (i = 0; i < 23; ++i) {  /* 2^48 - 2^24 */
 750         felem_square(tmp, ftmp3);
 751         felem_reduce(ftmp3, tmp);
 752     }
 753     felem_mul(tmp, ftmp3, ftmp2);
 754     felem_reduce(ftmp3, tmp);   /* 2^48 - 1 */
 755     felem_square(tmp, ftmp3);
 756     felem_reduce(ftmp4, tmp);   /* 2^49 - 2 */
 757     for (i = 0; i < 47; ++i) {  /* 2^96 - 2^48 */
 758         felem_square(tmp, ftmp4);
 759         felem_reduce(ftmp4, tmp);
 760     }
 761     felem_mul(tmp, ftmp3, ftmp4);
 762     felem_reduce(ftmp3, tmp);   /* 2^96 - 1 */
 763     felem_square(tmp, ftmp3);
 764     felem_reduce(ftmp4, tmp);   /* 2^97 - 2 */
 765     for (i = 0; i < 23; ++i) {  /* 2^120 - 2^24 */
 766         felem_square(tmp, ftmp4);
 767         felem_reduce(ftmp4, tmp);
 768     }
 769     felem_mul(tmp, ftmp2, ftmp4);
 770     felem_reduce(ftmp2, tmp);   /* 2^120 - 1 */
 771     for (i = 0; i < 6; ++i) {   /* 2^126 - 2^6 */
 772         felem_square(tmp, ftmp2);
 773         felem_reduce(ftmp2, tmp);
 774     }
 775     felem_mul(tmp, ftmp2, ftmp);
 776     felem_reduce(ftmp, tmp);    /* 2^126 - 1 */
 777     felem_square(tmp, ftmp);
 778     felem_reduce(ftmp, tmp);    /* 2^127 - 2 */
 779     felem_mul(tmp, ftmp, in);
 780     felem_reduce(ftmp, tmp);    /* 2^127 - 1 */
 781     for (i = 0; i < 97; ++i) {  /* 2^224 - 2^97 */
 782         felem_square(tmp, ftmp);
 783         felem_reduce(ftmp, tmp);
 784     }
 785     felem_mul(tmp, ftmp, ftmp3);
 786     felem_reduce(out, tmp);     /* 2^224 - 2^96 - 1 */
 787 }
 788
 789 /*
 790  * Copy in constant time: if icopy == 1, copy in to out, if icopy == 0, copy
 791  * out to itself.
 792  */
 793 static void copy_conditional(felem out, const felem in, limb icopy)
 794 {
 795     unsigned i;
 796     /*
 797      * icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one
 798      */
 799     const limb copy = -icopy;
 800     for (i = 0; i < 4; ++i) {
 801         const limb tmp = copy & (in[i] ^ out[i]);
 802         out[i] ^= tmp;
 803     }
 804 }
 805
 806 /******************************************************************************/
 807 /*-
 808  *                       ELLIPTIC CURVE POINT OPERATIONS
 809  *
 810  * Points are represented in Jacobian projective coordinates:
 811  * (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3),
 812  * or to the point at infinity if Z == 0.
 813  *
 814  */
 815
 816 /*-
 817  * Double an elliptic curve point:
 818  * (X', Y', Z') = 2 * (X, Y, Z), where
 819  * X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2
 820  * Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^2
 821  * Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z
 822  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed,
 823  * while x_out == y_in is not (maybe this works, but it's not tested).
 824  */
 825 static void
 826 point_double(felem x_out, felem y_out, felem z_out,
 827              const felem x_in, const felem y_in, const felem z_in)
 828 {
 829     widefelem tmp, tmp2;
 830     felem delta, gamma, beta, alpha, ftmp, ftmp2;
 831
 832     felem_assign(ftmp, x_in);
 833     felem_assign(ftmp2, x_in);
 834
 835     /* delta = z^2 */
 836     felem_square(tmp, z_in);
 837     felem_reduce(delta, tmp);
 838
 839     /* gamma = y^2 */
 840     felem_square(tmp, y_in);
 841     felem_reduce(gamma, tmp);
 842
 843     /* beta = x*gamma */
 844     felem_mul(tmp, x_in, gamma);
 845     felem_reduce(beta, tmp);
 846
 847     /* alpha = 3*(x-delta)*(x+delta) */
 848     felem_diff(ftmp, delta);
 849     /* ftmp[i] < 2^57 + 2^58 + 2 < 2^59 */
 850     felem_sum(ftmp2, delta);
 851     /* ftmp2[i] < 2^57 + 2^57 = 2^58 */
 852     felem_scalar(ftmp2, 3);
 853     /* ftmp2[i] < 3 * 2^58 < 2^60 */
 854     felem_mul(tmp, ftmp, ftmp2);
 855     /* tmp[i] < 2^60 * 2^59 * 4 = 2^121 */
 856     felem_reduce(alpha, tmp);
 857
 858     /* x' = alpha^2 - 8*beta */
 859     felem_square(tmp, alpha);
 860     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 861     felem_assign(ftmp, beta);
 862     felem_scalar(ftmp, 8);
 863     /* ftmp[i] < 8 * 2^57 = 2^60 */
 864     felem_diff_128_64(tmp, ftmp);
 865     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 866     felem_reduce(x_out, tmp);
 867
 868     /* z' = (y + z)^2 - gamma - delta */
 869     felem_sum(delta, gamma);
 870     /* delta[i] < 2^57 + 2^57 = 2^58 */
 871     felem_assign(ftmp, y_in);
 872     felem_sum(ftmp, z_in);
 873     /* ftmp[i] < 2^57 + 2^57 = 2^58 */
 874     felem_square(tmp, ftmp);
 875     /* tmp[i] < 4 * 2^58 * 2^58 = 2^118 */
 876     felem_diff_128_64(tmp, delta);
 877     /* tmp[i] < 2^118 + 2^64 + 8 < 2^119 */
 878     felem_reduce(z_out, tmp);
 879
 880     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
 881     felem_scalar(beta, 4);
 882     /* beta[i] < 4 * 2^57 = 2^59 */
 883     felem_diff(beta, x_out);
 884     /* beta[i] < 2^59 + 2^58 + 2 < 2^60 */
 885     felem_mul(tmp, alpha, beta);
 886     /* tmp[i] < 4 * 2^57 * 2^60 = 2^119 */
 887     felem_square(tmp2, gamma);
 888     /* tmp2[i] < 4 * 2^57 * 2^57 = 2^116 */
 889     widefelem_scalar(tmp2, 8);
 890     /* tmp2[i] < 8 * 2^116 = 2^119 */
 891     widefelem_diff(tmp, tmp2);
 892     /* tmp[i] < 2^119 + 2^120 < 2^121 */
 893     felem_reduce(y_out, tmp);
 894 }
 895
 896 /*-
 897  * Add two elliptic curve points:
 898  * (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where
 899  * X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 -
 900  * 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2
 901  * Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 - X_3) -
 902  *        Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3
 903  * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2)
 904  *
 905  * This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0.
 906  */
 907
 908 /*
 909  * This function is not entirely constant-time: it includes a branch for
 910  * checking whether the two input points are equal, (while not equal to the
 911  * point at infinity). This case never happens during single point
 912  * multiplication, so there is no timing leak for ECDH or ECDSA signing.
 913  */
 914 static void point_add(felem x3, felem y3, felem z3,
 915                       const felem x1, const felem y1, const felem z1,
 916                       const int mixed, const felem x2, const felem y2,
 917                       const felem z2)
 918 {
 919     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out;
 920     widefelem tmp, tmp2;
 921     limb z1_is_zero, z2_is_zero, x_equal, y_equal;
 922
 923     if (!mixed) {
 924         /* ftmp2 = z2^2 */
 925         felem_square(tmp, z2);
 926         felem_reduce(ftmp2, tmp);
 927
 928         /* ftmp4 = z2^3 */
 929         felem_mul(tmp, ftmp2, z2);
 930         felem_reduce(ftmp4, tmp);
 931
 932         /* ftmp4 = z2^3*y1 */
 933         felem_mul(tmp2, ftmp4, y1);
 934         felem_reduce(ftmp4, tmp2);
 935
 936         /* ftmp2 = z2^2*x1 */
 937         felem_mul(tmp2, ftmp2, x1);
 938         felem_reduce(ftmp2, tmp2);
 939     } else {
 940         /*
 941          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
 942          */
 943
 944         /* ftmp4 = z2^3*y1 */
 945         felem_assign(ftmp4, y1);
 946
 947         /* ftmp2 = z2^2*x1 */
 948         felem_assign(ftmp2, x1);
 949     }
 950
 951     /* ftmp = z1^2 */
 952     felem_square(tmp, z1);
 953     felem_reduce(ftmp, tmp);
 954
 955     /* ftmp3 = z1^3 */
 956     felem_mul(tmp, ftmp, z1);
 957     felem_reduce(ftmp3, tmp);
 958
 959     /* tmp = z1^3*y2 */
 960     felem_mul(tmp, ftmp3, y2);
 961     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 962
 963     /* ftmp3 = z1^3*y2 - z2^3*y1 */
 964     felem_diff_128_64(tmp, ftmp4);
 965     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 966     felem_reduce(ftmp3, tmp);
 967
 968     /* tmp = z1^2*x2 */
 969     felem_mul(tmp, ftmp, x2);
 970     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 971
 972     /* ftmp = z1^2*x2 - z2^2*x1 */
 973     felem_diff_128_64(tmp, ftmp2);
 974     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 975     felem_reduce(ftmp, tmp);
 976
 977     /*
 978      * the formulae are incorrect if the points are equal so we check for
 979      * this and do doubling if this happens
 980      */
 981     x_equal = felem_is_zero(ftmp);
 982     y_equal = felem_is_zero(ftmp3);
 983     z1_is_zero = felem_is_zero(z1);
 984     z2_is_zero = felem_is_zero(z2);
 985     /* In affine coordinates, (X_1, Y_1) == (X_2, Y_2) */
 986     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
 987         point_double(x3, y3, z3, x1, y1, z1);
 988         return;
 989     }
 990
 991     /* ftmp5 = z1*z2 */
 992     if (!mixed) {
 993         felem_mul(tmp, z1, z2);
 994         felem_reduce(ftmp5, tmp);
 995     } else {
 996         /* special case z2 = 0 is handled later */
 997         felem_assign(ftmp5, z1);
 998     }
 999
1000     /* z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) */
1001     felem_mul(tmp, ftmp, ftmp5);
1002     felem_reduce(z_out, tmp);
1003
1004     /* ftmp = (z1^2*x2 - z2^2*x1)^2 */
1005     felem_assign(ftmp5, ftmp);
1006     felem_square(tmp, ftmp);
1007     felem_reduce(ftmp, tmp);
1008
1009     /* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */
1010     felem_mul(tmp, ftmp, ftmp5);
1011     felem_reduce(ftmp5, tmp);
1012
1013     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
1014     felem_mul(tmp, ftmp2, ftmp);
1015     felem_reduce(ftmp2, tmp);
1016
1017     /* tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
1018     felem_mul(tmp, ftmp4, ftmp5);
1019     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
1020
1021     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 */
1022     felem_square(tmp2, ftmp3);
1023     /* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */
1024
1025     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */
1026     felem_diff_128_64(tmp2, ftmp5);
1027     /* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */
1028
1029     /* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
1030     felem_assign(ftmp5, ftmp2);
1031     felem_scalar(ftmp5, 2);
1032     /* ftmp5[i] < 2 * 2^57 = 2^58 */
1033
1034     /*-
1035      * x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 -
1036      *  2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2
1037      */
1038     felem_diff_128_64(tmp2, ftmp5);
1039     /* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */
1040     felem_reduce(x_out, tmp2);
1041
1042     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out */
1043     felem_diff(ftmp2, x_out);
1044     /* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */
1045
1046     /*
1047      * tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out)
1048      */
1049     felem_mul(tmp2, ftmp3, ftmp2);
1050     /* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */
1051
1052     /*-
1053      * y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) -
1054      *  z2^3*y1*(z1^2*x2 - z2^2*x1)^3
1055      */
1056     widefelem_diff(tmp2, tmp);
1057     /* tmp2[i] < 2^118 + 2^120 < 2^121 */
1058     felem_reduce(y_out, tmp2);
1059
1060     /*
1061      * the result (x_out, y_out, z_out) is incorrect if one of the inputs is
1062      * the point at infinity, so we need to check for this separately
1063      */
1064
1065     /*
1066      * if point 1 is at infinity, copy point 2 to output, and vice versa
1067      */
1068     copy_conditional(x_out, x2, z1_is_zero);
1069     copy_conditional(x_out, x1, z2_is_zero);
1070     copy_conditional(y_out, y2, z1_is_zero);
1071     copy_conditional(y_out, y1, z2_is_zero);
1072     copy_conditional(z_out, z2, z1_is_zero);
1073     copy_conditional(z_out, z1, z2_is_zero);
1074     felem_assign(x3, x_out);
1075     felem_assign(y3, y_out);
1076     felem_assign(z3, z_out);
1077 }
1078
1079 /*
1080  * select_point selects the |idx|th point from a precomputation table and
1081  * copies it to out.
1082  * The pre_comp array argument should be size of |size| argument
1083  */
1084 static void select_point(const u64 idx, unsigned int size,
1085                          const felem pre_comp[][3], felem out[3])
1086 {
1087     unsigned i, j;
1088     limb *outlimbs = &out[0][0];
1089
1090     memset(out, 0, sizeof(*out) * 3);
1091     for (i = 0; i < size; i++) {
1092         const limb *inlimbs = &pre_comp[i][0][0];
1093         u64 mask = i ^ idx;
1094         mask |= mask >> 4;
1095         mask |= mask >> 2;
1096         mask |= mask >> 1;
1097         mask &= 1;
1098         mask--;
1099         for (j = 0; j < 4 * 3; j++)
1100             outlimbs[j] |= inlimbs[j] & mask;
1101     }
1102 }
1103
1104 /* get_bit returns the |i|th bit in |in| */
1105 static char get_bit(const felem_bytearray in, unsigned i)
1106 {
1107     if (i >= 224)
1108         return 0;
1109     return (in[i >> 3] >> (i & 7)) & 1;
1110 }
1111
1112 /*
1113  * Interleaved point multiplication using precomputed point multiples: The
1114  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1115  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1116  * generator, using certain (large) precomputed multiples in g_pre_comp.
1117  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1118  */
1119 static void batch_mul(felem x_out, felem y_out, felem z_out,
1120                       const felem_bytearray scalars[],
1121                       const unsigned num_points, const u8 *g_scalar,
1122                       const int mixed, const felem pre_comp[][17][3],
1123                       const felem g_pre_comp[2][16][3])
1124 {
1125     int i, skip;
1126     unsigned num;
1127     unsigned gen_mul = (g_scalar != NULL);
1128     felem nq[3], tmp[4];
1129     u64 bits;
1130     u8 sign, digit;
1131
1132     /* set nq to the point at infinity */
1133     memset(nq, 0, sizeof(nq));
1134
1135     /*
1136      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1137      * of the generator (two in each of the last 28 rounds) and additions of
1138      * other points multiples (every 5th round).
1139      */
1140     skip = 1;                   /* save two point operations in the first
1141                                  * round */
1142     for (i = (num_points ? 220 : 27); i >= 0; --i) {
1143         /* double */
1144         if (!skip)
1145             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1146
1147         /* add multiples of the generator */
1148         if (gen_mul && (i <= 27)) {
1149             /* first, look 28 bits upwards */
1150             bits = get_bit(g_scalar, i + 196) << 3;
1151             bits |= get_bit(g_scalar, i + 140) << 2;
1152             bits |= get_bit(g_scalar, i + 84) << 1;
1153             bits |= get_bit(g_scalar, i + 28);
1154             /* select the point to add, in constant time */
1155             select_point(bits, 16, g_pre_comp[1], tmp);
1156
1157             if (!skip) {
1158                 /* value 1 below is argument for "mixed" */
1159                 point_add(nq[0], nq[1], nq[2],
1160                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1161             } else {
1162                 memcpy(nq, tmp, 3 * sizeof(felem));
1163                 skip = 0;
1164             }
1165
1166             /* second, look at the current position */
1167             bits = get_bit(g_scalar, i + 168) << 3;
1168             bits |= get_bit(g_scalar, i + 112) << 2;
1169             bits |= get_bit(g_scalar, i + 56) << 1;
1170             bits |= get_bit(g_scalar, i);
1171             /* select the point to add, in constant time */
1172             select_point(bits, 16, g_pre_comp[0], tmp);
1173             point_add(nq[0], nq[1], nq[2],
1174                       nq[0], nq[1], nq[2],
1175                       1 /* mixed */ , tmp[0], tmp[1], tmp[2]);
1176         }
1177
1178         /* do other additions every 5 doublings */
1179         if (num_points && (i % 5 == 0)) {
1180             /* loop over all scalars */
1181             for (num = 0; num < num_points; ++num) {
1182                 bits = get_bit(scalars[num], i + 4) << 5;
1183                 bits |= get_bit(scalars[num], i + 3) << 4;
1184                 bits |= get_bit(scalars[num], i + 2) << 3;
1185                 bits |= get_bit(scalars[num], i + 1) << 2;
1186                 bits |= get_bit(scalars[num], i) << 1;
1187                 bits |= get_bit(scalars[num], i - 1);
1188                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1189
1190                 /* select the point to add or subtract */
1191                 select_point(digit, 17, pre_comp[num], tmp);
1192                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1193                                             * point */
1194                 copy_conditional(tmp[1], tmp[3], sign);
1195
1196                 if (!skip) {
1197                     point_add(nq[0], nq[1], nq[2],
1198                               nq[0], nq[1], nq[2],
1199                               mixed, tmp[0], tmp[1], tmp[2]);
1200                 } else {
1201                     memcpy(nq, tmp, 3 * sizeof(felem));
1202                     skip = 0;
1203                 }
1204             }
1205         }
1206     }
1207     felem_assign(x_out, nq[0]);
1208     felem_assign(y_out, nq[1]);
1209     felem_assign(z_out, nq[2]);
1210 }
1211
1212 /******************************************************************************/
1213 /*
1214  * FUNCTIONS TO MANAGE PRECOMPUTATION
1215  */
1216
1217 static NISTP224_PRE_COMP *nistp224_pre_comp_new()
1218 {
1219     NISTP224_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1220
1221     if (!ret) {
1222         ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1223         return ret;
1224     }
1225
1226     ret->references = 1;
1227
1228     ret->lock = CRYPTO_THREAD_lock_new();
1229     if (ret->lock == NULL) {
1230         ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1231         OPENSSL_free(ret);
1232         return NULL;
1233     }
1234     return ret;
1235 }
1236
1237 NISTP224_PRE_COMP *EC_nistp224_pre_comp_dup(NISTP224_PRE_COMP *p)
1238 {
1239     int i;
1240     if (p != NULL)
1241         CRYPTO_atomic_add(&p->references, 1, &i, p->lock);
1242     return p;
1243 }
1244
1245 void EC_nistp224_pre_comp_free(NISTP224_PRE_COMP *p)
1246 {
1247     int i;
1248
1249     if (p == NULL)
1250         return;
1251
1252     CRYPTO_atomic_add(&p->references, -1, &i, p->lock);
1253     REF_PRINT_COUNT("EC_nistp224", x);
1254     if (i > 0)
1255         return;
1256     REF_ASSERT_ISNT(i < 0);
1257
1258     CRYPTO_THREAD_lock_free(p->lock);
1259     OPENSSL_free(p);
1260 }
1261
1262 /******************************************************************************/
1263 /*
1264  * OPENSSL EC_METHOD FUNCTIONS
1265  */
1266
1267 int ec_GFp_nistp224_group_init(EC_GROUP *group)
1268 {
1269     int ret;
1270     ret = ec_GFp_simple_group_init(group);
1271     group->a_is_minus3 = 1;
1272     return ret;
1273 }
1274
1275 int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1276                                     const BIGNUM *a, const BIGNUM *b,
1277                                     BN_CTX *ctx)
1278 {
1279     int ret = 0;
1280     BN_CTX *new_ctx = NULL;
1281     BIGNUM *curve_p, *curve_a, *curve_b;
1282
1283     if (ctx == NULL)
1284         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1285             return 0;
1286     BN_CTX_start(ctx);
1287     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1288         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1289         ((curve_b = BN_CTX_get(ctx)) == NULL))
1290         goto err;
1291     BN_bin2bn(nistp224_curve_params[0], sizeof(felem_bytearray), curve_p);
1292     BN_bin2bn(nistp224_curve_params[1], sizeof(felem_bytearray), curve_a);
1293     BN_bin2bn(nistp224_curve_params[2], sizeof(felem_bytearray), curve_b);
1294     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1295         ECerr(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE,
1296               EC_R_WRONG_CURVE_PARAMETERS);
1297         goto err;
1298     }
1299     group->field_mod_func = BN_nist_mod_224;
1300     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1301  err:
1302     BN_CTX_end(ctx);
1303     BN_CTX_free(new_ctx);
1304     return ret;
1305 }
1306
1307 /*
1308  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1309  * (X/Z^2, Y/Z^3)
1310  */
1311 int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group,
1312                                                  const EC_POINT *point,
1313                                                  BIGNUM *x, BIGNUM *y,
1314                                                  BN_CTX *ctx)
1315 {
1316     felem z1, z2, x_in, y_in, x_out, y_out;
1317     widefelem tmp;
1318
1319     if (EC_POINT_is_at_infinity(group, point)) {
1320         ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1321               EC_R_POINT_AT_INFINITY);
1322         return 0;
1323     }
1324     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1325         (!BN_to_felem(z1, point->Z)))
1326         return 0;
1327     felem_inv(z2, z1);
1328     felem_square(tmp, z2);
1329     felem_reduce(z1, tmp);
1330     felem_mul(tmp, x_in, z1);
1331     felem_reduce(x_in, tmp);
1332     felem_contract(x_out, x_in);
1333     if (x != NULL) {
1334         if (!felem_to_BN(x, x_out)) {
1335             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1336                   ERR_R_BN_LIB);
1337             return 0;
1338         }
1339     }
1340     felem_mul(tmp, z1, z2);
1341     felem_reduce(z1, tmp);
1342     felem_mul(tmp, y_in, z1);
1343     felem_reduce(y_in, tmp);
1344     felem_contract(y_out, y_in);
1345     if (y != NULL) {
1346         if (!felem_to_BN(y, y_out)) {
1347             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1348                   ERR_R_BN_LIB);
1349             return 0;
1350         }
1351     }
1352     return 1;
1353 }
1354
1355 static void make_points_affine(size_t num, felem points[ /* num */ ][3],
1356                                felem tmp_felems[ /* num+1 */ ])
1357 {
1358     /*
1359      * Runs in constant time, unless an input is the point at infinity (which
1360      * normally shouldn't happen).
1361      */
1362     ec_GFp_nistp_points_make_affine_internal(num,
1363                                              points,
1364                                              sizeof(felem),
1365                                              tmp_felems,
1366                                              (void (*)(void *))felem_one,
1367                                              felem_is_zero_int,
1368                                              (void (*)(void *, const void *))
1369                                              felem_assign,
1370                                              (void (*)(void *, const void *))
1371                                              felem_square_reduce, (void (*)
1372                                                                    (void *,
1373                                                                     const void
1374                                                                     *,
1375                                                                     const void
1376                                                                     *))
1377                                              felem_mul_reduce,
1378                                              (void (*)(void *, const void *))
1379                                              felem_inv,
1380                                              (void (*)(void *, const void *))
1381                                              felem_contract);
1382 }
1383
1384 /*
1385  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1386  * values Result is stored in r (r can equal one of the inputs).
1387  */
1388 int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
1389                                const BIGNUM *scalar, size_t num,
1390                                const EC_POINT *points[],
1391                                const BIGNUM *scalars[], BN_CTX *ctx)
1392 {
1393     int ret = 0;
1394     int j;
1395     unsigned i;
1396     int mixed = 0;
1397     BN_CTX *new_ctx = NULL;
1398     BIGNUM *x, *y, *z, *tmp_scalar;
1399     felem_bytearray g_secret;
1400     felem_bytearray *secrets = NULL;
1401     felem (*pre_comp)[17][3] = NULL;
1402     felem *tmp_felems = NULL;
1403     felem_bytearray tmp;
1404     unsigned num_bytes;
1405     int have_pre_comp = 0;
1406     size_t num_points = num;
1407     felem x_in, y_in, z_in, x_out, y_out, z_out;
1408     NISTP224_PRE_COMP *pre = NULL;
1409     const felem(*g_pre_comp)[16][3] = NULL;
1410     EC_POINT *generator = NULL;
1411     const EC_POINT *p = NULL;
1412     const BIGNUM *p_scalar = NULL;
1413
1414     if (ctx == NULL)
1415         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1416             return 0;
1417     BN_CTX_start(ctx);
1418     if (((x = BN_CTX_get(ctx)) == NULL) ||
1419         ((y = BN_CTX_get(ctx)) == NULL) ||
1420         ((z = BN_CTX_get(ctx)) == NULL) ||
1421         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1422         goto err;
1423
1424     if (scalar != NULL) {
1425         pre = group->pre_comp.nistp224;
1426         if (pre)
1427             /* we have precomputation, try to use it */
1428             g_pre_comp = (const felem(*)[16][3])pre->g_pre_comp;
1429         else
1430             /* try to use the standard precomputation */
1431             g_pre_comp = &gmul[0];
1432         generator = EC_POINT_new(group);
1433         if (generator == NULL)
1434             goto err;
1435         /* get the generator from precomputation */
1436         if (!felem_to_BN(x, g_pre_comp[0][1][0]) ||
1437             !felem_to_BN(y, g_pre_comp[0][1][1]) ||
1438             !felem_to_BN(z, g_pre_comp[0][1][2])) {
1439             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1440             goto err;
1441         }
1442         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1443                                                       generator, x, y, z,
1444                                                       ctx))
1445             goto err;
1446         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1447             /* precomputation matches generator */
1448             have_pre_comp = 1;
1449         else
1450             /*
1451              * we don't have valid precomputation: treat the generator as a
1452              * random point
1453              */
1454             num_points = num_points + 1;
1455     }
1456
1457     if (num_points > 0) {
1458         if (num_points >= 3) {
1459             /*
1460              * unless we precompute multiples for just one or two points,
1461              * converting those into affine form is time well spent
1462              */
1463             mixed = 1;
1464         }
1465         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1466         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1467         if (mixed)
1468             tmp_felems =
1469                 OPENSSL_malloc(sizeof(felem) * (num_points * 17 + 1));
1470         if ((secrets == NULL) || (pre_comp == NULL)
1471             || (mixed && (tmp_felems == NULL))) {
1472             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1473             goto err;
1474         }
1475
1476         /*
1477          * we treat NULL scalars as 0, and NULL points as points at infinity,
1478          * i.e., they contribute nothing to the linear combination
1479          */
1480         for (i = 0; i < num_points; ++i) {
1481             if (i == num)
1482                 /* the generator */
1483             {
1484                 p = EC_GROUP_get0_generator(group);
1485                 p_scalar = scalar;
1486             } else
1487                 /* the i^th point */
1488             {
1489                 p = points[i];
1490                 p_scalar = scalars[i];
1491             }
1492             if ((p_scalar != NULL) && (p != NULL)) {
1493                 /* reduce scalar to 0 <= scalar < 2^224 */
1494                 if ((BN_num_bits(p_scalar) > 224)
1495                     || (BN_is_negative(p_scalar))) {
1496                     /*
1497                      * this is an unusual input, and we don't guarantee
1498                      * constant-timeness
1499                      */
1500                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1501                         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1502                         goto err;
1503                     }
1504                     num_bytes = BN_bn2binpad(tmp_scalar, tmp, sizeof(tmp));
1505                 } else
1506                     num_bytes = BN_bn2binpad(p_scalar, tmp, sizeof(tmp));
1507                 flip_endian(secrets[i], tmp, num_bytes);
1508                 /* precompute multiples */
1509                 if ((!BN_to_felem(x_out, p->X)) ||
1510                     (!BN_to_felem(y_out, p->Y)) ||
1511                     (!BN_to_felem(z_out, p->Z)))
1512                     goto err;
1513                 felem_assign(pre_comp[i][1][0], x_out);
1514                 felem_assign(pre_comp[i][1][1], y_out);
1515                 felem_assign(pre_comp[i][1][2], z_out);
1516                 for (j = 2; j <= 16; ++j) {
1517                     if (j & 1) {
1518                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1519                                   pre_comp[i][j][2], pre_comp[i][1][0],
1520                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1521                                   pre_comp[i][j - 1][0],
1522                                   pre_comp[i][j - 1][1],
1523                                   pre_comp[i][j - 1][2]);
1524                     } else {
1525                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1526                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1527                                      pre_comp[i][j / 2][1],
1528                                      pre_comp[i][j / 2][2]);
1529                     }
1530                 }
1531             }
1532         }
1533         if (mixed)
1534             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1535     }
1536
1537     /* the scalar for the generator */
1538     if ((scalar != NULL) && (have_pre_comp)) {
1539         memset(g_secret, 0, sizeof(g_secret));
1540         /* reduce scalar to 0 <= scalar < 2^224 */
1541         if ((BN_num_bits(scalar) > 224) || (BN_is_negative(scalar))) {
1542             /*
1543              * this is an unusual input, and we don't guarantee
1544              * constant-timeness
1545              */
1546             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1547                 ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1548                 goto err;
1549             }
1550             num_bytes = BN_bn2binpad(tmp_scalar, tmp, sizeof(tmp));
1551         } else
1552             num_bytes = BN_bn2binpad(scalar, tmp, sizeof(tmp));
1553         flip_endian(g_secret, tmp, num_bytes);
1554         /* do the multiplication with generator precomputation */
1555         batch_mul(x_out, y_out, z_out,
1556                   (const felem_bytearray(*))secrets, num_points,
1557                   g_secret,
1558                   mixed, (const felem(*)[17][3])pre_comp, g_pre_comp);
1559     } else
1560         /* do the multiplication without generator precomputation */
1561         batch_mul(x_out, y_out, z_out,
1562                   (const felem_bytearray(*))secrets, num_points,
1563                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1564     /* reduce the output to its unique minimal representation */
1565     felem_contract(x_in, x_out);
1566     felem_contract(y_in, y_out);
1567     felem_contract(z_in, z_out);
1568     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1569         (!felem_to_BN(z, z_in))) {
1570         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1571         goto err;
1572     }
1573     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1574
1575  err:
1576     BN_CTX_end(ctx);
1577     EC_POINT_free(generator);
1578     BN_CTX_free(new_ctx);
1579     OPENSSL_free(secrets);
1580     OPENSSL_free(pre_comp);
1581     OPENSSL_free(tmp_felems);
1582     return ret;
1583 }
1584
1585 int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1586 {
1587     int ret = 0;
1588     NISTP224_PRE_COMP *pre = NULL;
1589     int i, j;
1590     BN_CTX *new_ctx = NULL;
1591     BIGNUM *x, *y;
1592     EC_POINT *generator = NULL;
1593     felem tmp_felems[32];
1594
1595     /* throw away old precomputation */
1596     EC_pre_comp_free(group);
1597     if (ctx == NULL)
1598         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1599             return 0;
1600     BN_CTX_start(ctx);
1601     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
1602         goto err;
1603     /* get the generator */
1604     if (group->generator == NULL)
1605         goto err;
1606     generator = EC_POINT_new(group);
1607     if (generator == NULL)
1608         goto err;
1609     BN_bin2bn(nistp224_curve_params[3], sizeof(felem_bytearray), x);
1610     BN_bin2bn(nistp224_curve_params[4], sizeof(felem_bytearray), y);
1611     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
1612         goto err;
1613     if ((pre = nistp224_pre_comp_new()) == NULL)
1614         goto err;
1615     /*
1616      * if the generator is the standard one, use built-in precomputation
1617      */
1618     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1619         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1620         goto done;
1621     }
1622     if ((!BN_to_felem(pre->g_pre_comp[0][1][0], group->generator->X)) ||
1623         (!BN_to_felem(pre->g_pre_comp[0][1][1], group->generator->Y)) ||
1624         (!BN_to_felem(pre->g_pre_comp[0][1][2], group->generator->Z)))
1625         goto err;
1626     /*
1627      * compute 2^56*G, 2^112*G, 2^168*G for the first table, 2^28*G, 2^84*G,
1628      * 2^140*G, 2^196*G for the second one
1629      */
1630     for (i = 1; i <= 8; i <<= 1) {
1631         point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1632                      pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
1633                      pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]);
1634         for (j = 0; j < 27; ++j) {
1635             point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1636                          pre->g_pre_comp[1][i][2], pre->g_pre_comp[1][i][0],
1637                          pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1638         }
1639         if (i == 8)
1640             break;
1641         point_double(pre->g_pre_comp[0][2 * i][0],
1642                      pre->g_pre_comp[0][2 * i][1],
1643                      pre->g_pre_comp[0][2 * i][2], pre->g_pre_comp[1][i][0],
1644                      pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1645         for (j = 0; j < 27; ++j) {
1646             point_double(pre->g_pre_comp[0][2 * i][0],
1647                          pre->g_pre_comp[0][2 * i][1],
1648                          pre->g_pre_comp[0][2 * i][2],
1649                          pre->g_pre_comp[0][2 * i][0],
1650                          pre->g_pre_comp[0][2 * i][1],
1651                          pre->g_pre_comp[0][2 * i][2]);
1652         }
1653     }
1654     for (i = 0; i < 2; i++) {
1655         /* g_pre_comp[i][0] is the point at infinity */
1656         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
1657         /* the remaining multiples */
1658         /* 2^56*G + 2^112*G resp. 2^84*G + 2^140*G */
1659         point_add(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
1660                   pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
1661                   pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
1662                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1663                   pre->g_pre_comp[i][2][2]);
1664         /* 2^56*G + 2^168*G resp. 2^84*G + 2^196*G */
1665         point_add(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
1666                   pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
1667                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1668                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1669                   pre->g_pre_comp[i][2][2]);
1670         /* 2^112*G + 2^168*G resp. 2^140*G + 2^196*G */
1671         point_add(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
1672                   pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
1673                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1674                   0, pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
1675                   pre->g_pre_comp[i][4][2]);
1676         /*
1677          * 2^56*G + 2^112*G + 2^168*G resp. 2^84*G + 2^140*G + 2^196*G
1678          */
1679         point_add(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
1680                   pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
1681                   pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
1682                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1683                   pre->g_pre_comp[i][2][2]);
1684         for (j = 1; j < 8; ++j) {
1685             /* odd multiples: add G resp. 2^28*G */
1686             point_add(pre->g_pre_comp[i][2 * j + 1][0],
1687                       pre->g_pre_comp[i][2 * j + 1][1],
1688                       pre->g_pre_comp[i][2 * j + 1][2],
1689                       pre->g_pre_comp[i][2 * j][0],
1690                       pre->g_pre_comp[i][2 * j][1],
1691                       pre->g_pre_comp[i][2 * j][2], 0,
1692                       pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1],
1693                       pre->g_pre_comp[i][1][2]);
1694         }
1695     }
1696     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems);
1697
1698  done:
1699     SETPRECOMP(group, nistp224, pre);
1700     pre = NULL;
1701     ret = 1;
1702  err:
1703     BN_CTX_end(ctx);
1704     EC_POINT_free(generator);
1705     BN_CTX_free(new_ctx);
1706     EC_nistp224_pre_comp_free(pre);
1707     return ret;
1708 }
1709
1710 int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group)
1711 {
1712     return HAVEPRECOMP(group, nistp224);
1713 }
1714
1715 #endif