crypto/ec/ecp_nistp224.c

   1 /*
   2  * Copyright 2010-2019 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the OpenSSL license (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * A 64-bit implementation of the NIST P-224 elliptic curve point multiplication
  28  *
  29  * Inspired by Daniel J. Bernstein's public domain nistp224 implementation
  30  * and Adam Langley's public domain 64-bit C implementation of curve25519
  31  */
  32
  33 #include <openssl/opensslconf.h>
  34 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  35 NON_EMPTY_TRANSLATION_UNIT
  36 #else
  37
  38 # include <stdint.h>
  39 # include <string.h>
  40 # include <openssl/err.h>
  41 # include "ec_local.h"
  42
  43 # if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
  44   /* even with gcc, the typedef won't work for 32-bit platforms */
  45 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  46                                  * platforms */
  47 # else
  48 #  error "Your compiler doesn't appear to support 128-bit integer types"
  49 # endif
  50
  51 typedef uint8_t u8;
  52 typedef uint64_t u64;
  53
  54 /******************************************************************************/
  55 /*-
  56  * INTERNAL REPRESENTATION OF FIELD ELEMENTS
  57  *
  58  * Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3
  59  * using 64-bit coefficients called 'limbs',
  60  * and sometimes (for multiplication results) as
  61  * b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 + 2^336*b_6
  62  * using 128-bit coefficients called 'widelimbs'.
  63  * A 4-limb representation is an 'felem';
  64  * a 7-widelimb representation is a 'widefelem'.
  65  * Even within felems, bits of adjacent limbs overlap, and we don't always
  66  * reduce the representations: we ensure that inputs to each felem
  67  * multiplication satisfy a_i < 2^60, so outputs satisfy b_i < 4*2^60*2^60,
  68  * and fit into a 128-bit word without overflow. The coefficients are then
  69  * again partially reduced to obtain an felem satisfying a_i < 2^57.
  70  * We only reduce to the unique minimal representation at the end of the
  71  * computation.
  72  */
  73
  74 typedef uint64_t limb;
  75 typedef uint128_t widelimb;
  76
  77 typedef limb felem[4];
  78 typedef widelimb widefelem[7];
  79
  80 /*
  81  * Field element represented as a byte array. 28*8 = 224 bits is also the
  82  * group order size for the elliptic curve, and we also use this type for
  83  * scalars for point multiplication.
  84  */
  85 typedef u8 felem_bytearray[28];
  86
  87 static const felem_bytearray nistp224_curve_params[5] = {
  88     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
  89      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
  90      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01},
  91     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a */
  92      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
  93      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE},
  94     {0xB4, 0x05, 0x0A, 0x85, 0x0C, 0x04, 0xB3, 0xAB, 0xF5, 0x41, /* b */
  95      0x32, 0x56, 0x50, 0x44, 0xB0, 0xB7, 0xD7, 0xBF, 0xD8, 0xBA,
  96      0x27, 0x0B, 0x39, 0x43, 0x23, 0x55, 0xFF, 0xB4},
  97     {0xB7, 0x0E, 0x0C, 0xBD, 0x6B, 0xB4, 0xBF, 0x7F, 0x32, 0x13, /* x */
  98      0x90, 0xB9, 0x4A, 0x03, 0xC1, 0xD3, 0x56, 0xC2, 0x11, 0x22,
  99      0x34, 0x32, 0x80, 0xD6, 0x11, 0x5C, 0x1D, 0x21},
 100     {0xbd, 0x37, 0x63, 0x88, 0xb5, 0xf7, 0x23, 0xfb, 0x4c, 0x22, /* y */
 101      0xdf, 0xe6, 0xcd, 0x43, 0x75, 0xa0, 0x5a, 0x07, 0x47, 0x64,
 102      0x44, 0xd5, 0x81, 0x99, 0x85, 0x00, 0x7e, 0x34}
 103 };
 104
 105 /*-
 106  * Precomputed multiples of the standard generator
 107  * Points are given in coordinates (X, Y, Z) where Z normally is 1
 108  * (0 for the point at infinity).
 109  * For each field element, slice a_0 is word 0, etc.
 110  *
 111  * The table has 2 * 16 elements, starting with the following:
 112  * index | bits    | point
 113  * ------+---------+------------------------------
 114  *     0 | 0 0 0 0 | 0G
 115  *     1 | 0 0 0 1 | 1G
 116  *     2 | 0 0 1 0 | 2^56G
 117  *     3 | 0 0 1 1 | (2^56 + 1)G
 118  *     4 | 0 1 0 0 | 2^112G
 119  *     5 | 0 1 0 1 | (2^112 + 1)G
 120  *     6 | 0 1 1 0 | (2^112 + 2^56)G
 121  *     7 | 0 1 1 1 | (2^112 + 2^56 + 1)G
 122  *     8 | 1 0 0 0 | 2^168G
 123  *     9 | 1 0 0 1 | (2^168 + 1)G
 124  *    10 | 1 0 1 0 | (2^168 + 2^56)G
 125  *    11 | 1 0 1 1 | (2^168 + 2^56 + 1)G
 126  *    12 | 1 1 0 0 | (2^168 + 2^112)G
 127  *    13 | 1 1 0 1 | (2^168 + 2^112 + 1)G
 128  *    14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G
 129  *    15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G
 130  * followed by a copy of this with each element multiplied by 2^28.
 131  *
 132  * The reason for this is so that we can clock bits into four different
 133  * locations when doing simple scalar multiplies against the base point,
 134  * and then another four locations using the second 16 elements.
 135  */
 136 static const felem gmul[2][16][3] = {
 137 {{{0, 0, 0, 0},
 138   {0, 0, 0, 0},
 139   {0, 0, 0, 0}},
 140  {{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf},
 141   {0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723},
 142   {1, 0, 0, 0}},
 143  {{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5},
 144   {0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321},
 145   {1, 0, 0, 0}},
 146  {{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748},
 147   {0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17},
 148   {1, 0, 0, 0}},
 149  {{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe},
 150   {0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b},
 151   {1, 0, 0, 0}},
 152  {{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3},
 153   {0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a},
 154   {1, 0, 0, 0}},
 155  {{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c},
 156   {0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244},
 157   {1, 0, 0, 0}},
 158  {{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849},
 159   {0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112},
 160   {1, 0, 0, 0}},
 161  {{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47},
 162   {0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394},
 163   {1, 0, 0, 0}},
 164  {{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d},
 165   {0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7},
 166   {1, 0, 0, 0}},
 167  {{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24},
 168   {0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881},
 169   {1, 0, 0, 0}},
 170  {{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984},
 171   {0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369},
 172   {1, 0, 0, 0}},
 173  {{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3},
 174   {0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60},
 175   {1, 0, 0, 0}},
 176  {{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057},
 177   {0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9},
 178   {1, 0, 0, 0}},
 179  {{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9},
 180   {0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc},
 181   {1, 0, 0, 0}},
 182  {{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58},
 183   {0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558},
 184   {1, 0, 0, 0}}},
 185 {{{0, 0, 0, 0},
 186   {0, 0, 0, 0},
 187   {0, 0, 0, 0}},
 188  {{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31},
 189   {0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d},
 190   {1, 0, 0, 0}},
 191  {{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3},
 192   {0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a},
 193   {1, 0, 0, 0}},
 194  {{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33},
 195   {0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100},
 196   {1, 0, 0, 0}},
 197  {{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5},
 198   {0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea},
 199   {1, 0, 0, 0}},
 200  {{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be},
 201   {0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51},
 202   {1, 0, 0, 0}},
 203  {{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1},
 204   {0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb},
 205   {1, 0, 0, 0}},
 206  {{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233},
 207   {0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def},
 208   {1, 0, 0, 0}},
 209  {{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae},
 210   {0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45},
 211   {1, 0, 0, 0}},
 212  {{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e},
 213   {0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb},
 214   {1, 0, 0, 0}},
 215  {{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de},
 216   {0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3},
 217   {1, 0, 0, 0}},
 218  {{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05},
 219   {0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58},
 220   {1, 0, 0, 0}},
 221  {{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb},
 222   {0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0},
 223   {1, 0, 0, 0}},
 224  {{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9},
 225   {0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea},
 226   {1, 0, 0, 0}},
 227  {{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba},
 228   {0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405},
 229   {1, 0, 0, 0}},
 230  {{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e},
 231   {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e},
 232   {1, 0, 0, 0}}}
 233 };
 234
 235 /* Precomputation for the group generator. */
 236 struct nistp224_pre_comp_st {
 237     felem g_pre_comp[2][16][3];
 238     CRYPTO_REF_COUNT references;
 239     CRYPTO_RWLOCK *lock;
 240 };
 241
 242 const EC_METHOD *EC_GFp_nistp224_method(void)
 243 {
 244     static const EC_METHOD ret = {
 245         EC_FLAGS_DEFAULT_OCT,
 246         NID_X9_62_prime_field,
 247         ec_GFp_nistp224_group_init,
 248         ec_GFp_simple_group_finish,
 249         ec_GFp_simple_group_clear_finish,
 250         ec_GFp_nist_group_copy,
 251         ec_GFp_nistp224_group_set_curve,
 252         ec_GFp_simple_group_get_curve,
 253         ec_GFp_simple_group_get_degree,
 254         ec_group_simple_order_bits,
 255         ec_GFp_simple_group_check_discriminant,
 256         ec_GFp_simple_point_init,
 257         ec_GFp_simple_point_finish,
 258         ec_GFp_simple_point_clear_finish,
 259         ec_GFp_simple_point_copy,
 260         ec_GFp_simple_point_set_to_infinity,
 261         ec_GFp_simple_set_Jprojective_coordinates_GFp,
 262         ec_GFp_simple_get_Jprojective_coordinates_GFp,
 263         ec_GFp_simple_point_set_affine_coordinates,
 264         ec_GFp_nistp224_point_get_affine_coordinates,
 265         0 /* point_set_compressed_coordinates */ ,
 266         0 /* point2oct */ ,
 267         0 /* oct2point */ ,
 268         ec_GFp_simple_add,
 269         ec_GFp_simple_dbl,
 270         ec_GFp_simple_invert,
 271         ec_GFp_simple_is_at_infinity,
 272         ec_GFp_simple_is_on_curve,
 273         ec_GFp_simple_cmp,
 274         ec_GFp_simple_make_affine,
 275         ec_GFp_simple_points_make_affine,
 276         ec_GFp_nistp224_points_mul,
 277         ec_GFp_nistp224_precompute_mult,
 278         ec_GFp_nistp224_have_precompute_mult,
 279         ec_GFp_nist_field_mul,
 280         ec_GFp_nist_field_sqr,
 281         0 /* field_div */ ,
 282         ec_GFp_simple_field_inv,
 283         0 /* field_encode */ ,
 284         0 /* field_decode */ ,
 285         0,                      /* field_set_to_one */
 286         ec_key_simple_priv2oct,
 287         ec_key_simple_oct2priv,
 288         0, /* set private */
 289         ec_key_simple_generate_key,
 290         ec_key_simple_check_key,
 291         ec_key_simple_generate_public_key,
 292         0, /* keycopy */
 293         0, /* keyfinish */
 294         ecdh_simple_compute_key,
 295         0, /* field_inverse_mod_ord */
 296         0, /* blind_coordinates */
 297         0, /* ladder_pre */
 298         0, /* ladder_step */
 299         0  /* ladder_post */
 300     };
 301
 302     return &ret;
 303 }
 304
 305 /*
 306  * Helper functions to convert field elements to/from internal representation
 307  */
 308 static void bin28_to_felem(felem out, const u8 in[28])
 309 {
 310     out[0] = *((const uint64_t *)(in)) & 0x00ffffffffffffff;
 311     out[1] = (*((const uint64_t *)(in + 7))) & 0x00ffffffffffffff;
 312     out[2] = (*((const uint64_t *)(in + 14))) & 0x00ffffffffffffff;
 313     out[3] = (*((const uint64_t *)(in+20))) >> 8;
 314 }
 315
 316 static void felem_to_bin28(u8 out[28], const felem in)
 317 {
 318     unsigned i;
 319     for (i = 0; i < 7; ++i) {
 320         out[i] = in[0] >> (8 * i);
 321         out[i + 7] = in[1] >> (8 * i);
 322         out[i + 14] = in[2] >> (8 * i);
 323         out[i + 21] = in[3] >> (8 * i);
 324     }
 325 }
 326
 327 /* From OpenSSL BIGNUM to internal representation */
 328 static int BN_to_felem(felem out, const BIGNUM *bn)
 329 {
 330     felem_bytearray b_out;
 331     int num_bytes;
 332
 333     if (BN_is_negative(bn)) {
 334         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 335         return 0;
 336     }
 337     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
 338     if (num_bytes < 0) {
 339         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 340         return 0;
 341     }
 342     bin28_to_felem(out, b_out);
 343     return 1;
 344 }
 345
 346 /* From internal representation to OpenSSL BIGNUM */
 347 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 348 {
 349     felem_bytearray b_out;
 350     felem_to_bin28(b_out, in);
 351     return BN_lebin2bn(b_out, sizeof(b_out), out);
 352 }
 353
 354 /******************************************************************************/
 355 /*-
 356  *                              FIELD OPERATIONS
 357  *
 358  * Field operations, using the internal representation of field elements.
 359  * NB! These operations are specific to our point multiplication and cannot be
 360  * expected to be correct in general - e.g., multiplication with a large scalar
 361  * will cause an overflow.
 362  *
 363  */
 364
 365 static void felem_one(felem out)
 366 {
 367     out[0] = 1;
 368     out[1] = 0;
 369     out[2] = 0;
 370     out[3] = 0;
 371 }
 372
 373 static void felem_assign(felem out, const felem in)
 374 {
 375     out[0] = in[0];
 376     out[1] = in[1];
 377     out[2] = in[2];
 378     out[3] = in[3];
 379 }
 380
 381 /* Sum two field elements: out += in */
 382 static void felem_sum(felem out, const felem in)
 383 {
 384     out[0] += in[0];
 385     out[1] += in[1];
 386     out[2] += in[2];
 387     out[3] += in[3];
 388 }
 389
 390 /* Subtract field elements: out -= in */
 391 /* Assumes in[i] < 2^57 */
 392 static void felem_diff(felem out, const felem in)
 393 {
 394     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 395     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 396     static const limb two58m42m2 = (((limb) 1) << 58) -
 397         (((limb) 1) << 42) - (((limb) 1) << 2);
 398
 399     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 400     out[0] += two58p2;
 401     out[1] += two58m42m2;
 402     out[2] += two58m2;
 403     out[3] += two58m2;
 404
 405     out[0] -= in[0];
 406     out[1] -= in[1];
 407     out[2] -= in[2];
 408     out[3] -= in[3];
 409 }
 410
 411 /* Subtract in unreduced 128-bit mode: out -= in */
 412 /* Assumes in[i] < 2^119 */
 413 static void widefelem_diff(widefelem out, const widefelem in)
 414 {
 415     static const widelimb two120 = ((widelimb) 1) << 120;
 416     static const widelimb two120m64 = (((widelimb) 1) << 120) -
 417         (((widelimb) 1) << 64);
 418     static const widelimb two120m104m64 = (((widelimb) 1) << 120) -
 419         (((widelimb) 1) << 104) - (((widelimb) 1) << 64);
 420
 421     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 422     out[0] += two120;
 423     out[1] += two120m64;
 424     out[2] += two120m64;
 425     out[3] += two120;
 426     out[4] += two120m104m64;
 427     out[5] += two120m64;
 428     out[6] += two120m64;
 429
 430     out[0] -= in[0];
 431     out[1] -= in[1];
 432     out[2] -= in[2];
 433     out[3] -= in[3];
 434     out[4] -= in[4];
 435     out[5] -= in[5];
 436     out[6] -= in[6];
 437 }
 438
 439 /* Subtract in mixed mode: out128 -= in64 */
 440 /* in[i] < 2^63 */
 441 static void felem_diff_128_64(widefelem out, const felem in)
 442 {
 443     static const widelimb two64p8 = (((widelimb) 1) << 64) +
 444         (((widelimb) 1) << 8);
 445     static const widelimb two64m8 = (((widelimb) 1) << 64) -
 446         (((widelimb) 1) << 8);
 447     static const widelimb two64m48m8 = (((widelimb) 1) << 64) -
 448         (((widelimb) 1) << 48) - (((widelimb) 1) << 8);
 449
 450     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 451     out[0] += two64p8;
 452     out[1] += two64m48m8;
 453     out[2] += two64m8;
 454     out[3] += two64m8;
 455
 456     out[0] -= in[0];
 457     out[1] -= in[1];
 458     out[2] -= in[2];
 459     out[3] -= in[3];
 460 }
 461
 462 /*
 463  * Multiply a field element by a scalar: out = out * scalar The scalars we
 464  * actually use are small, so results fit without overflow
 465  */
 466 static void felem_scalar(felem out, const limb scalar)
 467 {
 468     out[0] *= scalar;
 469     out[1] *= scalar;
 470     out[2] *= scalar;
 471     out[3] *= scalar;
 472 }
 473
 474 /*
 475  * Multiply an unreduced field element by a scalar: out = out * scalar The
 476  * scalars we actually use are small, so results fit without overflow
 477  */
 478 static void widefelem_scalar(widefelem out, const widelimb scalar)
 479 {
 480     out[0] *= scalar;
 481     out[1] *= scalar;
 482     out[2] *= scalar;
 483     out[3] *= scalar;
 484     out[4] *= scalar;
 485     out[5] *= scalar;
 486     out[6] *= scalar;
 487 }
 488
 489 /* Square a field element: out = in^2 */
 490 static void felem_square(widefelem out, const felem in)
 491 {
 492     limb tmp0, tmp1, tmp2;
 493     tmp0 = 2 * in[0];
 494     tmp1 = 2 * in[1];
 495     tmp2 = 2 * in[2];
 496     out[0] = ((widelimb) in[0]) * in[0];
 497     out[1] = ((widelimb) in[0]) * tmp1;
 498     out[2] = ((widelimb) in[0]) * tmp2 + ((widelimb) in[1]) * in[1];
 499     out[3] = ((widelimb) in[3]) * tmp0 + ((widelimb) in[1]) * tmp2;
 500     out[4] = ((widelimb) in[3]) * tmp1 + ((widelimb) in[2]) * in[2];
 501     out[5] = ((widelimb) in[3]) * tmp2;
 502     out[6] = ((widelimb) in[3]) * in[3];
 503 }
 504
 505 /* Multiply two field elements: out = in1 * in2 */
 506 static void felem_mul(widefelem out, const felem in1, const felem in2)
 507 {
 508     out[0] = ((widelimb) in1[0]) * in2[0];
 509     out[1] = ((widelimb) in1[0]) * in2[1] + ((widelimb) in1[1]) * in2[0];
 510     out[2] = ((widelimb) in1[0]) * in2[2] + ((widelimb) in1[1]) * in2[1] +
 511              ((widelimb) in1[2]) * in2[0];
 512     out[3] = ((widelimb) in1[0]) * in2[3] + ((widelimb) in1[1]) * in2[2] +
 513              ((widelimb) in1[2]) * in2[1] + ((widelimb) in1[3]) * in2[0];
 514     out[4] = ((widelimb) in1[1]) * in2[3] + ((widelimb) in1[2]) * in2[2] +
 515              ((widelimb) in1[3]) * in2[1];
 516     out[5] = ((widelimb) in1[2]) * in2[3] + ((widelimb) in1[3]) * in2[2];
 517     out[6] = ((widelimb) in1[3]) * in2[3];
 518 }
 519
 520 /*-
 521  * Reduce seven 128-bit coefficients to four 64-bit coefficients.
 522  * Requires in[i] < 2^126,
 523  * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 */
 524 static void felem_reduce(felem out, const widefelem in)
 525 {
 526     static const widelimb two127p15 = (((widelimb) 1) << 127) +
 527         (((widelimb) 1) << 15);
 528     static const widelimb two127m71 = (((widelimb) 1) << 127) -
 529         (((widelimb) 1) << 71);
 530     static const widelimb two127m71m55 = (((widelimb) 1) << 127) -
 531         (((widelimb) 1) << 71) - (((widelimb) 1) << 55);
 532     widelimb output[5];
 533
 534     /* Add 0 mod 2^224-2^96+1 to ensure all differences are positive */
 535     output[0] = in[0] + two127p15;
 536     output[1] = in[1] + two127m71m55;
 537     output[2] = in[2] + two127m71;
 538     output[3] = in[3];
 539     output[4] = in[4];
 540
 541     /* Eliminate in[4], in[5], in[6] */
 542     output[4] += in[6] >> 16;
 543     output[3] += (in[6] & 0xffff) << 40;
 544     output[2] -= in[6];
 545
 546     output[3] += in[5] >> 16;
 547     output[2] += (in[5] & 0xffff) << 40;
 548     output[1] -= in[5];
 549
 550     output[2] += output[4] >> 16;
 551     output[1] += (output[4] & 0xffff) << 40;
 552     output[0] -= output[4];
 553
 554     /* Carry 2 -> 3 -> 4 */
 555     output[3] += output[2] >> 56;
 556     output[2] &= 0x00ffffffffffffff;
 557
 558     output[4] = output[3] >> 56;
 559     output[3] &= 0x00ffffffffffffff;
 560
 561     /* Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72 */
 562
 563     /* Eliminate output[4] */
 564     output[2] += output[4] >> 16;
 565     /* output[2] < 2^56 + 2^56 = 2^57 */
 566     output[1] += (output[4] & 0xffff) << 40;
 567     output[0] -= output[4];
 568
 569     /* Carry 0 -> 1 -> 2 -> 3 */
 570     output[1] += output[0] >> 56;
 571     out[0] = output[0] & 0x00ffffffffffffff;
 572
 573     output[2] += output[1] >> 56;
 574     /* output[2] < 2^57 + 2^72 */
 575     out[1] = output[1] & 0x00ffffffffffffff;
 576     output[3] += output[2] >> 56;
 577     /* output[3] <= 2^56 + 2^16 */
 578     out[2] = output[2] & 0x00ffffffffffffff;
 579
 580     /*-
 581      * out[0] < 2^56, out[1] < 2^56, out[2] < 2^56,
 582      * out[3] <= 2^56 + 2^16 (due to final carry),
 583      * so out < 2*p
 584      */
 585     out[3] = output[3];
 586 }
 587
 588 static void felem_square_reduce(felem out, const felem in)
 589 {
 590     widefelem tmp;
 591     felem_square(tmp, in);
 592     felem_reduce(out, tmp);
 593 }
 594
 595 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 596 {
 597     widefelem tmp;
 598     felem_mul(tmp, in1, in2);
 599     felem_reduce(out, tmp);
 600 }
 601
 602 /*
 603  * Reduce to unique minimal representation. Requires 0 <= in < 2*p (always
 604  * call felem_reduce first)
 605  */
 606 static void felem_contract(felem out, const felem in)
 607 {
 608     static const int64_t two56 = ((limb) 1) << 56;
 609     /* 0 <= in < 2*p, p = 2^224 - 2^96 + 1 */
 610     /* if in > p , reduce in = in - 2^224 + 2^96 - 1 */
 611     int64_t tmp[4], a;
 612     tmp[0] = in[0];
 613     tmp[1] = in[1];
 614     tmp[2] = in[2];
 615     tmp[3] = in[3];
 616     /* Case 1: a = 1 iff in >= 2^224 */
 617     a = (in[3] >> 56);
 618     tmp[0] -= a;
 619     tmp[1] += a << 40;
 620     tmp[3] &= 0x00ffffffffffffff;
 621     /*
 622      * Case 2: a = 0 iff p <= in < 2^224, i.e., the high 128 bits are all 1
 623      * and the lower part is non-zero
 624      */
 625     a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) |
 626         (((int64_t) (in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63);
 627     a &= 0x00ffffffffffffff;
 628     /* turn a into an all-one mask (if a = 0) or an all-zero mask */
 629     a = (a - 1) >> 63;
 630     /* subtract 2^224 - 2^96 + 1 if a is all-one */
 631     tmp[3] &= a ^ 0xffffffffffffffff;
 632     tmp[2] &= a ^ 0xffffffffffffffff;
 633     tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff;
 634     tmp[0] -= 1 & a;
 635
 636     /*
 637      * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
 638      * non-zero, so we only need one step
 639      */
 640     a = tmp[0] >> 63;
 641     tmp[0] += two56 & a;
 642     tmp[1] -= 1 & a;
 643
 644     /* carry 1 -> 2 -> 3 */
 645     tmp[2] += tmp[1] >> 56;
 646     tmp[1] &= 0x00ffffffffffffff;
 647
 648     tmp[3] += tmp[2] >> 56;
 649     tmp[2] &= 0x00ffffffffffffff;
 650
 651     /* Now 0 <= out < p */
 652     out[0] = tmp[0];
 653     out[1] = tmp[1];
 654     out[2] = tmp[2];
 655     out[3] = tmp[3];
 656 }
 657
 658 /*
 659  * Get negative value: out = -in
 660  * Requires in[i] < 2^63,
 661  * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16
 662  */
 663 static void felem_neg(felem out, const felem in)
 664 {
 665     widefelem tmp = {0};
 666     felem_diff_128_64(tmp, in);
 667     felem_reduce(out, tmp);
 668 }
 669
 670 /*
 671  * Zero-check: returns 1 if input is 0, and 0 otherwise. We know that field
 672  * elements are reduced to in < 2^225, so we only need to check three cases:
 673  * 0, 2^224 - 2^96 + 1, and 2^225 - 2^97 + 2
 674  */
 675 static limb felem_is_zero(const felem in)
 676 {
 677     limb zero, two224m96p1, two225m97p2;
 678
 679     zero = in[0] | in[1] | in[2] | in[3];
 680     zero = (((int64_t) (zero) - 1) >> 63) & 1;
 681     two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000)
 682         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x00ffffffffffffff);
 683     two224m96p1 = (((int64_t) (two224m96p1) - 1) >> 63) & 1;
 684     two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000)
 685         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x01ffffffffffffff);
 686     two225m97p2 = (((int64_t) (two225m97p2) - 1) >> 63) & 1;
 687     return (zero | two224m96p1 | two225m97p2);
 688 }
 689
 690 static int felem_is_zero_int(const void *in)
 691 {
 692     return (int)(felem_is_zero(in) & ((limb) 1));
 693 }
 694
 695 /* Invert a field element */
 696 /* Computation chain copied from djb's code */
 697 static void felem_inv(felem out, const felem in)
 698 {
 699     felem ftmp, ftmp2, ftmp3, ftmp4;
 700     widefelem tmp;
 701     unsigned i;
 702
 703     felem_square(tmp, in);
 704     felem_reduce(ftmp, tmp);    /* 2 */
 705     felem_mul(tmp, in, ftmp);
 706     felem_reduce(ftmp, tmp);    /* 2^2 - 1 */
 707     felem_square(tmp, ftmp);
 708     felem_reduce(ftmp, tmp);    /* 2^3 - 2 */
 709     felem_mul(tmp, in, ftmp);
 710     felem_reduce(ftmp, tmp);    /* 2^3 - 1 */
 711     felem_square(tmp, ftmp);
 712     felem_reduce(ftmp2, tmp);   /* 2^4 - 2 */
 713     felem_square(tmp, ftmp2);
 714     felem_reduce(ftmp2, tmp);   /* 2^5 - 4 */
 715     felem_square(tmp, ftmp2);
 716     felem_reduce(ftmp2, tmp);   /* 2^6 - 8 */
 717     felem_mul(tmp, ftmp2, ftmp);
 718     felem_reduce(ftmp, tmp);    /* 2^6 - 1 */
 719     felem_square(tmp, ftmp);
 720     felem_reduce(ftmp2, tmp);   /* 2^7 - 2 */
 721     for (i = 0; i < 5; ++i) {   /* 2^12 - 2^6 */
 722         felem_square(tmp, ftmp2);
 723         felem_reduce(ftmp2, tmp);
 724     }
 725     felem_mul(tmp, ftmp2, ftmp);
 726     felem_reduce(ftmp2, tmp);   /* 2^12 - 1 */
 727     felem_square(tmp, ftmp2);
 728     felem_reduce(ftmp3, tmp);   /* 2^13 - 2 */
 729     for (i = 0; i < 11; ++i) {  /* 2^24 - 2^12 */
 730         felem_square(tmp, ftmp3);
 731         felem_reduce(ftmp3, tmp);
 732     }
 733     felem_mul(tmp, ftmp3, ftmp2);
 734     felem_reduce(ftmp2, tmp);   /* 2^24 - 1 */
 735     felem_square(tmp, ftmp2);
 736     felem_reduce(ftmp3, tmp);   /* 2^25 - 2 */
 737     for (i = 0; i < 23; ++i) {  /* 2^48 - 2^24 */
 738         felem_square(tmp, ftmp3);
 739         felem_reduce(ftmp3, tmp);
 740     }
 741     felem_mul(tmp, ftmp3, ftmp2);
 742     felem_reduce(ftmp3, tmp);   /* 2^48 - 1 */
 743     felem_square(tmp, ftmp3);
 744     felem_reduce(ftmp4, tmp);   /* 2^49 - 2 */
 745     for (i = 0; i < 47; ++i) {  /* 2^96 - 2^48 */
 746         felem_square(tmp, ftmp4);
 747         felem_reduce(ftmp4, tmp);
 748     }
 749     felem_mul(tmp, ftmp3, ftmp4);
 750     felem_reduce(ftmp3, tmp);   /* 2^96 - 1 */
 751     felem_square(tmp, ftmp3);
 752     felem_reduce(ftmp4, tmp);   /* 2^97 - 2 */
 753     for (i = 0; i < 23; ++i) {  /* 2^120 - 2^24 */
 754         felem_square(tmp, ftmp4);
 755         felem_reduce(ftmp4, tmp);
 756     }
 757     felem_mul(tmp, ftmp2, ftmp4);
 758     felem_reduce(ftmp2, tmp);   /* 2^120 - 1 */
 759     for (i = 0; i < 6; ++i) {   /* 2^126 - 2^6 */
 760         felem_square(tmp, ftmp2);
 761         felem_reduce(ftmp2, tmp);
 762     }
 763     felem_mul(tmp, ftmp2, ftmp);
 764     felem_reduce(ftmp, tmp);    /* 2^126 - 1 */
 765     felem_square(tmp, ftmp);
 766     felem_reduce(ftmp, tmp);    /* 2^127 - 2 */
 767     felem_mul(tmp, ftmp, in);
 768     felem_reduce(ftmp, tmp);    /* 2^127 - 1 */
 769     for (i = 0; i < 97; ++i) {  /* 2^224 - 2^97 */
 770         felem_square(tmp, ftmp);
 771         felem_reduce(ftmp, tmp);
 772     }
 773     felem_mul(tmp, ftmp, ftmp3);
 774     felem_reduce(out, tmp);     /* 2^224 - 2^96 - 1 */
 775 }
 776
 777 /*
 778  * Copy in constant time: if icopy == 1, copy in to out, if icopy == 0, copy
 779  * out to itself.
 780  */
 781 static void copy_conditional(felem out, const felem in, limb icopy)
 782 {
 783     unsigned i;
 784     /*
 785      * icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one
 786      */
 787     const limb copy = -icopy;
 788     for (i = 0; i < 4; ++i) {
 789         const limb tmp = copy & (in[i] ^ out[i]);
 790         out[i] ^= tmp;
 791     }
 792 }
 793
 794 /******************************************************************************/
 795 /*-
 796  *                       ELLIPTIC CURVE POINT OPERATIONS
 797  *
 798  * Points are represented in Jacobian projective coordinates:
 799  * (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3),
 800  * or to the point at infinity if Z == 0.
 801  *
 802  */
 803
 804 /*-
 805  * Double an elliptic curve point:
 806  * (X', Y', Z') = 2 * (X, Y, Z), where
 807  * X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2
 808  * Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^4
 809  * Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z
 810  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed,
 811  * while x_out == y_in is not (maybe this works, but it's not tested).
 812  */
 813 static void
 814 point_double(felem x_out, felem y_out, felem z_out,
 815              const felem x_in, const felem y_in, const felem z_in)
 816 {
 817     widefelem tmp, tmp2;
 818     felem delta, gamma, beta, alpha, ftmp, ftmp2;
 819
 820     felem_assign(ftmp, x_in);
 821     felem_assign(ftmp2, x_in);
 822
 823     /* delta = z^2 */
 824     felem_square(tmp, z_in);
 825     felem_reduce(delta, tmp);
 826
 827     /* gamma = y^2 */
 828     felem_square(tmp, y_in);
 829     felem_reduce(gamma, tmp);
 830
 831     /* beta = x*gamma */
 832     felem_mul(tmp, x_in, gamma);
 833     felem_reduce(beta, tmp);
 834
 835     /* alpha = 3*(x-delta)*(x+delta) */
 836     felem_diff(ftmp, delta);
 837     /* ftmp[i] < 2^57 + 2^58 + 2 < 2^59 */
 838     felem_sum(ftmp2, delta);
 839     /* ftmp2[i] < 2^57 + 2^57 = 2^58 */
 840     felem_scalar(ftmp2, 3);
 841     /* ftmp2[i] < 3 * 2^58 < 2^60 */
 842     felem_mul(tmp, ftmp, ftmp2);
 843     /* tmp[i] < 2^60 * 2^59 * 4 = 2^121 */
 844     felem_reduce(alpha, tmp);
 845
 846     /* x' = alpha^2 - 8*beta */
 847     felem_square(tmp, alpha);
 848     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 849     felem_assign(ftmp, beta);
 850     felem_scalar(ftmp, 8);
 851     /* ftmp[i] < 8 * 2^57 = 2^60 */
 852     felem_diff_128_64(tmp, ftmp);
 853     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 854     felem_reduce(x_out, tmp);
 855
 856     /* z' = (y + z)^2 - gamma - delta */
 857     felem_sum(delta, gamma);
 858     /* delta[i] < 2^57 + 2^57 = 2^58 */
 859     felem_assign(ftmp, y_in);
 860     felem_sum(ftmp, z_in);
 861     /* ftmp[i] < 2^57 + 2^57 = 2^58 */
 862     felem_square(tmp, ftmp);
 863     /* tmp[i] < 4 * 2^58 * 2^58 = 2^118 */
 864     felem_diff_128_64(tmp, delta);
 865     /* tmp[i] < 2^118 + 2^64 + 8 < 2^119 */
 866     felem_reduce(z_out, tmp);
 867
 868     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
 869     felem_scalar(beta, 4);
 870     /* beta[i] < 4 * 2^57 = 2^59 */
 871     felem_diff(beta, x_out);
 872     /* beta[i] < 2^59 + 2^58 + 2 < 2^60 */
 873     felem_mul(tmp, alpha, beta);
 874     /* tmp[i] < 4 * 2^57 * 2^60 = 2^119 */
 875     felem_square(tmp2, gamma);
 876     /* tmp2[i] < 4 * 2^57 * 2^57 = 2^116 */
 877     widefelem_scalar(tmp2, 8);
 878     /* tmp2[i] < 8 * 2^116 = 2^119 */
 879     widefelem_diff(tmp, tmp2);
 880     /* tmp[i] < 2^119 + 2^120 < 2^121 */
 881     felem_reduce(y_out, tmp);
 882 }
 883
 884 /*-
 885  * Add two elliptic curve points:
 886  * (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where
 887  * X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 -
 888  * 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2
 889  * Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 - X_3) -
 890  *        Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3
 891  * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2)
 892  *
 893  * This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0.
 894  */
 895
 896 /*
 897  * This function is not entirely constant-time: it includes a branch for
 898  * checking whether the two input points are equal, (while not equal to the
 899  * point at infinity). This case never happens during single point
 900  * multiplication, so there is no timing leak for ECDH or ECDSA signing.
 901  */
 902 static void point_add(felem x3, felem y3, felem z3,
 903                       const felem x1, const felem y1, const felem z1,
 904                       const int mixed, const felem x2, const felem y2,
 905                       const felem z2)
 906 {
 907     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out;
 908     widefelem tmp, tmp2;
 909     limb z1_is_zero, z2_is_zero, x_equal, y_equal;
 910     limb points_equal;
 911
 912     if (!mixed) {
 913         /* ftmp2 = z2^2 */
 914         felem_square(tmp, z2);
 915         felem_reduce(ftmp2, tmp);
 916
 917         /* ftmp4 = z2^3 */
 918         felem_mul(tmp, ftmp2, z2);
 919         felem_reduce(ftmp4, tmp);
 920
 921         /* ftmp4 = z2^3*y1 */
 922         felem_mul(tmp2, ftmp4, y1);
 923         felem_reduce(ftmp4, tmp2);
 924
 925         /* ftmp2 = z2^2*x1 */
 926         felem_mul(tmp2, ftmp2, x1);
 927         felem_reduce(ftmp2, tmp2);
 928     } else {
 929         /*
 930          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
 931          */
 932
 933         /* ftmp4 = z2^3*y1 */
 934         felem_assign(ftmp4, y1);
 935
 936         /* ftmp2 = z2^2*x1 */
 937         felem_assign(ftmp2, x1);
 938     }
 939
 940     /* ftmp = z1^2 */
 941     felem_square(tmp, z1);
 942     felem_reduce(ftmp, tmp);
 943
 944     /* ftmp3 = z1^3 */
 945     felem_mul(tmp, ftmp, z1);
 946     felem_reduce(ftmp3, tmp);
 947
 948     /* tmp = z1^3*y2 */
 949     felem_mul(tmp, ftmp3, y2);
 950     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 951
 952     /* ftmp3 = z1^3*y2 - z2^3*y1 */
 953     felem_diff_128_64(tmp, ftmp4);
 954     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 955     felem_reduce(ftmp3, tmp);
 956
 957     /* tmp = z1^2*x2 */
 958     felem_mul(tmp, ftmp, x2);
 959     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 960
 961     /* ftmp = z1^2*x2 - z2^2*x1 */
 962     felem_diff_128_64(tmp, ftmp2);
 963     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 964     felem_reduce(ftmp, tmp);
 965
 966     /*
 967      * The formulae are incorrect if the points are equal, in affine coordinates
 968      * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
 969      * happens.
 970      *
 971      * We use bitwise operations to avoid potential side-channels introduced by
 972      * the short-circuiting behaviour of boolean operators.
 973      */
 974     x_equal = felem_is_zero(ftmp);
 975     y_equal = felem_is_zero(ftmp3);
 976     /*
 977      * The special case of either point being the point at infinity (z1 and/or
 978      * z2 are zero), is handled separately later on in this function, so we
 979      * avoid jumping to point_double here in those special cases.
 980      */
 981     z1_is_zero = felem_is_zero(z1);
 982     z2_is_zero = felem_is_zero(z2);
 983
 984     /*
 985      * Compared to `ecp_nistp256.c` and `ecp_nistp521.c`, in this
 986      * specific implementation `felem_is_zero()` returns truth as `0x1`
 987      * (rather than `0xff..ff`).
 988      *
 989      * This implies that `~true` in this implementation becomes
 990      * `0xff..fe` (rather than `0x0`): for this reason, to be used in
 991      * the if expression, we mask out only the last bit in the next
 992      * line.
 993      */
 994     points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero)) & 1;
 995
 996     if (points_equal) {
 997         /*
 998          * This is obviously not constant-time but, as mentioned before, this
 999          * case never happens during single point multiplication, so there is no
1000          * timing leak for ECDH or ECDSA signing.
1001          */
1002         point_double(x3, y3, z3, x1, y1, z1);
1003         return;
1004     }
1005
1006     /* ftmp5 = z1*z2 */
1007     if (!mixed) {
1008         felem_mul(tmp, z1, z2);
1009         felem_reduce(ftmp5, tmp);
1010     } else {
1011         /* special case z2 = 0 is handled later */
1012         felem_assign(ftmp5, z1);
1013     }
1014
1015     /* z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) */
1016     felem_mul(tmp, ftmp, ftmp5);
1017     felem_reduce(z_out, tmp);
1018
1019     /* ftmp = (z1^2*x2 - z2^2*x1)^2 */
1020     felem_assign(ftmp5, ftmp);
1021     felem_square(tmp, ftmp);
1022     felem_reduce(ftmp, tmp);
1023
1024     /* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */
1025     felem_mul(tmp, ftmp, ftmp5);
1026     felem_reduce(ftmp5, tmp);
1027
1028     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
1029     felem_mul(tmp, ftmp2, ftmp);
1030     felem_reduce(ftmp2, tmp);
1031
1032     /* tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
1033     felem_mul(tmp, ftmp4, ftmp5);
1034     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
1035
1036     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 */
1037     felem_square(tmp2, ftmp3);
1038     /* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */
1039
1040     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */
1041     felem_diff_128_64(tmp2, ftmp5);
1042     /* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */
1043
1044     /* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
1045     felem_assign(ftmp5, ftmp2);
1046     felem_scalar(ftmp5, 2);
1047     /* ftmp5[i] < 2 * 2^57 = 2^58 */
1048
1049     /*-
1050      * x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 -
1051      *  2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2
1052      */
1053     felem_diff_128_64(tmp2, ftmp5);
1054     /* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */
1055     felem_reduce(x_out, tmp2);
1056
1057     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out */
1058     felem_diff(ftmp2, x_out);
1059     /* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */
1060
1061     /*
1062      * tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out)
1063      */
1064     felem_mul(tmp2, ftmp3, ftmp2);
1065     /* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */
1066
1067     /*-
1068      * y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) -
1069      *  z2^3*y1*(z1^2*x2 - z2^2*x1)^3
1070      */
1071     widefelem_diff(tmp2, tmp);
1072     /* tmp2[i] < 2^118 + 2^120 < 2^121 */
1073     felem_reduce(y_out, tmp2);
1074
1075     /*
1076      * the result (x_out, y_out, z_out) is incorrect if one of the inputs is
1077      * the point at infinity, so we need to check for this separately
1078      */
1079
1080     /*
1081      * if point 1 is at infinity, copy point 2 to output, and vice versa
1082      */
1083     copy_conditional(x_out, x2, z1_is_zero);
1084     copy_conditional(x_out, x1, z2_is_zero);
1085     copy_conditional(y_out, y2, z1_is_zero);
1086     copy_conditional(y_out, y1, z2_is_zero);
1087     copy_conditional(z_out, z2, z1_is_zero);
1088     copy_conditional(z_out, z1, z2_is_zero);
1089     felem_assign(x3, x_out);
1090     felem_assign(y3, y_out);
1091     felem_assign(z3, z_out);
1092 }
1093
1094 /*
1095  * select_point selects the |idx|th point from a precomputation table and
1096  * copies it to out.
1097  * The pre_comp array argument should be size of |size| argument
1098  */
1099 static void select_point(const u64 idx, unsigned int size,
1100                          const felem pre_comp[][3], felem out[3])
1101 {
1102     unsigned i, j;
1103     limb *outlimbs = &out[0][0];
1104
1105     memset(out, 0, sizeof(*out) * 3);
1106     for (i = 0; i < size; i++) {
1107         const limb *inlimbs = &pre_comp[i][0][0];
1108         u64 mask = i ^ idx;
1109         mask |= mask >> 4;
1110         mask |= mask >> 2;
1111         mask |= mask >> 1;
1112         mask &= 1;
1113         mask--;
1114         for (j = 0; j < 4 * 3; j++)
1115             outlimbs[j] |= inlimbs[j] & mask;
1116     }
1117 }
1118
1119 /* get_bit returns the |i|th bit in |in| */
1120 static char get_bit(const felem_bytearray in, unsigned i)
1121 {
1122     if (i >= 224)
1123         return 0;
1124     return (in[i >> 3] >> (i & 7)) & 1;
1125 }
1126
1127 /*
1128  * Interleaved point multiplication using precomputed point multiples: The
1129  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1130  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1131  * generator, using certain (large) precomputed multiples in g_pre_comp.
1132  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1133  */
1134 static void batch_mul(felem x_out, felem y_out, felem z_out,
1135                       const felem_bytearray scalars[],
1136                       const unsigned num_points, const u8 *g_scalar,
1137                       const int mixed, const felem pre_comp[][17][3],
1138                       const felem g_pre_comp[2][16][3])
1139 {
1140     int i, skip;
1141     unsigned num;
1142     unsigned gen_mul = (g_scalar != NULL);
1143     felem nq[3], tmp[4];
1144     u64 bits;
1145     u8 sign, digit;
1146
1147     /* set nq to the point at infinity */
1148     memset(nq, 0, sizeof(nq));
1149
1150     /*
1151      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1152      * of the generator (two in each of the last 28 rounds) and additions of
1153      * other points multiples (every 5th round).
1154      */
1155     skip = 1;                   /* save two point operations in the first
1156                                  * round */
1157     for (i = (num_points ? 220 : 27); i >= 0; --i) {
1158         /* double */
1159         if (!skip)
1160             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1161
1162         /* add multiples of the generator */
1163         if (gen_mul && (i <= 27)) {
1164             /* first, look 28 bits upwards */
1165             bits = get_bit(g_scalar, i + 196) << 3;
1166             bits |= get_bit(g_scalar, i + 140) << 2;
1167             bits |= get_bit(g_scalar, i + 84) << 1;
1168             bits |= get_bit(g_scalar, i + 28);
1169             /* select the point to add, in constant time */
1170             select_point(bits, 16, g_pre_comp[1], tmp);
1171
1172             if (!skip) {
1173                 /* value 1 below is argument for "mixed" */
1174                 point_add(nq[0], nq[1], nq[2],
1175                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1176             } else {
1177                 memcpy(nq, tmp, 3 * sizeof(felem));
1178                 skip = 0;
1179             }
1180
1181             /* second, look at the current position */
1182             bits = get_bit(g_scalar, i + 168) << 3;
1183             bits |= get_bit(g_scalar, i + 112) << 2;
1184             bits |= get_bit(g_scalar, i + 56) << 1;
1185             bits |= get_bit(g_scalar, i);
1186             /* select the point to add, in constant time */
1187             select_point(bits, 16, g_pre_comp[0], tmp);
1188             point_add(nq[0], nq[1], nq[2],
1189                       nq[0], nq[1], nq[2],
1190                       1 /* mixed */ , tmp[0], tmp[1], tmp[2]);
1191         }
1192
1193         /* do other additions every 5 doublings */
1194         if (num_points && (i % 5 == 0)) {
1195             /* loop over all scalars */
1196             for (num = 0; num < num_points; ++num) {
1197                 bits = get_bit(scalars[num], i + 4) << 5;
1198                 bits |= get_bit(scalars[num], i + 3) << 4;
1199                 bits |= get_bit(scalars[num], i + 2) << 3;
1200                 bits |= get_bit(scalars[num], i + 1) << 2;
1201                 bits |= get_bit(scalars[num], i) << 1;
1202                 bits |= get_bit(scalars[num], i - 1);
1203                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1204
1205                 /* select the point to add or subtract */
1206                 select_point(digit, 17, pre_comp[num], tmp);
1207                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1208                                             * point */
1209                 copy_conditional(tmp[1], tmp[3], sign);
1210
1211                 if (!skip) {
1212                     point_add(nq[0], nq[1], nq[2],
1213                               nq[0], nq[1], nq[2],
1214                               mixed, tmp[0], tmp[1], tmp[2]);
1215                 } else {
1216                     memcpy(nq, tmp, 3 * sizeof(felem));
1217                     skip = 0;
1218                 }
1219             }
1220         }
1221     }
1222     felem_assign(x_out, nq[0]);
1223     felem_assign(y_out, nq[1]);
1224     felem_assign(z_out, nq[2]);
1225 }
1226
1227 /******************************************************************************/
1228 /*
1229  * FUNCTIONS TO MANAGE PRECOMPUTATION
1230  */
1231
1232 static NISTP224_PRE_COMP *nistp224_pre_comp_new(void)
1233 {
1234     NISTP224_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1235
1236     if (!ret) {
1237         ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1238         return ret;
1239     }
1240
1241     ret->references = 1;
1242
1243     ret->lock = CRYPTO_THREAD_lock_new();
1244     if (ret->lock == NULL) {
1245         ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1246         OPENSSL_free(ret);
1247         return NULL;
1248     }
1249     return ret;
1250 }
1251
1252 NISTP224_PRE_COMP *EC_nistp224_pre_comp_dup(NISTP224_PRE_COMP *p)
1253 {
1254     int i;
1255     if (p != NULL)
1256         CRYPTO_UP_REF(&p->references, &i, p->lock);
1257     return p;
1258 }
1259
1260 void EC_nistp224_pre_comp_free(NISTP224_PRE_COMP *p)
1261 {
1262     int i;
1263
1264     if (p == NULL)
1265         return;
1266
1267     CRYPTO_DOWN_REF(&p->references, &i, p->lock);
1268     REF_PRINT_COUNT("EC_nistp224", x);
1269     if (i > 0)
1270         return;
1271     REF_ASSERT_ISNT(i < 0);
1272
1273     CRYPTO_THREAD_lock_free(p->lock);
1274     OPENSSL_free(p);
1275 }
1276
1277 /******************************************************************************/
1278 /*
1279  * OPENSSL EC_METHOD FUNCTIONS
1280  */
1281
1282 int ec_GFp_nistp224_group_init(EC_GROUP *group)
1283 {
1284     int ret;
1285     ret = ec_GFp_simple_group_init(group);
1286     group->a_is_minus3 = 1;
1287     return ret;
1288 }
1289
1290 int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1291                                     const BIGNUM *a, const BIGNUM *b,
1292                                     BN_CTX *ctx)
1293 {
1294     int ret = 0;
1295     BN_CTX *new_ctx = NULL;
1296     BIGNUM *curve_p, *curve_a, *curve_b;
1297
1298     if (ctx == NULL)
1299         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1300             return 0;
1301     BN_CTX_start(ctx);
1302     curve_p = BN_CTX_get(ctx);
1303     curve_a = BN_CTX_get(ctx);
1304     curve_b = BN_CTX_get(ctx);
1305     if (curve_b == NULL)
1306         goto err;
1307     BN_bin2bn(nistp224_curve_params[0], sizeof(felem_bytearray), curve_p);
1308     BN_bin2bn(nistp224_curve_params[1], sizeof(felem_bytearray), curve_a);
1309     BN_bin2bn(nistp224_curve_params[2], sizeof(felem_bytearray), curve_b);
1310     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1311         ECerr(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE,
1312               EC_R_WRONG_CURVE_PARAMETERS);
1313         goto err;
1314     }
1315     group->field_mod_func = BN_nist_mod_224;
1316     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1317  err:
1318     BN_CTX_end(ctx);
1319     BN_CTX_free(new_ctx);
1320     return ret;
1321 }
1322
1323 /*
1324  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1325  * (X/Z^2, Y/Z^3)
1326  */
1327 int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group,
1328                                                  const EC_POINT *point,
1329                                                  BIGNUM *x, BIGNUM *y,
1330                                                  BN_CTX *ctx)
1331 {
1332     felem z1, z2, x_in, y_in, x_out, y_out;
1333     widefelem tmp;
1334
1335     if (EC_POINT_is_at_infinity(group, point)) {
1336         ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1337               EC_R_POINT_AT_INFINITY);
1338         return 0;
1339     }
1340     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1341         (!BN_to_felem(z1, point->Z)))
1342         return 0;
1343     felem_inv(z2, z1);
1344     felem_square(tmp, z2);
1345     felem_reduce(z1, tmp);
1346     felem_mul(tmp, x_in, z1);
1347     felem_reduce(x_in, tmp);
1348     felem_contract(x_out, x_in);
1349     if (x != NULL) {
1350         if (!felem_to_BN(x, x_out)) {
1351             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1352                   ERR_R_BN_LIB);
1353             return 0;
1354         }
1355     }
1356     felem_mul(tmp, z1, z2);
1357     felem_reduce(z1, tmp);
1358     felem_mul(tmp, y_in, z1);
1359     felem_reduce(y_in, tmp);
1360     felem_contract(y_out, y_in);
1361     if (y != NULL) {
1362         if (!felem_to_BN(y, y_out)) {
1363             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1364                   ERR_R_BN_LIB);
1365             return 0;
1366         }
1367     }
1368     return 1;
1369 }
1370
1371 static void make_points_affine(size_t num, felem points[ /* num */ ][3],
1372                                felem tmp_felems[ /* num+1 */ ])
1373 {
1374     /*
1375      * Runs in constant time, unless an input is the point at infinity (which
1376      * normally shouldn't happen).
1377      */
1378     ec_GFp_nistp_points_make_affine_internal(num,
1379                                              points,
1380                                              sizeof(felem),
1381                                              tmp_felems,
1382                                              (void (*)(void *))felem_one,
1383                                              felem_is_zero_int,
1384                                              (void (*)(void *, const void *))
1385                                              felem_assign,
1386                                              (void (*)(void *, const void *))
1387                                              felem_square_reduce, (void (*)
1388                                                                    (void *,
1389                                                                     const void
1390                                                                     *,
1391                                                                     const void
1392                                                                     *))
1393                                              felem_mul_reduce,
1394                                              (void (*)(void *, const void *))
1395                                              felem_inv,
1396                                              (void (*)(void *, const void *))
1397                                              felem_contract);
1398 }
1399
1400 /*
1401  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1402  * values Result is stored in r (r can equal one of the inputs).
1403  */
1404 int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
1405                                const BIGNUM *scalar, size_t num,
1406                                const EC_POINT *points[],
1407                                const BIGNUM *scalars[], BN_CTX *ctx)
1408 {
1409     int ret = 0;
1410     int j;
1411     unsigned i;
1412     int mixed = 0;
1413     BIGNUM *x, *y, *z, *tmp_scalar;
1414     felem_bytearray g_secret;
1415     felem_bytearray *secrets = NULL;
1416     felem (*pre_comp)[17][3] = NULL;
1417     felem *tmp_felems = NULL;
1418     int num_bytes;
1419     int have_pre_comp = 0;
1420     size_t num_points = num;
1421     felem x_in, y_in, z_in, x_out, y_out, z_out;
1422     NISTP224_PRE_COMP *pre = NULL;
1423     const felem(*g_pre_comp)[16][3] = NULL;
1424     EC_POINT *generator = NULL;
1425     const EC_POINT *p = NULL;
1426     const BIGNUM *p_scalar = NULL;
1427
1428     BN_CTX_start(ctx);
1429     x = BN_CTX_get(ctx);
1430     y = BN_CTX_get(ctx);
1431     z = BN_CTX_get(ctx);
1432     tmp_scalar = BN_CTX_get(ctx);
1433     if (tmp_scalar == NULL)
1434         goto err;
1435
1436     if (scalar != NULL) {
1437         pre = group->pre_comp.nistp224;
1438         if (pre)
1439             /* we have precomputation, try to use it */
1440             g_pre_comp = (const felem(*)[16][3])pre->g_pre_comp;
1441         else
1442             /* try to use the standard precomputation */
1443             g_pre_comp = &gmul[0];
1444         generator = EC_POINT_new(group);
1445         if (generator == NULL)
1446             goto err;
1447         /* get the generator from precomputation */
1448         if (!felem_to_BN(x, g_pre_comp[0][1][0]) ||
1449             !felem_to_BN(y, g_pre_comp[0][1][1]) ||
1450             !felem_to_BN(z, g_pre_comp[0][1][2])) {
1451             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1452             goto err;
1453         }
1454         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1455                                                       generator, x, y, z,
1456                                                       ctx))
1457             goto err;
1458         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1459             /* precomputation matches generator */
1460             have_pre_comp = 1;
1461         else
1462             /*
1463              * we don't have valid precomputation: treat the generator as a
1464              * random point
1465              */
1466             num_points = num_points + 1;
1467     }
1468
1469     if (num_points > 0) {
1470         if (num_points >= 3) {
1471             /*
1472              * unless we precompute multiples for just one or two points,
1473              * converting those into affine form is time well spent
1474              */
1475             mixed = 1;
1476         }
1477         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1478         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1479         if (mixed)
1480             tmp_felems =
1481                 OPENSSL_malloc(sizeof(felem) * (num_points * 17 + 1));
1482         if ((secrets == NULL) || (pre_comp == NULL)
1483             || (mixed && (tmp_felems == NULL))) {
1484             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1485             goto err;
1486         }
1487
1488         /*
1489          * we treat NULL scalars as 0, and NULL points as points at infinity,
1490          * i.e., they contribute nothing to the linear combination
1491          */
1492         for (i = 0; i < num_points; ++i) {
1493             if (i == num) {
1494                 /* the generator */
1495                 p = EC_GROUP_get0_generator(group);
1496                 p_scalar = scalar;
1497             } else {
1498                 /* the i^th point */
1499                 p = points[i];
1500                 p_scalar = scalars[i];
1501             }
1502             if ((p_scalar != NULL) && (p != NULL)) {
1503                 /* reduce scalar to 0 <= scalar < 2^224 */
1504                 if ((BN_num_bits(p_scalar) > 224)
1505                     || (BN_is_negative(p_scalar))) {
1506                     /*
1507                      * this is an unusual input, and we don't guarantee
1508                      * constant-timeness
1509                      */
1510                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1511                         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1512                         goto err;
1513                     }
1514                     num_bytes = BN_bn2lebinpad(tmp_scalar,
1515                                                secrets[i], sizeof(secrets[i]));
1516                 } else {
1517                     num_bytes = BN_bn2lebinpad(p_scalar,
1518                                                secrets[i], sizeof(secrets[i]));
1519                 }
1520                 if (num_bytes < 0) {
1521                     ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1522                     goto err;
1523                 }
1524                 /* precompute multiples */
1525                 if ((!BN_to_felem(x_out, p->X)) ||
1526                     (!BN_to_felem(y_out, p->Y)) ||
1527                     (!BN_to_felem(z_out, p->Z)))
1528                     goto err;
1529                 felem_assign(pre_comp[i][1][0], x_out);
1530                 felem_assign(pre_comp[i][1][1], y_out);
1531                 felem_assign(pre_comp[i][1][2], z_out);
1532                 for (j = 2; j <= 16; ++j) {
1533                     if (j & 1) {
1534                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1535                                   pre_comp[i][j][2], pre_comp[i][1][0],
1536                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1537                                   pre_comp[i][j - 1][0],
1538                                   pre_comp[i][j - 1][1],
1539                                   pre_comp[i][j - 1][2]);
1540                     } else {
1541                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1542                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1543                                      pre_comp[i][j / 2][1],
1544                                      pre_comp[i][j / 2][2]);
1545                     }
1546                 }
1547             }
1548         }
1549         if (mixed)
1550             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1551     }
1552
1553     /* the scalar for the generator */
1554     if ((scalar != NULL) && (have_pre_comp)) {
1555         memset(g_secret, 0, sizeof(g_secret));
1556         /* reduce scalar to 0 <= scalar < 2^224 */
1557         if ((BN_num_bits(scalar) > 224) || (BN_is_negative(scalar))) {
1558             /*
1559              * this is an unusual input, and we don't guarantee
1560              * constant-timeness
1561              */
1562             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1563                 ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1564                 goto err;
1565             }
1566             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
1567         } else {
1568             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
1569         }
1570         /* do the multiplication with generator precomputation */
1571         batch_mul(x_out, y_out, z_out,
1572                   (const felem_bytearray(*))secrets, num_points,
1573                   g_secret,
1574                   mixed, (const felem(*)[17][3])pre_comp, g_pre_comp);
1575     } else {
1576         /* do the multiplication without generator precomputation */
1577         batch_mul(x_out, y_out, z_out,
1578                   (const felem_bytearray(*))secrets, num_points,
1579                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1580     }
1581     /* reduce the output to its unique minimal representation */
1582     felem_contract(x_in, x_out);
1583     felem_contract(y_in, y_out);
1584     felem_contract(z_in, z_out);
1585     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1586         (!felem_to_BN(z, z_in))) {
1587         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1588         goto err;
1589     }
1590     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1591
1592  err:
1593     BN_CTX_end(ctx);
1594     EC_POINT_free(generator);
1595     OPENSSL_free(secrets);
1596     OPENSSL_free(pre_comp);
1597     OPENSSL_free(tmp_felems);
1598     return ret;
1599 }
1600
1601 int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1602 {
1603     int ret = 0;
1604     NISTP224_PRE_COMP *pre = NULL;
1605     int i, j;
1606     BN_CTX *new_ctx = NULL;
1607     BIGNUM *x, *y;
1608     EC_POINT *generator = NULL;
1609     felem tmp_felems[32];
1610
1611     /* throw away old precomputation */
1612     EC_pre_comp_free(group);
1613     if (ctx == NULL)
1614         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1615             return 0;
1616     BN_CTX_start(ctx);
1617     x = BN_CTX_get(ctx);
1618     y = BN_CTX_get(ctx);
1619     if (y == NULL)
1620         goto err;
1621     /* get the generator */
1622     if (group->generator == NULL)
1623         goto err;
1624     generator = EC_POINT_new(group);
1625     if (generator == NULL)
1626         goto err;
1627     BN_bin2bn(nistp224_curve_params[3], sizeof(felem_bytearray), x);
1628     BN_bin2bn(nistp224_curve_params[4], sizeof(felem_bytearray), y);
1629     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
1630         goto err;
1631     if ((pre = nistp224_pre_comp_new()) == NULL)
1632         goto err;
1633     /*
1634      * if the generator is the standard one, use built-in precomputation
1635      */
1636     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1637         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1638         goto done;
1639     }
1640     if ((!BN_to_felem(pre->g_pre_comp[0][1][0], group->generator->X)) ||
1641         (!BN_to_felem(pre->g_pre_comp[0][1][1], group->generator->Y)) ||
1642         (!BN_to_felem(pre->g_pre_comp[0][1][2], group->generator->Z)))
1643         goto err;
1644     /*
1645      * compute 2^56*G, 2^112*G, 2^168*G for the first table, 2^28*G, 2^84*G,
1646      * 2^140*G, 2^196*G for the second one
1647      */
1648     for (i = 1; i <= 8; i <<= 1) {
1649         point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1650                      pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
1651                      pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]);
1652         for (j = 0; j < 27; ++j) {
1653             point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1654                          pre->g_pre_comp[1][i][2], pre->g_pre_comp[1][i][0],
1655                          pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1656         }
1657         if (i == 8)
1658             break;
1659         point_double(pre->g_pre_comp[0][2 * i][0],
1660                      pre->g_pre_comp[0][2 * i][1],
1661                      pre->g_pre_comp[0][2 * i][2], pre->g_pre_comp[1][i][0],
1662                      pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1663         for (j = 0; j < 27; ++j) {
1664             point_double(pre->g_pre_comp[0][2 * i][0],
1665                          pre->g_pre_comp[0][2 * i][1],
1666                          pre->g_pre_comp[0][2 * i][2],
1667                          pre->g_pre_comp[0][2 * i][0],
1668                          pre->g_pre_comp[0][2 * i][1],
1669                          pre->g_pre_comp[0][2 * i][2]);
1670         }
1671     }
1672     for (i = 0; i < 2; i++) {
1673         /* g_pre_comp[i][0] is the point at infinity */
1674         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
1675         /* the remaining multiples */
1676         /* 2^56*G + 2^112*G resp. 2^84*G + 2^140*G */
1677         point_add(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
1678                   pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
1679                   pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
1680                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1681                   pre->g_pre_comp[i][2][2]);
1682         /* 2^56*G + 2^168*G resp. 2^84*G + 2^196*G */
1683         point_add(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
1684                   pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
1685                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1686                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1687                   pre->g_pre_comp[i][2][2]);
1688         /* 2^112*G + 2^168*G resp. 2^140*G + 2^196*G */
1689         point_add(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
1690                   pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
1691                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1692                   0, pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
1693                   pre->g_pre_comp[i][4][2]);
1694         /*
1695          * 2^56*G + 2^112*G + 2^168*G resp. 2^84*G + 2^140*G + 2^196*G
1696          */
1697         point_add(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
1698                   pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
1699                   pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
1700                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1701                   pre->g_pre_comp[i][2][2]);
1702         for (j = 1; j < 8; ++j) {
1703             /* odd multiples: add G resp. 2^28*G */
1704             point_add(pre->g_pre_comp[i][2 * j + 1][0],
1705                       pre->g_pre_comp[i][2 * j + 1][1],
1706                       pre->g_pre_comp[i][2 * j + 1][2],
1707                       pre->g_pre_comp[i][2 * j][0],
1708                       pre->g_pre_comp[i][2 * j][1],
1709                       pre->g_pre_comp[i][2 * j][2], 0,
1710                       pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1],
1711                       pre->g_pre_comp[i][1][2]);
1712         }
1713     }
1714     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems);
1715
1716  done:
1717     SETPRECOMP(group, nistp224, pre);
1718     pre = NULL;
1719     ret = 1;
1720  err:
1721     BN_CTX_end(ctx);
1722     EC_POINT_free(generator);
1723     BN_CTX_free(new_ctx);
1724     EC_nistp224_pre_comp_free(pre);
1725     return ret;
1726 }
1727
1728 int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group)
1729 {
1730     return HAVEPRECOMP(group, nistp224);
1731 }
1732
1733 #endif