crypto/ec/ecp_nistp224.c

   1 /*
   2  * Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the Apache License 2.0 (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * A 64-bit implementation of the NIST P-224 elliptic curve point multiplication
  28  *
  29  * Inspired by Daniel J. Bernstein's public domain nistp224 implementation
  30  * and Adam Langley's public domain 64-bit C implementation of curve25519
  31  */
  32
  33 #include <openssl/opensslconf.h>
  34 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  35 NON_EMPTY_TRANSLATION_UNIT
  36 #else
  37
  38 # include <stdint.h>
  39 # include <string.h>
  40 # include <openssl/err.h>
  41 # include "ec_lcl.h"
  42
  43 # if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
  44   /* even with gcc, the typedef won't work for 32-bit platforms */
  45 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  46                                  * platforms */
  47 # else
  48 #  error "Your compiler doesn't appear to support 128-bit integer types"
  49 # endif
  50
  51 typedef uint8_t u8;
  52 typedef uint64_t u64;
  53
  54 /******************************************************************************/
  55 /*-
  56  * INTERNAL REPRESENTATION OF FIELD ELEMENTS
  57  *
  58  * Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3
  59  * using 64-bit coefficients called 'limbs',
  60  * and sometimes (for multiplication results) as
  61  * b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 + 2^336*b_6
  62  * using 128-bit coefficients called 'widelimbs'.
  63  * A 4-limb representation is an 'felem';
  64  * a 7-widelimb representation is a 'widefelem'.
  65  * Even within felems, bits of adjacent limbs overlap, and we don't always
  66  * reduce the representations: we ensure that inputs to each felem
  67  * multiplication satisfy a_i < 2^60, so outputs satisfy b_i < 4*2^60*2^60,
  68  * and fit into a 128-bit word without overflow. The coefficients are then
  69  * again partially reduced to obtain an felem satisfying a_i < 2^57.
  70  * We only reduce to the unique minimal representation at the end of the
  71  * computation.
  72  */
  73
  74 typedef uint64_t limb;
  75 typedef uint128_t widelimb;
  76
  77 typedef limb felem[4];
  78 typedef widelimb widefelem[7];
  79
  80 /*
  81  * Field element represented as a byte array. 28*8 = 224 bits is also the
  82  * group order size for the elliptic curve, and we also use this type for
  83  * scalars for point multiplication.
  84  */
  85 typedef u8 felem_bytearray[28];
  86
  87 static const felem_bytearray nistp224_curve_params[5] = {
  88     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
  89      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
  90      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01},
  91     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a */
  92      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
  93      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE},
  94     {0xB4, 0x05, 0x0A, 0x85, 0x0C, 0x04, 0xB3, 0xAB, 0xF5, 0x41, /* b */
  95      0x32, 0x56, 0x50, 0x44, 0xB0, 0xB7, 0xD7, 0xBF, 0xD8, 0xBA,
  96      0x27, 0x0B, 0x39, 0x43, 0x23, 0x55, 0xFF, 0xB4},
  97     {0xB7, 0x0E, 0x0C, 0xBD, 0x6B, 0xB4, 0xBF, 0x7F, 0x32, 0x13, /* x */
  98      0x90, 0xB9, 0x4A, 0x03, 0xC1, 0xD3, 0x56, 0xC2, 0x11, 0x22,
  99      0x34, 0x32, 0x80, 0xD6, 0x11, 0x5C, 0x1D, 0x21},
 100     {0xbd, 0x37, 0x63, 0x88, 0xb5, 0xf7, 0x23, 0xfb, 0x4c, 0x22, /* y */
 101      0xdf, 0xe6, 0xcd, 0x43, 0x75, 0xa0, 0x5a, 0x07, 0x47, 0x64,
 102      0x44, 0xd5, 0x81, 0x99, 0x85, 0x00, 0x7e, 0x34}
 103 };
 104
 105 /*-
 106  * Precomputed multiples of the standard generator
 107  * Points are given in coordinates (X, Y, Z) where Z normally is 1
 108  * (0 for the point at infinity).
 109  * For each field element, slice a_0 is word 0, etc.
 110  *
 111  * The table has 2 * 16 elements, starting with the following:
 112  * index | bits    | point
 113  * ------+---------+------------------------------
 114  *     0 | 0 0 0 0 | 0G
 115  *     1 | 0 0 0 1 | 1G
 116  *     2 | 0 0 1 0 | 2^56G
 117  *     3 | 0 0 1 1 | (2^56 + 1)G
 118  *     4 | 0 1 0 0 | 2^112G
 119  *     5 | 0 1 0 1 | (2^112 + 1)G
 120  *     6 | 0 1 1 0 | (2^112 + 2^56)G
 121  *     7 | 0 1 1 1 | (2^112 + 2^56 + 1)G
 122  *     8 | 1 0 0 0 | 2^168G
 123  *     9 | 1 0 0 1 | (2^168 + 1)G
 124  *    10 | 1 0 1 0 | (2^168 + 2^56)G
 125  *    11 | 1 0 1 1 | (2^168 + 2^56 + 1)G
 126  *    12 | 1 1 0 0 | (2^168 + 2^112)G
 127  *    13 | 1 1 0 1 | (2^168 + 2^112 + 1)G
 128  *    14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G
 129  *    15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G
 130  * followed by a copy of this with each element multiplied by 2^28.
 131  *
 132  * The reason for this is so that we can clock bits into four different
 133  * locations when doing simple scalar multiplies against the base point,
 134  * and then another four locations using the second 16 elements.
 135  */
 136 static const felem gmul[2][16][3] = {
 137 {{{0, 0, 0, 0},
 138   {0, 0, 0, 0},
 139   {0, 0, 0, 0}},
 140  {{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf},
 141   {0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723},
 142   {1, 0, 0, 0}},
 143  {{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5},
 144   {0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321},
 145   {1, 0, 0, 0}},
 146  {{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748},
 147   {0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17},
 148   {1, 0, 0, 0}},
 149  {{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe},
 150   {0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b},
 151   {1, 0, 0, 0}},
 152  {{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3},
 153   {0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a},
 154   {1, 0, 0, 0}},
 155  {{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c},
 156   {0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244},
 157   {1, 0, 0, 0}},
 158  {{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849},
 159   {0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112},
 160   {1, 0, 0, 0}},
 161  {{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47},
 162   {0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394},
 163   {1, 0, 0, 0}},
 164  {{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d},
 165   {0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7},
 166   {1, 0, 0, 0}},
 167  {{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24},
 168   {0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881},
 169   {1, 0, 0, 0}},
 170  {{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984},
 171   {0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369},
 172   {1, 0, 0, 0}},
 173  {{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3},
 174   {0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60},
 175   {1, 0, 0, 0}},
 176  {{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057},
 177   {0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9},
 178   {1, 0, 0, 0}},
 179  {{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9},
 180   {0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc},
 181   {1, 0, 0, 0}},
 182  {{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58},
 183   {0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558},
 184   {1, 0, 0, 0}}},
 185 {{{0, 0, 0, 0},
 186   {0, 0, 0, 0},
 187   {0, 0, 0, 0}},
 188  {{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31},
 189   {0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d},
 190   {1, 0, 0, 0}},
 191  {{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3},
 192   {0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a},
 193   {1, 0, 0, 0}},
 194  {{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33},
 195   {0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100},
 196   {1, 0, 0, 0}},
 197  {{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5},
 198   {0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea},
 199   {1, 0, 0, 0}},
 200  {{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be},
 201   {0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51},
 202   {1, 0, 0, 0}},
 203  {{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1},
 204   {0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb},
 205   {1, 0, 0, 0}},
 206  {{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233},
 207   {0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def},
 208   {1, 0, 0, 0}},
 209  {{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae},
 210   {0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45},
 211   {1, 0, 0, 0}},
 212  {{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e},
 213   {0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb},
 214   {1, 0, 0, 0}},
 215  {{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de},
 216   {0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3},
 217   {1, 0, 0, 0}},
 218  {{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05},
 219   {0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58},
 220   {1, 0, 0, 0}},
 221  {{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb},
 222   {0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0},
 223   {1, 0, 0, 0}},
 224  {{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9},
 225   {0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea},
 226   {1, 0, 0, 0}},
 227  {{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba},
 228   {0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405},
 229   {1, 0, 0, 0}},
 230  {{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e},
 231   {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e},
 232   {1, 0, 0, 0}}}
 233 };
 234
 235 /* Precomputation for the group generator. */
 236 struct nistp224_pre_comp_st {
 237     felem g_pre_comp[2][16][3];
 238     CRYPTO_REF_COUNT references;
 239     CRYPTO_RWLOCK *lock;
 240 };
 241
 242 const EC_METHOD *EC_GFp_nistp224_method(void)
 243 {
 244     static const EC_METHOD ret = {
 245         EC_FLAGS_DEFAULT_OCT,
 246         NID_X9_62_prime_field,
 247         ec_GFp_nistp224_group_init,
 248         ec_GFp_simple_group_finish,
 249         ec_GFp_simple_group_clear_finish,
 250         ec_GFp_nist_group_copy,
 251         ec_GFp_nistp224_group_set_curve,
 252         ec_GFp_simple_group_get_curve,
 253         ec_GFp_simple_group_get_degree,
 254         ec_group_simple_order_bits,
 255         ec_GFp_simple_group_check_discriminant,
 256         ec_GFp_simple_point_init,
 257         ec_GFp_simple_point_finish,
 258         ec_GFp_simple_point_clear_finish,
 259         ec_GFp_simple_point_copy,
 260         ec_GFp_simple_point_set_to_infinity,
 261         ec_GFp_simple_set_Jprojective_coordinates_GFp,
 262         ec_GFp_simple_get_Jprojective_coordinates_GFp,
 263         ec_GFp_simple_point_set_affine_coordinates,
 264         ec_GFp_nistp224_point_get_affine_coordinates,
 265         0 /* point_set_compressed_coordinates */ ,
 266         0 /* point2oct */ ,
 267         0 /* oct2point */ ,
 268         ec_GFp_simple_add,
 269         ec_GFp_simple_dbl,
 270         ec_GFp_simple_invert,
 271         ec_GFp_simple_is_at_infinity,
 272         ec_GFp_simple_is_on_curve,
 273         ec_GFp_simple_cmp,
 274         ec_GFp_simple_make_affine,
 275         ec_GFp_simple_points_make_affine,
 276         ec_GFp_nistp224_points_mul,
 277         ec_GFp_nistp224_precompute_mult,
 278         ec_GFp_nistp224_have_precompute_mult,
 279         ec_GFp_nist_field_mul,
 280         ec_GFp_nist_field_sqr,
 281         0 /* field_div */ ,
 282         ec_GFp_simple_field_inv,
 283         0 /* field_encode */ ,
 284         0 /* field_decode */ ,
 285         0,                      /* field_set_to_one */
 286         ec_key_simple_priv2oct,
 287         ec_key_simple_oct2priv,
 288         0, /* set private */
 289         ec_key_simple_generate_key,
 290         ec_key_simple_check_key,
 291         ec_key_simple_generate_public_key,
 292         0, /* keycopy */
 293         0, /* keyfinish */
 294         ecdh_simple_compute_key,
 295         ecdsa_simple_sign_setup,
 296         ecdsa_simple_sign_sig,
 297         ecdsa_simple_verify_sig,
 298         0, /* field_inverse_mod_ord */
 299         0, /* blind_coordinates */
 300         0, /* ladder_pre */
 301         0, /* ladder_step */
 302         0  /* ladder_post */
 303     };
 304
 305     return &ret;
 306 }
 307
 308 /*
 309  * Helper functions to convert field elements to/from internal representation
 310  */
 311 static void bin28_to_felem(felem out, const u8 in[28])
 312 {
 313     out[0] = *((const uint64_t *)(in)) & 0x00ffffffffffffff;
 314     out[1] = (*((const uint64_t *)(in + 7))) & 0x00ffffffffffffff;
 315     out[2] = (*((const uint64_t *)(in + 14))) & 0x00ffffffffffffff;
 316     out[3] = (*((const uint64_t *)(in+20))) >> 8;
 317 }
 318
 319 static void felem_to_bin28(u8 out[28], const felem in)
 320 {
 321     unsigned i;
 322     for (i = 0; i < 7; ++i) {
 323         out[i] = in[0] >> (8 * i);
 324         out[i + 7] = in[1] >> (8 * i);
 325         out[i + 14] = in[2] >> (8 * i);
 326         out[i + 21] = in[3] >> (8 * i);
 327     }
 328 }
 329
 330 /* From OpenSSL BIGNUM to internal representation */
 331 static int BN_to_felem(felem out, const BIGNUM *bn)
 332 {
 333     felem_bytearray b_out;
 334     int num_bytes;
 335
 336     if (BN_is_negative(bn)) {
 337         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 338         return 0;
 339     }
 340     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
 341     if (num_bytes < 0) {
 342         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 343         return 0;
 344     }
 345     bin28_to_felem(out, b_out);
 346     return 1;
 347 }
 348
 349 /* From internal representation to OpenSSL BIGNUM */
 350 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 351 {
 352     felem_bytearray b_out;
 353     felem_to_bin28(b_out, in);
 354     return BN_lebin2bn(b_out, sizeof(b_out), out);
 355 }
 356
 357 /******************************************************************************/
 358 /*-
 359  *                              FIELD OPERATIONS
 360  *
 361  * Field operations, using the internal representation of field elements.
 362  * NB! These operations are specific to our point multiplication and cannot be
 363  * expected to be correct in general - e.g., multiplication with a large scalar
 364  * will cause an overflow.
 365  *
 366  */
 367
 368 static void felem_one(felem out)
 369 {
 370     out[0] = 1;
 371     out[1] = 0;
 372     out[2] = 0;
 373     out[3] = 0;
 374 }
 375
 376 static void felem_assign(felem out, const felem in)
 377 {
 378     out[0] = in[0];
 379     out[1] = in[1];
 380     out[2] = in[2];
 381     out[3] = in[3];
 382 }
 383
 384 /* Sum two field elements: out += in */
 385 static void felem_sum(felem out, const felem in)
 386 {
 387     out[0] += in[0];
 388     out[1] += in[1];
 389     out[2] += in[2];
 390     out[3] += in[3];
 391 }
 392
 393 /* Subtract field elements: out -= in */
 394 /* Assumes in[i] < 2^57 */
 395 static void felem_diff(felem out, const felem in)
 396 {
 397     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 398     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 399     static const limb two58m42m2 = (((limb) 1) << 58) -
 400         (((limb) 1) << 42) - (((limb) 1) << 2);
 401
 402     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 403     out[0] += two58p2;
 404     out[1] += two58m42m2;
 405     out[2] += two58m2;
 406     out[3] += two58m2;
 407
 408     out[0] -= in[0];
 409     out[1] -= in[1];
 410     out[2] -= in[2];
 411     out[3] -= in[3];
 412 }
 413
 414 /* Subtract in unreduced 128-bit mode: out -= in */
 415 /* Assumes in[i] < 2^119 */
 416 static void widefelem_diff(widefelem out, const widefelem in)
 417 {
 418     static const widelimb two120 = ((widelimb) 1) << 120;
 419     static const widelimb two120m64 = (((widelimb) 1) << 120) -
 420         (((widelimb) 1) << 64);
 421     static const widelimb two120m104m64 = (((widelimb) 1) << 120) -
 422         (((widelimb) 1) << 104) - (((widelimb) 1) << 64);
 423
 424     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 425     out[0] += two120;
 426     out[1] += two120m64;
 427     out[2] += two120m64;
 428     out[3] += two120;
 429     out[4] += two120m104m64;
 430     out[5] += two120m64;
 431     out[6] += two120m64;
 432
 433     out[0] -= in[0];
 434     out[1] -= in[1];
 435     out[2] -= in[2];
 436     out[3] -= in[3];
 437     out[4] -= in[4];
 438     out[5] -= in[5];
 439     out[6] -= in[6];
 440 }
 441
 442 /* Subtract in mixed mode: out128 -= in64 */
 443 /* in[i] < 2^63 */
 444 static void felem_diff_128_64(widefelem out, const felem in)
 445 {
 446     static const widelimb two64p8 = (((widelimb) 1) << 64) +
 447         (((widelimb) 1) << 8);
 448     static const widelimb two64m8 = (((widelimb) 1) << 64) -
 449         (((widelimb) 1) << 8);
 450     static const widelimb two64m48m8 = (((widelimb) 1) << 64) -
 451         (((widelimb) 1) << 48) - (((widelimb) 1) << 8);
 452
 453     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 454     out[0] += two64p8;
 455     out[1] += two64m48m8;
 456     out[2] += two64m8;
 457     out[3] += two64m8;
 458
 459     out[0] -= in[0];
 460     out[1] -= in[1];
 461     out[2] -= in[2];
 462     out[3] -= in[3];
 463 }
 464
 465 /*
 466  * Multiply a field element by a scalar: out = out * scalar The scalars we
 467  * actually use are small, so results fit without overflow
 468  */
 469 static void felem_scalar(felem out, const limb scalar)
 470 {
 471     out[0] *= scalar;
 472     out[1] *= scalar;
 473     out[2] *= scalar;
 474     out[3] *= scalar;
 475 }
 476
 477 /*
 478  * Multiply an unreduced field element by a scalar: out = out * scalar The
 479  * scalars we actually use are small, so results fit without overflow
 480  */
 481 static void widefelem_scalar(widefelem out, const widelimb scalar)
 482 {
 483     out[0] *= scalar;
 484     out[1] *= scalar;
 485     out[2] *= scalar;
 486     out[3] *= scalar;
 487     out[4] *= scalar;
 488     out[5] *= scalar;
 489     out[6] *= scalar;
 490 }
 491
 492 /* Square a field element: out = in^2 */
 493 static void felem_square(widefelem out, const felem in)
 494 {
 495     limb tmp0, tmp1, tmp2;
 496     tmp0 = 2 * in[0];
 497     tmp1 = 2 * in[1];
 498     tmp2 = 2 * in[2];
 499     out[0] = ((widelimb) in[0]) * in[0];
 500     out[1] = ((widelimb) in[0]) * tmp1;
 501     out[2] = ((widelimb) in[0]) * tmp2 + ((widelimb) in[1]) * in[1];
 502     out[3] = ((widelimb) in[3]) * tmp0 + ((widelimb) in[1]) * tmp2;
 503     out[4] = ((widelimb) in[3]) * tmp1 + ((widelimb) in[2]) * in[2];
 504     out[5] = ((widelimb) in[3]) * tmp2;
 505     out[6] = ((widelimb) in[3]) * in[3];
 506 }
 507
 508 /* Multiply two field elements: out = in1 * in2 */
 509 static void felem_mul(widefelem out, const felem in1, const felem in2)
 510 {
 511     out[0] = ((widelimb) in1[0]) * in2[0];
 512     out[1] = ((widelimb) in1[0]) * in2[1] + ((widelimb) in1[1]) * in2[0];
 513     out[2] = ((widelimb) in1[0]) * in2[2] + ((widelimb) in1[1]) * in2[1] +
 514              ((widelimb) in1[2]) * in2[0];
 515     out[3] = ((widelimb) in1[0]) * in2[3] + ((widelimb) in1[1]) * in2[2] +
 516              ((widelimb) in1[2]) * in2[1] + ((widelimb) in1[3]) * in2[0];
 517     out[4] = ((widelimb) in1[1]) * in2[3] + ((widelimb) in1[2]) * in2[2] +
 518              ((widelimb) in1[3]) * in2[1];
 519     out[5] = ((widelimb) in1[2]) * in2[3] + ((widelimb) in1[3]) * in2[2];
 520     out[6] = ((widelimb) in1[3]) * in2[3];
 521 }
 522
 523 /*-
 524  * Reduce seven 128-bit coefficients to four 64-bit coefficients.
 525  * Requires in[i] < 2^126,
 526  * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 */
 527 static void felem_reduce(felem out, const widefelem in)
 528 {
 529     static const widelimb two127p15 = (((widelimb) 1) << 127) +
 530         (((widelimb) 1) << 15);
 531     static const widelimb two127m71 = (((widelimb) 1) << 127) -
 532         (((widelimb) 1) << 71);
 533     static const widelimb two127m71m55 = (((widelimb) 1) << 127) -
 534         (((widelimb) 1) << 71) - (((widelimb) 1) << 55);
 535     widelimb output[5];
 536
 537     /* Add 0 mod 2^224-2^96+1 to ensure all differences are positive */
 538     output[0] = in[0] + two127p15;
 539     output[1] = in[1] + two127m71m55;
 540     output[2] = in[2] + two127m71;
 541     output[3] = in[3];
 542     output[4] = in[4];
 543
 544     /* Eliminate in[4], in[5], in[6] */
 545     output[4] += in[6] >> 16;
 546     output[3] += (in[6] & 0xffff) << 40;
 547     output[2] -= in[6];
 548
 549     output[3] += in[5] >> 16;
 550     output[2] += (in[5] & 0xffff) << 40;
 551     output[1] -= in[5];
 552
 553     output[2] += output[4] >> 16;
 554     output[1] += (output[4] & 0xffff) << 40;
 555     output[0] -= output[4];
 556
 557     /* Carry 2 -> 3 -> 4 */
 558     output[3] += output[2] >> 56;
 559     output[2] &= 0x00ffffffffffffff;
 560
 561     output[4] = output[3] >> 56;
 562     output[3] &= 0x00ffffffffffffff;
 563
 564     /* Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72 */
 565
 566     /* Eliminate output[4] */
 567     output[2] += output[4] >> 16;
 568     /* output[2] < 2^56 + 2^56 = 2^57 */
 569     output[1] += (output[4] & 0xffff) << 40;
 570     output[0] -= output[4];
 571
 572     /* Carry 0 -> 1 -> 2 -> 3 */
 573     output[1] += output[0] >> 56;
 574     out[0] = output[0] & 0x00ffffffffffffff;
 575
 576     output[2] += output[1] >> 56;
 577     /* output[2] < 2^57 + 2^72 */
 578     out[1] = output[1] & 0x00ffffffffffffff;
 579     output[3] += output[2] >> 56;
 580     /* output[3] <= 2^56 + 2^16 */
 581     out[2] = output[2] & 0x00ffffffffffffff;
 582
 583     /*-
 584      * out[0] < 2^56, out[1] < 2^56, out[2] < 2^56,
 585      * out[3] <= 2^56 + 2^16 (due to final carry),
 586      * so out < 2*p
 587      */
 588     out[3] = output[3];
 589 }
 590
 591 static void felem_square_reduce(felem out, const felem in)
 592 {
 593     widefelem tmp;
 594     felem_square(tmp, in);
 595     felem_reduce(out, tmp);
 596 }
 597
 598 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 599 {
 600     widefelem tmp;
 601     felem_mul(tmp, in1, in2);
 602     felem_reduce(out, tmp);
 603 }
 604
 605 /*
 606  * Reduce to unique minimal representation. Requires 0 <= in < 2*p (always
 607  * call felem_reduce first)
 608  */
 609 static void felem_contract(felem out, const felem in)
 610 {
 611     static const int64_t two56 = ((limb) 1) << 56;
 612     /* 0 <= in < 2*p, p = 2^224 - 2^96 + 1 */
 613     /* if in > p , reduce in = in - 2^224 + 2^96 - 1 */
 614     int64_t tmp[4], a;
 615     tmp[0] = in[0];
 616     tmp[1] = in[1];
 617     tmp[2] = in[2];
 618     tmp[3] = in[3];
 619     /* Case 1: a = 1 iff in >= 2^224 */
 620     a = (in[3] >> 56);
 621     tmp[0] -= a;
 622     tmp[1] += a << 40;
 623     tmp[3] &= 0x00ffffffffffffff;
 624     /*
 625      * Case 2: a = 0 iff p <= in < 2^224, i.e., the high 128 bits are all 1
 626      * and the lower part is non-zero
 627      */
 628     a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) |
 629         (((int64_t) (in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63);
 630     a &= 0x00ffffffffffffff;
 631     /* turn a into an all-one mask (if a = 0) or an all-zero mask */
 632     a = (a - 1) >> 63;
 633     /* subtract 2^224 - 2^96 + 1 if a is all-one */
 634     tmp[3] &= a ^ 0xffffffffffffffff;
 635     tmp[2] &= a ^ 0xffffffffffffffff;
 636     tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff;
 637     tmp[0] -= 1 & a;
 638
 639     /*
 640      * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
 641      * non-zero, so we only need one step
 642      */
 643     a = tmp[0] >> 63;
 644     tmp[0] += two56 & a;
 645     tmp[1] -= 1 & a;
 646
 647     /* carry 1 -> 2 -> 3 */
 648     tmp[2] += tmp[1] >> 56;
 649     tmp[1] &= 0x00ffffffffffffff;
 650
 651     tmp[3] += tmp[2] >> 56;
 652     tmp[2] &= 0x00ffffffffffffff;
 653
 654     /* Now 0 <= out < p */
 655     out[0] = tmp[0];
 656     out[1] = tmp[1];
 657     out[2] = tmp[2];
 658     out[3] = tmp[3];
 659 }
 660
 661 /*
 662  * Get negative value: out = -in
 663  * Requires in[i] < 2^63,
 664  * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16
 665  */
 666 static void felem_neg(felem out, const felem in)
 667 {
 668     widefelem tmp;
 669
 670     memset(tmp, 0, sizeof(tmp));
 671     felem_diff_128_64(tmp, in);
 672     felem_reduce(out, tmp);
 673 }
 674
 675 /*
 676  * Zero-check: returns 1 if input is 0, and 0 otherwise. We know that field
 677  * elements are reduced to in < 2^225, so we only need to check three cases:
 678  * 0, 2^224 - 2^96 + 1, and 2^225 - 2^97 + 2
 679  */
 680 static limb felem_is_zero(const felem in)
 681 {
 682     limb zero, two224m96p1, two225m97p2;
 683
 684     zero = in[0] | in[1] | in[2] | in[3];
 685     zero = (((int64_t) (zero) - 1) >> 63) & 1;
 686     two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000)
 687         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x00ffffffffffffff);
 688     two224m96p1 = (((int64_t) (two224m96p1) - 1) >> 63) & 1;
 689     two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000)
 690         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x01ffffffffffffff);
 691     two225m97p2 = (((int64_t) (two225m97p2) - 1) >> 63) & 1;
 692     return (zero | two224m96p1 | two225m97p2);
 693 }
 694
 695 static int felem_is_zero_int(const void *in)
 696 {
 697     return (int)(felem_is_zero(in) & ((limb) 1));
 698 }
 699
 700 /* Invert a field element */
 701 /* Computation chain copied from djb's code */
 702 static void felem_inv(felem out, const felem in)
 703 {
 704     felem ftmp, ftmp2, ftmp3, ftmp4;
 705     widefelem tmp;
 706     unsigned i;
 707
 708     felem_square(tmp, in);
 709     felem_reduce(ftmp, tmp);    /* 2 */
 710     felem_mul(tmp, in, ftmp);
 711     felem_reduce(ftmp, tmp);    /* 2^2 - 1 */
 712     felem_square(tmp, ftmp);
 713     felem_reduce(ftmp, tmp);    /* 2^3 - 2 */
 714     felem_mul(tmp, in, ftmp);
 715     felem_reduce(ftmp, tmp);    /* 2^3 - 1 */
 716     felem_square(tmp, ftmp);
 717     felem_reduce(ftmp2, tmp);   /* 2^4 - 2 */
 718     felem_square(tmp, ftmp2);
 719     felem_reduce(ftmp2, tmp);   /* 2^5 - 4 */
 720     felem_square(tmp, ftmp2);
 721     felem_reduce(ftmp2, tmp);   /* 2^6 - 8 */
 722     felem_mul(tmp, ftmp2, ftmp);
 723     felem_reduce(ftmp, tmp);    /* 2^6 - 1 */
 724     felem_square(tmp, ftmp);
 725     felem_reduce(ftmp2, tmp);   /* 2^7 - 2 */
 726     for (i = 0; i < 5; ++i) {   /* 2^12 - 2^6 */
 727         felem_square(tmp, ftmp2);
 728         felem_reduce(ftmp2, tmp);
 729     }
 730     felem_mul(tmp, ftmp2, ftmp);
 731     felem_reduce(ftmp2, tmp);   /* 2^12 - 1 */
 732     felem_square(tmp, ftmp2);
 733     felem_reduce(ftmp3, tmp);   /* 2^13 - 2 */
 734     for (i = 0; i < 11; ++i) {  /* 2^24 - 2^12 */
 735         felem_square(tmp, ftmp3);
 736         felem_reduce(ftmp3, tmp);
 737     }
 738     felem_mul(tmp, ftmp3, ftmp2);
 739     felem_reduce(ftmp2, tmp);   /* 2^24 - 1 */
 740     felem_square(tmp, ftmp2);
 741     felem_reduce(ftmp3, tmp);   /* 2^25 - 2 */
 742     for (i = 0; i < 23; ++i) {  /* 2^48 - 2^24 */
 743         felem_square(tmp, ftmp3);
 744         felem_reduce(ftmp3, tmp);
 745     }
 746     felem_mul(tmp, ftmp3, ftmp2);
 747     felem_reduce(ftmp3, tmp);   /* 2^48 - 1 */
 748     felem_square(tmp, ftmp3);
 749     felem_reduce(ftmp4, tmp);   /* 2^49 - 2 */
 750     for (i = 0; i < 47; ++i) {  /* 2^96 - 2^48 */
 751         felem_square(tmp, ftmp4);
 752         felem_reduce(ftmp4, tmp);
 753     }
 754     felem_mul(tmp, ftmp3, ftmp4);
 755     felem_reduce(ftmp3, tmp);   /* 2^96 - 1 */
 756     felem_square(tmp, ftmp3);
 757     felem_reduce(ftmp4, tmp);   /* 2^97 - 2 */
 758     for (i = 0; i < 23; ++i) {  /* 2^120 - 2^24 */
 759         felem_square(tmp, ftmp4);
 760         felem_reduce(ftmp4, tmp);
 761     }
 762     felem_mul(tmp, ftmp2, ftmp4);
 763     felem_reduce(ftmp2, tmp);   /* 2^120 - 1 */
 764     for (i = 0; i < 6; ++i) {   /* 2^126 - 2^6 */
 765         felem_square(tmp, ftmp2);
 766         felem_reduce(ftmp2, tmp);
 767     }
 768     felem_mul(tmp, ftmp2, ftmp);
 769     felem_reduce(ftmp, tmp);    /* 2^126 - 1 */
 770     felem_square(tmp, ftmp);
 771     felem_reduce(ftmp, tmp);    /* 2^127 - 2 */
 772     felem_mul(tmp, ftmp, in);
 773     felem_reduce(ftmp, tmp);    /* 2^127 - 1 */
 774     for (i = 0; i < 97; ++i) {  /* 2^224 - 2^97 */
 775         felem_square(tmp, ftmp);
 776         felem_reduce(ftmp, tmp);
 777     }
 778     felem_mul(tmp, ftmp, ftmp3);
 779     felem_reduce(out, tmp);     /* 2^224 - 2^96 - 1 */
 780 }
 781
 782 /*
 783  * Copy in constant time: if icopy == 1, copy in to out, if icopy == 0, copy
 784  * out to itself.
 785  */
 786 static void copy_conditional(felem out, const felem in, limb icopy)
 787 {
 788     unsigned i;
 789     /*
 790      * icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one
 791      */
 792     const limb copy = -icopy;
 793     for (i = 0; i < 4; ++i) {
 794         const limb tmp = copy & (in[i] ^ out[i]);
 795         out[i] ^= tmp;
 796     }
 797 }
 798
 799 /******************************************************************************/
 800 /*-
 801  *                       ELLIPTIC CURVE POINT OPERATIONS
 802  *
 803  * Points are represented in Jacobian projective coordinates:
 804  * (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3),
 805  * or to the point at infinity if Z == 0.
 806  *
 807  */
 808
 809 /*-
 810  * Double an elliptic curve point:
 811  * (X', Y', Z') = 2 * (X, Y, Z), where
 812  * X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2
 813  * Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^4
 814  * Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z
 815  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed,
 816  * while x_out == y_in is not (maybe this works, but it's not tested).
 817  */
 818 static void
 819 point_double(felem x_out, felem y_out, felem z_out,
 820              const felem x_in, const felem y_in, const felem z_in)
 821 {
 822     widefelem tmp, tmp2;
 823     felem delta, gamma, beta, alpha, ftmp, ftmp2;
 824
 825     felem_assign(ftmp, x_in);
 826     felem_assign(ftmp2, x_in);
 827
 828     /* delta = z^2 */
 829     felem_square(tmp, z_in);
 830     felem_reduce(delta, tmp);
 831
 832     /* gamma = y^2 */
 833     felem_square(tmp, y_in);
 834     felem_reduce(gamma, tmp);
 835
 836     /* beta = x*gamma */
 837     felem_mul(tmp, x_in, gamma);
 838     felem_reduce(beta, tmp);
 839
 840     /* alpha = 3*(x-delta)*(x+delta) */
 841     felem_diff(ftmp, delta);
 842     /* ftmp[i] < 2^57 + 2^58 + 2 < 2^59 */
 843     felem_sum(ftmp2, delta);
 844     /* ftmp2[i] < 2^57 + 2^57 = 2^58 */
 845     felem_scalar(ftmp2, 3);
 846     /* ftmp2[i] < 3 * 2^58 < 2^60 */
 847     felem_mul(tmp, ftmp, ftmp2);
 848     /* tmp[i] < 2^60 * 2^59 * 4 = 2^121 */
 849     felem_reduce(alpha, tmp);
 850
 851     /* x' = alpha^2 - 8*beta */
 852     felem_square(tmp, alpha);
 853     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 854     felem_assign(ftmp, beta);
 855     felem_scalar(ftmp, 8);
 856     /* ftmp[i] < 8 * 2^57 = 2^60 */
 857     felem_diff_128_64(tmp, ftmp);
 858     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 859     felem_reduce(x_out, tmp);
 860
 861     /* z' = (y + z)^2 - gamma - delta */
 862     felem_sum(delta, gamma);
 863     /* delta[i] < 2^57 + 2^57 = 2^58 */
 864     felem_assign(ftmp, y_in);
 865     felem_sum(ftmp, z_in);
 866     /* ftmp[i] < 2^57 + 2^57 = 2^58 */
 867     felem_square(tmp, ftmp);
 868     /* tmp[i] < 4 * 2^58 * 2^58 = 2^118 */
 869     felem_diff_128_64(tmp, delta);
 870     /* tmp[i] < 2^118 + 2^64 + 8 < 2^119 */
 871     felem_reduce(z_out, tmp);
 872
 873     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
 874     felem_scalar(beta, 4);
 875     /* beta[i] < 4 * 2^57 = 2^59 */
 876     felem_diff(beta, x_out);
 877     /* beta[i] < 2^59 + 2^58 + 2 < 2^60 */
 878     felem_mul(tmp, alpha, beta);
 879     /* tmp[i] < 4 * 2^57 * 2^60 = 2^119 */
 880     felem_square(tmp2, gamma);
 881     /* tmp2[i] < 4 * 2^57 * 2^57 = 2^116 */
 882     widefelem_scalar(tmp2, 8);
 883     /* tmp2[i] < 8 * 2^116 = 2^119 */
 884     widefelem_diff(tmp, tmp2);
 885     /* tmp[i] < 2^119 + 2^120 < 2^121 */
 886     felem_reduce(y_out, tmp);
 887 }
 888
 889 /*-
 890  * Add two elliptic curve points:
 891  * (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where
 892  * X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 -
 893  * 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2
 894  * Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 - X_3) -
 895  *        Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3
 896  * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2)
 897  *
 898  * This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0.
 899  */
 900
 901 /*
 902  * This function is not entirely constant-time: it includes a branch for
 903  * checking whether the two input points are equal, (while not equal to the
 904  * point at infinity). This case never happens during single point
 905  * multiplication, so there is no timing leak for ECDH or ECDSA signing.
 906  */
 907 static void point_add(felem x3, felem y3, felem z3,
 908                       const felem x1, const felem y1, const felem z1,
 909                       const int mixed, const felem x2, const felem y2,
 910                       const felem z2)
 911 {
 912     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out;
 913     widefelem tmp, tmp2;
 914     limb z1_is_zero, z2_is_zero, x_equal, y_equal;
 915
 916     if (!mixed) {
 917         /* ftmp2 = z2^2 */
 918         felem_square(tmp, z2);
 919         felem_reduce(ftmp2, tmp);
 920
 921         /* ftmp4 = z2^3 */
 922         felem_mul(tmp, ftmp2, z2);
 923         felem_reduce(ftmp4, tmp);
 924
 925         /* ftmp4 = z2^3*y1 */
 926         felem_mul(tmp2, ftmp4, y1);
 927         felem_reduce(ftmp4, tmp2);
 928
 929         /* ftmp2 = z2^2*x1 */
 930         felem_mul(tmp2, ftmp2, x1);
 931         felem_reduce(ftmp2, tmp2);
 932     } else {
 933         /*
 934          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
 935          */
 936
 937         /* ftmp4 = z2^3*y1 */
 938         felem_assign(ftmp4, y1);
 939
 940         /* ftmp2 = z2^2*x1 */
 941         felem_assign(ftmp2, x1);
 942     }
 943
 944     /* ftmp = z1^2 */
 945     felem_square(tmp, z1);
 946     felem_reduce(ftmp, tmp);
 947
 948     /* ftmp3 = z1^3 */
 949     felem_mul(tmp, ftmp, z1);
 950     felem_reduce(ftmp3, tmp);
 951
 952     /* tmp = z1^3*y2 */
 953     felem_mul(tmp, ftmp3, y2);
 954     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 955
 956     /* ftmp3 = z1^3*y2 - z2^3*y1 */
 957     felem_diff_128_64(tmp, ftmp4);
 958     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 959     felem_reduce(ftmp3, tmp);
 960
 961     /* tmp = z1^2*x2 */
 962     felem_mul(tmp, ftmp, x2);
 963     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 964
 965     /* ftmp = z1^2*x2 - z2^2*x1 */
 966     felem_diff_128_64(tmp, ftmp2);
 967     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 968     felem_reduce(ftmp, tmp);
 969
 970     /*
 971      * the formulae are incorrect if the points are equal so we check for
 972      * this and do doubling if this happens
 973      */
 974     x_equal = felem_is_zero(ftmp);
 975     y_equal = felem_is_zero(ftmp3);
 976     z1_is_zero = felem_is_zero(z1);
 977     z2_is_zero = felem_is_zero(z2);
 978     /* In affine coordinates, (X_1, Y_1) == (X_2, Y_2) */
 979     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
 980         point_double(x3, y3, z3, x1, y1, z1);
 981         return;
 982     }
 983
 984     /* ftmp5 = z1*z2 */
 985     if (!mixed) {
 986         felem_mul(tmp, z1, z2);
 987         felem_reduce(ftmp5, tmp);
 988     } else {
 989         /* special case z2 = 0 is handled later */
 990         felem_assign(ftmp5, z1);
 991     }
 992
 993     /* z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) */
 994     felem_mul(tmp, ftmp, ftmp5);
 995     felem_reduce(z_out, tmp);
 996
 997     /* ftmp = (z1^2*x2 - z2^2*x1)^2 */
 998     felem_assign(ftmp5, ftmp);
 999     felem_square(tmp, ftmp);
1000     felem_reduce(ftmp, tmp);
1001
1002     /* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */
1003     felem_mul(tmp, ftmp, ftmp5);
1004     felem_reduce(ftmp5, tmp);
1005
1006     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
1007     felem_mul(tmp, ftmp2, ftmp);
1008     felem_reduce(ftmp2, tmp);
1009
1010     /* tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
1011     felem_mul(tmp, ftmp4, ftmp5);
1012     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
1013
1014     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 */
1015     felem_square(tmp2, ftmp3);
1016     /* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */
1017
1018     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */
1019     felem_diff_128_64(tmp2, ftmp5);
1020     /* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */
1021
1022     /* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
1023     felem_assign(ftmp5, ftmp2);
1024     felem_scalar(ftmp5, 2);
1025     /* ftmp5[i] < 2 * 2^57 = 2^58 */
1026
1027     /*-
1028      * x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 -
1029      *  2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2
1030      */
1031     felem_diff_128_64(tmp2, ftmp5);
1032     /* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */
1033     felem_reduce(x_out, tmp2);
1034
1035     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out */
1036     felem_diff(ftmp2, x_out);
1037     /* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */
1038
1039     /*
1040      * tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out)
1041      */
1042     felem_mul(tmp2, ftmp3, ftmp2);
1043     /* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */
1044
1045     /*-
1046      * y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) -
1047      *  z2^3*y1*(z1^2*x2 - z2^2*x1)^3
1048      */
1049     widefelem_diff(tmp2, tmp);
1050     /* tmp2[i] < 2^118 + 2^120 < 2^121 */
1051     felem_reduce(y_out, tmp2);
1052
1053     /*
1054      * the result (x_out, y_out, z_out) is incorrect if one of the inputs is
1055      * the point at infinity, so we need to check for this separately
1056      */
1057
1058     /*
1059      * if point 1 is at infinity, copy point 2 to output, and vice versa
1060      */
1061     copy_conditional(x_out, x2, z1_is_zero);
1062     copy_conditional(x_out, x1, z2_is_zero);
1063     copy_conditional(y_out, y2, z1_is_zero);
1064     copy_conditional(y_out, y1, z2_is_zero);
1065     copy_conditional(z_out, z2, z1_is_zero);
1066     copy_conditional(z_out, z1, z2_is_zero);
1067     felem_assign(x3, x_out);
1068     felem_assign(y3, y_out);
1069     felem_assign(z3, z_out);
1070 }
1071
1072 /*
1073  * select_point selects the |idx|th point from a precomputation table and
1074  * copies it to out.
1075  * The pre_comp array argument should be size of |size| argument
1076  */
1077 static void select_point(const u64 idx, unsigned int size,
1078                          const felem pre_comp[][3], felem out[3])
1079 {
1080     unsigned i, j;
1081     limb *outlimbs = &out[0][0];
1082
1083     memset(out, 0, sizeof(*out) * 3);
1084     for (i = 0; i < size; i++) {
1085         const limb *inlimbs = &pre_comp[i][0][0];
1086         u64 mask = i ^ idx;
1087         mask |= mask >> 4;
1088         mask |= mask >> 2;
1089         mask |= mask >> 1;
1090         mask &= 1;
1091         mask--;
1092         for (j = 0; j < 4 * 3; j++)
1093             outlimbs[j] |= inlimbs[j] & mask;
1094     }
1095 }
1096
1097 /* get_bit returns the |i|th bit in |in| */
1098 static char get_bit(const felem_bytearray in, unsigned i)
1099 {
1100     if (i >= 224)
1101         return 0;
1102     return (in[i >> 3] >> (i & 7)) & 1;
1103 }
1104
1105 /*
1106  * Interleaved point multiplication using precomputed point multiples: The
1107  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1108  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1109  * generator, using certain (large) precomputed multiples in g_pre_comp.
1110  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1111  */
1112 static void batch_mul(felem x_out, felem y_out, felem z_out,
1113                       const felem_bytearray scalars[],
1114                       const unsigned num_points, const u8 *g_scalar,
1115                       const int mixed, const felem pre_comp[][17][3],
1116                       const felem g_pre_comp[2][16][3])
1117 {
1118     int i, skip;
1119     unsigned num;
1120     unsigned gen_mul = (g_scalar != NULL);
1121     felem nq[3], tmp[4];
1122     u64 bits;
1123     u8 sign, digit;
1124
1125     /* set nq to the point at infinity */
1126     memset(nq, 0, sizeof(nq));
1127
1128     /*
1129      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1130      * of the generator (two in each of the last 28 rounds) and additions of
1131      * other points multiples (every 5th round).
1132      */
1133     skip = 1;                   /* save two point operations in the first
1134                                  * round */
1135     for (i = (num_points ? 220 : 27); i >= 0; --i) {
1136         /* double */
1137         if (!skip)
1138             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1139
1140         /* add multiples of the generator */
1141         if (gen_mul && (i <= 27)) {
1142             /* first, look 28 bits upwards */
1143             bits = get_bit(g_scalar, i + 196) << 3;
1144             bits |= get_bit(g_scalar, i + 140) << 2;
1145             bits |= get_bit(g_scalar, i + 84) << 1;
1146             bits |= get_bit(g_scalar, i + 28);
1147             /* select the point to add, in constant time */
1148             select_point(bits, 16, g_pre_comp[1], tmp);
1149
1150             if (!skip) {
1151                 /* value 1 below is argument for "mixed" */
1152                 point_add(nq[0], nq[1], nq[2],
1153                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1154             } else {
1155                 memcpy(nq, tmp, 3 * sizeof(felem));
1156                 skip = 0;
1157             }
1158
1159             /* second, look at the current position */
1160             bits = get_bit(g_scalar, i + 168) << 3;
1161             bits |= get_bit(g_scalar, i + 112) << 2;
1162             bits |= get_bit(g_scalar, i + 56) << 1;
1163             bits |= get_bit(g_scalar, i);
1164             /* select the point to add, in constant time */
1165             select_point(bits, 16, g_pre_comp[0], tmp);
1166             point_add(nq[0], nq[1], nq[2],
1167                       nq[0], nq[1], nq[2],
1168                       1 /* mixed */ , tmp[0], tmp[1], tmp[2]);
1169         }
1170
1171         /* do other additions every 5 doublings */
1172         if (num_points && (i % 5 == 0)) {
1173             /* loop over all scalars */
1174             for (num = 0; num < num_points; ++num) {
1175                 bits = get_bit(scalars[num], i + 4) << 5;
1176                 bits |= get_bit(scalars[num], i + 3) << 4;
1177                 bits |= get_bit(scalars[num], i + 2) << 3;
1178                 bits |= get_bit(scalars[num], i + 1) << 2;
1179                 bits |= get_bit(scalars[num], i) << 1;
1180                 bits |= get_bit(scalars[num], i - 1);
1181                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1182
1183                 /* select the point to add or subtract */
1184                 select_point(digit, 17, pre_comp[num], tmp);
1185                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1186                                             * point */
1187                 copy_conditional(tmp[1], tmp[3], sign);
1188
1189                 if (!skip) {
1190                     point_add(nq[0], nq[1], nq[2],
1191                               nq[0], nq[1], nq[2],
1192                               mixed, tmp[0], tmp[1], tmp[2]);
1193                 } else {
1194                     memcpy(nq, tmp, 3 * sizeof(felem));
1195                     skip = 0;
1196                 }
1197             }
1198         }
1199     }
1200     felem_assign(x_out, nq[0]);
1201     felem_assign(y_out, nq[1]);
1202     felem_assign(z_out, nq[2]);
1203 }
1204
1205 /******************************************************************************/
1206 /*
1207  * FUNCTIONS TO MANAGE PRECOMPUTATION
1208  */
1209
1210 static NISTP224_PRE_COMP *nistp224_pre_comp_new(void)
1211 {
1212     NISTP224_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1213
1214     if (!ret) {
1215         ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1216         return ret;
1217     }
1218
1219     ret->references = 1;
1220
1221     ret->lock = CRYPTO_THREAD_lock_new();
1222     if (ret->lock == NULL) {
1223         ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1224         OPENSSL_free(ret);
1225         return NULL;
1226     }
1227     return ret;
1228 }
1229
1230 NISTP224_PRE_COMP *EC_nistp224_pre_comp_dup(NISTP224_PRE_COMP *p)
1231 {
1232     int i;
1233     if (p != NULL)
1234         CRYPTO_UP_REF(&p->references, &i, p->lock);
1235     return p;
1236 }
1237
1238 void EC_nistp224_pre_comp_free(NISTP224_PRE_COMP *p)
1239 {
1240     int i;
1241
1242     if (p == NULL)
1243         return;
1244
1245     CRYPTO_DOWN_REF(&p->references, &i, p->lock);
1246     REF_PRINT_COUNT("EC_nistp224", x);
1247     if (i > 0)
1248         return;
1249     REF_ASSERT_ISNT(i < 0);
1250
1251     CRYPTO_THREAD_lock_free(p->lock);
1252     OPENSSL_free(p);
1253 }
1254
1255 /******************************************************************************/
1256 /*
1257  * OPENSSL EC_METHOD FUNCTIONS
1258  */
1259
1260 int ec_GFp_nistp224_group_init(EC_GROUP *group)
1261 {
1262     int ret;
1263     ret = ec_GFp_simple_group_init(group);
1264     group->a_is_minus3 = 1;
1265     return ret;
1266 }
1267
1268 int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1269                                     const BIGNUM *a, const BIGNUM *b,
1270                                     BN_CTX *ctx)
1271 {
1272     int ret = 0;
1273     BIGNUM *curve_p, *curve_a, *curve_b;
1274 #ifndef FIPS_MODE
1275     BN_CTX *new_ctx = NULL;
1276
1277     if (ctx == NULL)
1278         ctx = new_ctx = BN_CTX_new();
1279 #endif
1280     if (ctx == NULL)
1281         return 0;
1282
1283     BN_CTX_start(ctx);
1284     curve_p = BN_CTX_get(ctx);
1285     curve_a = BN_CTX_get(ctx);
1286     curve_b = BN_CTX_get(ctx);
1287     if (curve_b == NULL)
1288         goto err;
1289     BN_bin2bn(nistp224_curve_params[0], sizeof(felem_bytearray), curve_p);
1290     BN_bin2bn(nistp224_curve_params[1], sizeof(felem_bytearray), curve_a);
1291     BN_bin2bn(nistp224_curve_params[2], sizeof(felem_bytearray), curve_b);
1292     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1293         ECerr(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE,
1294               EC_R_WRONG_CURVE_PARAMETERS);
1295         goto err;
1296     }
1297     group->field_mod_func = BN_nist_mod_224;
1298     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1299  err:
1300     BN_CTX_end(ctx);
1301 #ifndef FIPS_MODE
1302     BN_CTX_free(new_ctx);
1303 #endif
1304     return ret;
1305 }
1306
1307 /*
1308  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1309  * (X/Z^2, Y/Z^3)
1310  */
1311 int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group,
1312                                                  const EC_POINT *point,
1313                                                  BIGNUM *x, BIGNUM *y,
1314                                                  BN_CTX *ctx)
1315 {
1316     felem z1, z2, x_in, y_in, x_out, y_out;
1317     widefelem tmp;
1318
1319     if (EC_POINT_is_at_infinity(group, point)) {
1320         ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1321               EC_R_POINT_AT_INFINITY);
1322         return 0;
1323     }
1324     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1325         (!BN_to_felem(z1, point->Z)))
1326         return 0;
1327     felem_inv(z2, z1);
1328     felem_square(tmp, z2);
1329     felem_reduce(z1, tmp);
1330     felem_mul(tmp, x_in, z1);
1331     felem_reduce(x_in, tmp);
1332     felem_contract(x_out, x_in);
1333     if (x != NULL) {
1334         if (!felem_to_BN(x, x_out)) {
1335             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1336                   ERR_R_BN_LIB);
1337             return 0;
1338         }
1339     }
1340     felem_mul(tmp, z1, z2);
1341     felem_reduce(z1, tmp);
1342     felem_mul(tmp, y_in, z1);
1343     felem_reduce(y_in, tmp);
1344     felem_contract(y_out, y_in);
1345     if (y != NULL) {
1346         if (!felem_to_BN(y, y_out)) {
1347             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1348                   ERR_R_BN_LIB);
1349             return 0;
1350         }
1351     }
1352     return 1;
1353 }
1354
1355 static void make_points_affine(size_t num, felem points[ /* num */ ][3],
1356                                felem tmp_felems[ /* num+1 */ ])
1357 {
1358     /*
1359      * Runs in constant time, unless an input is the point at infinity (which
1360      * normally shouldn't happen).
1361      */
1362     ec_GFp_nistp_points_make_affine_internal(num,
1363                                              points,
1364                                              sizeof(felem),
1365                                              tmp_felems,
1366                                              (void (*)(void *))felem_one,
1367                                              felem_is_zero_int,
1368                                              (void (*)(void *, const void *))
1369                                              felem_assign,
1370                                              (void (*)(void *, const void *))
1371                                              felem_square_reduce, (void (*)
1372                                                                    (void *,
1373                                                                     const void
1374                                                                     *,
1375                                                                     const void
1376                                                                     *))
1377                                              felem_mul_reduce,
1378                                              (void (*)(void *, const void *))
1379                                              felem_inv,
1380                                              (void (*)(void *, const void *))
1381                                              felem_contract);
1382 }
1383
1384 /*
1385  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1386  * values Result is stored in r (r can equal one of the inputs).
1387  */
1388 int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
1389                                const BIGNUM *scalar, size_t num,
1390                                const EC_POINT *points[],
1391                                const BIGNUM *scalars[], BN_CTX *ctx)
1392 {
1393     int ret = 0;
1394     int j;
1395     unsigned i;
1396     int mixed = 0;
1397     BIGNUM *x, *y, *z, *tmp_scalar;
1398     felem_bytearray g_secret;
1399     felem_bytearray *secrets = NULL;
1400     felem (*pre_comp)[17][3] = NULL;
1401     felem *tmp_felems = NULL;
1402     int num_bytes;
1403     int have_pre_comp = 0;
1404     size_t num_points = num;
1405     felem x_in, y_in, z_in, x_out, y_out, z_out;
1406     NISTP224_PRE_COMP *pre = NULL;
1407     const felem(*g_pre_comp)[16][3] = NULL;
1408     EC_POINT *generator = NULL;
1409     const EC_POINT *p = NULL;
1410     const BIGNUM *p_scalar = NULL;
1411
1412     BN_CTX_start(ctx);
1413     x = BN_CTX_get(ctx);
1414     y = BN_CTX_get(ctx);
1415     z = BN_CTX_get(ctx);
1416     tmp_scalar = BN_CTX_get(ctx);
1417     if (tmp_scalar == NULL)
1418         goto err;
1419
1420     if (scalar != NULL) {
1421         pre = group->pre_comp.nistp224;
1422         if (pre)
1423             /* we have precomputation, try to use it */
1424             g_pre_comp = (const felem(*)[16][3])pre->g_pre_comp;
1425         else
1426             /* try to use the standard precomputation */
1427             g_pre_comp = &gmul[0];
1428         generator = EC_POINT_new(group);
1429         if (generator == NULL)
1430             goto err;
1431         /* get the generator from precomputation */
1432         if (!felem_to_BN(x, g_pre_comp[0][1][0]) ||
1433             !felem_to_BN(y, g_pre_comp[0][1][1]) ||
1434             !felem_to_BN(z, g_pre_comp[0][1][2])) {
1435             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1436             goto err;
1437         }
1438         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1439                                                       generator, x, y, z,
1440                                                       ctx))
1441             goto err;
1442         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1443             /* precomputation matches generator */
1444             have_pre_comp = 1;
1445         else
1446             /*
1447              * we don't have valid precomputation: treat the generator as a
1448              * random point
1449              */
1450             num_points = num_points + 1;
1451     }
1452
1453     if (num_points > 0) {
1454         if (num_points >= 3) {
1455             /*
1456              * unless we precompute multiples for just one or two points,
1457              * converting those into affine form is time well spent
1458              */
1459             mixed = 1;
1460         }
1461         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1462         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1463         if (mixed)
1464             tmp_felems =
1465                 OPENSSL_malloc(sizeof(felem) * (num_points * 17 + 1));
1466         if ((secrets == NULL) || (pre_comp == NULL)
1467             || (mixed && (tmp_felems == NULL))) {
1468             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1469             goto err;
1470         }
1471
1472         /*
1473          * we treat NULL scalars as 0, and NULL points as points at infinity,
1474          * i.e., they contribute nothing to the linear combination
1475          */
1476         for (i = 0; i < num_points; ++i) {
1477             if (i == num) {
1478                 /* the generator */
1479                 p = EC_GROUP_get0_generator(group);
1480                 p_scalar = scalar;
1481             } else {
1482                 /* the i^th point */
1483                 p = points[i];
1484                 p_scalar = scalars[i];
1485             }
1486             if ((p_scalar != NULL) && (p != NULL)) {
1487                 /* reduce scalar to 0 <= scalar < 2^224 */
1488                 if ((BN_num_bits(p_scalar) > 224)
1489                     || (BN_is_negative(p_scalar))) {
1490                     /*
1491                      * this is an unusual input, and we don't guarantee
1492                      * constant-timeness
1493                      */
1494                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1495                         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1496                         goto err;
1497                     }
1498                     num_bytes = BN_bn2lebinpad(tmp_scalar,
1499                                                secrets[i], sizeof(secrets[i]));
1500                 } else {
1501                     num_bytes = BN_bn2lebinpad(p_scalar,
1502                                                secrets[i], sizeof(secrets[i]));
1503                 }
1504                 if (num_bytes < 0) {
1505                     ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1506                     goto err;
1507                 }
1508                 /* precompute multiples */
1509                 if ((!BN_to_felem(x_out, p->X)) ||
1510                     (!BN_to_felem(y_out, p->Y)) ||
1511                     (!BN_to_felem(z_out, p->Z)))
1512                     goto err;
1513                 felem_assign(pre_comp[i][1][0], x_out);
1514                 felem_assign(pre_comp[i][1][1], y_out);
1515                 felem_assign(pre_comp[i][1][2], z_out);
1516                 for (j = 2; j <= 16; ++j) {
1517                     if (j & 1) {
1518                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1519                                   pre_comp[i][j][2], pre_comp[i][1][0],
1520                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1521                                   pre_comp[i][j - 1][0],
1522                                   pre_comp[i][j - 1][1],
1523                                   pre_comp[i][j - 1][2]);
1524                     } else {
1525                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1526                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1527                                      pre_comp[i][j / 2][1],
1528                                      pre_comp[i][j / 2][2]);
1529                     }
1530                 }
1531             }
1532         }
1533         if (mixed)
1534             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1535     }
1536
1537     /* the scalar for the generator */
1538     if ((scalar != NULL) && (have_pre_comp)) {
1539         memset(g_secret, 0, sizeof(g_secret));
1540         /* reduce scalar to 0 <= scalar < 2^224 */
1541         if ((BN_num_bits(scalar) > 224) || (BN_is_negative(scalar))) {
1542             /*
1543              * this is an unusual input, and we don't guarantee
1544              * constant-timeness
1545              */
1546             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1547                 ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1548                 goto err;
1549             }
1550             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
1551         } else {
1552             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
1553         }
1554         /* do the multiplication with generator precomputation */
1555         batch_mul(x_out, y_out, z_out,
1556                   (const felem_bytearray(*))secrets, num_points,
1557                   g_secret,
1558                   mixed, (const felem(*)[17][3])pre_comp, g_pre_comp);
1559     } else {
1560         /* do the multiplication without generator precomputation */
1561         batch_mul(x_out, y_out, z_out,
1562                   (const felem_bytearray(*))secrets, num_points,
1563                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1564     }
1565     /* reduce the output to its unique minimal representation */
1566     felem_contract(x_in, x_out);
1567     felem_contract(y_in, y_out);
1568     felem_contract(z_in, z_out);
1569     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1570         (!felem_to_BN(z, z_in))) {
1571         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1572         goto err;
1573     }
1574     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1575
1576  err:
1577     BN_CTX_end(ctx);
1578     EC_POINT_free(generator);
1579     OPENSSL_free(secrets);
1580     OPENSSL_free(pre_comp);
1581     OPENSSL_free(tmp_felems);
1582     return ret;
1583 }
1584
1585 int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1586 {
1587     int ret = 0;
1588     NISTP224_PRE_COMP *pre = NULL;
1589     int i, j;
1590     BIGNUM *x, *y;
1591     EC_POINT *generator = NULL;
1592     felem tmp_felems[32];
1593 #ifndef FIPS_MODE
1594     BN_CTX *new_ctx = NULL;
1595 #endif
1596
1597     /* throw away old precomputation */
1598     EC_pre_comp_free(group);
1599
1600 #ifndef FIPS_MODE
1601     if (ctx == NULL)
1602         ctx = new_ctx = BN_CTX_new();
1603 #endif
1604     if (ctx == NULL)
1605         return 0;
1606
1607     BN_CTX_start(ctx);
1608     x = BN_CTX_get(ctx);
1609     y = BN_CTX_get(ctx);
1610     if (y == NULL)
1611         goto err;
1612     /* get the generator */
1613     if (group->generator == NULL)
1614         goto err;
1615     generator = EC_POINT_new(group);
1616     if (generator == NULL)
1617         goto err;
1618     BN_bin2bn(nistp224_curve_params[3], sizeof(felem_bytearray), x);
1619     BN_bin2bn(nistp224_curve_params[4], sizeof(felem_bytearray), y);
1620     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
1621         goto err;
1622     if ((pre = nistp224_pre_comp_new()) == NULL)
1623         goto err;
1624     /*
1625      * if the generator is the standard one, use built-in precomputation
1626      */
1627     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1628         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1629         goto done;
1630     }
1631     if ((!BN_to_felem(pre->g_pre_comp[0][1][0], group->generator->X)) ||
1632         (!BN_to_felem(pre->g_pre_comp[0][1][1], group->generator->Y)) ||
1633         (!BN_to_felem(pre->g_pre_comp[0][1][2], group->generator->Z)))
1634         goto err;
1635     /*
1636      * compute 2^56*G, 2^112*G, 2^168*G for the first table, 2^28*G, 2^84*G,
1637      * 2^140*G, 2^196*G for the second one
1638      */
1639     for (i = 1; i <= 8; i <<= 1) {
1640         point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1641                      pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
1642                      pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]);
1643         for (j = 0; j < 27; ++j) {
1644             point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1645                          pre->g_pre_comp[1][i][2], pre->g_pre_comp[1][i][0],
1646                          pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1647         }
1648         if (i == 8)
1649             break;
1650         point_double(pre->g_pre_comp[0][2 * i][0],
1651                      pre->g_pre_comp[0][2 * i][1],
1652                      pre->g_pre_comp[0][2 * i][2], pre->g_pre_comp[1][i][0],
1653                      pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1654         for (j = 0; j < 27; ++j) {
1655             point_double(pre->g_pre_comp[0][2 * i][0],
1656                          pre->g_pre_comp[0][2 * i][1],
1657                          pre->g_pre_comp[0][2 * i][2],
1658                          pre->g_pre_comp[0][2 * i][0],
1659                          pre->g_pre_comp[0][2 * i][1],
1660                          pre->g_pre_comp[0][2 * i][2]);
1661         }
1662     }
1663     for (i = 0; i < 2; i++) {
1664         /* g_pre_comp[i][0] is the point at infinity */
1665         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
1666         /* the remaining multiples */
1667         /* 2^56*G + 2^112*G resp. 2^84*G + 2^140*G */
1668         point_add(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
1669                   pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
1670                   pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
1671                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1672                   pre->g_pre_comp[i][2][2]);
1673         /* 2^56*G + 2^168*G resp. 2^84*G + 2^196*G */
1674         point_add(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
1675                   pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
1676                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1677                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1678                   pre->g_pre_comp[i][2][2]);
1679         /* 2^112*G + 2^168*G resp. 2^140*G + 2^196*G */
1680         point_add(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
1681                   pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
1682                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1683                   0, pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
1684                   pre->g_pre_comp[i][4][2]);
1685         /*
1686          * 2^56*G + 2^112*G + 2^168*G resp. 2^84*G + 2^140*G + 2^196*G
1687          */
1688         point_add(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
1689                   pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
1690                   pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
1691                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1692                   pre->g_pre_comp[i][2][2]);
1693         for (j = 1; j < 8; ++j) {
1694             /* odd multiples: add G resp. 2^28*G */
1695             point_add(pre->g_pre_comp[i][2 * j + 1][0],
1696                       pre->g_pre_comp[i][2 * j + 1][1],
1697                       pre->g_pre_comp[i][2 * j + 1][2],
1698                       pre->g_pre_comp[i][2 * j][0],
1699                       pre->g_pre_comp[i][2 * j][1],
1700                       pre->g_pre_comp[i][2 * j][2], 0,
1701                       pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1],
1702                       pre->g_pre_comp[i][1][2]);
1703         }
1704     }
1705     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems);
1706
1707  done:
1708     SETPRECOMP(group, nistp224, pre);
1709     pre = NULL;
1710     ret = 1;
1711  err:
1712     BN_CTX_end(ctx);
1713     EC_POINT_free(generator);
1714 #ifndef FIPS_MODE
1715     BN_CTX_free(new_ctx);
1716 #endif
1717     EC_nistp224_pre_comp_free(pre);
1718     return ret;
1719 }
1720
1721 int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group)
1722 {
1723     return HAVEPRECOMP(group, nistp224);
1724 }
1725
1726 #endif