crypto/ec/ecp_nistp224.c

   1 /*
   2  * Copyright 2010-2017 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the OpenSSL license (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * A 64-bit implementation of the NIST P-224 elliptic curve point multiplication
  28  *
  29  * Inspired by Daniel J. Bernstein's public domain nistp224 implementation
  30  * and Adam Langley's public domain 64-bit C implementation of curve25519
  31  */
  32
  33 #include <openssl/opensslconf.h>
  34 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  35 NON_EMPTY_TRANSLATION_UNIT
  36 #else
  37
  38 # include <stdint.h>
  39 # include <string.h>
  40 # include <openssl/err.h>
  41 # include "ec_lcl.h"
  42
  43 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  44   /* even with gcc, the typedef won't work for 32-bit platforms */
  45 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  46                                  * platforms */
  47 # else
  48 #  error "Need GCC 3.1 or later to define type uint128_t"
  49 # endif
  50
  51 typedef uint8_t u8;
  52 typedef uint64_t u64;
  53
  54 /******************************************************************************/
  55 /*-
  56  * INTERNAL REPRESENTATION OF FIELD ELEMENTS
  57  *
  58  * Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3
  59  * using 64-bit coefficients called 'limbs',
  60  * and sometimes (for multiplication results) as
  61  * b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 + 2^336*b_6
  62  * using 128-bit coefficients called 'widelimbs'.
  63  * A 4-limb representation is an 'felem';
  64  * a 7-widelimb representation is a 'widefelem'.
  65  * Even within felems, bits of adjacent limbs overlap, and we don't always
  66  * reduce the representations: we ensure that inputs to each felem
  67  * multiplication satisfy a_i < 2^60, so outputs satisfy b_i < 4*2^60*2^60,
  68  * and fit into a 128-bit word without overflow. The coefficients are then
  69  * again partially reduced to obtain an felem satisfying a_i < 2^57.
  70  * We only reduce to the unique minimal representation at the end of the
  71  * computation.
  72  */
  73
  74 typedef uint64_t limb;
  75 typedef uint128_t widelimb;
  76
  77 typedef limb felem[4];
  78 typedef widelimb widefelem[7];
  79
  80 /*
  81  * Field element represented as a byte array. 28*8 = 224 bits is also the
  82  * group order size for the elliptic curve, and we also use this type for
  83  * scalars for point multiplication.
  84  */
  85 typedef u8 felem_bytearray[28];
  86
  87 static const felem_bytearray nistp224_curve_params[5] = {
  88     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
  89      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
  90      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01},
  91     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a */
  92      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
  93      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE},
  94     {0xB4, 0x05, 0x0A, 0x85, 0x0C, 0x04, 0xB3, 0xAB, 0xF5, 0x41, /* b */
  95      0x32, 0x56, 0x50, 0x44, 0xB0, 0xB7, 0xD7, 0xBF, 0xD8, 0xBA,
  96      0x27, 0x0B, 0x39, 0x43, 0x23, 0x55, 0xFF, 0xB4},
  97     {0xB7, 0x0E, 0x0C, 0xBD, 0x6B, 0xB4, 0xBF, 0x7F, 0x32, 0x13, /* x */
  98      0x90, 0xB9, 0x4A, 0x03, 0xC1, 0xD3, 0x56, 0xC2, 0x11, 0x22,
  99      0x34, 0x32, 0x80, 0xD6, 0x11, 0x5C, 0x1D, 0x21},
 100     {0xbd, 0x37, 0x63, 0x88, 0xb5, 0xf7, 0x23, 0xfb, 0x4c, 0x22, /* y */
 101      0xdf, 0xe6, 0xcd, 0x43, 0x75, 0xa0, 0x5a, 0x07, 0x47, 0x64,
 102      0x44, 0xd5, 0x81, 0x99, 0x85, 0x00, 0x7e, 0x34}
 103 };
 104
 105 /*-
 106  * Precomputed multiples of the standard generator
 107  * Points are given in coordinates (X, Y, Z) where Z normally is 1
 108  * (0 for the point at infinity).
 109  * For each field element, slice a_0 is word 0, etc.
 110  *
 111  * The table has 2 * 16 elements, starting with the following:
 112  * index | bits    | point
 113  * ------+---------+------------------------------
 114  *     0 | 0 0 0 0 | 0G
 115  *     1 | 0 0 0 1 | 1G
 116  *     2 | 0 0 1 0 | 2^56G
 117  *     3 | 0 0 1 1 | (2^56 + 1)G
 118  *     4 | 0 1 0 0 | 2^112G
 119  *     5 | 0 1 0 1 | (2^112 + 1)G
 120  *     6 | 0 1 1 0 | (2^112 + 2^56)G
 121  *     7 | 0 1 1 1 | (2^112 + 2^56 + 1)G
 122  *     8 | 1 0 0 0 | 2^168G
 123  *     9 | 1 0 0 1 | (2^168 + 1)G
 124  *    10 | 1 0 1 0 | (2^168 + 2^56)G
 125  *    11 | 1 0 1 1 | (2^168 + 2^56 + 1)G
 126  *    12 | 1 1 0 0 | (2^168 + 2^112)G
 127  *    13 | 1 1 0 1 | (2^168 + 2^112 + 1)G
 128  *    14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G
 129  *    15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G
 130  * followed by a copy of this with each element multiplied by 2^28.
 131  *
 132  * The reason for this is so that we can clock bits into four different
 133  * locations when doing simple scalar multiplies against the base point,
 134  * and then another four locations using the second 16 elements.
 135  */
 136 static const felem gmul[2][16][3] = {
 137 {{{0, 0, 0, 0},
 138   {0, 0, 0, 0},
 139   {0, 0, 0, 0}},
 140  {{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf},
 141   {0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723},
 142   {1, 0, 0, 0}},
 143  {{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5},
 144   {0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321},
 145   {1, 0, 0, 0}},
 146  {{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748},
 147   {0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17},
 148   {1, 0, 0, 0}},
 149  {{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe},
 150   {0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b},
 151   {1, 0, 0, 0}},
 152  {{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3},
 153   {0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a},
 154   {1, 0, 0, 0}},
 155  {{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c},
 156   {0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244},
 157   {1, 0, 0, 0}},
 158  {{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849},
 159   {0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112},
 160   {1, 0, 0, 0}},
 161  {{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47},
 162   {0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394},
 163   {1, 0, 0, 0}},
 164  {{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d},
 165   {0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7},
 166   {1, 0, 0, 0}},
 167  {{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24},
 168   {0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881},
 169   {1, 0, 0, 0}},
 170  {{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984},
 171   {0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369},
 172   {1, 0, 0, 0}},
 173  {{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3},
 174   {0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60},
 175   {1, 0, 0, 0}},
 176  {{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057},
 177   {0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9},
 178   {1, 0, 0, 0}},
 179  {{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9},
 180   {0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc},
 181   {1, 0, 0, 0}},
 182  {{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58},
 183   {0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558},
 184   {1, 0, 0, 0}}},
 185 {{{0, 0, 0, 0},
 186   {0, 0, 0, 0},
 187   {0, 0, 0, 0}},
 188  {{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31},
 189   {0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d},
 190   {1, 0, 0, 0}},
 191  {{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3},
 192   {0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a},
 193   {1, 0, 0, 0}},
 194  {{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33},
 195   {0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100},
 196   {1, 0, 0, 0}},
 197  {{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5},
 198   {0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea},
 199   {1, 0, 0, 0}},
 200  {{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be},
 201   {0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51},
 202   {1, 0, 0, 0}},
 203  {{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1},
 204   {0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb},
 205   {1, 0, 0, 0}},
 206  {{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233},
 207   {0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def},
 208   {1, 0, 0, 0}},
 209  {{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae},
 210   {0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45},
 211   {1, 0, 0, 0}},
 212  {{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e},
 213   {0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb},
 214   {1, 0, 0, 0}},
 215  {{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de},
 216   {0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3},
 217   {1, 0, 0, 0}},
 218  {{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05},
 219   {0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58},
 220   {1, 0, 0, 0}},
 221  {{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb},
 222   {0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0},
 223   {1, 0, 0, 0}},
 224  {{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9},
 225   {0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea},
 226   {1, 0, 0, 0}},
 227  {{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba},
 228   {0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405},
 229   {1, 0, 0, 0}},
 230  {{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e},
 231   {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e},
 232   {1, 0, 0, 0}}}
 233 };
 234
 235 /* Precomputation for the group generator. */
 236 struct nistp224_pre_comp_st {
 237     felem g_pre_comp[2][16][3];
 238     CRYPTO_REF_COUNT references;
 239     CRYPTO_RWLOCK *lock;
 240 };
 241
 242 const EC_METHOD *EC_GFp_nistp224_method(void)
 243 {
 244     static const EC_METHOD ret = {
 245         EC_FLAGS_DEFAULT_OCT,
 246         NID_X9_62_prime_field,
 247         ec_GFp_nistp224_group_init,
 248         ec_GFp_simple_group_finish,
 249         ec_GFp_simple_group_clear_finish,
 250         ec_GFp_nist_group_copy,
 251         ec_GFp_nistp224_group_set_curve,
 252         ec_GFp_simple_group_get_curve,
 253         ec_GFp_simple_group_get_degree,
 254         ec_group_simple_order_bits,
 255         ec_GFp_simple_group_check_discriminant,
 256         ec_GFp_simple_point_init,
 257         ec_GFp_simple_point_finish,
 258         ec_GFp_simple_point_clear_finish,
 259         ec_GFp_simple_point_copy,
 260         ec_GFp_simple_point_set_to_infinity,
 261         ec_GFp_simple_set_Jprojective_coordinates_GFp,
 262         ec_GFp_simple_get_Jprojective_coordinates_GFp,
 263         ec_GFp_simple_point_set_affine_coordinates,
 264         ec_GFp_nistp224_point_get_affine_coordinates,
 265         0 /* point_set_compressed_coordinates */ ,
 266         0 /* point2oct */ ,
 267         0 /* oct2point */ ,
 268         ec_GFp_simple_add,
 269         ec_GFp_simple_dbl,
 270         ec_GFp_simple_invert,
 271         ec_GFp_simple_is_at_infinity,
 272         ec_GFp_simple_is_on_curve,
 273         ec_GFp_simple_cmp,
 274         ec_GFp_simple_make_affine,
 275         ec_GFp_simple_points_make_affine,
 276         ec_GFp_nistp224_points_mul,
 277         ec_GFp_nistp224_precompute_mult,
 278         ec_GFp_nistp224_have_precompute_mult,
 279         ec_GFp_nist_field_mul,
 280         ec_GFp_nist_field_sqr,
 281         0 /* field_div */ ,
 282         0 /* field_encode */ ,
 283         0 /* field_decode */ ,
 284         0,                      /* field_set_to_one */
 285         ec_key_simple_priv2oct,
 286         ec_key_simple_oct2priv,
 287         0, /* set private */
 288         ec_key_simple_generate_key,
 289         ec_key_simple_check_key,
 290         ec_key_simple_generate_public_key,
 291         0, /* keycopy */
 292         0, /* keyfinish */
 293         ecdh_simple_compute_key
 294     };
 295
 296     return &ret;
 297 }
 298
 299 /*
 300  * Helper functions to convert field elements to/from internal representation
 301  */
 302 static void bin28_to_felem(felem out, const u8 in[28])
 303 {
 304     out[0] = *((const uint64_t *)(in)) & 0x00ffffffffffffff;
 305     out[1] = (*((const uint64_t *)(in + 7))) & 0x00ffffffffffffff;
 306     out[2] = (*((const uint64_t *)(in + 14))) & 0x00ffffffffffffff;
 307     out[3] = (*((const uint64_t *)(in+20))) >> 8;
 308 }
 309
 310 static void felem_to_bin28(u8 out[28], const felem in)
 311 {
 312     unsigned i;
 313     for (i = 0; i < 7; ++i) {
 314         out[i] = in[0] >> (8 * i);
 315         out[i + 7] = in[1] >> (8 * i);
 316         out[i + 14] = in[2] >> (8 * i);
 317         out[i + 21] = in[3] >> (8 * i);
 318     }
 319 }
 320
 321 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 322 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 323 {
 324     unsigned i;
 325     for (i = 0; i < len; ++i)
 326         out[i] = in[len - 1 - i];
 327 }
 328
 329 /* From OpenSSL BIGNUM to internal representation */
 330 static int BN_to_felem(felem out, const BIGNUM *bn)
 331 {
 332     felem_bytearray b_in;
 333     felem_bytearray b_out;
 334     unsigned num_bytes;
 335
 336     /* BN_bn2bin eats leading zeroes */
 337     memset(b_out, 0, sizeof(b_out));
 338     num_bytes = BN_num_bytes(bn);
 339     if (num_bytes > sizeof(b_out)) {
 340         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 341         return 0;
 342     }
 343     if (BN_is_negative(bn)) {
 344         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 345         return 0;
 346     }
 347     num_bytes = BN_bn2bin(bn, b_in);
 348     flip_endian(b_out, b_in, num_bytes);
 349     bin28_to_felem(out, b_out);
 350     return 1;
 351 }
 352
 353 /* From internal representation to OpenSSL BIGNUM */
 354 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 355 {
 356     felem_bytearray b_in, b_out;
 357     felem_to_bin28(b_in, in);
 358     flip_endian(b_out, b_in, sizeof(b_out));
 359     return BN_bin2bn(b_out, sizeof(b_out), out);
 360 }
 361
 362 /******************************************************************************/
 363 /*-
 364  *                              FIELD OPERATIONS
 365  *
 366  * Field operations, using the internal representation of field elements.
 367  * NB! These operations are specific to our point multiplication and cannot be
 368  * expected to be correct in general - e.g., multiplication with a large scalar
 369  * will cause an overflow.
 370  *
 371  */
 372
 373 static void felem_one(felem out)
 374 {
 375     out[0] = 1;
 376     out[1] = 0;
 377     out[2] = 0;
 378     out[3] = 0;
 379 }
 380
 381 static void felem_assign(felem out, const felem in)
 382 {
 383     out[0] = in[0];
 384     out[1] = in[1];
 385     out[2] = in[2];
 386     out[3] = in[3];
 387 }
 388
 389 /* Sum two field elements: out += in */
 390 static void felem_sum(felem out, const felem in)
 391 {
 392     out[0] += in[0];
 393     out[1] += in[1];
 394     out[2] += in[2];
 395     out[3] += in[3];
 396 }
 397
 398 /* Get negative value: out = -in */
 399 /* Assumes in[i] < 2^57 */
 400 static void felem_neg(felem out, const felem in)
 401 {
 402     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 403     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 404     static const limb two58m42m2 = (((limb) 1) << 58) -
 405         (((limb) 1) << 42) - (((limb) 1) << 2);
 406
 407     /* Set to 0 mod 2^224-2^96+1 to ensure out > in */
 408     out[0] = two58p2 - in[0];
 409     out[1] = two58m42m2 - in[1];
 410     out[2] = two58m2 - in[2];
 411     out[3] = two58m2 - in[3];
 412 }
 413
 414 /* Subtract field elements: out -= in */
 415 /* Assumes in[i] < 2^57 */
 416 static void felem_diff(felem out, const felem in)
 417 {
 418     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 419     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 420     static const limb two58m42m2 = (((limb) 1) << 58) -
 421         (((limb) 1) << 42) - (((limb) 1) << 2);
 422
 423     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 424     out[0] += two58p2;
 425     out[1] += two58m42m2;
 426     out[2] += two58m2;
 427     out[3] += two58m2;
 428
 429     out[0] -= in[0];
 430     out[1] -= in[1];
 431     out[2] -= in[2];
 432     out[3] -= in[3];
 433 }
 434
 435 /* Subtract in unreduced 128-bit mode: out -= in */
 436 /* Assumes in[i] < 2^119 */
 437 static void widefelem_diff(widefelem out, const widefelem in)
 438 {
 439     static const widelimb two120 = ((widelimb) 1) << 120;
 440     static const widelimb two120m64 = (((widelimb) 1) << 120) -
 441         (((widelimb) 1) << 64);
 442     static const widelimb two120m104m64 = (((widelimb) 1) << 120) -
 443         (((widelimb) 1) << 104) - (((widelimb) 1) << 64);
 444
 445     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 446     out[0] += two120;
 447     out[1] += two120m64;
 448     out[2] += two120m64;
 449     out[3] += two120;
 450     out[4] += two120m104m64;
 451     out[5] += two120m64;
 452     out[6] += two120m64;
 453
 454     out[0] -= in[0];
 455     out[1] -= in[1];
 456     out[2] -= in[2];
 457     out[3] -= in[3];
 458     out[4] -= in[4];
 459     out[5] -= in[5];
 460     out[6] -= in[6];
 461 }
 462
 463 /* Subtract in mixed mode: out128 -= in64 */
 464 /* in[i] < 2^63 */
 465 static void felem_diff_128_64(widefelem out, const felem in)
 466 {
 467     static const widelimb two64p8 = (((widelimb) 1) << 64) +
 468         (((widelimb) 1) << 8);
 469     static const widelimb two64m8 = (((widelimb) 1) << 64) -
 470         (((widelimb) 1) << 8);
 471     static const widelimb two64m48m8 = (((widelimb) 1) << 64) -
 472         (((widelimb) 1) << 48) - (((widelimb) 1) << 8);
 473
 474     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 475     out[0] += two64p8;
 476     out[1] += two64m48m8;
 477     out[2] += two64m8;
 478     out[3] += two64m8;
 479
 480     out[0] -= in[0];
 481     out[1] -= in[1];
 482     out[2] -= in[2];
 483     out[3] -= in[3];
 484 }
 485
 486 /*
 487  * Multiply a field element by a scalar: out = out * scalar The scalars we
 488  * actually use are small, so results fit without overflow
 489  */
 490 static void felem_scalar(felem out, const limb scalar)
 491 {
 492     out[0] *= scalar;
 493     out[1] *= scalar;
 494     out[2] *= scalar;
 495     out[3] *= scalar;
 496 }
 497
 498 /*
 499  * Multiply an unreduced field element by a scalar: out = out * scalar The
 500  * scalars we actually use are small, so results fit without overflow
 501  */
 502 static void widefelem_scalar(widefelem out, const widelimb scalar)
 503 {
 504     out[0] *= scalar;
 505     out[1] *= scalar;
 506     out[2] *= scalar;
 507     out[3] *= scalar;
 508     out[4] *= scalar;
 509     out[5] *= scalar;
 510     out[6] *= scalar;
 511 }
 512
 513 /* Square a field element: out = in^2 */
 514 static void felem_square(widefelem out, const felem in)
 515 {
 516     limb tmp0, tmp1, tmp2;
 517     tmp0 = 2 * in[0];
 518     tmp1 = 2 * in[1];
 519     tmp2 = 2 * in[2];
 520     out[0] = ((widelimb) in[0]) * in[0];
 521     out[1] = ((widelimb) in[0]) * tmp1;
 522     out[2] = ((widelimb) in[0]) * tmp2 + ((widelimb) in[1]) * in[1];
 523     out[3] = ((widelimb) in[3]) * tmp0 + ((widelimb) in[1]) * tmp2;
 524     out[4] = ((widelimb) in[3]) * tmp1 + ((widelimb) in[2]) * in[2];
 525     out[5] = ((widelimb) in[3]) * tmp2;
 526     out[6] = ((widelimb) in[3]) * in[3];
 527 }
 528
 529 /* Multiply two field elements: out = in1 * in2 */
 530 static void felem_mul(widefelem out, const felem in1, const felem in2)
 531 {
 532     out[0] = ((widelimb) in1[0]) * in2[0];
 533     out[1] = ((widelimb) in1[0]) * in2[1] + ((widelimb) in1[1]) * in2[0];
 534     out[2] = ((widelimb) in1[0]) * in2[2] + ((widelimb) in1[1]) * in2[1] +
 535              ((widelimb) in1[2]) * in2[0];
 536     out[3] = ((widelimb) in1[0]) * in2[3] + ((widelimb) in1[1]) * in2[2] +
 537              ((widelimb) in1[2]) * in2[1] + ((widelimb) in1[3]) * in2[0];
 538     out[4] = ((widelimb) in1[1]) * in2[3] + ((widelimb) in1[2]) * in2[2] +
 539              ((widelimb) in1[3]) * in2[1];
 540     out[5] = ((widelimb) in1[2]) * in2[3] + ((widelimb) in1[3]) * in2[2];
 541     out[6] = ((widelimb) in1[3]) * in2[3];
 542 }
 543
 544 /*-
 545  * Reduce seven 128-bit coefficients to four 64-bit coefficients.
 546  * Requires in[i] < 2^126,
 547  * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 */
 548 static void felem_reduce(felem out, const widefelem in)
 549 {
 550     static const widelimb two127p15 = (((widelimb) 1) << 127) +
 551         (((widelimb) 1) << 15);
 552     static const widelimb two127m71 = (((widelimb) 1) << 127) -
 553         (((widelimb) 1) << 71);
 554     static const widelimb two127m71m55 = (((widelimb) 1) << 127) -
 555         (((widelimb) 1) << 71) - (((widelimb) 1) << 55);
 556     widelimb output[5];
 557
 558     /* Add 0 mod 2^224-2^96+1 to ensure all differences are positive */
 559     output[0] = in[0] + two127p15;
 560     output[1] = in[1] + two127m71m55;
 561     output[2] = in[2] + two127m71;
 562     output[3] = in[3];
 563     output[4] = in[4];
 564
 565     /* Eliminate in[4], in[5], in[6] */
 566     output[4] += in[6] >> 16;
 567     output[3] += (in[6] & 0xffff) << 40;
 568     output[2] -= in[6];
 569
 570     output[3] += in[5] >> 16;
 571     output[2] += (in[5] & 0xffff) << 40;
 572     output[1] -= in[5];
 573
 574     output[2] += output[4] >> 16;
 575     output[1] += (output[4] & 0xffff) << 40;
 576     output[0] -= output[4];
 577
 578     /* Carry 2 -> 3 -> 4 */
 579     output[3] += output[2] >> 56;
 580     output[2] &= 0x00ffffffffffffff;
 581
 582     output[4] = output[3] >> 56;
 583     output[3] &= 0x00ffffffffffffff;
 584
 585     /* Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72 */
 586
 587     /* Eliminate output[4] */
 588     output[2] += output[4] >> 16;
 589     /* output[2] < 2^56 + 2^56 = 2^57 */
 590     output[1] += (output[4] & 0xffff) << 40;
 591     output[0] -= output[4];
 592
 593     /* Carry 0 -> 1 -> 2 -> 3 */
 594     output[1] += output[0] >> 56;
 595     out[0] = output[0] & 0x00ffffffffffffff;
 596
 597     output[2] += output[1] >> 56;
 598     /* output[2] < 2^57 + 2^72 */
 599     out[1] = output[1] & 0x00ffffffffffffff;
 600     output[3] += output[2] >> 56;
 601     /* output[3] <= 2^56 + 2^16 */
 602     out[2] = output[2] & 0x00ffffffffffffff;
 603
 604     /*-
 605      * out[0] < 2^56, out[1] < 2^56, out[2] < 2^56,
 606      * out[3] <= 2^56 + 2^16 (due to final carry),
 607      * so out < 2*p
 608      */
 609     out[3] = output[3];
 610 }
 611
 612 static void felem_square_reduce(felem out, const felem in)
 613 {
 614     widefelem tmp;
 615     felem_square(tmp, in);
 616     felem_reduce(out, tmp);
 617 }
 618
 619 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 620 {
 621     widefelem tmp;
 622     felem_mul(tmp, in1, in2);
 623     felem_reduce(out, tmp);
 624 }
 625
 626 /*
 627  * Reduce to unique minimal representation. Requires 0 <= in < 2*p (always
 628  * call felem_reduce first)
 629  */
 630 static void felem_contract(felem out, const felem in)
 631 {
 632     static const int64_t two56 = ((limb) 1) << 56;
 633     /* 0 <= in < 2*p, p = 2^224 - 2^96 + 1 */
 634     /* if in > p , reduce in = in - 2^224 + 2^96 - 1 */
 635     int64_t tmp[4], a;
 636     tmp[0] = in[0];
 637     tmp[1] = in[1];
 638     tmp[2] = in[2];
 639     tmp[3] = in[3];
 640     /* Case 1: a = 1 iff in >= 2^224 */
 641     a = (in[3] >> 56);
 642     tmp[0] -= a;
 643     tmp[1] += a << 40;
 644     tmp[3] &= 0x00ffffffffffffff;
 645     /*
 646      * Case 2: a = 0 iff p <= in < 2^224, i.e., the high 128 bits are all 1
 647      * and the lower part is non-zero
 648      */
 649     a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) |
 650         (((int64_t) (in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63);
 651     a &= 0x00ffffffffffffff;
 652     /* turn a into an all-one mask (if a = 0) or an all-zero mask */
 653     a = (a - 1) >> 63;
 654     /* subtract 2^224 - 2^96 + 1 if a is all-one */
 655     tmp[3] &= a ^ 0xffffffffffffffff;
 656     tmp[2] &= a ^ 0xffffffffffffffff;
 657     tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff;
 658     tmp[0] -= 1 & a;
 659
 660     /*
 661      * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
 662      * non-zero, so we only need one step
 663      */
 664     a = tmp[0] >> 63;
 665     tmp[0] += two56 & a;
 666     tmp[1] -= 1 & a;
 667
 668     /* carry 1 -> 2 -> 3 */
 669     tmp[2] += tmp[1] >> 56;
 670     tmp[1] &= 0x00ffffffffffffff;
 671
 672     tmp[3] += tmp[2] >> 56;
 673     tmp[2] &= 0x00ffffffffffffff;
 674
 675     /* Now 0 <= out < p */
 676     out[0] = tmp[0];
 677     out[1] = tmp[1];
 678     out[2] = tmp[2];
 679     out[3] = tmp[3];
 680 }
 681
 682 /*
 683  * Zero-check: returns 1 if input is 0, and 0 otherwise. We know that field
 684  * elements are reduced to in < 2^225, so we only need to check three cases:
 685  * 0, 2^224 - 2^96 + 1, and 2^225 - 2^97 + 2
 686  */
 687 static limb felem_is_zero(const felem in)
 688 {
 689     limb zero, two224m96p1, two225m97p2;
 690
 691     zero = in[0] | in[1] | in[2] | in[3];
 692     zero = (((int64_t) (zero) - 1) >> 63) & 1;
 693     two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000)
 694         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x00ffffffffffffff);
 695     two224m96p1 = (((int64_t) (two224m96p1) - 1) >> 63) & 1;
 696     two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000)
 697         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x01ffffffffffffff);
 698     two225m97p2 = (((int64_t) (two225m97p2) - 1) >> 63) & 1;
 699     return (zero | two224m96p1 | two225m97p2);
 700 }
 701
 702 static int felem_is_zero_int(const void *in)
 703 {
 704     return (int)(felem_is_zero(in) & ((limb) 1));
 705 }
 706
 707 /* Invert a field element */
 708 /* Computation chain copied from djb's code */
 709 static void felem_inv(felem out, const felem in)
 710 {
 711     felem ftmp, ftmp2, ftmp3, ftmp4;
 712     widefelem tmp;
 713     unsigned i;
 714
 715     felem_square(tmp, in);
 716     felem_reduce(ftmp, tmp);    /* 2 */
 717     felem_mul(tmp, in, ftmp);
 718     felem_reduce(ftmp, tmp);    /* 2^2 - 1 */
 719     felem_square(tmp, ftmp);
 720     felem_reduce(ftmp, tmp);    /* 2^3 - 2 */
 721     felem_mul(tmp, in, ftmp);
 722     felem_reduce(ftmp, tmp);    /* 2^3 - 1 */
 723     felem_square(tmp, ftmp);
 724     felem_reduce(ftmp2, tmp);   /* 2^4 - 2 */
 725     felem_square(tmp, ftmp2);
 726     felem_reduce(ftmp2, tmp);   /* 2^5 - 4 */
 727     felem_square(tmp, ftmp2);
 728     felem_reduce(ftmp2, tmp);   /* 2^6 - 8 */
 729     felem_mul(tmp, ftmp2, ftmp);
 730     felem_reduce(ftmp, tmp);    /* 2^6 - 1 */
 731     felem_square(tmp, ftmp);
 732     felem_reduce(ftmp2, tmp);   /* 2^7 - 2 */
 733     for (i = 0; i < 5; ++i) {   /* 2^12 - 2^6 */
 734         felem_square(tmp, ftmp2);
 735         felem_reduce(ftmp2, tmp);
 736     }
 737     felem_mul(tmp, ftmp2, ftmp);
 738     felem_reduce(ftmp2, tmp);   /* 2^12 - 1 */
 739     felem_square(tmp, ftmp2);
 740     felem_reduce(ftmp3, tmp);   /* 2^13 - 2 */
 741     for (i = 0; i < 11; ++i) {  /* 2^24 - 2^12 */
 742         felem_square(tmp, ftmp3);
 743         felem_reduce(ftmp3, tmp);
 744     }
 745     felem_mul(tmp, ftmp3, ftmp2);
 746     felem_reduce(ftmp2, tmp);   /* 2^24 - 1 */
 747     felem_square(tmp, ftmp2);
 748     felem_reduce(ftmp3, tmp);   /* 2^25 - 2 */
 749     for (i = 0; i < 23; ++i) {  /* 2^48 - 2^24 */
 750         felem_square(tmp, ftmp3);
 751         felem_reduce(ftmp3, tmp);
 752     }
 753     felem_mul(tmp, ftmp3, ftmp2);
 754     felem_reduce(ftmp3, tmp);   /* 2^48 - 1 */
 755     felem_square(tmp, ftmp3);
 756     felem_reduce(ftmp4, tmp);   /* 2^49 - 2 */
 757     for (i = 0; i < 47; ++i) {  /* 2^96 - 2^48 */
 758         felem_square(tmp, ftmp4);
 759         felem_reduce(ftmp4, tmp);
 760     }
 761     felem_mul(tmp, ftmp3, ftmp4);
 762     felem_reduce(ftmp3, tmp);   /* 2^96 - 1 */
 763     felem_square(tmp, ftmp3);
 764     felem_reduce(ftmp4, tmp);   /* 2^97 - 2 */
 765     for (i = 0; i < 23; ++i) {  /* 2^120 - 2^24 */
 766         felem_square(tmp, ftmp4);
 767         felem_reduce(ftmp4, tmp);
 768     }
 769     felem_mul(tmp, ftmp2, ftmp4);
 770     felem_reduce(ftmp2, tmp);   /* 2^120 - 1 */
 771     for (i = 0; i < 6; ++i) {   /* 2^126 - 2^6 */
 772         felem_square(tmp, ftmp2);
 773         felem_reduce(ftmp2, tmp);
 774     }
 775     felem_mul(tmp, ftmp2, ftmp);
 776     felem_reduce(ftmp, tmp);    /* 2^126 - 1 */
 777     felem_square(tmp, ftmp);
 778     felem_reduce(ftmp, tmp);    /* 2^127 - 2 */
 779     felem_mul(tmp, ftmp, in);
 780     felem_reduce(ftmp, tmp);    /* 2^127 - 1 */
 781     for (i = 0; i < 97; ++i) {  /* 2^224 - 2^97 */
 782         felem_square(tmp, ftmp);
 783         felem_reduce(ftmp, tmp);
 784     }
 785     felem_mul(tmp, ftmp, ftmp3);
 786     felem_reduce(out, tmp);     /* 2^224 - 2^96 - 1 */
 787 }
 788
 789 /*
 790  * Copy in constant time: if icopy == 1, copy in to out, if icopy == 0, copy
 791  * out to itself.
 792  */
 793 static void copy_conditional(felem out, const felem in, limb icopy)
 794 {
 795     unsigned i;
 796     /*
 797      * icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one
 798      */
 799     const limb copy = -icopy;
 800     for (i = 0; i < 4; ++i) {
 801         const limb tmp = copy & (in[i] ^ out[i]);
 802         out[i] ^= tmp;
 803     }
 804 }
 805
 806 /******************************************************************************/
 807 /*-
 808  *                       ELLIPTIC CURVE POINT OPERATIONS
 809  *
 810  * Points are represented in Jacobian projective coordinates:
 811  * (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3),
 812  * or to the point at infinity if Z == 0.
 813  *
 814  */
 815
 816 /*-
 817  * Double an elliptic curve point:
 818  * (X', Y', Z') = 2 * (X, Y, Z), where
 819  * X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2
 820  * Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^2
 821  * Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z
 822  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed,
 823  * while x_out == y_in is not (maybe this works, but it's not tested).
 824  */
 825 static void
 826 point_double(felem x_out, felem y_out, felem z_out,
 827              const felem x_in, const felem y_in, const felem z_in)
 828 {
 829     widefelem tmp, tmp2;
 830     felem delta, gamma, beta, alpha, ftmp, ftmp2;
 831
 832     felem_assign(ftmp, x_in);
 833     felem_assign(ftmp2, x_in);
 834
 835     /* delta = z^2 */
 836     felem_square(tmp, z_in);
 837     felem_reduce(delta, tmp);
 838
 839     /* gamma = y^2 */
 840     felem_square(tmp, y_in);
 841     felem_reduce(gamma, tmp);
 842
 843     /* beta = x*gamma */
 844     felem_mul(tmp, x_in, gamma);
 845     felem_reduce(beta, tmp);
 846
 847     /* alpha = 3*(x-delta)*(x+delta) */
 848     felem_diff(ftmp, delta);
 849     /* ftmp[i] < 2^57 + 2^58 + 2 < 2^59 */
 850     felem_sum(ftmp2, delta);
 851     /* ftmp2[i] < 2^57 + 2^57 = 2^58 */
 852     felem_scalar(ftmp2, 3);
 853     /* ftmp2[i] < 3 * 2^58 < 2^60 */
 854     felem_mul(tmp, ftmp, ftmp2);
 855     /* tmp[i] < 2^60 * 2^59 * 4 = 2^121 */
 856     felem_reduce(alpha, tmp);
 857
 858     /* x' = alpha^2 - 8*beta */
 859     felem_square(tmp, alpha);
 860     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 861     felem_assign(ftmp, beta);
 862     felem_scalar(ftmp, 8);
 863     /* ftmp[i] < 8 * 2^57 = 2^60 */
 864     felem_diff_128_64(tmp, ftmp);
 865     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 866     felem_reduce(x_out, tmp);
 867
 868     /* z' = (y + z)^2 - gamma - delta */
 869     felem_sum(delta, gamma);
 870     /* delta[i] < 2^57 + 2^57 = 2^58 */
 871     felem_assign(ftmp, y_in);
 872     felem_sum(ftmp, z_in);
 873     /* ftmp[i] < 2^57 + 2^57 = 2^58 */
 874     felem_square(tmp, ftmp);
 875     /* tmp[i] < 4 * 2^58 * 2^58 = 2^118 */
 876     felem_diff_128_64(tmp, delta);
 877     /* tmp[i] < 2^118 + 2^64 + 8 < 2^119 */
 878     felem_reduce(z_out, tmp);
 879
 880     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
 881     felem_scalar(beta, 4);
 882     /* beta[i] < 4 * 2^57 = 2^59 */
 883     felem_diff(beta, x_out);
 884     /* beta[i] < 2^59 + 2^58 + 2 < 2^60 */
 885     felem_mul(tmp, alpha, beta);
 886     /* tmp[i] < 4 * 2^57 * 2^60 = 2^119 */
 887     felem_square(tmp2, gamma);
 888     /* tmp2[i] < 4 * 2^57 * 2^57 = 2^116 */
 889     widefelem_scalar(tmp2, 8);
 890     /* tmp2[i] < 8 * 2^116 = 2^119 */
 891     widefelem_diff(tmp, tmp2);
 892     /* tmp[i] < 2^119 + 2^120 < 2^121 */
 893     felem_reduce(y_out, tmp);
 894 }
 895
 896 /*-
 897  * Add two elliptic curve points:
 898  * (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where
 899  * X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 -
 900  * 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2
 901  * Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 - X_3) -
 902  *        Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3
 903  * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2)
 904  *
 905  * This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0.
 906  */
 907
 908 /*
 909  * This function is not entirely constant-time: it includes a branch for
 910  * checking whether the two input points are equal, (while not equal to the
 911  * point at infinity). This case never happens during single point
 912  * multiplication, so there is no timing leak for ECDH or ECDSA signing.
 913  */
 914 static void point_add(felem x3, felem y3, felem z3,
 915                       const felem x1, const felem y1, const felem z1,
 916                       const int mixed, const felem x2, const felem y2,
 917                       const felem z2)
 918 {
 919     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out;
 920     widefelem tmp, tmp2;
 921     limb z1_is_zero, z2_is_zero, x_equal, y_equal;
 922
 923     if (!mixed) {
 924         /* ftmp2 = z2^2 */
 925         felem_square(tmp, z2);
 926         felem_reduce(ftmp2, tmp);
 927
 928         /* ftmp4 = z2^3 */
 929         felem_mul(tmp, ftmp2, z2);
 930         felem_reduce(ftmp4, tmp);
 931
 932         /* ftmp4 = z2^3*y1 */
 933         felem_mul(tmp2, ftmp4, y1);
 934         felem_reduce(ftmp4, tmp2);
 935
 936         /* ftmp2 = z2^2*x1 */
 937         felem_mul(tmp2, ftmp2, x1);
 938         felem_reduce(ftmp2, tmp2);
 939     } else {
 940         /*
 941          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
 942          */
 943
 944         /* ftmp4 = z2^3*y1 */
 945         felem_assign(ftmp4, y1);
 946
 947         /* ftmp2 = z2^2*x1 */
 948         felem_assign(ftmp2, x1);
 949     }
 950
 951     /* ftmp = z1^2 */
 952     felem_square(tmp, z1);
 953     felem_reduce(ftmp, tmp);
 954
 955     /* ftmp3 = z1^3 */
 956     felem_mul(tmp, ftmp, z1);
 957     felem_reduce(ftmp3, tmp);
 958
 959     /* tmp = z1^3*y2 */
 960     felem_mul(tmp, ftmp3, y2);
 961     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 962
 963     /* ftmp3 = z1^3*y2 - z2^3*y1 */
 964     felem_diff_128_64(tmp, ftmp4);
 965     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 966     felem_reduce(ftmp3, tmp);
 967
 968     /* tmp = z1^2*x2 */
 969     felem_mul(tmp, ftmp, x2);
 970     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 971
 972     /* ftmp = z1^2*x2 - z2^2*x1 */
 973     felem_diff_128_64(tmp, ftmp2);
 974     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 975     felem_reduce(ftmp, tmp);
 976
 977     /*
 978      * the formulae are incorrect if the points are equal so we check for
 979      * this and do doubling if this happens
 980      */
 981     x_equal = felem_is_zero(ftmp);
 982     y_equal = felem_is_zero(ftmp3);
 983     z1_is_zero = felem_is_zero(z1);
 984     z2_is_zero = felem_is_zero(z2);
 985     /* In affine coordinates, (X_1, Y_1) == (X_2, Y_2) */
 986     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
 987         point_double(x3, y3, z3, x1, y1, z1);
 988         return;
 989     }
 990
 991     /* ftmp5 = z1*z2 */
 992     if (!mixed) {
 993         felem_mul(tmp, z1, z2);
 994         felem_reduce(ftmp5, tmp);
 995     } else {
 996         /* special case z2 = 0 is handled later */
 997         felem_assign(ftmp5, z1);
 998     }
 999
1000     /* z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) */
1001     felem_mul(tmp, ftmp, ftmp5);
1002     felem_reduce(z_out, tmp);
1003
1004     /* ftmp = (z1^2*x2 - z2^2*x1)^2 */
1005     felem_assign(ftmp5, ftmp);
1006     felem_square(tmp, ftmp);
1007     felem_reduce(ftmp, tmp);
1008
1009     /* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */
1010     felem_mul(tmp, ftmp, ftmp5);
1011     felem_reduce(ftmp5, tmp);
1012
1013     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
1014     felem_mul(tmp, ftmp2, ftmp);
1015     felem_reduce(ftmp2, tmp);
1016
1017     /* tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
1018     felem_mul(tmp, ftmp4, ftmp5);
1019     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
1020
1021     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 */
1022     felem_square(tmp2, ftmp3);
1023     /* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */
1024
1025     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */
1026     felem_diff_128_64(tmp2, ftmp5);
1027     /* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */
1028
1029     /* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
1030     felem_assign(ftmp5, ftmp2);
1031     felem_scalar(ftmp5, 2);
1032     /* ftmp5[i] < 2 * 2^57 = 2^58 */
1033
1034     /*-
1035      * x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 -
1036      *  2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2
1037      */
1038     felem_diff_128_64(tmp2, ftmp5);
1039     /* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */
1040     felem_reduce(x_out, tmp2);
1041
1042     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out */
1043     felem_diff(ftmp2, x_out);
1044     /* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */
1045
1046     /*
1047      * tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out)
1048      */
1049     felem_mul(tmp2, ftmp3, ftmp2);
1050     /* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */
1051
1052     /*-
1053      * y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) -
1054      *  z2^3*y1*(z1^2*x2 - z2^2*x1)^3
1055      */
1056     widefelem_diff(tmp2, tmp);
1057     /* tmp2[i] < 2^118 + 2^120 < 2^121 */
1058     felem_reduce(y_out, tmp2);
1059
1060     /*
1061      * the result (x_out, y_out, z_out) is incorrect if one of the inputs is
1062      * the point at infinity, so we need to check for this separately
1063      */
1064
1065     /*
1066      * if point 1 is at infinity, copy point 2 to output, and vice versa
1067      */
1068     copy_conditional(x_out, x2, z1_is_zero);
1069     copy_conditional(x_out, x1, z2_is_zero);
1070     copy_conditional(y_out, y2, z1_is_zero);
1071     copy_conditional(y_out, y1, z2_is_zero);
1072     copy_conditional(z_out, z2, z1_is_zero);
1073     copy_conditional(z_out, z1, z2_is_zero);
1074     felem_assign(x3, x_out);
1075     felem_assign(y3, y_out);
1076     felem_assign(z3, z_out);
1077 }
1078
1079 /*
1080  * select_point selects the |idx|th point from a precomputation table and
1081  * copies it to out.
1082  * The pre_comp array argument should be size of |size| argument
1083  */
1084 static void select_point(const u64 idx, unsigned int size,
1085                          const felem pre_comp[][3], felem out[3])
1086 {
1087     unsigned i, j;
1088     limb *outlimbs = &out[0][0];
1089
1090     memset(out, 0, sizeof(*out) * 3);
1091     for (i = 0; i < size; i++) {
1092         const limb *inlimbs = &pre_comp[i][0][0];
1093         u64 mask = i ^ idx;
1094         mask |= mask >> 4;
1095         mask |= mask >> 2;
1096         mask |= mask >> 1;
1097         mask &= 1;
1098         mask--;
1099         for (j = 0; j < 4 * 3; j++)
1100             outlimbs[j] |= inlimbs[j] & mask;
1101     }
1102 }
1103
1104 /* get_bit returns the |i|th bit in |in| */
1105 static char get_bit(const felem_bytearray in, unsigned i)
1106 {
1107     if (i >= 224)
1108         return 0;
1109     return (in[i >> 3] >> (i & 7)) & 1;
1110 }
1111
1112 /*
1113  * Interleaved point multiplication using precomputed point multiples: The
1114  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1115  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1116  * generator, using certain (large) precomputed multiples in g_pre_comp.
1117  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1118  */
1119 static void batch_mul(felem x_out, felem y_out, felem z_out,
1120                       const felem_bytearray scalars[],
1121                       const unsigned num_points, const u8 *g_scalar,
1122                       const int mixed, const felem pre_comp[][17][3],
1123                       const felem g_pre_comp[2][16][3])
1124 {
1125     int i, skip;
1126     unsigned num;
1127     unsigned gen_mul = (g_scalar != NULL);
1128     felem nq[3], tmp[4];
1129     u64 bits;
1130     u8 sign, digit;
1131
1132     /* set nq to the point at infinity */
1133     memset(nq, 0, sizeof(nq));
1134
1135     /*
1136      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1137      * of the generator (two in each of the last 28 rounds) and additions of
1138      * other points multiples (every 5th round).
1139      */
1140     skip = 1;                   /* save two point operations in the first
1141                                  * round */
1142     for (i = (num_points ? 220 : 27); i >= 0; --i) {
1143         /* double */
1144         if (!skip)
1145             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1146
1147         /* add multiples of the generator */
1148         if (gen_mul && (i <= 27)) {
1149             /* first, look 28 bits upwards */
1150             bits = get_bit(g_scalar, i + 196) << 3;
1151             bits |= get_bit(g_scalar, i + 140) << 2;
1152             bits |= get_bit(g_scalar, i + 84) << 1;
1153             bits |= get_bit(g_scalar, i + 28);
1154             /* select the point to add, in constant time */
1155             select_point(bits, 16, g_pre_comp[1], tmp);
1156
1157             if (!skip) {
1158                 /* value 1 below is argument for "mixed" */
1159                 point_add(nq[0], nq[1], nq[2],
1160                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1161             } else {
1162                 memcpy(nq, tmp, 3 * sizeof(felem));
1163                 skip = 0;
1164             }
1165
1166             /* second, look at the current position */
1167             bits = get_bit(g_scalar, i + 168) << 3;
1168             bits |= get_bit(g_scalar, i + 112) << 2;
1169             bits |= get_bit(g_scalar, i + 56) << 1;
1170             bits |= get_bit(g_scalar, i);
1171             /* select the point to add, in constant time */
1172             select_point(bits, 16, g_pre_comp[0], tmp);
1173             point_add(nq[0], nq[1], nq[2],
1174                       nq[0], nq[1], nq[2],
1175                       1 /* mixed */ , tmp[0], tmp[1], tmp[2]);
1176         }
1177
1178         /* do other additions every 5 doublings */
1179         if (num_points && (i % 5 == 0)) {
1180             /* loop over all scalars */
1181             for (num = 0; num < num_points; ++num) {
1182                 bits = get_bit(scalars[num], i + 4) << 5;
1183                 bits |= get_bit(scalars[num], i + 3) << 4;
1184                 bits |= get_bit(scalars[num], i + 2) << 3;
1185                 bits |= get_bit(scalars[num], i + 1) << 2;
1186                 bits |= get_bit(scalars[num], i) << 1;
1187                 bits |= get_bit(scalars[num], i - 1);
1188                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1189
1190                 /* select the point to add or subtract */
1191                 select_point(digit, 17, pre_comp[num], tmp);
1192                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1193                                             * point */
1194                 copy_conditional(tmp[1], tmp[3], sign);
1195
1196                 if (!skip) {
1197                     point_add(nq[0], nq[1], nq[2],
1198                               nq[0], nq[1], nq[2],
1199                               mixed, tmp[0], tmp[1], tmp[2]);
1200                 } else {
1201                     memcpy(nq, tmp, 3 * sizeof(felem));
1202                     skip = 0;
1203                 }
1204             }
1205         }
1206     }
1207     felem_assign(x_out, nq[0]);
1208     felem_assign(y_out, nq[1]);
1209     felem_assign(z_out, nq[2]);
1210 }
1211
1212 /******************************************************************************/
1213 /*
1214  * FUNCTIONS TO MANAGE PRECOMPUTATION
1215  */
1216
1217 static NISTP224_PRE_COMP *nistp224_pre_comp_new()
1218 {
1219     NISTP224_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1220
1221     if (!ret) {
1222         ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1223         return ret;
1224     }
1225
1226     ret->references = 1;
1227
1228     ret->lock = CRYPTO_THREAD_lock_new();
1229     if (ret->lock == NULL) {
1230         ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1231         OPENSSL_free(ret);
1232         return NULL;
1233     }
1234     return ret;
1235 }
1236
1237 NISTP224_PRE_COMP *EC_nistp224_pre_comp_dup(NISTP224_PRE_COMP *p)
1238 {
1239     int i;
1240     if (p != NULL)
1241         CRYPTO_UP_REF(&p->references, &i, p->lock);
1242     return p;
1243 }
1244
1245 void EC_nistp224_pre_comp_free(NISTP224_PRE_COMP *p)
1246 {
1247     int i;
1248
1249     if (p == NULL)
1250         return;
1251
1252     CRYPTO_DOWN_REF(&p->references, &i, p->lock);
1253     REF_PRINT_COUNT("EC_nistp224", x);
1254     if (i > 0)
1255         return;
1256     REF_ASSERT_ISNT(i < 0);
1257
1258     CRYPTO_THREAD_lock_free(p->lock);
1259     OPENSSL_free(p);
1260 }
1261
1262 /******************************************************************************/
1263 /*
1264  * OPENSSL EC_METHOD FUNCTIONS
1265  */
1266
1267 int ec_GFp_nistp224_group_init(EC_GROUP *group)
1268 {
1269     int ret;
1270     ret = ec_GFp_simple_group_init(group);
1271     group->a_is_minus3 = 1;
1272     return ret;
1273 }
1274
1275 int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1276                                     const BIGNUM *a, const BIGNUM *b,
1277                                     BN_CTX *ctx)
1278 {
1279     int ret = 0;
1280     BN_CTX *new_ctx = NULL;
1281     BIGNUM *curve_p, *curve_a, *curve_b;
1282
1283     if (ctx == NULL)
1284         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1285             return 0;
1286     BN_CTX_start(ctx);
1287     curve_p = BN_CTX_get(ctx);
1288     curve_a = BN_CTX_get(ctx);
1289     curve_b = BN_CTX_get(ctx);
1290     if (curve_b == NULL)
1291         goto err;
1292     BN_bin2bn(nistp224_curve_params[0], sizeof(felem_bytearray), curve_p);
1293     BN_bin2bn(nistp224_curve_params[1], sizeof(felem_bytearray), curve_a);
1294     BN_bin2bn(nistp224_curve_params[2], sizeof(felem_bytearray), curve_b);
1295     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1296         ECerr(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE,
1297               EC_R_WRONG_CURVE_PARAMETERS);
1298         goto err;
1299     }
1300     group->field_mod_func = BN_nist_mod_224;
1301     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1302  err:
1303     BN_CTX_end(ctx);
1304     BN_CTX_free(new_ctx);
1305     return ret;
1306 }
1307
1308 /*
1309  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1310  * (X/Z^2, Y/Z^3)
1311  */
1312 int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group,
1313                                                  const EC_POINT *point,
1314                                                  BIGNUM *x, BIGNUM *y,
1315                                                  BN_CTX *ctx)
1316 {
1317     felem z1, z2, x_in, y_in, x_out, y_out;
1318     widefelem tmp;
1319
1320     if (EC_POINT_is_at_infinity(group, point)) {
1321         ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1322               EC_R_POINT_AT_INFINITY);
1323         return 0;
1324     }
1325     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1326         (!BN_to_felem(z1, point->Z)))
1327         return 0;
1328     felem_inv(z2, z1);
1329     felem_square(tmp, z2);
1330     felem_reduce(z1, tmp);
1331     felem_mul(tmp, x_in, z1);
1332     felem_reduce(x_in, tmp);
1333     felem_contract(x_out, x_in);
1334     if (x != NULL) {
1335         if (!felem_to_BN(x, x_out)) {
1336             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1337                   ERR_R_BN_LIB);
1338             return 0;
1339         }
1340     }
1341     felem_mul(tmp, z1, z2);
1342     felem_reduce(z1, tmp);
1343     felem_mul(tmp, y_in, z1);
1344     felem_reduce(y_in, tmp);
1345     felem_contract(y_out, y_in);
1346     if (y != NULL) {
1347         if (!felem_to_BN(y, y_out)) {
1348             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1349                   ERR_R_BN_LIB);
1350             return 0;
1351         }
1352     }
1353     return 1;
1354 }
1355
1356 static void make_points_affine(size_t num, felem points[ /* num */ ][3],
1357                                felem tmp_felems[ /* num+1 */ ])
1358 {
1359     /*
1360      * Runs in constant time, unless an input is the point at infinity (which
1361      * normally shouldn't happen).
1362      */
1363     ec_GFp_nistp_points_make_affine_internal(num,
1364                                              points,
1365                                              sizeof(felem),
1366                                              tmp_felems,
1367                                              (void (*)(void *))felem_one,
1368                                              felem_is_zero_int,
1369                                              (void (*)(void *, const void *))
1370                                              felem_assign,
1371                                              (void (*)(void *, const void *))
1372                                              felem_square_reduce, (void (*)
1373                                                                    (void *,
1374                                                                     const void
1375                                                                     *,
1376                                                                     const void
1377                                                                     *))
1378                                              felem_mul_reduce,
1379                                              (void (*)(void *, const void *))
1380                                              felem_inv,
1381                                              (void (*)(void *, const void *))
1382                                              felem_contract);
1383 }
1384
1385 /*
1386  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1387  * values Result is stored in r (r can equal one of the inputs).
1388  */
1389 int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
1390                                const BIGNUM *scalar, size_t num,
1391                                const EC_POINT *points[],
1392                                const BIGNUM *scalars[], BN_CTX *ctx)
1393 {
1394     int ret = 0;
1395     int j;
1396     unsigned i;
1397     int mixed = 0;
1398     BN_CTX *new_ctx = NULL;
1399     BIGNUM *x, *y, *z, *tmp_scalar;
1400     felem_bytearray g_secret;
1401     felem_bytearray *secrets = NULL;
1402     felem (*pre_comp)[17][3] = NULL;
1403     felem *tmp_felems = NULL;
1404     felem_bytearray tmp;
1405     unsigned num_bytes;
1406     int have_pre_comp = 0;
1407     size_t num_points = num;
1408     felem x_in, y_in, z_in, x_out, y_out, z_out;
1409     NISTP224_PRE_COMP *pre = NULL;
1410     const felem(*g_pre_comp)[16][3] = NULL;
1411     EC_POINT *generator = NULL;
1412     const EC_POINT *p = NULL;
1413     const BIGNUM *p_scalar = NULL;
1414
1415     if (ctx == NULL)
1416         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1417             return 0;
1418     BN_CTX_start(ctx);
1419     x = BN_CTX_get(ctx);
1420     y = BN_CTX_get(ctx);
1421     z = BN_CTX_get(ctx);
1422     tmp_scalar = BN_CTX_get(ctx);
1423     if (tmp_scalar == NULL)
1424         goto err;
1425
1426     if (scalar != NULL) {
1427         pre = group->pre_comp.nistp224;
1428         if (pre)
1429             /* we have precomputation, try to use it */
1430             g_pre_comp = (const felem(*)[16][3])pre->g_pre_comp;
1431         else
1432             /* try to use the standard precomputation */
1433             g_pre_comp = &gmul[0];
1434         generator = EC_POINT_new(group);
1435         if (generator == NULL)
1436             goto err;
1437         /* get the generator from precomputation */
1438         if (!felem_to_BN(x, g_pre_comp[0][1][0]) ||
1439             !felem_to_BN(y, g_pre_comp[0][1][1]) ||
1440             !felem_to_BN(z, g_pre_comp[0][1][2])) {
1441             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1442             goto err;
1443         }
1444         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1445                                                       generator, x, y, z,
1446                                                       ctx))
1447             goto err;
1448         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1449             /* precomputation matches generator */
1450             have_pre_comp = 1;
1451         else
1452             /*
1453              * we don't have valid precomputation: treat the generator as a
1454              * random point
1455              */
1456             num_points = num_points + 1;
1457     }
1458
1459     if (num_points > 0) {
1460         if (num_points >= 3) {
1461             /*
1462              * unless we precompute multiples for just one or two points,
1463              * converting those into affine form is time well spent
1464              */
1465             mixed = 1;
1466         }
1467         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1468         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1469         if (mixed)
1470             tmp_felems =
1471                 OPENSSL_malloc(sizeof(felem) * (num_points * 17 + 1));
1472         if ((secrets == NULL) || (pre_comp == NULL)
1473             || (mixed && (tmp_felems == NULL))) {
1474             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1475             goto err;
1476         }
1477
1478         /*
1479          * we treat NULL scalars as 0, and NULL points as points at infinity,
1480          * i.e., they contribute nothing to the linear combination
1481          */
1482         for (i = 0; i < num_points; ++i) {
1483             if (i == num)
1484                 /* the generator */
1485             {
1486                 p = EC_GROUP_get0_generator(group);
1487                 p_scalar = scalar;
1488             } else
1489                 /* the i^th point */
1490             {
1491                 p = points[i];
1492                 p_scalar = scalars[i];
1493             }
1494             if ((p_scalar != NULL) && (p != NULL)) {
1495                 /* reduce scalar to 0 <= scalar < 2^224 */
1496                 if ((BN_num_bits(p_scalar) > 224)
1497                     || (BN_is_negative(p_scalar))) {
1498                     /*
1499                      * this is an unusual input, and we don't guarantee
1500                      * constant-timeness
1501                      */
1502                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1503                         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1504                         goto err;
1505                     }
1506                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
1507                 } else
1508                     num_bytes = BN_bn2bin(p_scalar, tmp);
1509                 flip_endian(secrets[i], tmp, num_bytes);
1510                 /* precompute multiples */
1511                 if ((!BN_to_felem(x_out, p->X)) ||
1512                     (!BN_to_felem(y_out, p->Y)) ||
1513                     (!BN_to_felem(z_out, p->Z)))
1514                     goto err;
1515                 felem_assign(pre_comp[i][1][0], x_out);
1516                 felem_assign(pre_comp[i][1][1], y_out);
1517                 felem_assign(pre_comp[i][1][2], z_out);
1518                 for (j = 2; j <= 16; ++j) {
1519                     if (j & 1) {
1520                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1521                                   pre_comp[i][j][2], pre_comp[i][1][0],
1522                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1523                                   pre_comp[i][j - 1][0],
1524                                   pre_comp[i][j - 1][1],
1525                                   pre_comp[i][j - 1][2]);
1526                     } else {
1527                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1528                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1529                                      pre_comp[i][j / 2][1],
1530                                      pre_comp[i][j / 2][2]);
1531                     }
1532                 }
1533             }
1534         }
1535         if (mixed)
1536             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1537     }
1538
1539     /* the scalar for the generator */
1540     if ((scalar != NULL) && (have_pre_comp)) {
1541         memset(g_secret, 0, sizeof(g_secret));
1542         /* reduce scalar to 0 <= scalar < 2^224 */
1543         if ((BN_num_bits(scalar) > 224) || (BN_is_negative(scalar))) {
1544             /*
1545              * this is an unusual input, and we don't guarantee
1546              * constant-timeness
1547              */
1548             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1549                 ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1550                 goto err;
1551             }
1552             num_bytes = BN_bn2bin(tmp_scalar, tmp);
1553         } else
1554             num_bytes = BN_bn2bin(scalar, tmp);
1555         flip_endian(g_secret, tmp, num_bytes);
1556         /* do the multiplication with generator precomputation */
1557         batch_mul(x_out, y_out, z_out,
1558                   (const felem_bytearray(*))secrets, num_points,
1559                   g_secret,
1560                   mixed, (const felem(*)[17][3])pre_comp, g_pre_comp);
1561     } else
1562         /* do the multiplication without generator precomputation */
1563         batch_mul(x_out, y_out, z_out,
1564                   (const felem_bytearray(*))secrets, num_points,
1565                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1566     /* reduce the output to its unique minimal representation */
1567     felem_contract(x_in, x_out);
1568     felem_contract(y_in, y_out);
1569     felem_contract(z_in, z_out);
1570     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1571         (!felem_to_BN(z, z_in))) {
1572         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1573         goto err;
1574     }
1575     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1576
1577  err:
1578     BN_CTX_end(ctx);
1579     EC_POINT_free(generator);
1580     BN_CTX_free(new_ctx);
1581     OPENSSL_free(secrets);
1582     OPENSSL_free(pre_comp);
1583     OPENSSL_free(tmp_felems);
1584     return ret;
1585 }
1586
1587 int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1588 {
1589     int ret = 0;
1590     NISTP224_PRE_COMP *pre = NULL;
1591     int i, j;
1592     BN_CTX *new_ctx = NULL;
1593     BIGNUM *x, *y;
1594     EC_POINT *generator = NULL;
1595     felem tmp_felems[32];
1596
1597     /* throw away old precomputation */
1598     EC_pre_comp_free(group);
1599     if (ctx == NULL)
1600         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1601             return 0;
1602     BN_CTX_start(ctx);
1603     x = BN_CTX_get(ctx);
1604     y = BN_CTX_get(ctx);
1605     if (y == NULL)
1606         goto err;
1607     /* get the generator */
1608     if (group->generator == NULL)
1609         goto err;
1610     generator = EC_POINT_new(group);
1611     if (generator == NULL)
1612         goto err;
1613     BN_bin2bn(nistp224_curve_params[3], sizeof(felem_bytearray), x);
1614     BN_bin2bn(nistp224_curve_params[4], sizeof(felem_bytearray), y);
1615     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
1616         goto err;
1617     if ((pre = nistp224_pre_comp_new()) == NULL)
1618         goto err;
1619     /*
1620      * if the generator is the standard one, use built-in precomputation
1621      */
1622     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1623         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1624         goto done;
1625     }
1626     if ((!BN_to_felem(pre->g_pre_comp[0][1][0], group->generator->X)) ||
1627         (!BN_to_felem(pre->g_pre_comp[0][1][1], group->generator->Y)) ||
1628         (!BN_to_felem(pre->g_pre_comp[0][1][2], group->generator->Z)))
1629         goto err;
1630     /*
1631      * compute 2^56*G, 2^112*G, 2^168*G for the first table, 2^28*G, 2^84*G,
1632      * 2^140*G, 2^196*G for the second one
1633      */
1634     for (i = 1; i <= 8; i <<= 1) {
1635         point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1636                      pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
1637                      pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]);
1638         for (j = 0; j < 27; ++j) {
1639             point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1640                          pre->g_pre_comp[1][i][2], pre->g_pre_comp[1][i][0],
1641                          pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1642         }
1643         if (i == 8)
1644             break;
1645         point_double(pre->g_pre_comp[0][2 * i][0],
1646                      pre->g_pre_comp[0][2 * i][1],
1647                      pre->g_pre_comp[0][2 * i][2], pre->g_pre_comp[1][i][0],
1648                      pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1649         for (j = 0; j < 27; ++j) {
1650             point_double(pre->g_pre_comp[0][2 * i][0],
1651                          pre->g_pre_comp[0][2 * i][1],
1652                          pre->g_pre_comp[0][2 * i][2],
1653                          pre->g_pre_comp[0][2 * i][0],
1654                          pre->g_pre_comp[0][2 * i][1],
1655                          pre->g_pre_comp[0][2 * i][2]);
1656         }
1657     }
1658     for (i = 0; i < 2; i++) {
1659         /* g_pre_comp[i][0] is the point at infinity */
1660         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
1661         /* the remaining multiples */
1662         /* 2^56*G + 2^112*G resp. 2^84*G + 2^140*G */
1663         point_add(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
1664                   pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
1665                   pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
1666                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1667                   pre->g_pre_comp[i][2][2]);
1668         /* 2^56*G + 2^168*G resp. 2^84*G + 2^196*G */
1669         point_add(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
1670                   pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
1671                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1672                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1673                   pre->g_pre_comp[i][2][2]);
1674         /* 2^112*G + 2^168*G resp. 2^140*G + 2^196*G */
1675         point_add(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
1676                   pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
1677                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1678                   0, pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
1679                   pre->g_pre_comp[i][4][2]);
1680         /*
1681          * 2^56*G + 2^112*G + 2^168*G resp. 2^84*G + 2^140*G + 2^196*G
1682          */
1683         point_add(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
1684                   pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
1685                   pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
1686                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1687                   pre->g_pre_comp[i][2][2]);
1688         for (j = 1; j < 8; ++j) {
1689             /* odd multiples: add G resp. 2^28*G */
1690             point_add(pre->g_pre_comp[i][2 * j + 1][0],
1691                       pre->g_pre_comp[i][2 * j + 1][1],
1692                       pre->g_pre_comp[i][2 * j + 1][2],
1693                       pre->g_pre_comp[i][2 * j][0],
1694                       pre->g_pre_comp[i][2 * j][1],
1695                       pre->g_pre_comp[i][2 * j][2], 0,
1696                       pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1],
1697                       pre->g_pre_comp[i][1][2]);
1698         }
1699     }
1700     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems);
1701
1702  done:
1703     SETPRECOMP(group, nistp224, pre);
1704     pre = NULL;
1705     ret = 1;
1706  err:
1707     BN_CTX_end(ctx);
1708     EC_POINT_free(generator);
1709     BN_CTX_free(new_ctx);
1710     EC_nistp224_pre_comp_free(pre);
1711     return ret;
1712 }
1713
1714 int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group)
1715 {
1716     return HAVEPRECOMP(group, nistp224);
1717 }
1718
1719 #endif