crypto/ec/ecp_nistp224.c

   1 /*
   2  * Written by Emilia Kasper (Google) for the OpenSSL project.
   3  */
   4 /* Copyright 2011 Google Inc.
   5  *
   6  * Licensed under the Apache License, Version 2.0 (the "License");
   7  *
   8  * you may not use this file except in compliance with the License.
   9  * You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  *  Unless required by applicable law or agreed to in writing, software
  14  *  distributed under the License is distributed on an "AS IS" BASIS,
  15  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  *  See the License for the specific language governing permissions and
  17  *  limitations under the License.
  18  */
  19
  20 /*
  21  * A 64-bit implementation of the NIST P-224 elliptic curve point multiplication
  22  *
  23  * Inspired by Daniel J. Bernstein's public domain nistp224 implementation
  24  * and Adam Langley's public domain 64-bit C implementation of curve25519
  25  */
  26
  27 #include <openssl/opensslconf.h>
  28 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  29 NON_EMPTY_TRANSLATION_UNIT
  30 #else
  31
  32 # include <stdint.h>
  33 # include <string.h>
  34 # include <openssl/err.h>
  35 # include "ec_lcl.h"
  36
  37 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  38   /* even with gcc, the typedef won't work for 32-bit platforms */
  39 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  40                                  * platforms */
  41 # else
  42 #  error "Need GCC 3.1 or later to define type uint128_t"
  43 # endif
  44
  45 typedef uint8_t u8;
  46 typedef uint64_t u64;
  47 typedef int64_t s64;
  48
  49 /******************************************************************************/
  50 /*-
  51  * INTERNAL REPRESENTATION OF FIELD ELEMENTS
  52  *
  53  * Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3
  54  * using 64-bit coefficients called 'limbs',
  55  * and sometimes (for multiplication results) as
  56  * b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 + 2^336*b_6
  57  * using 128-bit coefficients called 'widelimbs'.
  58  * A 4-limb representation is an 'felem';
  59  * a 7-widelimb representation is a 'widefelem'.
  60  * Even within felems, bits of adjacent limbs overlap, and we don't always
  61  * reduce the representations: we ensure that inputs to each felem
  62  * multiplication satisfy a_i < 2^60, so outputs satisfy b_i < 4*2^60*2^60,
  63  * and fit into a 128-bit word without overflow. The coefficients are then
  64  * again partially reduced to obtain an felem satisfying a_i < 2^57.
  65  * We only reduce to the unique minimal representation at the end of the
  66  * computation.
  67  */
  68
  69 typedef uint64_t limb;
  70 typedef uint128_t widelimb;
  71
  72 typedef limb felem[4];
  73 typedef widelimb widefelem[7];
  74
  75 /*
  76  * Field element represented as a byte arrary. 28*8 = 224 bits is also the
  77  * group order size for the elliptic curve, and we also use this type for
  78  * scalars for point multiplication.
  79  */
  80 typedef u8 felem_bytearray[28];
  81
  82 static const felem_bytearray nistp224_curve_params[5] = {
  83     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
  84      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
  85      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01},
  86     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a */
  87      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
  88      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE},
  89     {0xB4, 0x05, 0x0A, 0x85, 0x0C, 0x04, 0xB3, 0xAB, 0xF5, 0x41, /* b */
  90      0x32, 0x56, 0x50, 0x44, 0xB0, 0xB7, 0xD7, 0xBF, 0xD8, 0xBA,
  91      0x27, 0x0B, 0x39, 0x43, 0x23, 0x55, 0xFF, 0xB4},
  92     {0xB7, 0x0E, 0x0C, 0xBD, 0x6B, 0xB4, 0xBF, 0x7F, 0x32, 0x13, /* x */
  93      0x90, 0xB9, 0x4A, 0x03, 0xC1, 0xD3, 0x56, 0xC2, 0x11, 0x22,
  94      0x34, 0x32, 0x80, 0xD6, 0x11, 0x5C, 0x1D, 0x21},
  95     {0xbd, 0x37, 0x63, 0x88, 0xb5, 0xf7, 0x23, 0xfb, 0x4c, 0x22, /* y */
  96      0xdf, 0xe6, 0xcd, 0x43, 0x75, 0xa0, 0x5a, 0x07, 0x47, 0x64,
  97      0x44, 0xd5, 0x81, 0x99, 0x85, 0x00, 0x7e, 0x34}
  98 };
  99
 100 /*-
 101  * Precomputed multiples of the standard generator
 102  * Points are given in coordinates (X, Y, Z) where Z normally is 1
 103  * (0 for the point at infinity).
 104  * For each field element, slice a_0 is word 0, etc.
 105  *
 106  * The table has 2 * 16 elements, starting with the following:
 107  * index | bits    | point
 108  * ------+---------+------------------------------
 109  *     0 | 0 0 0 0 | 0G
 110  *     1 | 0 0 0 1 | 1G
 111  *     2 | 0 0 1 0 | 2^56G
 112  *     3 | 0 0 1 1 | (2^56 + 1)G
 113  *     4 | 0 1 0 0 | 2^112G
 114  *     5 | 0 1 0 1 | (2^112 + 1)G
 115  *     6 | 0 1 1 0 | (2^112 + 2^56)G
 116  *     7 | 0 1 1 1 | (2^112 + 2^56 + 1)G
 117  *     8 | 1 0 0 0 | 2^168G
 118  *     9 | 1 0 0 1 | (2^168 + 1)G
 119  *    10 | 1 0 1 0 | (2^168 + 2^56)G
 120  *    11 | 1 0 1 1 | (2^168 + 2^56 + 1)G
 121  *    12 | 1 1 0 0 | (2^168 + 2^112)G
 122  *    13 | 1 1 0 1 | (2^168 + 2^112 + 1)G
 123  *    14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G
 124  *    15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G
 125  * followed by a copy of this with each element multiplied by 2^28.
 126  *
 127  * The reason for this is so that we can clock bits into four different
 128  * locations when doing simple scalar multiplies against the base point,
 129  * and then another four locations using the second 16 elements.
 130  */
 131 static const felem gmul[2][16][3] = {
 132 {{{0, 0, 0, 0},
 133   {0, 0, 0, 0},
 134   {0, 0, 0, 0}},
 135  {{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf},
 136   {0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723},
 137   {1, 0, 0, 0}},
 138  {{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5},
 139   {0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321},
 140   {1, 0, 0, 0}},
 141  {{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748},
 142   {0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17},
 143   {1, 0, 0, 0}},
 144  {{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe},
 145   {0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b},
 146   {1, 0, 0, 0}},
 147  {{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3},
 148   {0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a},
 149   {1, 0, 0, 0}},
 150  {{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c},
 151   {0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244},
 152   {1, 0, 0, 0}},
 153  {{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849},
 154   {0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112},
 155   {1, 0, 0, 0}},
 156  {{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47},
 157   {0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394},
 158   {1, 0, 0, 0}},
 159  {{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d},
 160   {0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7},
 161   {1, 0, 0, 0}},
 162  {{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24},
 163   {0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881},
 164   {1, 0, 0, 0}},
 165  {{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984},
 166   {0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369},
 167   {1, 0, 0, 0}},
 168  {{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3},
 169   {0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60},
 170   {1, 0, 0, 0}},
 171  {{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057},
 172   {0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9},
 173   {1, 0, 0, 0}},
 174  {{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9},
 175   {0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc},
 176   {1, 0, 0, 0}},
 177  {{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58},
 178   {0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558},
 179   {1, 0, 0, 0}}},
 180 {{{0, 0, 0, 0},
 181   {0, 0, 0, 0},
 182   {0, 0, 0, 0}},
 183  {{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31},
 184   {0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d},
 185   {1, 0, 0, 0}},
 186  {{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3},
 187   {0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a},
 188   {1, 0, 0, 0}},
 189  {{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33},
 190   {0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100},
 191   {1, 0, 0, 0}},
 192  {{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5},
 193   {0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea},
 194   {1, 0, 0, 0}},
 195  {{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be},
 196   {0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51},
 197   {1, 0, 0, 0}},
 198  {{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1},
 199   {0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb},
 200   {1, 0, 0, 0}},
 201  {{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233},
 202   {0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def},
 203   {1, 0, 0, 0}},
 204  {{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae},
 205   {0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45},
 206   {1, 0, 0, 0}},
 207  {{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e},
 208   {0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb},
 209   {1, 0, 0, 0}},
 210  {{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de},
 211   {0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3},
 212   {1, 0, 0, 0}},
 213  {{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05},
 214   {0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58},
 215   {1, 0, 0, 0}},
 216  {{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb},
 217   {0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0},
 218   {1, 0, 0, 0}},
 219  {{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9},
 220   {0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea},
 221   {1, 0, 0, 0}},
 222  {{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba},
 223   {0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405},
 224   {1, 0, 0, 0}},
 225  {{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e},
 226   {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e},
 227   {1, 0, 0, 0}}}
 228 };
 229
 230 /* Precomputation for the group generator. */
 231 struct nistp224_pre_comp_st {
 232     felem g_pre_comp[2][16][3];
 233     int references;
 234 };
 235
 236 const EC_METHOD *EC_GFp_nistp224_method(void)
 237 {
 238     static const EC_METHOD ret = {
 239         EC_FLAGS_DEFAULT_OCT,
 240         NID_X9_62_prime_field,
 241         ec_GFp_nistp224_group_init,
 242         ec_GFp_simple_group_finish,
 243         ec_GFp_simple_group_clear_finish,
 244         ec_GFp_nist_group_copy,
 245         ec_GFp_nistp224_group_set_curve,
 246         ec_GFp_simple_group_get_curve,
 247         ec_GFp_simple_group_get_degree,
 248         ec_group_simple_order_bits,
 249         ec_GFp_simple_group_check_discriminant,
 250         ec_GFp_simple_point_init,
 251         ec_GFp_simple_point_finish,
 252         ec_GFp_simple_point_clear_finish,
 253         ec_GFp_simple_point_copy,
 254         ec_GFp_simple_point_set_to_infinity,
 255         ec_GFp_simple_set_Jprojective_coordinates_GFp,
 256         ec_GFp_simple_get_Jprojective_coordinates_GFp,
 257         ec_GFp_simple_point_set_affine_coordinates,
 258         ec_GFp_nistp224_point_get_affine_coordinates,
 259         0 /* point_set_compressed_coordinates */ ,
 260         0 /* point2oct */ ,
 261         0 /* oct2point */ ,
 262         ec_GFp_simple_add,
 263         ec_GFp_simple_dbl,
 264         ec_GFp_simple_invert,
 265         ec_GFp_simple_is_at_infinity,
 266         ec_GFp_simple_is_on_curve,
 267         ec_GFp_simple_cmp,
 268         ec_GFp_simple_make_affine,
 269         ec_GFp_simple_points_make_affine,
 270         ec_GFp_nistp224_points_mul,
 271         ec_GFp_nistp224_precompute_mult,
 272         ec_GFp_nistp224_have_precompute_mult,
 273         ec_GFp_nist_field_mul,
 274         ec_GFp_nist_field_sqr,
 275         0 /* field_div */ ,
 276         0 /* field_encode */ ,
 277         0 /* field_decode */ ,
 278         0,                      /* field_set_to_one */
 279         ec_key_simple_priv2oct,
 280         ec_key_simple_oct2priv,
 281         0, /* set private */
 282         ec_key_simple_generate_key,
 283         ec_key_simple_check_key,
 284         ec_key_simple_generate_public_key,
 285         0, /* keycopy */
 286         0, /* keyfinish */
 287         ecdh_simple_compute_key
 288     };
 289
 290     return &ret;
 291 }
 292
 293 /*
 294  * Helper functions to convert field elements to/from internal representation
 295  */
 296 static void bin28_to_felem(felem out, const u8 in[28])
 297 {
 298     out[0] = *((const uint64_t *)(in)) & 0x00ffffffffffffff;
 299     out[1] = (*((const uint64_t *)(in + 7))) & 0x00ffffffffffffff;
 300     out[2] = (*((const uint64_t *)(in + 14))) & 0x00ffffffffffffff;
 301     out[3] = (*((const uint64_t *)(in+20))) >> 8;
 302 }
 303
 304 static void felem_to_bin28(u8 out[28], const felem in)
 305 {
 306     unsigned i;
 307     for (i = 0; i < 7; ++i) {
 308         out[i] = in[0] >> (8 * i);
 309         out[i + 7] = in[1] >> (8 * i);
 310         out[i + 14] = in[2] >> (8 * i);
 311         out[i + 21] = in[3] >> (8 * i);
 312     }
 313 }
 314
 315 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 316 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 317 {
 318     unsigned i;
 319     for (i = 0; i < len; ++i)
 320         out[i] = in[len - 1 - i];
 321 }
 322
 323 /* From OpenSSL BIGNUM to internal representation */
 324 static int BN_to_felem(felem out, const BIGNUM *bn)
 325 {
 326     felem_bytearray b_in;
 327     felem_bytearray b_out;
 328     unsigned num_bytes;
 329
 330     /* BN_bn2bin eats leading zeroes */
 331     memset(b_out, 0, sizeof(b_out));
 332     num_bytes = BN_num_bytes(bn);
 333     if (num_bytes > sizeof b_out) {
 334         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 335         return 0;
 336     }
 337     if (BN_is_negative(bn)) {
 338         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 339         return 0;
 340     }
 341     num_bytes = BN_bn2bin(bn, b_in);
 342     flip_endian(b_out, b_in, num_bytes);
 343     bin28_to_felem(out, b_out);
 344     return 1;
 345 }
 346
 347 /* From internal representation to OpenSSL BIGNUM */
 348 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 349 {
 350     felem_bytearray b_in, b_out;
 351     felem_to_bin28(b_in, in);
 352     flip_endian(b_out, b_in, sizeof b_out);
 353     return BN_bin2bn(b_out, sizeof b_out, out);
 354 }
 355
 356 /******************************************************************************/
 357 /*-
 358  *                              FIELD OPERATIONS
 359  *
 360  * Field operations, using the internal representation of field elements.
 361  * NB! These operations are specific to our point multiplication and cannot be
 362  * expected to be correct in general - e.g., multiplication with a large scalar
 363  * will cause an overflow.
 364  *
 365  */
 366
 367 static void felem_one(felem out)
 368 {
 369     out[0] = 1;
 370     out[1] = 0;
 371     out[2] = 0;
 372     out[3] = 0;
 373 }
 374
 375 static void felem_assign(felem out, const felem in)
 376 {
 377     out[0] = in[0];
 378     out[1] = in[1];
 379     out[2] = in[2];
 380     out[3] = in[3];
 381 }
 382
 383 /* Sum two field elements: out += in */
 384 static void felem_sum(felem out, const felem in)
 385 {
 386     out[0] += in[0];
 387     out[1] += in[1];
 388     out[2] += in[2];
 389     out[3] += in[3];
 390 }
 391
 392 /* Get negative value: out = -in */
 393 /* Assumes in[i] < 2^57 */
 394 static void felem_neg(felem out, const felem in)
 395 {
 396     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 397     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 398     static const limb two58m42m2 = (((limb) 1) << 58) -
 399         (((limb) 1) << 42) - (((limb) 1) << 2);
 400
 401     /* Set to 0 mod 2^224-2^96+1 to ensure out > in */
 402     out[0] = two58p2 - in[0];
 403     out[1] = two58m42m2 - in[1];
 404     out[2] = two58m2 - in[2];
 405     out[3] = two58m2 - in[3];
 406 }
 407
 408 /* Subtract field elements: out -= in */
 409 /* Assumes in[i] < 2^57 */
 410 static void felem_diff(felem out, const felem in)
 411 {
 412     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 413     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 414     static const limb two58m42m2 = (((limb) 1) << 58) -
 415         (((limb) 1) << 42) - (((limb) 1) << 2);
 416
 417     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 418     out[0] += two58p2;
 419     out[1] += two58m42m2;
 420     out[2] += two58m2;
 421     out[3] += two58m2;
 422
 423     out[0] -= in[0];
 424     out[1] -= in[1];
 425     out[2] -= in[2];
 426     out[3] -= in[3];
 427 }
 428
 429 /* Subtract in unreduced 128-bit mode: out -= in */
 430 /* Assumes in[i] < 2^119 */
 431 static void widefelem_diff(widefelem out, const widefelem in)
 432 {
 433     static const widelimb two120 = ((widelimb) 1) << 120;
 434     static const widelimb two120m64 = (((widelimb) 1) << 120) -
 435         (((widelimb) 1) << 64);
 436     static const widelimb two120m104m64 = (((widelimb) 1) << 120) -
 437         (((widelimb) 1) << 104) - (((widelimb) 1) << 64);
 438
 439     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 440     out[0] += two120;
 441     out[1] += two120m64;
 442     out[2] += two120m64;
 443     out[3] += two120;
 444     out[4] += two120m104m64;
 445     out[5] += two120m64;
 446     out[6] += two120m64;
 447
 448     out[0] -= in[0];
 449     out[1] -= in[1];
 450     out[2] -= in[2];
 451     out[3] -= in[3];
 452     out[4] -= in[4];
 453     out[5] -= in[5];
 454     out[6] -= in[6];
 455 }
 456
 457 /* Subtract in mixed mode: out128 -= in64 */
 458 /* in[i] < 2^63 */
 459 static void felem_diff_128_64(widefelem out, const felem in)
 460 {
 461     static const widelimb two64p8 = (((widelimb) 1) << 64) +
 462         (((widelimb) 1) << 8);
 463     static const widelimb two64m8 = (((widelimb) 1) << 64) -
 464         (((widelimb) 1) << 8);
 465     static const widelimb two64m48m8 = (((widelimb) 1) << 64) -
 466         (((widelimb) 1) << 48) - (((widelimb) 1) << 8);
 467
 468     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 469     out[0] += two64p8;
 470     out[1] += two64m48m8;
 471     out[2] += two64m8;
 472     out[3] += two64m8;
 473
 474     out[0] -= in[0];
 475     out[1] -= in[1];
 476     out[2] -= in[2];
 477     out[3] -= in[3];
 478 }
 479
 480 /*
 481  * Multiply a field element by a scalar: out = out * scalar The scalars we
 482  * actually use are small, so results fit without overflow
 483  */
 484 static void felem_scalar(felem out, const limb scalar)
 485 {
 486     out[0] *= scalar;
 487     out[1] *= scalar;
 488     out[2] *= scalar;
 489     out[3] *= scalar;
 490 }
 491
 492 /*
 493  * Multiply an unreduced field element by a scalar: out = out * scalar The
 494  * scalars we actually use are small, so results fit without overflow
 495  */
 496 static void widefelem_scalar(widefelem out, const widelimb scalar)
 497 {
 498     out[0] *= scalar;
 499     out[1] *= scalar;
 500     out[2] *= scalar;
 501     out[3] *= scalar;
 502     out[4] *= scalar;
 503     out[5] *= scalar;
 504     out[6] *= scalar;
 505 }
 506
 507 /* Square a field element: out = in^2 */
 508 static void felem_square(widefelem out, const felem in)
 509 {
 510     limb tmp0, tmp1, tmp2;
 511     tmp0 = 2 * in[0];
 512     tmp1 = 2 * in[1];
 513     tmp2 = 2 * in[2];
 514     out[0] = ((widelimb) in[0]) * in[0];
 515     out[1] = ((widelimb) in[0]) * tmp1;
 516     out[2] = ((widelimb) in[0]) * tmp2 + ((widelimb) in[1]) * in[1];
 517     out[3] = ((widelimb) in[3]) * tmp0 + ((widelimb) in[1]) * tmp2;
 518     out[4] = ((widelimb) in[3]) * tmp1 + ((widelimb) in[2]) * in[2];
 519     out[5] = ((widelimb) in[3]) * tmp2;
 520     out[6] = ((widelimb) in[3]) * in[3];
 521 }
 522
 523 /* Multiply two field elements: out = in1 * in2 */
 524 static void felem_mul(widefelem out, const felem in1, const felem in2)
 525 {
 526     out[0] = ((widelimb) in1[0]) * in2[0];
 527     out[1] = ((widelimb) in1[0]) * in2[1] + ((widelimb) in1[1]) * in2[0];
 528     out[2] = ((widelimb) in1[0]) * in2[2] + ((widelimb) in1[1]) * in2[1] +
 529              ((widelimb) in1[2]) * in2[0];
 530     out[3] = ((widelimb) in1[0]) * in2[3] + ((widelimb) in1[1]) * in2[2] +
 531              ((widelimb) in1[2]) * in2[1] + ((widelimb) in1[3]) * in2[0];
 532     out[4] = ((widelimb) in1[1]) * in2[3] + ((widelimb) in1[2]) * in2[2] +
 533              ((widelimb) in1[3]) * in2[1];
 534     out[5] = ((widelimb) in1[2]) * in2[3] + ((widelimb) in1[3]) * in2[2];
 535     out[6] = ((widelimb) in1[3]) * in2[3];
 536 }
 537
 538 /*-
 539  * Reduce seven 128-bit coefficients to four 64-bit coefficients.
 540  * Requires in[i] < 2^126,
 541  * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 */
 542 static void felem_reduce(felem out, const widefelem in)
 543 {
 544     static const widelimb two127p15 = (((widelimb) 1) << 127) +
 545         (((widelimb) 1) << 15);
 546     static const widelimb two127m71 = (((widelimb) 1) << 127) -
 547         (((widelimb) 1) << 71);
 548     static const widelimb two127m71m55 = (((widelimb) 1) << 127) -
 549         (((widelimb) 1) << 71) - (((widelimb) 1) << 55);
 550     widelimb output[5];
 551
 552     /* Add 0 mod 2^224-2^96+1 to ensure all differences are positive */
 553     output[0] = in[0] + two127p15;
 554     output[1] = in[1] + two127m71m55;
 555     output[2] = in[2] + two127m71;
 556     output[3] = in[3];
 557     output[4] = in[4];
 558
 559     /* Eliminate in[4], in[5], in[6] */
 560     output[4] += in[6] >> 16;
 561     output[3] += (in[6] & 0xffff) << 40;
 562     output[2] -= in[6];
 563
 564     output[3] += in[5] >> 16;
 565     output[2] += (in[5] & 0xffff) << 40;
 566     output[1] -= in[5];
 567
 568     output[2] += output[4] >> 16;
 569     output[1] += (output[4] & 0xffff) << 40;
 570     output[0] -= output[4];
 571
 572     /* Carry 2 -> 3 -> 4 */
 573     output[3] += output[2] >> 56;
 574     output[2] &= 0x00ffffffffffffff;
 575
 576     output[4] = output[3] >> 56;
 577     output[3] &= 0x00ffffffffffffff;
 578
 579     /* Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72 */
 580
 581     /* Eliminate output[4] */
 582     output[2] += output[4] >> 16;
 583     /* output[2] < 2^56 + 2^56 = 2^57 */
 584     output[1] += (output[4] & 0xffff) << 40;
 585     output[0] -= output[4];
 586
 587     /* Carry 0 -> 1 -> 2 -> 3 */
 588     output[1] += output[0] >> 56;
 589     out[0] = output[0] & 0x00ffffffffffffff;
 590
 591     output[2] += output[1] >> 56;
 592     /* output[2] < 2^57 + 2^72 */
 593     out[1] = output[1] & 0x00ffffffffffffff;
 594     output[3] += output[2] >> 56;
 595     /* output[3] <= 2^56 + 2^16 */
 596     out[2] = output[2] & 0x00ffffffffffffff;
 597
 598     /*-
 599      * out[0] < 2^56, out[1] < 2^56, out[2] < 2^56,
 600      * out[3] <= 2^56 + 2^16 (due to final carry),
 601      * so out < 2*p
 602      */
 603     out[3] = output[3];
 604 }
 605
 606 static void felem_square_reduce(felem out, const felem in)
 607 {
 608     widefelem tmp;
 609     felem_square(tmp, in);
 610     felem_reduce(out, tmp);
 611 }
 612
 613 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 614 {
 615     widefelem tmp;
 616     felem_mul(tmp, in1, in2);
 617     felem_reduce(out, tmp);
 618 }
 619
 620 /*
 621  * Reduce to unique minimal representation. Requires 0 <= in < 2*p (always
 622  * call felem_reduce first)
 623  */
 624 static void felem_contract(felem out, const felem in)
 625 {
 626     static const int64_t two56 = ((limb) 1) << 56;
 627     /* 0 <= in < 2*p, p = 2^224 - 2^96 + 1 */
 628     /* if in > p , reduce in = in - 2^224 + 2^96 - 1 */
 629     int64_t tmp[4], a;
 630     tmp[0] = in[0];
 631     tmp[1] = in[1];
 632     tmp[2] = in[2];
 633     tmp[3] = in[3];
 634     /* Case 1: a = 1 iff in >= 2^224 */
 635     a = (in[3] >> 56);
 636     tmp[0] -= a;
 637     tmp[1] += a << 40;
 638     tmp[3] &= 0x00ffffffffffffff;
 639     /*
 640      * Case 2: a = 0 iff p <= in < 2^224, i.e., the high 128 bits are all 1
 641      * and the lower part is non-zero
 642      */
 643     a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) |
 644         (((int64_t) (in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63);
 645     a &= 0x00ffffffffffffff;
 646     /* turn a into an all-one mask (if a = 0) or an all-zero mask */
 647     a = (a - 1) >> 63;
 648     /* subtract 2^224 - 2^96 + 1 if a is all-one */
 649     tmp[3] &= a ^ 0xffffffffffffffff;
 650     tmp[2] &= a ^ 0xffffffffffffffff;
 651     tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff;
 652     tmp[0] -= 1 & a;
 653
 654     /*
 655      * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
 656      * non-zero, so we only need one step
 657      */
 658     a = tmp[0] >> 63;
 659     tmp[0] += two56 & a;
 660     tmp[1] -= 1 & a;
 661
 662     /* carry 1 -> 2 -> 3 */
 663     tmp[2] += tmp[1] >> 56;
 664     tmp[1] &= 0x00ffffffffffffff;
 665
 666     tmp[3] += tmp[2] >> 56;
 667     tmp[2] &= 0x00ffffffffffffff;
 668
 669     /* Now 0 <= out < p */
 670     out[0] = tmp[0];
 671     out[1] = tmp[1];
 672     out[2] = tmp[2];
 673     out[3] = tmp[3];
 674 }
 675
 676 /*
 677  * Zero-check: returns 1 if input is 0, and 0 otherwise. We know that field
 678  * elements are reduced to in < 2^225, so we only need to check three cases:
 679  * 0, 2^224 - 2^96 + 1, and 2^225 - 2^97 + 2
 680  */
 681 static limb felem_is_zero(const felem in)
 682 {
 683     limb zero, two224m96p1, two225m97p2;
 684
 685     zero = in[0] | in[1] | in[2] | in[3];
 686     zero = (((int64_t) (zero) - 1) >> 63) & 1;
 687     two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000)
 688         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x00ffffffffffffff);
 689     two224m96p1 = (((int64_t) (two224m96p1) - 1) >> 63) & 1;
 690     two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000)
 691         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x01ffffffffffffff);
 692     two225m97p2 = (((int64_t) (two225m97p2) - 1) >> 63) & 1;
 693     return (zero | two224m96p1 | two225m97p2);
 694 }
 695
 696 static limb felem_is_zero_int(const felem in)
 697 {
 698     return (int)(felem_is_zero(in) & ((limb) 1));
 699 }
 700
 701 /* Invert a field element */
 702 /* Computation chain copied from djb's code */
 703 static void felem_inv(felem out, const felem in)
 704 {
 705     felem ftmp, ftmp2, ftmp3, ftmp4;
 706     widefelem tmp;
 707     unsigned i;
 708
 709     felem_square(tmp, in);
 710     felem_reduce(ftmp, tmp);    /* 2 */
 711     felem_mul(tmp, in, ftmp);
 712     felem_reduce(ftmp, tmp);    /* 2^2 - 1 */
 713     felem_square(tmp, ftmp);
 714     felem_reduce(ftmp, tmp);    /* 2^3 - 2 */
 715     felem_mul(tmp, in, ftmp);
 716     felem_reduce(ftmp, tmp);    /* 2^3 - 1 */
 717     felem_square(tmp, ftmp);
 718     felem_reduce(ftmp2, tmp);   /* 2^4 - 2 */
 719     felem_square(tmp, ftmp2);
 720     felem_reduce(ftmp2, tmp);   /* 2^5 - 4 */
 721     felem_square(tmp, ftmp2);
 722     felem_reduce(ftmp2, tmp);   /* 2^6 - 8 */
 723     felem_mul(tmp, ftmp2, ftmp);
 724     felem_reduce(ftmp, tmp);    /* 2^6 - 1 */
 725     felem_square(tmp, ftmp);
 726     felem_reduce(ftmp2, tmp);   /* 2^7 - 2 */
 727     for (i = 0; i < 5; ++i) {   /* 2^12 - 2^6 */
 728         felem_square(tmp, ftmp2);
 729         felem_reduce(ftmp2, tmp);
 730     }
 731     felem_mul(tmp, ftmp2, ftmp);
 732     felem_reduce(ftmp2, tmp);   /* 2^12 - 1 */
 733     felem_square(tmp, ftmp2);
 734     felem_reduce(ftmp3, tmp);   /* 2^13 - 2 */
 735     for (i = 0; i < 11; ++i) {  /* 2^24 - 2^12 */
 736         felem_square(tmp, ftmp3);
 737         felem_reduce(ftmp3, tmp);
 738     }
 739     felem_mul(tmp, ftmp3, ftmp2);
 740     felem_reduce(ftmp2, tmp);   /* 2^24 - 1 */
 741     felem_square(tmp, ftmp2);
 742     felem_reduce(ftmp3, tmp);   /* 2^25 - 2 */
 743     for (i = 0; i < 23; ++i) {  /* 2^48 - 2^24 */
 744         felem_square(tmp, ftmp3);
 745         felem_reduce(ftmp3, tmp);
 746     }
 747     felem_mul(tmp, ftmp3, ftmp2);
 748     felem_reduce(ftmp3, tmp);   /* 2^48 - 1 */
 749     felem_square(tmp, ftmp3);
 750     felem_reduce(ftmp4, tmp);   /* 2^49 - 2 */
 751     for (i = 0; i < 47; ++i) {  /* 2^96 - 2^48 */
 752         felem_square(tmp, ftmp4);
 753         felem_reduce(ftmp4, tmp);
 754     }
 755     felem_mul(tmp, ftmp3, ftmp4);
 756     felem_reduce(ftmp3, tmp);   /* 2^96 - 1 */
 757     felem_square(tmp, ftmp3);
 758     felem_reduce(ftmp4, tmp);   /* 2^97 - 2 */
 759     for (i = 0; i < 23; ++i) {  /* 2^120 - 2^24 */
 760         felem_square(tmp, ftmp4);
 761         felem_reduce(ftmp4, tmp);
 762     }
 763     felem_mul(tmp, ftmp2, ftmp4);
 764     felem_reduce(ftmp2, tmp);   /* 2^120 - 1 */
 765     for (i = 0; i < 6; ++i) {   /* 2^126 - 2^6 */
 766         felem_square(tmp, ftmp2);
 767         felem_reduce(ftmp2, tmp);
 768     }
 769     felem_mul(tmp, ftmp2, ftmp);
 770     felem_reduce(ftmp, tmp);    /* 2^126 - 1 */
 771     felem_square(tmp, ftmp);
 772     felem_reduce(ftmp, tmp);    /* 2^127 - 2 */
 773     felem_mul(tmp, ftmp, in);
 774     felem_reduce(ftmp, tmp);    /* 2^127 - 1 */
 775     for (i = 0; i < 97; ++i) {  /* 2^224 - 2^97 */
 776         felem_square(tmp, ftmp);
 777         felem_reduce(ftmp, tmp);
 778     }
 779     felem_mul(tmp, ftmp, ftmp3);
 780     felem_reduce(out, tmp);     /* 2^224 - 2^96 - 1 */
 781 }
 782
 783 /*
 784  * Copy in constant time: if icopy == 1, copy in to out, if icopy == 0, copy
 785  * out to itself.
 786  */
 787 static void copy_conditional(felem out, const felem in, limb icopy)
 788 {
 789     unsigned i;
 790     /*
 791      * icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one
 792      */
 793     const limb copy = -icopy;
 794     for (i = 0; i < 4; ++i) {
 795         const limb tmp = copy & (in[i] ^ out[i]);
 796         out[i] ^= tmp;
 797     }
 798 }
 799
 800 /******************************************************************************/
 801 /*-
 802  *                       ELLIPTIC CURVE POINT OPERATIONS
 803  *
 804  * Points are represented in Jacobian projective coordinates:
 805  * (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3),
 806  * or to the point at infinity if Z == 0.
 807  *
 808  */
 809
 810 /*-
 811  * Double an elliptic curve point:
 812  * (X', Y', Z') = 2 * (X, Y, Z), where
 813  * X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2
 814  * Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^2
 815  * Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z
 816  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed,
 817  * while x_out == y_in is not (maybe this works, but it's not tested).
 818  */
 819 static void
 820 point_double(felem x_out, felem y_out, felem z_out,
 821              const felem x_in, const felem y_in, const felem z_in)
 822 {
 823     widefelem tmp, tmp2;
 824     felem delta, gamma, beta, alpha, ftmp, ftmp2;
 825
 826     felem_assign(ftmp, x_in);
 827     felem_assign(ftmp2, x_in);
 828
 829     /* delta = z^2 */
 830     felem_square(tmp, z_in);
 831     felem_reduce(delta, tmp);
 832
 833     /* gamma = y^2 */
 834     felem_square(tmp, y_in);
 835     felem_reduce(gamma, tmp);
 836
 837     /* beta = x*gamma */
 838     felem_mul(tmp, x_in, gamma);
 839     felem_reduce(beta, tmp);
 840
 841     /* alpha = 3*(x-delta)*(x+delta) */
 842     felem_diff(ftmp, delta);
 843     /* ftmp[i] < 2^57 + 2^58 + 2 < 2^59 */
 844     felem_sum(ftmp2, delta);
 845     /* ftmp2[i] < 2^57 + 2^57 = 2^58 */
 846     felem_scalar(ftmp2, 3);
 847     /* ftmp2[i] < 3 * 2^58 < 2^60 */
 848     felem_mul(tmp, ftmp, ftmp2);
 849     /* tmp[i] < 2^60 * 2^59 * 4 = 2^121 */
 850     felem_reduce(alpha, tmp);
 851
 852     /* x' = alpha^2 - 8*beta */
 853     felem_square(tmp, alpha);
 854     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 855     felem_assign(ftmp, beta);
 856     felem_scalar(ftmp, 8);
 857     /* ftmp[i] < 8 * 2^57 = 2^60 */
 858     felem_diff_128_64(tmp, ftmp);
 859     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 860     felem_reduce(x_out, tmp);
 861
 862     /* z' = (y + z)^2 - gamma - delta */
 863     felem_sum(delta, gamma);
 864     /* delta[i] < 2^57 + 2^57 = 2^58 */
 865     felem_assign(ftmp, y_in);
 866     felem_sum(ftmp, z_in);
 867     /* ftmp[i] < 2^57 + 2^57 = 2^58 */
 868     felem_square(tmp, ftmp);
 869     /* tmp[i] < 4 * 2^58 * 2^58 = 2^118 */
 870     felem_diff_128_64(tmp, delta);
 871     /* tmp[i] < 2^118 + 2^64 + 8 < 2^119 */
 872     felem_reduce(z_out, tmp);
 873
 874     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
 875     felem_scalar(beta, 4);
 876     /* beta[i] < 4 * 2^57 = 2^59 */
 877     felem_diff(beta, x_out);
 878     /* beta[i] < 2^59 + 2^58 + 2 < 2^60 */
 879     felem_mul(tmp, alpha, beta);
 880     /* tmp[i] < 4 * 2^57 * 2^60 = 2^119 */
 881     felem_square(tmp2, gamma);
 882     /* tmp2[i] < 4 * 2^57 * 2^57 = 2^116 */
 883     widefelem_scalar(tmp2, 8);
 884     /* tmp2[i] < 8 * 2^116 = 2^119 */
 885     widefelem_diff(tmp, tmp2);
 886     /* tmp[i] < 2^119 + 2^120 < 2^121 */
 887     felem_reduce(y_out, tmp);
 888 }
 889
 890 /*-
 891  * Add two elliptic curve points:
 892  * (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where
 893  * X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 -
 894  * 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2
 895  * Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 - X_3) -
 896  *        Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3
 897  * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2)
 898  *
 899  * This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0.
 900  */
 901
 902 /*
 903  * This function is not entirely constant-time: it includes a branch for
 904  * checking whether the two input points are equal, (while not equal to the
 905  * point at infinity). This case never happens during single point
 906  * multiplication, so there is no timing leak for ECDH or ECDSA signing.
 907  */
 908 static void point_add(felem x3, felem y3, felem z3,
 909                       const felem x1, const felem y1, const felem z1,
 910                       const int mixed, const felem x2, const felem y2,
 911                       const felem z2)
 912 {
 913     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out;
 914     widefelem tmp, tmp2;
 915     limb z1_is_zero, z2_is_zero, x_equal, y_equal;
 916
 917     if (!mixed) {
 918         /* ftmp2 = z2^2 */
 919         felem_square(tmp, z2);
 920         felem_reduce(ftmp2, tmp);
 921
 922         /* ftmp4 = z2^3 */
 923         felem_mul(tmp, ftmp2, z2);
 924         felem_reduce(ftmp4, tmp);
 925
 926         /* ftmp4 = z2^3*y1 */
 927         felem_mul(tmp2, ftmp4, y1);
 928         felem_reduce(ftmp4, tmp2);
 929
 930         /* ftmp2 = z2^2*x1 */
 931         felem_mul(tmp2, ftmp2, x1);
 932         felem_reduce(ftmp2, tmp2);
 933     } else {
 934         /*
 935          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
 936          */
 937
 938         /* ftmp4 = z2^3*y1 */
 939         felem_assign(ftmp4, y1);
 940
 941         /* ftmp2 = z2^2*x1 */
 942         felem_assign(ftmp2, x1);
 943     }
 944
 945     /* ftmp = z1^2 */
 946     felem_square(tmp, z1);
 947     felem_reduce(ftmp, tmp);
 948
 949     /* ftmp3 = z1^3 */
 950     felem_mul(tmp, ftmp, z1);
 951     felem_reduce(ftmp3, tmp);
 952
 953     /* tmp = z1^3*y2 */
 954     felem_mul(tmp, ftmp3, y2);
 955     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 956
 957     /* ftmp3 = z1^3*y2 - z2^3*y1 */
 958     felem_diff_128_64(tmp, ftmp4);
 959     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 960     felem_reduce(ftmp3, tmp);
 961
 962     /* tmp = z1^2*x2 */
 963     felem_mul(tmp, ftmp, x2);
 964     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 965
 966     /* ftmp = z1^2*x2 - z2^2*x1 */
 967     felem_diff_128_64(tmp, ftmp2);
 968     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 969     felem_reduce(ftmp, tmp);
 970
 971     /*
 972      * the formulae are incorrect if the points are equal so we check for
 973      * this and do doubling if this happens
 974      */
 975     x_equal = felem_is_zero(ftmp);
 976     y_equal = felem_is_zero(ftmp3);
 977     z1_is_zero = felem_is_zero(z1);
 978     z2_is_zero = felem_is_zero(z2);
 979     /* In affine coordinates, (X_1, Y_1) == (X_2, Y_2) */
 980     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
 981         point_double(x3, y3, z3, x1, y1, z1);
 982         return;
 983     }
 984
 985     /* ftmp5 = z1*z2 */
 986     if (!mixed) {
 987         felem_mul(tmp, z1, z2);
 988         felem_reduce(ftmp5, tmp);
 989     } else {
 990         /* special case z2 = 0 is handled later */
 991         felem_assign(ftmp5, z1);
 992     }
 993
 994     /* z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) */
 995     felem_mul(tmp, ftmp, ftmp5);
 996     felem_reduce(z_out, tmp);
 997
 998     /* ftmp = (z1^2*x2 - z2^2*x1)^2 */
 999     felem_assign(ftmp5, ftmp);
1000     felem_square(tmp, ftmp);
1001     felem_reduce(ftmp, tmp);
1002
1003     /* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */
1004     felem_mul(tmp, ftmp, ftmp5);
1005     felem_reduce(ftmp5, tmp);
1006
1007     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
1008     felem_mul(tmp, ftmp2, ftmp);
1009     felem_reduce(ftmp2, tmp);
1010
1011     /* tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
1012     felem_mul(tmp, ftmp4, ftmp5);
1013     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
1014
1015     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 */
1016     felem_square(tmp2, ftmp3);
1017     /* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */
1018
1019     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */
1020     felem_diff_128_64(tmp2, ftmp5);
1021     /* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */
1022
1023     /* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
1024     felem_assign(ftmp5, ftmp2);
1025     felem_scalar(ftmp5, 2);
1026     /* ftmp5[i] < 2 * 2^57 = 2^58 */
1027
1028     /*-
1029      * x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 -
1030      *  2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2
1031      */
1032     felem_diff_128_64(tmp2, ftmp5);
1033     /* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */
1034     felem_reduce(x_out, tmp2);
1035
1036     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out */
1037     felem_diff(ftmp2, x_out);
1038     /* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */
1039
1040     /*
1041      * tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out)
1042      */
1043     felem_mul(tmp2, ftmp3, ftmp2);
1044     /* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */
1045
1046     /*-
1047      * y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) -
1048      *  z2^3*y1*(z1^2*x2 - z2^2*x1)^3
1049      */
1050     widefelem_diff(tmp2, tmp);
1051     /* tmp2[i] < 2^118 + 2^120 < 2^121 */
1052     felem_reduce(y_out, tmp2);
1053
1054     /*
1055      * the result (x_out, y_out, z_out) is incorrect if one of the inputs is
1056      * the point at infinity, so we need to check for this separately
1057      */
1058
1059     /*
1060      * if point 1 is at infinity, copy point 2 to output, and vice versa
1061      */
1062     copy_conditional(x_out, x2, z1_is_zero);
1063     copy_conditional(x_out, x1, z2_is_zero);
1064     copy_conditional(y_out, y2, z1_is_zero);
1065     copy_conditional(y_out, y1, z2_is_zero);
1066     copy_conditional(z_out, z2, z1_is_zero);
1067     copy_conditional(z_out, z1, z2_is_zero);
1068     felem_assign(x3, x_out);
1069     felem_assign(y3, y_out);
1070     felem_assign(z3, z_out);
1071 }
1072
1073 /*
1074  * select_point selects the |idx|th point from a precomputation table and
1075  * copies it to out.
1076  * The pre_comp array argument should be size of |size| argument
1077  */
1078 static void select_point(const u64 idx, unsigned int size,
1079                          const felem pre_comp[][3], felem out[3])
1080 {
1081     unsigned i, j;
1082     limb *outlimbs = &out[0][0];
1083
1084     memset(out, 0, sizeof(*out) * 3);
1085     for (i = 0; i < size; i++) {
1086         const limb *inlimbs = &pre_comp[i][0][0];
1087         u64 mask = i ^ idx;
1088         mask |= mask >> 4;
1089         mask |= mask >> 2;
1090         mask |= mask >> 1;
1091         mask &= 1;
1092         mask--;
1093         for (j = 0; j < 4 * 3; j++)
1094             outlimbs[j] |= inlimbs[j] & mask;
1095     }
1096 }
1097
1098 /* get_bit returns the |i|th bit in |in| */
1099 static char get_bit(const felem_bytearray in, unsigned i)
1100 {
1101     if (i >= 224)
1102         return 0;
1103     return (in[i >> 3] >> (i & 7)) & 1;
1104 }
1105
1106 /*
1107  * Interleaved point multiplication using precomputed point multiples: The
1108  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1109  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1110  * generator, using certain (large) precomputed multiples in g_pre_comp.
1111  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1112  */
1113 static void batch_mul(felem x_out, felem y_out, felem z_out,
1114                       const felem_bytearray scalars[],
1115                       const unsigned num_points, const u8 *g_scalar,
1116                       const int mixed, const felem pre_comp[][17][3],
1117                       const felem g_pre_comp[2][16][3])
1118 {
1119     int i, skip;
1120     unsigned num;
1121     unsigned gen_mul = (g_scalar != NULL);
1122     felem nq[3], tmp[4];
1123     u64 bits;
1124     u8 sign, digit;
1125
1126     /* set nq to the point at infinity */
1127     memset(nq, 0, sizeof(nq));
1128
1129     /*
1130      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1131      * of the generator (two in each of the last 28 rounds) and additions of
1132      * other points multiples (every 5th round).
1133      */
1134     skip = 1;                   /* save two point operations in the first
1135                                  * round */
1136     for (i = (num_points ? 220 : 27); i >= 0; --i) {
1137         /* double */
1138         if (!skip)
1139             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1140
1141         /* add multiples of the generator */
1142         if (gen_mul && (i <= 27)) {
1143             /* first, look 28 bits upwards */
1144             bits = get_bit(g_scalar, i + 196) << 3;
1145             bits |= get_bit(g_scalar, i + 140) << 2;
1146             bits |= get_bit(g_scalar, i + 84) << 1;
1147             bits |= get_bit(g_scalar, i + 28);
1148             /* select the point to add, in constant time */
1149             select_point(bits, 16, g_pre_comp[1], tmp);
1150
1151             if (!skip) {
1152                 /* value 1 below is argument for "mixed" */
1153                 point_add(nq[0], nq[1], nq[2],
1154                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1155             } else {
1156                 memcpy(nq, tmp, 3 * sizeof(felem));
1157                 skip = 0;
1158             }
1159
1160             /* second, look at the current position */
1161             bits = get_bit(g_scalar, i + 168) << 3;
1162             bits |= get_bit(g_scalar, i + 112) << 2;
1163             bits |= get_bit(g_scalar, i + 56) << 1;
1164             bits |= get_bit(g_scalar, i);
1165             /* select the point to add, in constant time */
1166             select_point(bits, 16, g_pre_comp[0], tmp);
1167             point_add(nq[0], nq[1], nq[2],
1168                       nq[0], nq[1], nq[2],
1169                       1 /* mixed */ , tmp[0], tmp[1], tmp[2]);
1170         }
1171
1172         /* do other additions every 5 doublings */
1173         if (num_points && (i % 5 == 0)) {
1174             /* loop over all scalars */
1175             for (num = 0; num < num_points; ++num) {
1176                 bits = get_bit(scalars[num], i + 4) << 5;
1177                 bits |= get_bit(scalars[num], i + 3) << 4;
1178                 bits |= get_bit(scalars[num], i + 2) << 3;
1179                 bits |= get_bit(scalars[num], i + 1) << 2;
1180                 bits |= get_bit(scalars[num], i) << 1;
1181                 bits |= get_bit(scalars[num], i - 1);
1182                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1183
1184                 /* select the point to add or subtract */
1185                 select_point(digit, 17, pre_comp[num], tmp);
1186                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1187                                             * point */
1188                 copy_conditional(tmp[1], tmp[3], sign);
1189
1190                 if (!skip) {
1191                     point_add(nq[0], nq[1], nq[2],
1192                               nq[0], nq[1], nq[2],
1193                               mixed, tmp[0], tmp[1], tmp[2]);
1194                 } else {
1195                     memcpy(nq, tmp, 3 * sizeof(felem));
1196                     skip = 0;
1197                 }
1198             }
1199         }
1200     }
1201     felem_assign(x_out, nq[0]);
1202     felem_assign(y_out, nq[1]);
1203     felem_assign(z_out, nq[2]);
1204 }
1205
1206 /******************************************************************************/
1207 /*
1208  * FUNCTIONS TO MANAGE PRECOMPUTATION
1209  */
1210
1211 static NISTP224_PRE_COMP *nistp224_pre_comp_new()
1212 {
1213     NISTP224_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1214
1215     if (!ret) {
1216         ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1217         return ret;
1218     }
1219     ret->references = 1;
1220     return ret;
1221 }
1222
1223 NISTP224_PRE_COMP *EC_nistp224_pre_comp_dup(NISTP224_PRE_COMP *p)
1224 {
1225     if (p != NULL)
1226         CRYPTO_add(&p->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1227     return p;
1228 }
1229
1230 void EC_nistp224_pre_comp_free(NISTP224_PRE_COMP *p)
1231 {
1232     if (p == NULL
1233         || CRYPTO_add(&p->references, -1, CRYPTO_LOCK_EC_PRE_COMP) > 0)
1234         return;
1235     OPENSSL_free(p);
1236 }
1237
1238 /******************************************************************************/
1239 /*
1240  * OPENSSL EC_METHOD FUNCTIONS
1241  */
1242
1243 int ec_GFp_nistp224_group_init(EC_GROUP *group)
1244 {
1245     int ret;
1246     ret = ec_GFp_simple_group_init(group);
1247     group->a_is_minus3 = 1;
1248     return ret;
1249 }
1250
1251 int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1252                                     const BIGNUM *a, const BIGNUM *b,
1253                                     BN_CTX *ctx)
1254 {
1255     int ret = 0;
1256     BN_CTX *new_ctx = NULL;
1257     BIGNUM *curve_p, *curve_a, *curve_b;
1258
1259     if (ctx == NULL)
1260         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1261             return 0;
1262     BN_CTX_start(ctx);
1263     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1264         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1265         ((curve_b = BN_CTX_get(ctx)) == NULL))
1266         goto err;
1267     BN_bin2bn(nistp224_curve_params[0], sizeof(felem_bytearray), curve_p);
1268     BN_bin2bn(nistp224_curve_params[1], sizeof(felem_bytearray), curve_a);
1269     BN_bin2bn(nistp224_curve_params[2], sizeof(felem_bytearray), curve_b);
1270     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1271         ECerr(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE,
1272               EC_R_WRONG_CURVE_PARAMETERS);
1273         goto err;
1274     }
1275     group->field_mod_func = BN_nist_mod_224;
1276     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1277  err:
1278     BN_CTX_end(ctx);
1279     BN_CTX_free(new_ctx);
1280     return ret;
1281 }
1282
1283 /*
1284  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1285  * (X/Z^2, Y/Z^3)
1286  */
1287 int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group,
1288                                                  const EC_POINT *point,
1289                                                  BIGNUM *x, BIGNUM *y,
1290                                                  BN_CTX *ctx)
1291 {
1292     felem z1, z2, x_in, y_in, x_out, y_out;
1293     widefelem tmp;
1294
1295     if (EC_POINT_is_at_infinity(group, point)) {
1296         ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1297               EC_R_POINT_AT_INFINITY);
1298         return 0;
1299     }
1300     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1301         (!BN_to_felem(z1, point->Z)))
1302         return 0;
1303     felem_inv(z2, z1);
1304     felem_square(tmp, z2);
1305     felem_reduce(z1, tmp);
1306     felem_mul(tmp, x_in, z1);
1307     felem_reduce(x_in, tmp);
1308     felem_contract(x_out, x_in);
1309     if (x != NULL) {
1310         if (!felem_to_BN(x, x_out)) {
1311             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1312                   ERR_R_BN_LIB);
1313             return 0;
1314         }
1315     }
1316     felem_mul(tmp, z1, z2);
1317     felem_reduce(z1, tmp);
1318     felem_mul(tmp, y_in, z1);
1319     felem_reduce(y_in, tmp);
1320     felem_contract(y_out, y_in);
1321     if (y != NULL) {
1322         if (!felem_to_BN(y, y_out)) {
1323             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1324                   ERR_R_BN_LIB);
1325             return 0;
1326         }
1327     }
1328     return 1;
1329 }
1330
1331 static void make_points_affine(size_t num, felem points[ /* num */ ][3],
1332                                felem tmp_felems[ /* num+1 */ ])
1333 {
1334     /*
1335      * Runs in constant time, unless an input is the point at infinity (which
1336      * normally shouldn't happen).
1337      */
1338     ec_GFp_nistp_points_make_affine_internal(num,
1339                                              points,
1340                                              sizeof(felem),
1341                                              tmp_felems,
1342                                              (void (*)(void *))felem_one,
1343                                              (int (*)(const void *))
1344                                              felem_is_zero_int,
1345                                              (void (*)(void *, const void *))
1346                                              felem_assign,
1347                                              (void (*)(void *, const void *))
1348                                              felem_square_reduce, (void (*)
1349                                                                    (void *,
1350                                                                     const void
1351                                                                     *,
1352                                                                     const void
1353                                                                     *))
1354                                              felem_mul_reduce,
1355                                              (void (*)(void *, const void *))
1356                                              felem_inv,
1357                                              (void (*)(void *, const void *))
1358                                              felem_contract);
1359 }
1360
1361 /*
1362  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1363  * values Result is stored in r (r can equal one of the inputs).
1364  */
1365 int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
1366                                const BIGNUM *scalar, size_t num,
1367                                const EC_POINT *points[],
1368                                const BIGNUM *scalars[], BN_CTX *ctx)
1369 {
1370     int ret = 0;
1371     int j;
1372     unsigned i;
1373     int mixed = 0;
1374     BN_CTX *new_ctx = NULL;
1375     BIGNUM *x, *y, *z, *tmp_scalar;
1376     felem_bytearray g_secret;
1377     felem_bytearray *secrets = NULL;
1378     felem (*pre_comp)[17][3] = NULL;
1379     felem *tmp_felems = NULL;
1380     felem_bytearray tmp;
1381     unsigned num_bytes;
1382     int have_pre_comp = 0;
1383     size_t num_points = num;
1384     felem x_in, y_in, z_in, x_out, y_out, z_out;
1385     NISTP224_PRE_COMP *pre = NULL;
1386     const felem(*g_pre_comp)[16][3] = NULL;
1387     EC_POINT *generator = NULL;
1388     const EC_POINT *p = NULL;
1389     const BIGNUM *p_scalar = NULL;
1390
1391     if (ctx == NULL)
1392         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1393             return 0;
1394     BN_CTX_start(ctx);
1395     if (((x = BN_CTX_get(ctx)) == NULL) ||
1396         ((y = BN_CTX_get(ctx)) == NULL) ||
1397         ((z = BN_CTX_get(ctx)) == NULL) ||
1398         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1399         goto err;
1400
1401     if (scalar != NULL) {
1402         pre = group->pre_comp.nistp224;
1403         if (pre)
1404             /* we have precomputation, try to use it */
1405             g_pre_comp = (const felem(*)[16][3])pre->g_pre_comp;
1406         else
1407             /* try to use the standard precomputation */
1408             g_pre_comp = &gmul[0];
1409         generator = EC_POINT_new(group);
1410         if (generator == NULL)
1411             goto err;
1412         /* get the generator from precomputation */
1413         if (!felem_to_BN(x, g_pre_comp[0][1][0]) ||
1414             !felem_to_BN(y, g_pre_comp[0][1][1]) ||
1415             !felem_to_BN(z, g_pre_comp[0][1][2])) {
1416             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1417             goto err;
1418         }
1419         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1420                                                       generator, x, y, z,
1421                                                       ctx))
1422             goto err;
1423         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1424             /* precomputation matches generator */
1425             have_pre_comp = 1;
1426         else
1427             /*
1428              * we don't have valid precomputation: treat the generator as a
1429              * random point
1430              */
1431             num_points = num_points + 1;
1432     }
1433
1434     if (num_points > 0) {
1435         if (num_points >= 3) {
1436             /*
1437              * unless we precompute multiples for just one or two points,
1438              * converting those into affine form is time well spent
1439              */
1440             mixed = 1;
1441         }
1442         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1443         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1444         if (mixed)
1445             tmp_felems =
1446                 OPENSSL_malloc(sizeof(felem) * (num_points * 17 + 1));
1447         if ((secrets == NULL) || (pre_comp == NULL)
1448             || (mixed && (tmp_felems == NULL))) {
1449             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1450             goto err;
1451         }
1452
1453         /*
1454          * we treat NULL scalars as 0, and NULL points as points at infinity,
1455          * i.e., they contribute nothing to the linear combination
1456          */
1457         for (i = 0; i < num_points; ++i) {
1458             if (i == num)
1459                 /* the generator */
1460             {
1461                 p = EC_GROUP_get0_generator(group);
1462                 p_scalar = scalar;
1463             } else
1464                 /* the i^th point */
1465             {
1466                 p = points[i];
1467                 p_scalar = scalars[i];
1468             }
1469             if ((p_scalar != NULL) && (p != NULL)) {
1470                 /* reduce scalar to 0 <= scalar < 2^224 */
1471                 if ((BN_num_bits(p_scalar) > 224)
1472                     || (BN_is_negative(p_scalar))) {
1473                     /*
1474                      * this is an unusual input, and we don't guarantee
1475                      * constant-timeness
1476                      */
1477                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1478                         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1479                         goto err;
1480                     }
1481                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
1482                 } else
1483                     num_bytes = BN_bn2bin(p_scalar, tmp);
1484                 flip_endian(secrets[i], tmp, num_bytes);
1485                 /* precompute multiples */
1486                 if ((!BN_to_felem(x_out, p->X)) ||
1487                     (!BN_to_felem(y_out, p->Y)) ||
1488                     (!BN_to_felem(z_out, p->Z)))
1489                     goto err;
1490                 felem_assign(pre_comp[i][1][0], x_out);
1491                 felem_assign(pre_comp[i][1][1], y_out);
1492                 felem_assign(pre_comp[i][1][2], z_out);
1493                 for (j = 2; j <= 16; ++j) {
1494                     if (j & 1) {
1495                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1496                                   pre_comp[i][j][2], pre_comp[i][1][0],
1497                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1498                                   pre_comp[i][j - 1][0],
1499                                   pre_comp[i][j - 1][1],
1500                                   pre_comp[i][j - 1][2]);
1501                     } else {
1502                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1503                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1504                                      pre_comp[i][j / 2][1],
1505                                      pre_comp[i][j / 2][2]);
1506                     }
1507                 }
1508             }
1509         }
1510         if (mixed)
1511             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1512     }
1513
1514     /* the scalar for the generator */
1515     if ((scalar != NULL) && (have_pre_comp)) {
1516         memset(g_secret, 0, sizeof(g_secret));
1517         /* reduce scalar to 0 <= scalar < 2^224 */
1518         if ((BN_num_bits(scalar) > 224) || (BN_is_negative(scalar))) {
1519             /*
1520              * this is an unusual input, and we don't guarantee
1521              * constant-timeness
1522              */
1523             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1524                 ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1525                 goto err;
1526             }
1527             num_bytes = BN_bn2bin(tmp_scalar, tmp);
1528         } else
1529             num_bytes = BN_bn2bin(scalar, tmp);
1530         flip_endian(g_secret, tmp, num_bytes);
1531         /* do the multiplication with generator precomputation */
1532         batch_mul(x_out, y_out, z_out,
1533                   (const felem_bytearray(*))secrets, num_points,
1534                   g_secret,
1535                   mixed, (const felem(*)[17][3])pre_comp, g_pre_comp);
1536     } else
1537         /* do the multiplication without generator precomputation */
1538         batch_mul(x_out, y_out, z_out,
1539                   (const felem_bytearray(*))secrets, num_points,
1540                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1541     /* reduce the output to its unique minimal representation */
1542     felem_contract(x_in, x_out);
1543     felem_contract(y_in, y_out);
1544     felem_contract(z_in, z_out);
1545     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1546         (!felem_to_BN(z, z_in))) {
1547         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1548         goto err;
1549     }
1550     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1551
1552  err:
1553     BN_CTX_end(ctx);
1554     EC_POINT_free(generator);
1555     BN_CTX_free(new_ctx);
1556     OPENSSL_free(secrets);
1557     OPENSSL_free(pre_comp);
1558     OPENSSL_free(tmp_felems);
1559     return ret;
1560 }
1561
1562 int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1563 {
1564     int ret = 0;
1565     NISTP224_PRE_COMP *pre = NULL;
1566     int i, j;
1567     BN_CTX *new_ctx = NULL;
1568     BIGNUM *x, *y;
1569     EC_POINT *generator = NULL;
1570     felem tmp_felems[32];
1571
1572     /* throw away old precomputation */
1573     EC_pre_comp_free(group);
1574     if (ctx == NULL)
1575         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1576             return 0;
1577     BN_CTX_start(ctx);
1578     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
1579         goto err;
1580     /* get the generator */
1581     if (group->generator == NULL)
1582         goto err;
1583     generator = EC_POINT_new(group);
1584     if (generator == NULL)
1585         goto err;
1586     BN_bin2bn(nistp224_curve_params[3], sizeof(felem_bytearray), x);
1587     BN_bin2bn(nistp224_curve_params[4], sizeof(felem_bytearray), y);
1588     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
1589         goto err;
1590     if ((pre = nistp224_pre_comp_new()) == NULL)
1591         goto err;
1592     /*
1593      * if the generator is the standard one, use built-in precomputation
1594      */
1595     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1596         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1597         goto done;
1598     }
1599     if ((!BN_to_felem(pre->g_pre_comp[0][1][0], group->generator->X)) ||
1600         (!BN_to_felem(pre->g_pre_comp[0][1][1], group->generator->Y)) ||
1601         (!BN_to_felem(pre->g_pre_comp[0][1][2], group->generator->Z)))
1602         goto err;
1603     /*
1604      * compute 2^56*G, 2^112*G, 2^168*G for the first table, 2^28*G, 2^84*G,
1605      * 2^140*G, 2^196*G for the second one
1606      */
1607     for (i = 1; i <= 8; i <<= 1) {
1608         point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1609                      pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
1610                      pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]);
1611         for (j = 0; j < 27; ++j) {
1612             point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1613                          pre->g_pre_comp[1][i][2], pre->g_pre_comp[1][i][0],
1614                          pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1615         }
1616         if (i == 8)
1617             break;
1618         point_double(pre->g_pre_comp[0][2 * i][0],
1619                      pre->g_pre_comp[0][2 * i][1],
1620                      pre->g_pre_comp[0][2 * i][2], pre->g_pre_comp[1][i][0],
1621                      pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1622         for (j = 0; j < 27; ++j) {
1623             point_double(pre->g_pre_comp[0][2 * i][0],
1624                          pre->g_pre_comp[0][2 * i][1],
1625                          pre->g_pre_comp[0][2 * i][2],
1626                          pre->g_pre_comp[0][2 * i][0],
1627                          pre->g_pre_comp[0][2 * i][1],
1628                          pre->g_pre_comp[0][2 * i][2]);
1629         }
1630     }
1631     for (i = 0; i < 2; i++) {
1632         /* g_pre_comp[i][0] is the point at infinity */
1633         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
1634         /* the remaining multiples */
1635         /* 2^56*G + 2^112*G resp. 2^84*G + 2^140*G */
1636         point_add(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
1637                   pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
1638                   pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
1639                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1640                   pre->g_pre_comp[i][2][2]);
1641         /* 2^56*G + 2^168*G resp. 2^84*G + 2^196*G */
1642         point_add(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
1643                   pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
1644                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1645                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1646                   pre->g_pre_comp[i][2][2]);
1647         /* 2^112*G + 2^168*G resp. 2^140*G + 2^196*G */
1648         point_add(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
1649                   pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
1650                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1651                   0, pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
1652                   pre->g_pre_comp[i][4][2]);
1653         /*
1654          * 2^56*G + 2^112*G + 2^168*G resp. 2^84*G + 2^140*G + 2^196*G
1655          */
1656         point_add(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
1657                   pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
1658                   pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
1659                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1660                   pre->g_pre_comp[i][2][2]);
1661         for (j = 1; j < 8; ++j) {
1662             /* odd multiples: add G resp. 2^28*G */
1663             point_add(pre->g_pre_comp[i][2 * j + 1][0],
1664                       pre->g_pre_comp[i][2 * j + 1][1],
1665                       pre->g_pre_comp[i][2 * j + 1][2],
1666                       pre->g_pre_comp[i][2 * j][0],
1667                       pre->g_pre_comp[i][2 * j][1],
1668                       pre->g_pre_comp[i][2 * j][2], 0,
1669                       pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1],
1670                       pre->g_pre_comp[i][1][2]);
1671         }
1672     }
1673     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems);
1674
1675  done:
1676     SETPRECOMP(group, nistp224, pre);
1677     pre = NULL;
1678     ret = 1;
1679  err:
1680     BN_CTX_end(ctx);
1681     EC_POINT_free(generator);
1682     BN_CTX_free(new_ctx);
1683     EC_nistp224_pre_comp_free(pre);
1684     return ret;
1685 }
1686
1687 int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group)
1688 {
1689     return HAVEPRECOMP(group, nistp224);
1690 }
1691
1692 #endif