crypto/ec/ecp_nistp224.c

   1 /*
   2  * Written by Emilia Kasper (Google) for the OpenSSL project.
   3  */
   4 /* Copyright 2011 Google Inc.
   5  *
   6  * Licensed under the Apache License, Version 2.0 (the "License");
   7  *
   8  * you may not use this file except in compliance with the License.
   9  * You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  *  Unless required by applicable law or agreed to in writing, software
  14  *  distributed under the License is distributed on an "AS IS" BASIS,
  15  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  *  See the License for the specific language governing permissions and
  17  *  limitations under the License.
  18  */
  19
  20 /*
  21  * A 64-bit implementation of the NIST P-224 elliptic curve point multiplication
  22  *
  23  * Inspired by Daniel J. Bernstein's public domain nistp224 implementation
  24  * and Adam Langley's public domain 64-bit C implementation of curve25519
  25  */
  26
  27 #include <openssl/opensslconf.h>
  28 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  29 NON_EMPTY_TRANSLATION_UNIT
  30 #else
  31
  32 # include <stdint.h>
  33 # include <string.h>
  34 # include <openssl/err.h>
  35 # include "ec_lcl.h"
  36
  37 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  38   /* even with gcc, the typedef won't work for 32-bit platforms */
  39 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  40                                  * platforms */
  41 # else
  42 #  error "Need GCC 3.1 or later to define type uint128_t"
  43 # endif
  44
  45 typedef uint8_t u8;
  46 typedef uint64_t u64;
  47 typedef int64_t s64;
  48
  49 /******************************************************************************/
  50 /*-
  51  * INTERNAL REPRESENTATION OF FIELD ELEMENTS
  52  *
  53  * Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3
  54  * using 64-bit coefficients called 'limbs',
  55  * and sometimes (for multiplication results) as
  56  * b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 + 2^336*b_6
  57  * using 128-bit coefficients called 'widelimbs'.
  58  * A 4-limb representation is an 'felem';
  59  * a 7-widelimb representation is a 'widefelem'.
  60  * Even within felems, bits of adjacent limbs overlap, and we don't always
  61  * reduce the representations: we ensure that inputs to each felem
  62  * multiplication satisfy a_i < 2^60, so outputs satisfy b_i < 4*2^60*2^60,
  63  * and fit into a 128-bit word without overflow. The coefficients are then
  64  * again partially reduced to obtain an felem satisfying a_i < 2^57.
  65  * We only reduce to the unique minimal representation at the end of the
  66  * computation.
  67  */
  68
  69 typedef uint64_t limb;
  70 typedef uint128_t widelimb;
  71
  72 typedef limb felem[4];
  73 typedef widelimb widefelem[7];
  74
  75 /*
  76  * Field element represented as a byte arrary. 28*8 = 224 bits is also the
  77  * group order size for the elliptic curve, and we also use this type for
  78  * scalars for point multiplication.
  79  */
  80 typedef u8 felem_bytearray[28];
  81
  82 static const felem_bytearray nistp224_curve_params[5] = {
  83     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
  84      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
  85      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01},
  86     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a */
  87      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
  88      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE},
  89     {0xB4, 0x05, 0x0A, 0x85, 0x0C, 0x04, 0xB3, 0xAB, 0xF5, 0x41, /* b */
  90      0x32, 0x56, 0x50, 0x44, 0xB0, 0xB7, 0xD7, 0xBF, 0xD8, 0xBA,
  91      0x27, 0x0B, 0x39, 0x43, 0x23, 0x55, 0xFF, 0xB4},
  92     {0xB7, 0x0E, 0x0C, 0xBD, 0x6B, 0xB4, 0xBF, 0x7F, 0x32, 0x13, /* x */
  93      0x90, 0xB9, 0x4A, 0x03, 0xC1, 0xD3, 0x56, 0xC2, 0x11, 0x22,
  94      0x34, 0x32, 0x80, 0xD6, 0x11, 0x5C, 0x1D, 0x21},
  95     {0xbd, 0x37, 0x63, 0x88, 0xb5, 0xf7, 0x23, 0xfb, 0x4c, 0x22, /* y */
  96      0xdf, 0xe6, 0xcd, 0x43, 0x75, 0xa0, 0x5a, 0x07, 0x47, 0x64,
  97      0x44, 0xd5, 0x81, 0x99, 0x85, 0x00, 0x7e, 0x34}
  98 };
  99
 100 /*-
 101  * Precomputed multiples of the standard generator
 102  * Points are given in coordinates (X, Y, Z) where Z normally is 1
 103  * (0 for the point at infinity).
 104  * For each field element, slice a_0 is word 0, etc.
 105  *
 106  * The table has 2 * 16 elements, starting with the following:
 107  * index | bits    | point
 108  * ------+---------+------------------------------
 109  *     0 | 0 0 0 0 | 0G
 110  *     1 | 0 0 0 1 | 1G
 111  *     2 | 0 0 1 0 | 2^56G
 112  *     3 | 0 0 1 1 | (2^56 + 1)G
 113  *     4 | 0 1 0 0 | 2^112G
 114  *     5 | 0 1 0 1 | (2^112 + 1)G
 115  *     6 | 0 1 1 0 | (2^112 + 2^56)G
 116  *     7 | 0 1 1 1 | (2^112 + 2^56 + 1)G
 117  *     8 | 1 0 0 0 | 2^168G
 118  *     9 | 1 0 0 1 | (2^168 + 1)G
 119  *    10 | 1 0 1 0 | (2^168 + 2^56)G
 120  *    11 | 1 0 1 1 | (2^168 + 2^56 + 1)G
 121  *    12 | 1 1 0 0 | (2^168 + 2^112)G
 122  *    13 | 1 1 0 1 | (2^168 + 2^112 + 1)G
 123  *    14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G
 124  *    15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G
 125  * followed by a copy of this with each element multiplied by 2^28.
 126  *
 127  * The reason for this is so that we can clock bits into four different
 128  * locations when doing simple scalar multiplies against the base point,
 129  * and then another four locations using the second 16 elements.
 130  */
 131 static const felem gmul[2][16][3] = {
 132 {{{0, 0, 0, 0},
 133   {0, 0, 0, 0},
 134   {0, 0, 0, 0}},
 135  {{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf},
 136   {0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723},
 137   {1, 0, 0, 0}},
 138  {{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5},
 139   {0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321},
 140   {1, 0, 0, 0}},
 141  {{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748},
 142   {0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17},
 143   {1, 0, 0, 0}},
 144  {{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe},
 145   {0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b},
 146   {1, 0, 0, 0}},
 147  {{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3},
 148   {0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a},
 149   {1, 0, 0, 0}},
 150  {{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c},
 151   {0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244},
 152   {1, 0, 0, 0}},
 153  {{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849},
 154   {0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112},
 155   {1, 0, 0, 0}},
 156  {{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47},
 157   {0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394},
 158   {1, 0, 0, 0}},
 159  {{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d},
 160   {0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7},
 161   {1, 0, 0, 0}},
 162  {{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24},
 163   {0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881},
 164   {1, 0, 0, 0}},
 165  {{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984},
 166   {0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369},
 167   {1, 0, 0, 0}},
 168  {{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3},
 169   {0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60},
 170   {1, 0, 0, 0}},
 171  {{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057},
 172   {0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9},
 173   {1, 0, 0, 0}},
 174  {{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9},
 175   {0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc},
 176   {1, 0, 0, 0}},
 177  {{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58},
 178   {0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558},
 179   {1, 0, 0, 0}}},
 180 {{{0, 0, 0, 0},
 181   {0, 0, 0, 0},
 182   {0, 0, 0, 0}},
 183  {{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31},
 184   {0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d},
 185   {1, 0, 0, 0}},
 186  {{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3},
 187   {0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a},
 188   {1, 0, 0, 0}},
 189  {{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33},
 190   {0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100},
 191   {1, 0, 0, 0}},
 192  {{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5},
 193   {0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea},
 194   {1, 0, 0, 0}},
 195  {{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be},
 196   {0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51},
 197   {1, 0, 0, 0}},
 198  {{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1},
 199   {0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb},
 200   {1, 0, 0, 0}},
 201  {{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233},
 202   {0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def},
 203   {1, 0, 0, 0}},
 204  {{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae},
 205   {0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45},
 206   {1, 0, 0, 0}},
 207  {{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e},
 208   {0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb},
 209   {1, 0, 0, 0}},
 210  {{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de},
 211   {0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3},
 212   {1, 0, 0, 0}},
 213  {{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05},
 214   {0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58},
 215   {1, 0, 0, 0}},
 216  {{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb},
 217   {0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0},
 218   {1, 0, 0, 0}},
 219  {{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9},
 220   {0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea},
 221   {1, 0, 0, 0}},
 222  {{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba},
 223   {0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405},
 224   {1, 0, 0, 0}},
 225  {{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e},
 226   {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e},
 227   {1, 0, 0, 0}}}
 228 };
 229
 230 /* Precomputation for the group generator. */
 231 struct nistp224_pre_comp_st {
 232     felem g_pre_comp[2][16][3];
 233     int references;
 234 };
 235
 236 const EC_METHOD *EC_GFp_nistp224_method(void)
 237 {
 238     static const EC_METHOD ret = {
 239         EC_FLAGS_DEFAULT_OCT,
 240         NID_X9_62_prime_field,
 241         ec_GFp_nistp224_group_init,
 242         ec_GFp_simple_group_finish,
 243         ec_GFp_simple_group_clear_finish,
 244         ec_GFp_nist_group_copy,
 245         ec_GFp_nistp224_group_set_curve,
 246         ec_GFp_simple_group_get_curve,
 247         ec_GFp_simple_group_get_degree,
 248         ec_GFp_simple_group_check_discriminant,
 249         ec_GFp_simple_point_init,
 250         ec_GFp_simple_point_finish,
 251         ec_GFp_simple_point_clear_finish,
 252         ec_GFp_simple_point_copy,
 253         ec_GFp_simple_point_set_to_infinity,
 254         ec_GFp_simple_set_Jprojective_coordinates_GFp,
 255         ec_GFp_simple_get_Jprojective_coordinates_GFp,
 256         ec_GFp_simple_point_set_affine_coordinates,
 257         ec_GFp_nistp224_point_get_affine_coordinates,
 258         0 /* point_set_compressed_coordinates */ ,
 259         0 /* point2oct */ ,
 260         0 /* oct2point */ ,
 261         ec_GFp_simple_add,
 262         ec_GFp_simple_dbl,
 263         ec_GFp_simple_invert,
 264         ec_GFp_simple_is_at_infinity,
 265         ec_GFp_simple_is_on_curve,
 266         ec_GFp_simple_cmp,
 267         ec_GFp_simple_make_affine,
 268         ec_GFp_simple_points_make_affine,
 269         ec_GFp_nistp224_points_mul,
 270         ec_GFp_nistp224_precompute_mult,
 271         ec_GFp_nistp224_have_precompute_mult,
 272         ec_GFp_nist_field_mul,
 273         ec_GFp_nist_field_sqr,
 274         0 /* field_div */ ,
 275         0 /* field_encode */ ,
 276         0 /* field_decode */ ,
 277         0                       /* field_set_to_one */
 278     };
 279
 280     return &ret;
 281 }
 282
 283 /*
 284  * Helper functions to convert field elements to/from internal representation
 285  */
 286 static void bin28_to_felem(felem out, const u8 in[28])
 287 {
 288     out[0] = *((const uint64_t *)(in)) & 0x00ffffffffffffff;
 289     out[1] = (*((const uint64_t *)(in + 7))) & 0x00ffffffffffffff;
 290     out[2] = (*((const uint64_t *)(in + 14))) & 0x00ffffffffffffff;
 291     out[3] = (*((const uint64_t *)(in+20))) >> 8;
 292 }
 293
 294 static void felem_to_bin28(u8 out[28], const felem in)
 295 {
 296     unsigned i;
 297     for (i = 0; i < 7; ++i) {
 298         out[i] = in[0] >> (8 * i);
 299         out[i + 7] = in[1] >> (8 * i);
 300         out[i + 14] = in[2] >> (8 * i);
 301         out[i + 21] = in[3] >> (8 * i);
 302     }
 303 }
 304
 305 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 306 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 307 {
 308     unsigned i;
 309     for (i = 0; i < len; ++i)
 310         out[i] = in[len - 1 - i];
 311 }
 312
 313 /* From OpenSSL BIGNUM to internal representation */
 314 static int BN_to_felem(felem out, const BIGNUM *bn)
 315 {
 316     felem_bytearray b_in;
 317     felem_bytearray b_out;
 318     unsigned num_bytes;
 319
 320     /* BN_bn2bin eats leading zeroes */
 321     memset(b_out, 0, sizeof(b_out));
 322     num_bytes = BN_num_bytes(bn);
 323     if (num_bytes > sizeof b_out) {
 324         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 325         return 0;
 326     }
 327     if (BN_is_negative(bn)) {
 328         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 329         return 0;
 330     }
 331     num_bytes = BN_bn2bin(bn, b_in);
 332     flip_endian(b_out, b_in, num_bytes);
 333     bin28_to_felem(out, b_out);
 334     return 1;
 335 }
 336
 337 /* From internal representation to OpenSSL BIGNUM */
 338 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 339 {
 340     felem_bytearray b_in, b_out;
 341     felem_to_bin28(b_in, in);
 342     flip_endian(b_out, b_in, sizeof b_out);
 343     return BN_bin2bn(b_out, sizeof b_out, out);
 344 }
 345
 346 /******************************************************************************/
 347 /*-
 348  *                              FIELD OPERATIONS
 349  *
 350  * Field operations, using the internal representation of field elements.
 351  * NB! These operations are specific to our point multiplication and cannot be
 352  * expected to be correct in general - e.g., multiplication with a large scalar
 353  * will cause an overflow.
 354  *
 355  */
 356
 357 static void felem_one(felem out)
 358 {
 359     out[0] = 1;
 360     out[1] = 0;
 361     out[2] = 0;
 362     out[3] = 0;
 363 }
 364
 365 static void felem_assign(felem out, const felem in)
 366 {
 367     out[0] = in[0];
 368     out[1] = in[1];
 369     out[2] = in[2];
 370     out[3] = in[3];
 371 }
 372
 373 /* Sum two field elements: out += in */
 374 static void felem_sum(felem out, const felem in)
 375 {
 376     out[0] += in[0];
 377     out[1] += in[1];
 378     out[2] += in[2];
 379     out[3] += in[3];
 380 }
 381
 382 /* Get negative value: out = -in */
 383 /* Assumes in[i] < 2^57 */
 384 static void felem_neg(felem out, const felem in)
 385 {
 386     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 387     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 388     static const limb two58m42m2 = (((limb) 1) << 58) -
 389         (((limb) 1) << 42) - (((limb) 1) << 2);
 390
 391     /* Set to 0 mod 2^224-2^96+1 to ensure out > in */
 392     out[0] = two58p2 - in[0];
 393     out[1] = two58m42m2 - in[1];
 394     out[2] = two58m2 - in[2];
 395     out[3] = two58m2 - in[3];
 396 }
 397
 398 /* Subtract field elements: out -= in */
 399 /* Assumes in[i] < 2^57 */
 400 static void felem_diff(felem out, const felem in)
 401 {
 402     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 403     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 404     static const limb two58m42m2 = (((limb) 1) << 58) -
 405         (((limb) 1) << 42) - (((limb) 1) << 2);
 406
 407     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 408     out[0] += two58p2;
 409     out[1] += two58m42m2;
 410     out[2] += two58m2;
 411     out[3] += two58m2;
 412
 413     out[0] -= in[0];
 414     out[1] -= in[1];
 415     out[2] -= in[2];
 416     out[3] -= in[3];
 417 }
 418
 419 /* Subtract in unreduced 128-bit mode: out -= in */
 420 /* Assumes in[i] < 2^119 */
 421 static void widefelem_diff(widefelem out, const widefelem in)
 422 {
 423     static const widelimb two120 = ((widelimb) 1) << 120;
 424     static const widelimb two120m64 = (((widelimb) 1) << 120) -
 425         (((widelimb) 1) << 64);
 426     static const widelimb two120m104m64 = (((widelimb) 1) << 120) -
 427         (((widelimb) 1) << 104) - (((widelimb) 1) << 64);
 428
 429     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 430     out[0] += two120;
 431     out[1] += two120m64;
 432     out[2] += two120m64;
 433     out[3] += two120;
 434     out[4] += two120m104m64;
 435     out[5] += two120m64;
 436     out[6] += two120m64;
 437
 438     out[0] -= in[0];
 439     out[1] -= in[1];
 440     out[2] -= in[2];
 441     out[3] -= in[3];
 442     out[4] -= in[4];
 443     out[5] -= in[5];
 444     out[6] -= in[6];
 445 }
 446
 447 /* Subtract in mixed mode: out128 -= in64 */
 448 /* in[i] < 2^63 */
 449 static void felem_diff_128_64(widefelem out, const felem in)
 450 {
 451     static const widelimb two64p8 = (((widelimb) 1) << 64) +
 452         (((widelimb) 1) << 8);
 453     static const widelimb two64m8 = (((widelimb) 1) << 64) -
 454         (((widelimb) 1) << 8);
 455     static const widelimb two64m48m8 = (((widelimb) 1) << 64) -
 456         (((widelimb) 1) << 48) - (((widelimb) 1) << 8);
 457
 458     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 459     out[0] += two64p8;
 460     out[1] += two64m48m8;
 461     out[2] += two64m8;
 462     out[3] += two64m8;
 463
 464     out[0] -= in[0];
 465     out[1] -= in[1];
 466     out[2] -= in[2];
 467     out[3] -= in[3];
 468 }
 469
 470 /*
 471  * Multiply a field element by a scalar: out = out * scalar The scalars we
 472  * actually use are small, so results fit without overflow
 473  */
 474 static void felem_scalar(felem out, const limb scalar)
 475 {
 476     out[0] *= scalar;
 477     out[1] *= scalar;
 478     out[2] *= scalar;
 479     out[3] *= scalar;
 480 }
 481
 482 /*
 483  * Multiply an unreduced field element by a scalar: out = out * scalar The
 484  * scalars we actually use are small, so results fit without overflow
 485  */
 486 static void widefelem_scalar(widefelem out, const widelimb scalar)
 487 {
 488     out[0] *= scalar;
 489     out[1] *= scalar;
 490     out[2] *= scalar;
 491     out[3] *= scalar;
 492     out[4] *= scalar;
 493     out[5] *= scalar;
 494     out[6] *= scalar;
 495 }
 496
 497 /* Square a field element: out = in^2 */
 498 static void felem_square(widefelem out, const felem in)
 499 {
 500     limb tmp0, tmp1, tmp2;
 501     tmp0 = 2 * in[0];
 502     tmp1 = 2 * in[1];
 503     tmp2 = 2 * in[2];
 504     out[0] = ((widelimb) in[0]) * in[0];
 505     out[1] = ((widelimb) in[0]) * tmp1;
 506     out[2] = ((widelimb) in[0]) * tmp2 + ((widelimb) in[1]) * in[1];
 507     out[3] = ((widelimb) in[3]) * tmp0 + ((widelimb) in[1]) * tmp2;
 508     out[4] = ((widelimb) in[3]) * tmp1 + ((widelimb) in[2]) * in[2];
 509     out[5] = ((widelimb) in[3]) * tmp2;
 510     out[6] = ((widelimb) in[3]) * in[3];
 511 }
 512
 513 /* Multiply two field elements: out = in1 * in2 */
 514 static void felem_mul(widefelem out, const felem in1, const felem in2)
 515 {
 516     out[0] = ((widelimb) in1[0]) * in2[0];
 517     out[1] = ((widelimb) in1[0]) * in2[1] + ((widelimb) in1[1]) * in2[0];
 518     out[2] = ((widelimb) in1[0]) * in2[2] + ((widelimb) in1[1]) * in2[1] +
 519              ((widelimb) in1[2]) * in2[0];
 520     out[3] = ((widelimb) in1[0]) * in2[3] + ((widelimb) in1[1]) * in2[2] +
 521              ((widelimb) in1[2]) * in2[1] + ((widelimb) in1[3]) * in2[0];
 522     out[4] = ((widelimb) in1[1]) * in2[3] + ((widelimb) in1[2]) * in2[2] +
 523              ((widelimb) in1[3]) * in2[1];
 524     out[5] = ((widelimb) in1[2]) * in2[3] + ((widelimb) in1[3]) * in2[2];
 525     out[6] = ((widelimb) in1[3]) * in2[3];
 526 }
 527
 528 /*-
 529  * Reduce seven 128-bit coefficients to four 64-bit coefficients.
 530  * Requires in[i] < 2^126,
 531  * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 */
 532 static void felem_reduce(felem out, const widefelem in)
 533 {
 534     static const widelimb two127p15 = (((widelimb) 1) << 127) +
 535         (((widelimb) 1) << 15);
 536     static const widelimb two127m71 = (((widelimb) 1) << 127) -
 537         (((widelimb) 1) << 71);
 538     static const widelimb two127m71m55 = (((widelimb) 1) << 127) -
 539         (((widelimb) 1) << 71) - (((widelimb) 1) << 55);
 540     widelimb output[5];
 541
 542     /* Add 0 mod 2^224-2^96+1 to ensure all differences are positive */
 543     output[0] = in[0] + two127p15;
 544     output[1] = in[1] + two127m71m55;
 545     output[2] = in[2] + two127m71;
 546     output[3] = in[3];
 547     output[4] = in[4];
 548
 549     /* Eliminate in[4], in[5], in[6] */
 550     output[4] += in[6] >> 16;
 551     output[3] += (in[6] & 0xffff) << 40;
 552     output[2] -= in[6];
 553
 554     output[3] += in[5] >> 16;
 555     output[2] += (in[5] & 0xffff) << 40;
 556     output[1] -= in[5];
 557
 558     output[2] += output[4] >> 16;
 559     output[1] += (output[4] & 0xffff) << 40;
 560     output[0] -= output[4];
 561
 562     /* Carry 2 -> 3 -> 4 */
 563     output[3] += output[2] >> 56;
 564     output[2] &= 0x00ffffffffffffff;
 565
 566     output[4] = output[3] >> 56;
 567     output[3] &= 0x00ffffffffffffff;
 568
 569     /* Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72 */
 570
 571     /* Eliminate output[4] */
 572     output[2] += output[4] >> 16;
 573     /* output[2] < 2^56 + 2^56 = 2^57 */
 574     output[1] += (output[4] & 0xffff) << 40;
 575     output[0] -= output[4];
 576
 577     /* Carry 0 -> 1 -> 2 -> 3 */
 578     output[1] += output[0] >> 56;
 579     out[0] = output[0] & 0x00ffffffffffffff;
 580
 581     output[2] += output[1] >> 56;
 582     /* output[2] < 2^57 + 2^72 */
 583     out[1] = output[1] & 0x00ffffffffffffff;
 584     output[3] += output[2] >> 56;
 585     /* output[3] <= 2^56 + 2^16 */
 586     out[2] = output[2] & 0x00ffffffffffffff;
 587
 588     /*-
 589      * out[0] < 2^56, out[1] < 2^56, out[2] < 2^56,
 590      * out[3] <= 2^56 + 2^16 (due to final carry),
 591      * so out < 2*p
 592      */
 593     out[3] = output[3];
 594 }
 595
 596 static void felem_square_reduce(felem out, const felem in)
 597 {
 598     widefelem tmp;
 599     felem_square(tmp, in);
 600     felem_reduce(out, tmp);
 601 }
 602
 603 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 604 {
 605     widefelem tmp;
 606     felem_mul(tmp, in1, in2);
 607     felem_reduce(out, tmp);
 608 }
 609
 610 /*
 611  * Reduce to unique minimal representation. Requires 0 <= in < 2*p (always
 612  * call felem_reduce first)
 613  */
 614 static void felem_contract(felem out, const felem in)
 615 {
 616     static const int64_t two56 = ((limb) 1) << 56;
 617     /* 0 <= in < 2*p, p = 2^224 - 2^96 + 1 */
 618     /* if in > p , reduce in = in - 2^224 + 2^96 - 1 */
 619     int64_t tmp[4], a;
 620     tmp[0] = in[0];
 621     tmp[1] = in[1];
 622     tmp[2] = in[2];
 623     tmp[3] = in[3];
 624     /* Case 1: a = 1 iff in >= 2^224 */
 625     a = (in[3] >> 56);
 626     tmp[0] -= a;
 627     tmp[1] += a << 40;
 628     tmp[3] &= 0x00ffffffffffffff;
 629     /*
 630      * Case 2: a = 0 iff p <= in < 2^224, i.e., the high 128 bits are all 1
 631      * and the lower part is non-zero
 632      */
 633     a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) |
 634         (((int64_t) (in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63);
 635     a &= 0x00ffffffffffffff;
 636     /* turn a into an all-one mask (if a = 0) or an all-zero mask */
 637     a = (a - 1) >> 63;
 638     /* subtract 2^224 - 2^96 + 1 if a is all-one */
 639     tmp[3] &= a ^ 0xffffffffffffffff;
 640     tmp[2] &= a ^ 0xffffffffffffffff;
 641     tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff;
 642     tmp[0] -= 1 & a;
 643
 644     /*
 645      * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
 646      * non-zero, so we only need one step
 647      */
 648     a = tmp[0] >> 63;
 649     tmp[0] += two56 & a;
 650     tmp[1] -= 1 & a;
 651
 652     /* carry 1 -> 2 -> 3 */
 653     tmp[2] += tmp[1] >> 56;
 654     tmp[1] &= 0x00ffffffffffffff;
 655
 656     tmp[3] += tmp[2] >> 56;
 657     tmp[2] &= 0x00ffffffffffffff;
 658
 659     /* Now 0 <= out < p */
 660     out[0] = tmp[0];
 661     out[1] = tmp[1];
 662     out[2] = tmp[2];
 663     out[3] = tmp[3];
 664 }
 665
 666 /*
 667  * Zero-check: returns 1 if input is 0, and 0 otherwise. We know that field
 668  * elements are reduced to in < 2^225, so we only need to check three cases:
 669  * 0, 2^224 - 2^96 + 1, and 2^225 - 2^97 + 2
 670  */
 671 static limb felem_is_zero(const felem in)
 672 {
 673     limb zero, two224m96p1, two225m97p2;
 674
 675     zero = in[0] | in[1] | in[2] | in[3];
 676     zero = (((int64_t) (zero) - 1) >> 63) & 1;
 677     two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000)
 678         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x00ffffffffffffff);
 679     two224m96p1 = (((int64_t) (two224m96p1) - 1) >> 63) & 1;
 680     two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000)
 681         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x01ffffffffffffff);
 682     two225m97p2 = (((int64_t) (two225m97p2) - 1) >> 63) & 1;
 683     return (zero | two224m96p1 | two225m97p2);
 684 }
 685
 686 static limb felem_is_zero_int(const felem in)
 687 {
 688     return (int)(felem_is_zero(in) & ((limb) 1));
 689 }
 690
 691 /* Invert a field element */
 692 /* Computation chain copied from djb's code */
 693 static void felem_inv(felem out, const felem in)
 694 {
 695     felem ftmp, ftmp2, ftmp3, ftmp4;
 696     widefelem tmp;
 697     unsigned i;
 698
 699     felem_square(tmp, in);
 700     felem_reduce(ftmp, tmp);    /* 2 */
 701     felem_mul(tmp, in, ftmp);
 702     felem_reduce(ftmp, tmp);    /* 2^2 - 1 */
 703     felem_square(tmp, ftmp);
 704     felem_reduce(ftmp, tmp);    /* 2^3 - 2 */
 705     felem_mul(tmp, in, ftmp);
 706     felem_reduce(ftmp, tmp);    /* 2^3 - 1 */
 707     felem_square(tmp, ftmp);
 708     felem_reduce(ftmp2, tmp);   /* 2^4 - 2 */
 709     felem_square(tmp, ftmp2);
 710     felem_reduce(ftmp2, tmp);   /* 2^5 - 4 */
 711     felem_square(tmp, ftmp2);
 712     felem_reduce(ftmp2, tmp);   /* 2^6 - 8 */
 713     felem_mul(tmp, ftmp2, ftmp);
 714     felem_reduce(ftmp, tmp);    /* 2^6 - 1 */
 715     felem_square(tmp, ftmp);
 716     felem_reduce(ftmp2, tmp);   /* 2^7 - 2 */
 717     for (i = 0; i < 5; ++i) {   /* 2^12 - 2^6 */
 718         felem_square(tmp, ftmp2);
 719         felem_reduce(ftmp2, tmp);
 720     }
 721     felem_mul(tmp, ftmp2, ftmp);
 722     felem_reduce(ftmp2, tmp);   /* 2^12 - 1 */
 723     felem_square(tmp, ftmp2);
 724     felem_reduce(ftmp3, tmp);   /* 2^13 - 2 */
 725     for (i = 0; i < 11; ++i) {  /* 2^24 - 2^12 */
 726         felem_square(tmp, ftmp3);
 727         felem_reduce(ftmp3, tmp);
 728     }
 729     felem_mul(tmp, ftmp3, ftmp2);
 730     felem_reduce(ftmp2, tmp);   /* 2^24 - 1 */
 731     felem_square(tmp, ftmp2);
 732     felem_reduce(ftmp3, tmp);   /* 2^25 - 2 */
 733     for (i = 0; i < 23; ++i) {  /* 2^48 - 2^24 */
 734         felem_square(tmp, ftmp3);
 735         felem_reduce(ftmp3, tmp);
 736     }
 737     felem_mul(tmp, ftmp3, ftmp2);
 738     felem_reduce(ftmp3, tmp);   /* 2^48 - 1 */
 739     felem_square(tmp, ftmp3);
 740     felem_reduce(ftmp4, tmp);   /* 2^49 - 2 */
 741     for (i = 0; i < 47; ++i) {  /* 2^96 - 2^48 */
 742         felem_square(tmp, ftmp4);
 743         felem_reduce(ftmp4, tmp);
 744     }
 745     felem_mul(tmp, ftmp3, ftmp4);
 746     felem_reduce(ftmp3, tmp);   /* 2^96 - 1 */
 747     felem_square(tmp, ftmp3);
 748     felem_reduce(ftmp4, tmp);   /* 2^97 - 2 */
 749     for (i = 0; i < 23; ++i) {  /* 2^120 - 2^24 */
 750         felem_square(tmp, ftmp4);
 751         felem_reduce(ftmp4, tmp);
 752     }
 753     felem_mul(tmp, ftmp2, ftmp4);
 754     felem_reduce(ftmp2, tmp);   /* 2^120 - 1 */
 755     for (i = 0; i < 6; ++i) {   /* 2^126 - 2^6 */
 756         felem_square(tmp, ftmp2);
 757         felem_reduce(ftmp2, tmp);
 758     }
 759     felem_mul(tmp, ftmp2, ftmp);
 760     felem_reduce(ftmp, tmp);    /* 2^126 - 1 */
 761     felem_square(tmp, ftmp);
 762     felem_reduce(ftmp, tmp);    /* 2^127 - 2 */
 763     felem_mul(tmp, ftmp, in);
 764     felem_reduce(ftmp, tmp);    /* 2^127 - 1 */
 765     for (i = 0; i < 97; ++i) {  /* 2^224 - 2^97 */
 766         felem_square(tmp, ftmp);
 767         felem_reduce(ftmp, tmp);
 768     }
 769     felem_mul(tmp, ftmp, ftmp3);
 770     felem_reduce(out, tmp);     /* 2^224 - 2^96 - 1 */
 771 }
 772
 773 /*
 774  * Copy in constant time: if icopy == 1, copy in to out, if icopy == 0, copy
 775  * out to itself.
 776  */
 777 static void copy_conditional(felem out, const felem in, limb icopy)
 778 {
 779     unsigned i;
 780     /*
 781      * icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one
 782      */
 783     const limb copy = -icopy;
 784     for (i = 0; i < 4; ++i) {
 785         const limb tmp = copy & (in[i] ^ out[i]);
 786         out[i] ^= tmp;
 787     }
 788 }
 789
 790 /******************************************************************************/
 791 /*-
 792  *                       ELLIPTIC CURVE POINT OPERATIONS
 793  *
 794  * Points are represented in Jacobian projective coordinates:
 795  * (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3),
 796  * or to the point at infinity if Z == 0.
 797  *
 798  */
 799
 800 /*-
 801  * Double an elliptic curve point:
 802  * (X', Y', Z') = 2 * (X, Y, Z), where
 803  * X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2
 804  * Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^2
 805  * Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z
 806  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed,
 807  * while x_out == y_in is not (maybe this works, but it's not tested).
 808  */
 809 static void
 810 point_double(felem x_out, felem y_out, felem z_out,
 811              const felem x_in, const felem y_in, const felem z_in)
 812 {
 813     widefelem tmp, tmp2;
 814     felem delta, gamma, beta, alpha, ftmp, ftmp2;
 815
 816     felem_assign(ftmp, x_in);
 817     felem_assign(ftmp2, x_in);
 818
 819     /* delta = z^2 */
 820     felem_square(tmp, z_in);
 821     felem_reduce(delta, tmp);
 822
 823     /* gamma = y^2 */
 824     felem_square(tmp, y_in);
 825     felem_reduce(gamma, tmp);
 826
 827     /* beta = x*gamma */
 828     felem_mul(tmp, x_in, gamma);
 829     felem_reduce(beta, tmp);
 830
 831     /* alpha = 3*(x-delta)*(x+delta) */
 832     felem_diff(ftmp, delta);
 833     /* ftmp[i] < 2^57 + 2^58 + 2 < 2^59 */
 834     felem_sum(ftmp2, delta);
 835     /* ftmp2[i] < 2^57 + 2^57 = 2^58 */
 836     felem_scalar(ftmp2, 3);
 837     /* ftmp2[i] < 3 * 2^58 < 2^60 */
 838     felem_mul(tmp, ftmp, ftmp2);
 839     /* tmp[i] < 2^60 * 2^59 * 4 = 2^121 */
 840     felem_reduce(alpha, tmp);
 841
 842     /* x' = alpha^2 - 8*beta */
 843     felem_square(tmp, alpha);
 844     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 845     felem_assign(ftmp, beta);
 846     felem_scalar(ftmp, 8);
 847     /* ftmp[i] < 8 * 2^57 = 2^60 */
 848     felem_diff_128_64(tmp, ftmp);
 849     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 850     felem_reduce(x_out, tmp);
 851
 852     /* z' = (y + z)^2 - gamma - delta */
 853     felem_sum(delta, gamma);
 854     /* delta[i] < 2^57 + 2^57 = 2^58 */
 855     felem_assign(ftmp, y_in);
 856     felem_sum(ftmp, z_in);
 857     /* ftmp[i] < 2^57 + 2^57 = 2^58 */
 858     felem_square(tmp, ftmp);
 859     /* tmp[i] < 4 * 2^58 * 2^58 = 2^118 */
 860     felem_diff_128_64(tmp, delta);
 861     /* tmp[i] < 2^118 + 2^64 + 8 < 2^119 */
 862     felem_reduce(z_out, tmp);
 863
 864     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
 865     felem_scalar(beta, 4);
 866     /* beta[i] < 4 * 2^57 = 2^59 */
 867     felem_diff(beta, x_out);
 868     /* beta[i] < 2^59 + 2^58 + 2 < 2^60 */
 869     felem_mul(tmp, alpha, beta);
 870     /* tmp[i] < 4 * 2^57 * 2^60 = 2^119 */
 871     felem_square(tmp2, gamma);
 872     /* tmp2[i] < 4 * 2^57 * 2^57 = 2^116 */
 873     widefelem_scalar(tmp2, 8);
 874     /* tmp2[i] < 8 * 2^116 = 2^119 */
 875     widefelem_diff(tmp, tmp2);
 876     /* tmp[i] < 2^119 + 2^120 < 2^121 */
 877     felem_reduce(y_out, tmp);
 878 }
 879
 880 /*-
 881  * Add two elliptic curve points:
 882  * (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where
 883  * X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 -
 884  * 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2
 885  * Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 - X_3) -
 886  *        Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3
 887  * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2)
 888  *
 889  * This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0.
 890  */
 891
 892 /*
 893  * This function is not entirely constant-time: it includes a branch for
 894  * checking whether the two input points are equal, (while not equal to the
 895  * point at infinity). This case never happens during single point
 896  * multiplication, so there is no timing leak for ECDH or ECDSA signing.
 897  */
 898 static void point_add(felem x3, felem y3, felem z3,
 899                       const felem x1, const felem y1, const felem z1,
 900                       const int mixed, const felem x2, const felem y2,
 901                       const felem z2)
 902 {
 903     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out;
 904     widefelem tmp, tmp2;
 905     limb z1_is_zero, z2_is_zero, x_equal, y_equal;
 906
 907     if (!mixed) {
 908         /* ftmp2 = z2^2 */
 909         felem_square(tmp, z2);
 910         felem_reduce(ftmp2, tmp);
 911
 912         /* ftmp4 = z2^3 */
 913         felem_mul(tmp, ftmp2, z2);
 914         felem_reduce(ftmp4, tmp);
 915
 916         /* ftmp4 = z2^3*y1 */
 917         felem_mul(tmp2, ftmp4, y1);
 918         felem_reduce(ftmp4, tmp2);
 919
 920         /* ftmp2 = z2^2*x1 */
 921         felem_mul(tmp2, ftmp2, x1);
 922         felem_reduce(ftmp2, tmp2);
 923     } else {
 924         /*
 925          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
 926          */
 927
 928         /* ftmp4 = z2^3*y1 */
 929         felem_assign(ftmp4, y1);
 930
 931         /* ftmp2 = z2^2*x1 */
 932         felem_assign(ftmp2, x1);
 933     }
 934
 935     /* ftmp = z1^2 */
 936     felem_square(tmp, z1);
 937     felem_reduce(ftmp, tmp);
 938
 939     /* ftmp3 = z1^3 */
 940     felem_mul(tmp, ftmp, z1);
 941     felem_reduce(ftmp3, tmp);
 942
 943     /* tmp = z1^3*y2 */
 944     felem_mul(tmp, ftmp3, y2);
 945     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 946
 947     /* ftmp3 = z1^3*y2 - z2^3*y1 */
 948     felem_diff_128_64(tmp, ftmp4);
 949     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 950     felem_reduce(ftmp3, tmp);
 951
 952     /* tmp = z1^2*x2 */
 953     felem_mul(tmp, ftmp, x2);
 954     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 955
 956     /* ftmp = z1^2*x2 - z2^2*x1 */
 957     felem_diff_128_64(tmp, ftmp2);
 958     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 959     felem_reduce(ftmp, tmp);
 960
 961     /*
 962      * the formulae are incorrect if the points are equal so we check for
 963      * this and do doubling if this happens
 964      */
 965     x_equal = felem_is_zero(ftmp);
 966     y_equal = felem_is_zero(ftmp3);
 967     z1_is_zero = felem_is_zero(z1);
 968     z2_is_zero = felem_is_zero(z2);
 969     /* In affine coordinates, (X_1, Y_1) == (X_2, Y_2) */
 970     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
 971         point_double(x3, y3, z3, x1, y1, z1);
 972         return;
 973     }
 974
 975     /* ftmp5 = z1*z2 */
 976     if (!mixed) {
 977         felem_mul(tmp, z1, z2);
 978         felem_reduce(ftmp5, tmp);
 979     } else {
 980         /* special case z2 = 0 is handled later */
 981         felem_assign(ftmp5, z1);
 982     }
 983
 984     /* z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) */
 985     felem_mul(tmp, ftmp, ftmp5);
 986     felem_reduce(z_out, tmp);
 987
 988     /* ftmp = (z1^2*x2 - z2^2*x1)^2 */
 989     felem_assign(ftmp5, ftmp);
 990     felem_square(tmp, ftmp);
 991     felem_reduce(ftmp, tmp);
 992
 993     /* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */
 994     felem_mul(tmp, ftmp, ftmp5);
 995     felem_reduce(ftmp5, tmp);
 996
 997     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
 998     felem_mul(tmp, ftmp2, ftmp);
 999     felem_reduce(ftmp2, tmp);
1000
1001     /* tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
1002     felem_mul(tmp, ftmp4, ftmp5);
1003     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
1004
1005     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 */
1006     felem_square(tmp2, ftmp3);
1007     /* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */
1008
1009     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */
1010     felem_diff_128_64(tmp2, ftmp5);
1011     /* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */
1012
1013     /* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
1014     felem_assign(ftmp5, ftmp2);
1015     felem_scalar(ftmp5, 2);
1016     /* ftmp5[i] < 2 * 2^57 = 2^58 */
1017
1018     /*-
1019      * x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 -
1020      *  2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2
1021      */
1022     felem_diff_128_64(tmp2, ftmp5);
1023     /* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */
1024     felem_reduce(x_out, tmp2);
1025
1026     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out */
1027     felem_diff(ftmp2, x_out);
1028     /* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */
1029
1030     /*
1031      * tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out)
1032      */
1033     felem_mul(tmp2, ftmp3, ftmp2);
1034     /* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */
1035
1036     /*-
1037      * y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) -
1038      *  z2^3*y1*(z1^2*x2 - z2^2*x1)^3
1039      */
1040     widefelem_diff(tmp2, tmp);
1041     /* tmp2[i] < 2^118 + 2^120 < 2^121 */
1042     felem_reduce(y_out, tmp2);
1043
1044     /*
1045      * the result (x_out, y_out, z_out) is incorrect if one of the inputs is
1046      * the point at infinity, so we need to check for this separately
1047      */
1048
1049     /*
1050      * if point 1 is at infinity, copy point 2 to output, and vice versa
1051      */
1052     copy_conditional(x_out, x2, z1_is_zero);
1053     copy_conditional(x_out, x1, z2_is_zero);
1054     copy_conditional(y_out, y2, z1_is_zero);
1055     copy_conditional(y_out, y1, z2_is_zero);
1056     copy_conditional(z_out, z2, z1_is_zero);
1057     copy_conditional(z_out, z1, z2_is_zero);
1058     felem_assign(x3, x_out);
1059     felem_assign(y3, y_out);
1060     felem_assign(z3, z_out);
1061 }
1062
1063 /*
1064  * select_point selects the |idx|th point from a precomputation table and
1065  * copies it to out.
1066  * The pre_comp array argument should be size of |size| argument
1067  */
1068 static void select_point(const u64 idx, unsigned int size,
1069                          const felem pre_comp[][3], felem out[3])
1070 {
1071     unsigned i, j;
1072     limb *outlimbs = &out[0][0];
1073
1074     memset(out, 0, sizeof(*out) * 3);
1075     for (i = 0; i < size; i++) {
1076         const limb *inlimbs = &pre_comp[i][0][0];
1077         u64 mask = i ^ idx;
1078         mask |= mask >> 4;
1079         mask |= mask >> 2;
1080         mask |= mask >> 1;
1081         mask &= 1;
1082         mask--;
1083         for (j = 0; j < 4 * 3; j++)
1084             outlimbs[j] |= inlimbs[j] & mask;
1085     }
1086 }
1087
1088 /* get_bit returns the |i|th bit in |in| */
1089 static char get_bit(const felem_bytearray in, unsigned i)
1090 {
1091     if (i >= 224)
1092         return 0;
1093     return (in[i >> 3] >> (i & 7)) & 1;
1094 }
1095
1096 /*
1097  * Interleaved point multiplication using precomputed point multiples: The
1098  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1099  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1100  * generator, using certain (large) precomputed multiples in g_pre_comp.
1101  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1102  */
1103 static void batch_mul(felem x_out, felem y_out, felem z_out,
1104                       const felem_bytearray scalars[],
1105                       const unsigned num_points, const u8 *g_scalar,
1106                       const int mixed, const felem pre_comp[][17][3],
1107                       const felem g_pre_comp[2][16][3])
1108 {
1109     int i, skip;
1110     unsigned num;
1111     unsigned gen_mul = (g_scalar != NULL);
1112     felem nq[3], tmp[4];
1113     u64 bits;
1114     u8 sign, digit;
1115
1116     /* set nq to the point at infinity */
1117     memset(nq, 0, sizeof(nq));
1118
1119     /*
1120      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1121      * of the generator (two in each of the last 28 rounds) and additions of
1122      * other points multiples (every 5th round).
1123      */
1124     skip = 1;                   /* save two point operations in the first
1125                                  * round */
1126     for (i = (num_points ? 220 : 27); i >= 0; --i) {
1127         /* double */
1128         if (!skip)
1129             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1130
1131         /* add multiples of the generator */
1132         if (gen_mul && (i <= 27)) {
1133             /* first, look 28 bits upwards */
1134             bits = get_bit(g_scalar, i + 196) << 3;
1135             bits |= get_bit(g_scalar, i + 140) << 2;
1136             bits |= get_bit(g_scalar, i + 84) << 1;
1137             bits |= get_bit(g_scalar, i + 28);
1138             /* select the point to add, in constant time */
1139             select_point(bits, 16, g_pre_comp[1], tmp);
1140
1141             if (!skip) {
1142                 /* value 1 below is argument for "mixed" */
1143                 point_add(nq[0], nq[1], nq[2],
1144                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1145             } else {
1146                 memcpy(nq, tmp, 3 * sizeof(felem));
1147                 skip = 0;
1148             }
1149
1150             /* second, look at the current position */
1151             bits = get_bit(g_scalar, i + 168) << 3;
1152             bits |= get_bit(g_scalar, i + 112) << 2;
1153             bits |= get_bit(g_scalar, i + 56) << 1;
1154             bits |= get_bit(g_scalar, i);
1155             /* select the point to add, in constant time */
1156             select_point(bits, 16, g_pre_comp[0], tmp);
1157             point_add(nq[0], nq[1], nq[2],
1158                       nq[0], nq[1], nq[2],
1159                       1 /* mixed */ , tmp[0], tmp[1], tmp[2]);
1160         }
1161
1162         /* do other additions every 5 doublings */
1163         if (num_points && (i % 5 == 0)) {
1164             /* loop over all scalars */
1165             for (num = 0; num < num_points; ++num) {
1166                 bits = get_bit(scalars[num], i + 4) << 5;
1167                 bits |= get_bit(scalars[num], i + 3) << 4;
1168                 bits |= get_bit(scalars[num], i + 2) << 3;
1169                 bits |= get_bit(scalars[num], i + 1) << 2;
1170                 bits |= get_bit(scalars[num], i) << 1;
1171                 bits |= get_bit(scalars[num], i - 1);
1172                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1173
1174                 /* select the point to add or subtract */
1175                 select_point(digit, 17, pre_comp[num], tmp);
1176                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1177                                             * point */
1178                 copy_conditional(tmp[1], tmp[3], sign);
1179
1180                 if (!skip) {
1181                     point_add(nq[0], nq[1], nq[2],
1182                               nq[0], nq[1], nq[2],
1183                               mixed, tmp[0], tmp[1], tmp[2]);
1184                 } else {
1185                     memcpy(nq, tmp, 3 * sizeof(felem));
1186                     skip = 0;
1187                 }
1188             }
1189         }
1190     }
1191     felem_assign(x_out, nq[0]);
1192     felem_assign(y_out, nq[1]);
1193     felem_assign(z_out, nq[2]);
1194 }
1195
1196 /******************************************************************************/
1197 /*
1198  * FUNCTIONS TO MANAGE PRECOMPUTATION
1199  */
1200
1201 static NISTP224_PRE_COMP *nistp224_pre_comp_new()
1202 {
1203     NISTP224_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1204
1205     if (!ret) {
1206         ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1207         return ret;
1208     }
1209     ret->references = 1;
1210     return ret;
1211 }
1212
1213 NISTP224_PRE_COMP *EC_nistp224_pre_comp_dup(NISTP224_PRE_COMP *p)
1214 {
1215     if (p != NULL)
1216         CRYPTO_add(&p->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1217     return p;
1218 }
1219
1220 void EC_nistp224_pre_comp_free(NISTP224_PRE_COMP *p)
1221 {
1222     if (p == NULL
1223         || CRYPTO_add(&p->references, -1, CRYPTO_LOCK_EC_PRE_COMP) > 0)
1224         return;
1225     OPENSSL_free(p);
1226 }
1227
1228 /******************************************************************************/
1229 /*
1230  * OPENSSL EC_METHOD FUNCTIONS
1231  */
1232
1233 int ec_GFp_nistp224_group_init(EC_GROUP *group)
1234 {
1235     int ret;
1236     ret = ec_GFp_simple_group_init(group);
1237     group->a_is_minus3 = 1;
1238     return ret;
1239 }
1240
1241 int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1242                                     const BIGNUM *a, const BIGNUM *b,
1243                                     BN_CTX *ctx)
1244 {
1245     int ret = 0;
1246     BN_CTX *new_ctx = NULL;
1247     BIGNUM *curve_p, *curve_a, *curve_b;
1248
1249     if (ctx == NULL)
1250         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1251             return 0;
1252     BN_CTX_start(ctx);
1253     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1254         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1255         ((curve_b = BN_CTX_get(ctx)) == NULL))
1256         goto err;
1257     BN_bin2bn(nistp224_curve_params[0], sizeof(felem_bytearray), curve_p);
1258     BN_bin2bn(nistp224_curve_params[1], sizeof(felem_bytearray), curve_a);
1259     BN_bin2bn(nistp224_curve_params[2], sizeof(felem_bytearray), curve_b);
1260     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1261         ECerr(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE,
1262               EC_R_WRONG_CURVE_PARAMETERS);
1263         goto err;
1264     }
1265     group->field_mod_func = BN_nist_mod_224;
1266     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1267  err:
1268     BN_CTX_end(ctx);
1269     BN_CTX_free(new_ctx);
1270     return ret;
1271 }
1272
1273 /*
1274  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1275  * (X/Z^2, Y/Z^3)
1276  */
1277 int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group,
1278                                                  const EC_POINT *point,
1279                                                  BIGNUM *x, BIGNUM *y,
1280                                                  BN_CTX *ctx)
1281 {
1282     felem z1, z2, x_in, y_in, x_out, y_out;
1283     widefelem tmp;
1284
1285     if (EC_POINT_is_at_infinity(group, point)) {
1286         ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1287               EC_R_POINT_AT_INFINITY);
1288         return 0;
1289     }
1290     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1291         (!BN_to_felem(z1, point->Z)))
1292         return 0;
1293     felem_inv(z2, z1);
1294     felem_square(tmp, z2);
1295     felem_reduce(z1, tmp);
1296     felem_mul(tmp, x_in, z1);
1297     felem_reduce(x_in, tmp);
1298     felem_contract(x_out, x_in);
1299     if (x != NULL) {
1300         if (!felem_to_BN(x, x_out)) {
1301             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1302                   ERR_R_BN_LIB);
1303             return 0;
1304         }
1305     }
1306     felem_mul(tmp, z1, z2);
1307     felem_reduce(z1, tmp);
1308     felem_mul(tmp, y_in, z1);
1309     felem_reduce(y_in, tmp);
1310     felem_contract(y_out, y_in);
1311     if (y != NULL) {
1312         if (!felem_to_BN(y, y_out)) {
1313             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1314                   ERR_R_BN_LIB);
1315             return 0;
1316         }
1317     }
1318     return 1;
1319 }
1320
1321 static void make_points_affine(size_t num, felem points[ /* num */ ][3],
1322                                felem tmp_felems[ /* num+1 */ ])
1323 {
1324     /*
1325      * Runs in constant time, unless an input is the point at infinity (which
1326      * normally shouldn't happen).
1327      */
1328     ec_GFp_nistp_points_make_affine_internal(num,
1329                                              points,
1330                                              sizeof(felem),
1331                                              tmp_felems,
1332                                              (void (*)(void *))felem_one,
1333                                              (int (*)(const void *))
1334                                              felem_is_zero_int,
1335                                              (void (*)(void *, const void *))
1336                                              felem_assign,
1337                                              (void (*)(void *, const void *))
1338                                              felem_square_reduce, (void (*)
1339                                                                    (void *,
1340                                                                     const void
1341                                                                     *,
1342                                                                     const void
1343                                                                     *))
1344                                              felem_mul_reduce,
1345                                              (void (*)(void *, const void *))
1346                                              felem_inv,
1347                                              (void (*)(void *, const void *))
1348                                              felem_contract);
1349 }
1350
1351 /*
1352  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1353  * values Result is stored in r (r can equal one of the inputs).
1354  */
1355 int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
1356                                const BIGNUM *scalar, size_t num,
1357                                const EC_POINT *points[],
1358                                const BIGNUM *scalars[], BN_CTX *ctx)
1359 {
1360     int ret = 0;
1361     int j;
1362     unsigned i;
1363     int mixed = 0;
1364     BN_CTX *new_ctx = NULL;
1365     BIGNUM *x, *y, *z, *tmp_scalar;
1366     felem_bytearray g_secret;
1367     felem_bytearray *secrets = NULL;
1368     felem (*pre_comp)[17][3] = NULL;
1369     felem *tmp_felems = NULL;
1370     felem_bytearray tmp;
1371     unsigned num_bytes;
1372     int have_pre_comp = 0;
1373     size_t num_points = num;
1374     felem x_in, y_in, z_in, x_out, y_out, z_out;
1375     NISTP224_PRE_COMP *pre = NULL;
1376     const felem(*g_pre_comp)[16][3] = NULL;
1377     EC_POINT *generator = NULL;
1378     const EC_POINT *p = NULL;
1379     const BIGNUM *p_scalar = NULL;
1380
1381     if (ctx == NULL)
1382         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1383             return 0;
1384     BN_CTX_start(ctx);
1385     if (((x = BN_CTX_get(ctx)) == NULL) ||
1386         ((y = BN_CTX_get(ctx)) == NULL) ||
1387         ((z = BN_CTX_get(ctx)) == NULL) ||
1388         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1389         goto err;
1390
1391     if (scalar != NULL) {
1392         pre = group->pre_comp.nistp224;
1393         if (pre)
1394             /* we have precomputation, try to use it */
1395             g_pre_comp = (const felem(*)[16][3])pre->g_pre_comp;
1396         else
1397             /* try to use the standard precomputation */
1398             g_pre_comp = &gmul[0];
1399         generator = EC_POINT_new(group);
1400         if (generator == NULL)
1401             goto err;
1402         /* get the generator from precomputation */
1403         if (!felem_to_BN(x, g_pre_comp[0][1][0]) ||
1404             !felem_to_BN(y, g_pre_comp[0][1][1]) ||
1405             !felem_to_BN(z, g_pre_comp[0][1][2])) {
1406             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1407             goto err;
1408         }
1409         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1410                                                       generator, x, y, z,
1411                                                       ctx))
1412             goto err;
1413         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1414             /* precomputation matches generator */
1415             have_pre_comp = 1;
1416         else
1417             /*
1418              * we don't have valid precomputation: treat the generator as a
1419              * random point
1420              */
1421             num_points = num_points + 1;
1422     }
1423
1424     if (num_points > 0) {
1425         if (num_points >= 3) {
1426             /*
1427              * unless we precompute multiples for just one or two points,
1428              * converting those into affine form is time well spent
1429              */
1430             mixed = 1;
1431         }
1432         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1433         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1434         if (mixed)
1435             tmp_felems =
1436                 OPENSSL_malloc(sizeof(felem) * (num_points * 17 + 1));
1437         if ((secrets == NULL) || (pre_comp == NULL)
1438             || (mixed && (tmp_felems == NULL))) {
1439             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1440             goto err;
1441         }
1442
1443         /*
1444          * we treat NULL scalars as 0, and NULL points as points at infinity,
1445          * i.e., they contribute nothing to the linear combination
1446          */
1447         for (i = 0; i < num_points; ++i) {
1448             if (i == num)
1449                 /* the generator */
1450             {
1451                 p = EC_GROUP_get0_generator(group);
1452                 p_scalar = scalar;
1453             } else
1454                 /* the i^th point */
1455             {
1456                 p = points[i];
1457                 p_scalar = scalars[i];
1458             }
1459             if ((p_scalar != NULL) && (p != NULL)) {
1460                 /* reduce scalar to 0 <= scalar < 2^224 */
1461                 if ((BN_num_bits(p_scalar) > 224)
1462                     || (BN_is_negative(p_scalar))) {
1463                     /*
1464                      * this is an unusual input, and we don't guarantee
1465                      * constant-timeness
1466                      */
1467                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1468                         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1469                         goto err;
1470                     }
1471                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
1472                 } else
1473                     num_bytes = BN_bn2bin(p_scalar, tmp);
1474                 flip_endian(secrets[i], tmp, num_bytes);
1475                 /* precompute multiples */
1476                 if ((!BN_to_felem(x_out, p->X)) ||
1477                     (!BN_to_felem(y_out, p->Y)) ||
1478                     (!BN_to_felem(z_out, p->Z)))
1479                     goto err;
1480                 felem_assign(pre_comp[i][1][0], x_out);
1481                 felem_assign(pre_comp[i][1][1], y_out);
1482                 felem_assign(pre_comp[i][1][2], z_out);
1483                 for (j = 2; j <= 16; ++j) {
1484                     if (j & 1) {
1485                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1486                                   pre_comp[i][j][2], pre_comp[i][1][0],
1487                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1488                                   pre_comp[i][j - 1][0],
1489                                   pre_comp[i][j - 1][1],
1490                                   pre_comp[i][j - 1][2]);
1491                     } else {
1492                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1493                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1494                                      pre_comp[i][j / 2][1],
1495                                      pre_comp[i][j / 2][2]);
1496                     }
1497                 }
1498             }
1499         }
1500         if (mixed)
1501             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1502     }
1503
1504     /* the scalar for the generator */
1505     if ((scalar != NULL) && (have_pre_comp)) {
1506         memset(g_secret, 0, sizeof(g_secret));
1507         /* reduce scalar to 0 <= scalar < 2^224 */
1508         if ((BN_num_bits(scalar) > 224) || (BN_is_negative(scalar))) {
1509             /*
1510              * this is an unusual input, and we don't guarantee
1511              * constant-timeness
1512              */
1513             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1514                 ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1515                 goto err;
1516             }
1517             num_bytes = BN_bn2bin(tmp_scalar, tmp);
1518         } else
1519             num_bytes = BN_bn2bin(scalar, tmp);
1520         flip_endian(g_secret, tmp, num_bytes);
1521         /* do the multiplication with generator precomputation */
1522         batch_mul(x_out, y_out, z_out,
1523                   (const felem_bytearray(*))secrets, num_points,
1524                   g_secret,
1525                   mixed, (const felem(*)[17][3])pre_comp, g_pre_comp);
1526     } else
1527         /* do the multiplication without generator precomputation */
1528         batch_mul(x_out, y_out, z_out,
1529                   (const felem_bytearray(*))secrets, num_points,
1530                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1531     /* reduce the output to its unique minimal representation */
1532     felem_contract(x_in, x_out);
1533     felem_contract(y_in, y_out);
1534     felem_contract(z_in, z_out);
1535     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1536         (!felem_to_BN(z, z_in))) {
1537         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1538         goto err;
1539     }
1540     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1541
1542  err:
1543     BN_CTX_end(ctx);
1544     EC_POINT_free(generator);
1545     BN_CTX_free(new_ctx);
1546     OPENSSL_free(secrets);
1547     OPENSSL_free(pre_comp);
1548     OPENSSL_free(tmp_felems);
1549     return ret;
1550 }
1551
1552 int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1553 {
1554     int ret = 0;
1555     NISTP224_PRE_COMP *pre = NULL;
1556     int i, j;
1557     BN_CTX *new_ctx = NULL;
1558     BIGNUM *x, *y;
1559     EC_POINT *generator = NULL;
1560     felem tmp_felems[32];
1561
1562     /* throw away old precomputation */
1563     EC_pre_comp_free(group);
1564     if (ctx == NULL)
1565         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1566             return 0;
1567     BN_CTX_start(ctx);
1568     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
1569         goto err;
1570     /* get the generator */
1571     if (group->generator == NULL)
1572         goto err;
1573     generator = EC_POINT_new(group);
1574     if (generator == NULL)
1575         goto err;
1576     BN_bin2bn(nistp224_curve_params[3], sizeof(felem_bytearray), x);
1577     BN_bin2bn(nistp224_curve_params[4], sizeof(felem_bytearray), y);
1578     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
1579         goto err;
1580     if ((pre = nistp224_pre_comp_new()) == NULL)
1581         goto err;
1582     /*
1583      * if the generator is the standard one, use built-in precomputation
1584      */
1585     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1586         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1587         goto done;
1588     }
1589     if ((!BN_to_felem(pre->g_pre_comp[0][1][0], group->generator->X)) ||
1590         (!BN_to_felem(pre->g_pre_comp[0][1][1], group->generator->Y)) ||
1591         (!BN_to_felem(pre->g_pre_comp[0][1][2], group->generator->Z)))
1592         goto err;
1593     /*
1594      * compute 2^56*G, 2^112*G, 2^168*G for the first table, 2^28*G, 2^84*G,
1595      * 2^140*G, 2^196*G for the second one
1596      */
1597     for (i = 1; i <= 8; i <<= 1) {
1598         point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1599                      pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
1600                      pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]);
1601         for (j = 0; j < 27; ++j) {
1602             point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1603                          pre->g_pre_comp[1][i][2], pre->g_pre_comp[1][i][0],
1604                          pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1605         }
1606         if (i == 8)
1607             break;
1608         point_double(pre->g_pre_comp[0][2 * i][0],
1609                      pre->g_pre_comp[0][2 * i][1],
1610                      pre->g_pre_comp[0][2 * i][2], pre->g_pre_comp[1][i][0],
1611                      pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1612         for (j = 0; j < 27; ++j) {
1613             point_double(pre->g_pre_comp[0][2 * i][0],
1614                          pre->g_pre_comp[0][2 * i][1],
1615                          pre->g_pre_comp[0][2 * i][2],
1616                          pre->g_pre_comp[0][2 * i][0],
1617                          pre->g_pre_comp[0][2 * i][1],
1618                          pre->g_pre_comp[0][2 * i][2]);
1619         }
1620     }
1621     for (i = 0; i < 2; i++) {
1622         /* g_pre_comp[i][0] is the point at infinity */
1623         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
1624         /* the remaining multiples */
1625         /* 2^56*G + 2^112*G resp. 2^84*G + 2^140*G */
1626         point_add(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
1627                   pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
1628                   pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
1629                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1630                   pre->g_pre_comp[i][2][2]);
1631         /* 2^56*G + 2^168*G resp. 2^84*G + 2^196*G */
1632         point_add(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
1633                   pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
1634                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1635                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1636                   pre->g_pre_comp[i][2][2]);
1637         /* 2^112*G + 2^168*G resp. 2^140*G + 2^196*G */
1638         point_add(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
1639                   pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
1640                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1641                   0, pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
1642                   pre->g_pre_comp[i][4][2]);
1643         /*
1644          * 2^56*G + 2^112*G + 2^168*G resp. 2^84*G + 2^140*G + 2^196*G
1645          */
1646         point_add(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
1647                   pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
1648                   pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
1649                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1650                   pre->g_pre_comp[i][2][2]);
1651         for (j = 1; j < 8; ++j) {
1652             /* odd multiples: add G resp. 2^28*G */
1653             point_add(pre->g_pre_comp[i][2 * j + 1][0],
1654                       pre->g_pre_comp[i][2 * j + 1][1],
1655                       pre->g_pre_comp[i][2 * j + 1][2],
1656                       pre->g_pre_comp[i][2 * j][0],
1657                       pre->g_pre_comp[i][2 * j][1],
1658                       pre->g_pre_comp[i][2 * j][2], 0,
1659                       pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1],
1660                       pre->g_pre_comp[i][1][2]);
1661         }
1662     }
1663     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems);
1664
1665  done:
1666     SETPRECOMP(group, nistp224, pre);
1667     pre = NULL;
1668     ret = 1;
1669  err:
1670     BN_CTX_end(ctx);
1671     EC_POINT_free(generator);
1672     BN_CTX_free(new_ctx);
1673     EC_nistp224_pre_comp_free(pre);
1674     return ret;
1675 }
1676
1677 int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group)
1678 {
1679     return HAVEPRECOMP(group, nistp224);
1680 }
1681
1682 #endif