crypto/ec/ecp_nistp224.c

   1 /* crypto/ec/ecp_nistp224.c */
   2 /*
   3  * Written by Emilia Kasper (Google) for the OpenSSL project.
   4  */
   5 /* Copyright 2011 Google Inc.
   6  *
   7  * Licensed under the Apache License, Version 2.0 (the "License");
   8  *
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  *  Unless required by applicable law or agreed to in writing, software
  15  *  distributed under the License is distributed on an "AS IS" BASIS,
  16  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  *  See the License for the specific language governing permissions and
  18  *  limitations under the License.
  19  */
  20
  21 /*
  22  * A 64-bit implementation of the NIST P-224 elliptic curve point multiplication
  23  *
  24  * Inspired by Daniel J. Bernstein's public domain nistp224 implementation
  25  * and Adam Langley's public domain 64-bit C implementation of curve25519
  26  */
  27
  28 #include <openssl/opensslconf.h>
  29 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
  30
  31 # include <stdint.h>
  32 # include <string.h>
  33 # include <openssl/err.h>
  34 # include "ec_lcl.h"
  35
  36 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  37   /* even with gcc, the typedef won't work for 32-bit platforms */
  38 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  39                                  * platforms */
  40 # else
  41 #  error "Need GCC 3.1 or later to define type uint128_t"
  42 # endif
  43
  44 typedef uint8_t u8;
  45 typedef uint64_t u64;
  46 typedef int64_t s64;
  47
  48 /******************************************************************************/
  49 /*-
  50  * INTERNAL REPRESENTATION OF FIELD ELEMENTS
  51  *
  52  * Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3
  53  * using 64-bit coefficients called 'limbs',
  54  * and sometimes (for multiplication results) as
  55  * b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 + 2^336*b_6
  56  * using 128-bit coefficients called 'widelimbs'.
  57  * A 4-limb representation is an 'felem';
  58  * a 7-widelimb representation is a 'widefelem'.
  59  * Even within felems, bits of adjacent limbs overlap, and we don't always
  60  * reduce the representations: we ensure that inputs to each felem
  61  * multiplication satisfy a_i < 2^60, so outputs satisfy b_i < 4*2^60*2^60,
  62  * and fit into a 128-bit word without overflow. The coefficients are then
  63  * again partially reduced to obtain an felem satisfying a_i < 2^57.
  64  * We only reduce to the unique minimal representation at the end of the
  65  * computation.
  66  */
  67
  68 typedef uint64_t limb;
  69 typedef uint128_t widelimb;
  70
  71 typedef limb felem[4];
  72 typedef widelimb widefelem[7];
  73
  74 /*
  75  * Field element represented as a byte arrary. 28*8 = 224 bits is also the
  76  * group order size for the elliptic curve, and we also use this type for
  77  * scalars for point multiplication.
  78  */
  79 typedef u8 felem_bytearray[28];
  80
  81 static const felem_bytearray nistp224_curve_params[5] = {
  82     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
  83      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
  84      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01},
  85     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a */
  86      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
  87      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE},
  88     {0xB4, 0x05, 0x0A, 0x85, 0x0C, 0x04, 0xB3, 0xAB, 0xF5, 0x41, /* b */
  89      0x32, 0x56, 0x50, 0x44, 0xB0, 0xB7, 0xD7, 0xBF, 0xD8, 0xBA,
  90      0x27, 0x0B, 0x39, 0x43, 0x23, 0x55, 0xFF, 0xB4},
  91     {0xB7, 0x0E, 0x0C, 0xBD, 0x6B, 0xB4, 0xBF, 0x7F, 0x32, 0x13, /* x */
  92      0x90, 0xB9, 0x4A, 0x03, 0xC1, 0xD3, 0x56, 0xC2, 0x11, 0x22,
  93      0x34, 0x32, 0x80, 0xD6, 0x11, 0x5C, 0x1D, 0x21},
  94     {0xbd, 0x37, 0x63, 0x88, 0xb5, 0xf7, 0x23, 0xfb, 0x4c, 0x22, /* y */
  95      0xdf, 0xe6, 0xcd, 0x43, 0x75, 0xa0, 0x5a, 0x07, 0x47, 0x64,
  96      0x44, 0xd5, 0x81, 0x99, 0x85, 0x00, 0x7e, 0x34}
  97 };
  98
  99 /*-
 100  * Precomputed multiples of the standard generator
 101  * Points are given in coordinates (X, Y, Z) where Z normally is 1
 102  * (0 for the point at infinity).
 103  * For each field element, slice a_0 is word 0, etc.
 104  *
 105  * The table has 2 * 16 elements, starting with the following:
 106  * index | bits    | point
 107  * ------+---------+------------------------------
 108  *     0 | 0 0 0 0 | 0G
 109  *     1 | 0 0 0 1 | 1G
 110  *     2 | 0 0 1 0 | 2^56G
 111  *     3 | 0 0 1 1 | (2^56 + 1)G
 112  *     4 | 0 1 0 0 | 2^112G
 113  *     5 | 0 1 0 1 | (2^112 + 1)G
 114  *     6 | 0 1 1 0 | (2^112 + 2^56)G
 115  *     7 | 0 1 1 1 | (2^112 + 2^56 + 1)G
 116  *     8 | 1 0 0 0 | 2^168G
 117  *     9 | 1 0 0 1 | (2^168 + 1)G
 118  *    10 | 1 0 1 0 | (2^168 + 2^56)G
 119  *    11 | 1 0 1 1 | (2^168 + 2^56 + 1)G
 120  *    12 | 1 1 0 0 | (2^168 + 2^112)G
 121  *    13 | 1 1 0 1 | (2^168 + 2^112 + 1)G
 122  *    14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G
 123  *    15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G
 124  * followed by a copy of this with each element multiplied by 2^28.
 125  *
 126  * The reason for this is so that we can clock bits into four different
 127  * locations when doing simple scalar multiplies against the base point,
 128  * and then another four locations using the second 16 elements.
 129  */
 130 static const felem gmul[2][16][3] = {
 131 {{{0, 0, 0, 0},
 132   {0, 0, 0, 0},
 133   {0, 0, 0, 0}},
 134  {{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf},
 135   {0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723},
 136   {1, 0, 0, 0}},
 137  {{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5},
 138   {0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321},
 139   {1, 0, 0, 0}},
 140  {{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748},
 141   {0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17},
 142   {1, 0, 0, 0}},
 143  {{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe},
 144   {0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b},
 145   {1, 0, 0, 0}},
 146  {{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3},
 147   {0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a},
 148   {1, 0, 0, 0}},
 149  {{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c},
 150   {0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244},
 151   {1, 0, 0, 0}},
 152  {{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849},
 153   {0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112},
 154   {1, 0, 0, 0}},
 155  {{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47},
 156   {0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394},
 157   {1, 0, 0, 0}},
 158  {{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d},
 159   {0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7},
 160   {1, 0, 0, 0}},
 161  {{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24},
 162   {0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881},
 163   {1, 0, 0, 0}},
 164  {{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984},
 165   {0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369},
 166   {1, 0, 0, 0}},
 167  {{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3},
 168   {0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60},
 169   {1, 0, 0, 0}},
 170  {{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057},
 171   {0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9},
 172   {1, 0, 0, 0}},
 173  {{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9},
 174   {0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc},
 175   {1, 0, 0, 0}},
 176  {{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58},
 177   {0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558},
 178   {1, 0, 0, 0}}},
 179 {{{0, 0, 0, 0},
 180   {0, 0, 0, 0},
 181   {0, 0, 0, 0}},
 182  {{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31},
 183   {0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d},
 184   {1, 0, 0, 0}},
 185  {{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3},
 186   {0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a},
 187   {1, 0, 0, 0}},
 188  {{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33},
 189   {0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100},
 190   {1, 0, 0, 0}},
 191  {{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5},
 192   {0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea},
 193   {1, 0, 0, 0}},
 194  {{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be},
 195   {0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51},
 196   {1, 0, 0, 0}},
 197  {{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1},
 198   {0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb},
 199   {1, 0, 0, 0}},
 200  {{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233},
 201   {0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def},
 202   {1, 0, 0, 0}},
 203  {{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae},
 204   {0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45},
 205   {1, 0, 0, 0}},
 206  {{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e},
 207   {0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb},
 208   {1, 0, 0, 0}},
 209  {{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de},
 210   {0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3},
 211   {1, 0, 0, 0}},
 212  {{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05},
 213   {0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58},
 214   {1, 0, 0, 0}},
 215  {{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb},
 216   {0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0},
 217   {1, 0, 0, 0}},
 218  {{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9},
 219   {0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea},
 220   {1, 0, 0, 0}},
 221  {{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba},
 222   {0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405},
 223   {1, 0, 0, 0}},
 224  {{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e},
 225   {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e},
 226   {1, 0, 0, 0}}}
 227 };
 228
 229 /* Precomputation for the group generator. */
 230 typedef struct {
 231     felem g_pre_comp[2][16][3];
 232     int references;
 233 } NISTP224_PRE_COMP;
 234
 235 const EC_METHOD *EC_GFp_nistp224_method(void)
 236 {
 237     static const EC_METHOD ret = {
 238         EC_FLAGS_DEFAULT_OCT,
 239         NID_X9_62_prime_field,
 240         ec_GFp_nistp224_group_init,
 241         ec_GFp_simple_group_finish,
 242         ec_GFp_simple_group_clear_finish,
 243         ec_GFp_nist_group_copy,
 244         ec_GFp_nistp224_group_set_curve,
 245         ec_GFp_simple_group_get_curve,
 246         ec_GFp_simple_group_get_degree,
 247         ec_GFp_simple_group_check_discriminant,
 248         ec_GFp_simple_point_init,
 249         ec_GFp_simple_point_finish,
 250         ec_GFp_simple_point_clear_finish,
 251         ec_GFp_simple_point_copy,
 252         ec_GFp_simple_point_set_to_infinity,
 253         ec_GFp_simple_set_Jprojective_coordinates_GFp,
 254         ec_GFp_simple_get_Jprojective_coordinates_GFp,
 255         ec_GFp_simple_point_set_affine_coordinates,
 256         ec_GFp_nistp224_point_get_affine_coordinates,
 257         0 /* point_set_compressed_coordinates */ ,
 258         0 /* point2oct */ ,
 259         0 /* oct2point */ ,
 260         ec_GFp_simple_add,
 261         ec_GFp_simple_dbl,
 262         ec_GFp_simple_invert,
 263         ec_GFp_simple_is_at_infinity,
 264         ec_GFp_simple_is_on_curve,
 265         ec_GFp_simple_cmp,
 266         ec_GFp_simple_make_affine,
 267         ec_GFp_simple_points_make_affine,
 268         ec_GFp_nistp224_points_mul,
 269         ec_GFp_nistp224_precompute_mult,
 270         ec_GFp_nistp224_have_precompute_mult,
 271         ec_GFp_nist_field_mul,
 272         ec_GFp_nist_field_sqr,
 273         0 /* field_div */ ,
 274         0 /* field_encode */ ,
 275         0 /* field_decode */ ,
 276         0                       /* field_set_to_one */
 277     };
 278
 279     return &ret;
 280 }
 281
 282 /*
 283  * Helper functions to convert field elements to/from internal representation
 284  */
 285 static void bin28_to_felem(felem out, const u8 in[28])
 286 {
 287     out[0] = *((const uint64_t *)(in)) & 0x00ffffffffffffff;
 288     out[1] = (*((const uint64_t *)(in + 7))) & 0x00ffffffffffffff;
 289     out[2] = (*((const uint64_t *)(in + 14))) & 0x00ffffffffffffff;
 290     out[3] = (*((const uint64_t *)(in+20))) >> 8;
 291 }
 292
 293 static void felem_to_bin28(u8 out[28], const felem in)
 294 {
 295     unsigned i;
 296     for (i = 0; i < 7; ++i) {
 297         out[i] = in[0] >> (8 * i);
 298         out[i + 7] = in[1] >> (8 * i);
 299         out[i + 14] = in[2] >> (8 * i);
 300         out[i + 21] = in[3] >> (8 * i);
 301     }
 302 }
 303
 304 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 305 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 306 {
 307     unsigned i;
 308     for (i = 0; i < len; ++i)
 309         out[i] = in[len - 1 - i];
 310 }
 311
 312 /* From OpenSSL BIGNUM to internal representation */
 313 static int BN_to_felem(felem out, const BIGNUM *bn)
 314 {
 315     felem_bytearray b_in;
 316     felem_bytearray b_out;
 317     unsigned num_bytes;
 318
 319     /* BN_bn2bin eats leading zeroes */
 320     memset(b_out, 0, sizeof b_out);
 321     num_bytes = BN_num_bytes(bn);
 322     if (num_bytes > sizeof b_out) {
 323         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 324         return 0;
 325     }
 326     if (BN_is_negative(bn)) {
 327         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 328         return 0;
 329     }
 330     num_bytes = BN_bn2bin(bn, b_in);
 331     flip_endian(b_out, b_in, num_bytes);
 332     bin28_to_felem(out, b_out);
 333     return 1;
 334 }
 335
 336 /* From internal representation to OpenSSL BIGNUM */
 337 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 338 {
 339     felem_bytearray b_in, b_out;
 340     felem_to_bin28(b_in, in);
 341     flip_endian(b_out, b_in, sizeof b_out);
 342     return BN_bin2bn(b_out, sizeof b_out, out);
 343 }
 344
 345 /******************************************************************************/
 346 /*-
 347  *                              FIELD OPERATIONS
 348  *
 349  * Field operations, using the internal representation of field elements.
 350  * NB! These operations are specific to our point multiplication and cannot be
 351  * expected to be correct in general - e.g., multiplication with a large scalar
 352  * will cause an overflow.
 353  *
 354  */
 355
 356 static void felem_one(felem out)
 357 {
 358     out[0] = 1;
 359     out[1] = 0;
 360     out[2] = 0;
 361     out[3] = 0;
 362 }
 363
 364 static void felem_assign(felem out, const felem in)
 365 {
 366     out[0] = in[0];
 367     out[1] = in[1];
 368     out[2] = in[2];
 369     out[3] = in[3];
 370 }
 371
 372 /* Sum two field elements: out += in */
 373 static void felem_sum(felem out, const felem in)
 374 {
 375     out[0] += in[0];
 376     out[1] += in[1];
 377     out[2] += in[2];
 378     out[3] += in[3];
 379 }
 380
 381 /* Get negative value: out = -in */
 382 /* Assumes in[i] < 2^57 */
 383 static void felem_neg(felem out, const felem in)
 384 {
 385     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 386     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 387     static const limb two58m42m2 = (((limb) 1) << 58) -
 388         (((limb) 1) << 42) - (((limb) 1) << 2);
 389
 390     /* Set to 0 mod 2^224-2^96+1 to ensure out > in */
 391     out[0] = two58p2 - in[0];
 392     out[1] = two58m42m2 - in[1];
 393     out[2] = two58m2 - in[2];
 394     out[3] = two58m2 - in[3];
 395 }
 396
 397 /* Subtract field elements: out -= in */
 398 /* Assumes in[i] < 2^57 */
 399 static void felem_diff(felem out, const felem in)
 400 {
 401     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 402     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 403     static const limb two58m42m2 = (((limb) 1) << 58) -
 404         (((limb) 1) << 42) - (((limb) 1) << 2);
 405
 406     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 407     out[0] += two58p2;
 408     out[1] += two58m42m2;
 409     out[2] += two58m2;
 410     out[3] += two58m2;
 411
 412     out[0] -= in[0];
 413     out[1] -= in[1];
 414     out[2] -= in[2];
 415     out[3] -= in[3];
 416 }
 417
 418 /* Subtract in unreduced 128-bit mode: out -= in */
 419 /* Assumes in[i] < 2^119 */
 420 static void widefelem_diff(widefelem out, const widefelem in)
 421 {
 422     static const widelimb two120 = ((widelimb) 1) << 120;
 423     static const widelimb two120m64 = (((widelimb) 1) << 120) -
 424         (((widelimb) 1) << 64);
 425     static const widelimb two120m104m64 = (((widelimb) 1) << 120) -
 426         (((widelimb) 1) << 104) - (((widelimb) 1) << 64);
 427
 428     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 429     out[0] += two120;
 430     out[1] += two120m64;
 431     out[2] += two120m64;
 432     out[3] += two120;
 433     out[4] += two120m104m64;
 434     out[5] += two120m64;
 435     out[6] += two120m64;
 436
 437     out[0] -= in[0];
 438     out[1] -= in[1];
 439     out[2] -= in[2];
 440     out[3] -= in[3];
 441     out[4] -= in[4];
 442     out[5] -= in[5];
 443     out[6] -= in[6];
 444 }
 445
 446 /* Subtract in mixed mode: out128 -= in64 */
 447 /* in[i] < 2^63 */
 448 static void felem_diff_128_64(widefelem out, const felem in)
 449 {
 450     static const widelimb two64p8 = (((widelimb) 1) << 64) +
 451         (((widelimb) 1) << 8);
 452     static const widelimb two64m8 = (((widelimb) 1) << 64) -
 453         (((widelimb) 1) << 8);
 454     static const widelimb two64m48m8 = (((widelimb) 1) << 64) -
 455         (((widelimb) 1) << 48) - (((widelimb) 1) << 8);
 456
 457     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 458     out[0] += two64p8;
 459     out[1] += two64m48m8;
 460     out[2] += two64m8;
 461     out[3] += two64m8;
 462
 463     out[0] -= in[0];
 464     out[1] -= in[1];
 465     out[2] -= in[2];
 466     out[3] -= in[3];
 467 }
 468
 469 /*
 470  * Multiply a field element by a scalar: out = out * scalar The scalars we
 471  * actually use are small, so results fit without overflow
 472  */
 473 static void felem_scalar(felem out, const limb scalar)
 474 {
 475     out[0] *= scalar;
 476     out[1] *= scalar;
 477     out[2] *= scalar;
 478     out[3] *= scalar;
 479 }
 480
 481 /*
 482  * Multiply an unreduced field element by a scalar: out = out * scalar The
 483  * scalars we actually use are small, so results fit without overflow
 484  */
 485 static void widefelem_scalar(widefelem out, const widelimb scalar)
 486 {
 487     out[0] *= scalar;
 488     out[1] *= scalar;
 489     out[2] *= scalar;
 490     out[3] *= scalar;
 491     out[4] *= scalar;
 492     out[5] *= scalar;
 493     out[6] *= scalar;
 494 }
 495
 496 /* Square a field element: out = in^2 */
 497 static void felem_square(widefelem out, const felem in)
 498 {
 499     limb tmp0, tmp1, tmp2;
 500     tmp0 = 2 * in[0];
 501     tmp1 = 2 * in[1];
 502     tmp2 = 2 * in[2];
 503     out[0] = ((widelimb) in[0]) * in[0];
 504     out[1] = ((widelimb) in[0]) * tmp1;
 505     out[2] = ((widelimb) in[0]) * tmp2 + ((widelimb) in[1]) * in[1];
 506     out[3] = ((widelimb) in[3]) * tmp0 + ((widelimb) in[1]) * tmp2;
 507     out[4] = ((widelimb) in[3]) * tmp1 + ((widelimb) in[2]) * in[2];
 508     out[5] = ((widelimb) in[3]) * tmp2;
 509     out[6] = ((widelimb) in[3]) * in[3];
 510 }
 511
 512 /* Multiply two field elements: out = in1 * in2 */
 513 static void felem_mul(widefelem out, const felem in1, const felem in2)
 514 {
 515     out[0] = ((widelimb) in1[0]) * in2[0];
 516     out[1] = ((widelimb) in1[0]) * in2[1] + ((widelimb) in1[1]) * in2[0];
 517     out[2] = ((widelimb) in1[0]) * in2[2] + ((widelimb) in1[1]) * in2[1] +
 518              ((widelimb) in1[2]) * in2[0];
 519     out[3] = ((widelimb) in1[0]) * in2[3] + ((widelimb) in1[1]) * in2[2] +
 520              ((widelimb) in1[2]) * in2[1] + ((widelimb) in1[3]) * in2[0];
 521     out[4] = ((widelimb) in1[1]) * in2[3] + ((widelimb) in1[2]) * in2[2] +
 522              ((widelimb) in1[3]) * in2[1];
 523     out[5] = ((widelimb) in1[2]) * in2[3] + ((widelimb) in1[3]) * in2[2];
 524     out[6] = ((widelimb) in1[3]) * in2[3];
 525 }
 526
 527 /*-
 528  * Reduce seven 128-bit coefficients to four 64-bit coefficients.
 529  * Requires in[i] < 2^126,
 530  * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 */
 531 static void felem_reduce(felem out, const widefelem in)
 532 {
 533     static const widelimb two127p15 = (((widelimb) 1) << 127) +
 534         (((widelimb) 1) << 15);
 535     static const widelimb two127m71 = (((widelimb) 1) << 127) -
 536         (((widelimb) 1) << 71);
 537     static const widelimb two127m71m55 = (((widelimb) 1) << 127) -
 538         (((widelimb) 1) << 71) - (((widelimb) 1) << 55);
 539     widelimb output[5];
 540
 541     /* Add 0 mod 2^224-2^96+1 to ensure all differences are positive */
 542     output[0] = in[0] + two127p15;
 543     output[1] = in[1] + two127m71m55;
 544     output[2] = in[2] + two127m71;
 545     output[3] = in[3];
 546     output[4] = in[4];
 547
 548     /* Eliminate in[4], in[5], in[6] */
 549     output[4] += in[6] >> 16;
 550     output[3] += (in[6] & 0xffff) << 40;
 551     output[2] -= in[6];
 552
 553     output[3] += in[5] >> 16;
 554     output[2] += (in[5] & 0xffff) << 40;
 555     output[1] -= in[5];
 556
 557     output[2] += output[4] >> 16;
 558     output[1] += (output[4] & 0xffff) << 40;
 559     output[0] -= output[4];
 560
 561     /* Carry 2 -> 3 -> 4 */
 562     output[3] += output[2] >> 56;
 563     output[2] &= 0x00ffffffffffffff;
 564
 565     output[4] = output[3] >> 56;
 566     output[3] &= 0x00ffffffffffffff;
 567
 568     /* Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72 */
 569
 570     /* Eliminate output[4] */
 571     output[2] += output[4] >> 16;
 572     /* output[2] < 2^56 + 2^56 = 2^57 */
 573     output[1] += (output[4] & 0xffff) << 40;
 574     output[0] -= output[4];
 575
 576     /* Carry 0 -> 1 -> 2 -> 3 */
 577     output[1] += output[0] >> 56;
 578     out[0] = output[0] & 0x00ffffffffffffff;
 579
 580     output[2] += output[1] >> 56;
 581     /* output[2] < 2^57 + 2^72 */
 582     out[1] = output[1] & 0x00ffffffffffffff;
 583     output[3] += output[2] >> 56;
 584     /* output[3] <= 2^56 + 2^16 */
 585     out[2] = output[2] & 0x00ffffffffffffff;
 586
 587     /*-
 588      * out[0] < 2^56, out[1] < 2^56, out[2] < 2^56,
 589      * out[3] <= 2^56 + 2^16 (due to final carry),
 590      * so out < 2*p
 591      */
 592     out[3] = output[3];
 593 }
 594
 595 static void felem_square_reduce(felem out, const felem in)
 596 {
 597     widefelem tmp;
 598     felem_square(tmp, in);
 599     felem_reduce(out, tmp);
 600 }
 601
 602 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 603 {
 604     widefelem tmp;
 605     felem_mul(tmp, in1, in2);
 606     felem_reduce(out, tmp);
 607 }
 608
 609 /*
 610  * Reduce to unique minimal representation. Requires 0 <= in < 2*p (always
 611  * call felem_reduce first)
 612  */
 613 static void felem_contract(felem out, const felem in)
 614 {
 615     static const int64_t two56 = ((limb) 1) << 56;
 616     /* 0 <= in < 2*p, p = 2^224 - 2^96 + 1 */
 617     /* if in > p , reduce in = in - 2^224 + 2^96 - 1 */
 618     int64_t tmp[4], a;
 619     tmp[0] = in[0];
 620     tmp[1] = in[1];
 621     tmp[2] = in[2];
 622     tmp[3] = in[3];
 623     /* Case 1: a = 1 iff in >= 2^224 */
 624     a = (in[3] >> 56);
 625     tmp[0] -= a;
 626     tmp[1] += a << 40;
 627     tmp[3] &= 0x00ffffffffffffff;
 628     /*
 629      * Case 2: a = 0 iff p <= in < 2^224, i.e., the high 128 bits are all 1
 630      * and the lower part is non-zero
 631      */
 632     a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) |
 633         (((int64_t) (in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63);
 634     a &= 0x00ffffffffffffff;
 635     /* turn a into an all-one mask (if a = 0) or an all-zero mask */
 636     a = (a - 1) >> 63;
 637     /* subtract 2^224 - 2^96 + 1 if a is all-one */
 638     tmp[3] &= a ^ 0xffffffffffffffff;
 639     tmp[2] &= a ^ 0xffffffffffffffff;
 640     tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff;
 641     tmp[0] -= 1 & a;
 642
 643     /*
 644      * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
 645      * non-zero, so we only need one step
 646      */
 647     a = tmp[0] >> 63;
 648     tmp[0] += two56 & a;
 649     tmp[1] -= 1 & a;
 650
 651     /* carry 1 -> 2 -> 3 */
 652     tmp[2] += tmp[1] >> 56;
 653     tmp[1] &= 0x00ffffffffffffff;
 654
 655     tmp[3] += tmp[2] >> 56;
 656     tmp[2] &= 0x00ffffffffffffff;
 657
 658     /* Now 0 <= out < p */
 659     out[0] = tmp[0];
 660     out[1] = tmp[1];
 661     out[2] = tmp[2];
 662     out[3] = tmp[3];
 663 }
 664
 665 /*
 666  * Zero-check: returns 1 if input is 0, and 0 otherwise. We know that field
 667  * elements are reduced to in < 2^225, so we only need to check three cases:
 668  * 0, 2^224 - 2^96 + 1, and 2^225 - 2^97 + 2
 669  */
 670 static limb felem_is_zero(const felem in)
 671 {
 672     limb zero, two224m96p1, two225m97p2;
 673
 674     zero = in[0] | in[1] | in[2] | in[3];
 675     zero = (((int64_t) (zero) - 1) >> 63) & 1;
 676     two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000)
 677         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x00ffffffffffffff);
 678     two224m96p1 = (((int64_t) (two224m96p1) - 1) >> 63) & 1;
 679     two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000)
 680         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x01ffffffffffffff);
 681     two225m97p2 = (((int64_t) (two225m97p2) - 1) >> 63) & 1;
 682     return (zero | two224m96p1 | two225m97p2);
 683 }
 684
 685 static limb felem_is_zero_int(const felem in)
 686 {
 687     return (int)(felem_is_zero(in) & ((limb) 1));
 688 }
 689
 690 /* Invert a field element */
 691 /* Computation chain copied from djb's code */
 692 static void felem_inv(felem out, const felem in)
 693 {
 694     felem ftmp, ftmp2, ftmp3, ftmp4;
 695     widefelem tmp;
 696     unsigned i;
 697
 698     felem_square(tmp, in);
 699     felem_reduce(ftmp, tmp);    /* 2 */
 700     felem_mul(tmp, in, ftmp);
 701     felem_reduce(ftmp, tmp);    /* 2^2 - 1 */
 702     felem_square(tmp, ftmp);
 703     felem_reduce(ftmp, tmp);    /* 2^3 - 2 */
 704     felem_mul(tmp, in, ftmp);
 705     felem_reduce(ftmp, tmp);    /* 2^3 - 1 */
 706     felem_square(tmp, ftmp);
 707     felem_reduce(ftmp2, tmp);   /* 2^4 - 2 */
 708     felem_square(tmp, ftmp2);
 709     felem_reduce(ftmp2, tmp);   /* 2^5 - 4 */
 710     felem_square(tmp, ftmp2);
 711     felem_reduce(ftmp2, tmp);   /* 2^6 - 8 */
 712     felem_mul(tmp, ftmp2, ftmp);
 713     felem_reduce(ftmp, tmp);    /* 2^6 - 1 */
 714     felem_square(tmp, ftmp);
 715     felem_reduce(ftmp2, tmp);   /* 2^7 - 2 */
 716     for (i = 0; i < 5; ++i) {   /* 2^12 - 2^6 */
 717         felem_square(tmp, ftmp2);
 718         felem_reduce(ftmp2, tmp);
 719     }
 720     felem_mul(tmp, ftmp2, ftmp);
 721     felem_reduce(ftmp2, tmp);   /* 2^12 - 1 */
 722     felem_square(tmp, ftmp2);
 723     felem_reduce(ftmp3, tmp);   /* 2^13 - 2 */
 724     for (i = 0; i < 11; ++i) {  /* 2^24 - 2^12 */
 725         felem_square(tmp, ftmp3);
 726         felem_reduce(ftmp3, tmp);
 727     }
 728     felem_mul(tmp, ftmp3, ftmp2);
 729     felem_reduce(ftmp2, tmp);   /* 2^24 - 1 */
 730     felem_square(tmp, ftmp2);
 731     felem_reduce(ftmp3, tmp);   /* 2^25 - 2 */
 732     for (i = 0; i < 23; ++i) {  /* 2^48 - 2^24 */
 733         felem_square(tmp, ftmp3);
 734         felem_reduce(ftmp3, tmp);
 735     }
 736     felem_mul(tmp, ftmp3, ftmp2);
 737     felem_reduce(ftmp3, tmp);   /* 2^48 - 1 */
 738     felem_square(tmp, ftmp3);
 739     felem_reduce(ftmp4, tmp);   /* 2^49 - 2 */
 740     for (i = 0; i < 47; ++i) {  /* 2^96 - 2^48 */
 741         felem_square(tmp, ftmp4);
 742         felem_reduce(ftmp4, tmp);
 743     }
 744     felem_mul(tmp, ftmp3, ftmp4);
 745     felem_reduce(ftmp3, tmp);   /* 2^96 - 1 */
 746     felem_square(tmp, ftmp3);
 747     felem_reduce(ftmp4, tmp);   /* 2^97 - 2 */
 748     for (i = 0; i < 23; ++i) {  /* 2^120 - 2^24 */
 749         felem_square(tmp, ftmp4);
 750         felem_reduce(ftmp4, tmp);
 751     }
 752     felem_mul(tmp, ftmp2, ftmp4);
 753     felem_reduce(ftmp2, tmp);   /* 2^120 - 1 */
 754     for (i = 0; i < 6; ++i) {   /* 2^126 - 2^6 */
 755         felem_square(tmp, ftmp2);
 756         felem_reduce(ftmp2, tmp);
 757     }
 758     felem_mul(tmp, ftmp2, ftmp);
 759     felem_reduce(ftmp, tmp);    /* 2^126 - 1 */
 760     felem_square(tmp, ftmp);
 761     felem_reduce(ftmp, tmp);    /* 2^127 - 2 */
 762     felem_mul(tmp, ftmp, in);
 763     felem_reduce(ftmp, tmp);    /* 2^127 - 1 */
 764     for (i = 0; i < 97; ++i) {  /* 2^224 - 2^97 */
 765         felem_square(tmp, ftmp);
 766         felem_reduce(ftmp, tmp);
 767     }
 768     felem_mul(tmp, ftmp, ftmp3);
 769     felem_reduce(out, tmp);     /* 2^224 - 2^96 - 1 */
 770 }
 771
 772 /*
 773  * Copy in constant time: if icopy == 1, copy in to out, if icopy == 0, copy
 774  * out to itself.
 775  */
 776 static void copy_conditional(felem out, const felem in, limb icopy)
 777 {
 778     unsigned i;
 779     /*
 780      * icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one
 781      */
 782     const limb copy = -icopy;
 783     for (i = 0; i < 4; ++i) {
 784         const limb tmp = copy & (in[i] ^ out[i]);
 785         out[i] ^= tmp;
 786     }
 787 }
 788
 789 /******************************************************************************/
 790 /*-
 791  *                       ELLIPTIC CURVE POINT OPERATIONS
 792  *
 793  * Points are represented in Jacobian projective coordinates:
 794  * (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3),
 795  * or to the point at infinity if Z == 0.
 796  *
 797  */
 798
 799 /*-
 800  * Double an elliptic curve point:
 801  * (X', Y', Z') = 2 * (X, Y, Z), where
 802  * X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2
 803  * Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^2
 804  * Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z
 805  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed,
 806  * while x_out == y_in is not (maybe this works, but it's not tested).
 807  */
 808 static void
 809 point_double(felem x_out, felem y_out, felem z_out,
 810              const felem x_in, const felem y_in, const felem z_in)
 811 {
 812     widefelem tmp, tmp2;
 813     felem delta, gamma, beta, alpha, ftmp, ftmp2;
 814
 815     felem_assign(ftmp, x_in);
 816     felem_assign(ftmp2, x_in);
 817
 818     /* delta = z^2 */
 819     felem_square(tmp, z_in);
 820     felem_reduce(delta, tmp);
 821
 822     /* gamma = y^2 */
 823     felem_square(tmp, y_in);
 824     felem_reduce(gamma, tmp);
 825
 826     /* beta = x*gamma */
 827     felem_mul(tmp, x_in, gamma);
 828     felem_reduce(beta, tmp);
 829
 830     /* alpha = 3*(x-delta)*(x+delta) */
 831     felem_diff(ftmp, delta);
 832     /* ftmp[i] < 2^57 + 2^58 + 2 < 2^59 */
 833     felem_sum(ftmp2, delta);
 834     /* ftmp2[i] < 2^57 + 2^57 = 2^58 */
 835     felem_scalar(ftmp2, 3);
 836     /* ftmp2[i] < 3 * 2^58 < 2^60 */
 837     felem_mul(tmp, ftmp, ftmp2);
 838     /* tmp[i] < 2^60 * 2^59 * 4 = 2^121 */
 839     felem_reduce(alpha, tmp);
 840
 841     /* x' = alpha^2 - 8*beta */
 842     felem_square(tmp, alpha);
 843     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 844     felem_assign(ftmp, beta);
 845     felem_scalar(ftmp, 8);
 846     /* ftmp[i] < 8 * 2^57 = 2^60 */
 847     felem_diff_128_64(tmp, ftmp);
 848     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 849     felem_reduce(x_out, tmp);
 850
 851     /* z' = (y + z)^2 - gamma - delta */
 852     felem_sum(delta, gamma);
 853     /* delta[i] < 2^57 + 2^57 = 2^58 */
 854     felem_assign(ftmp, y_in);
 855     felem_sum(ftmp, z_in);
 856     /* ftmp[i] < 2^57 + 2^57 = 2^58 */
 857     felem_square(tmp, ftmp);
 858     /* tmp[i] < 4 * 2^58 * 2^58 = 2^118 */
 859     felem_diff_128_64(tmp, delta);
 860     /* tmp[i] < 2^118 + 2^64 + 8 < 2^119 */
 861     felem_reduce(z_out, tmp);
 862
 863     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
 864     felem_scalar(beta, 4);
 865     /* beta[i] < 4 * 2^57 = 2^59 */
 866     felem_diff(beta, x_out);
 867     /* beta[i] < 2^59 + 2^58 + 2 < 2^60 */
 868     felem_mul(tmp, alpha, beta);
 869     /* tmp[i] < 4 * 2^57 * 2^60 = 2^119 */
 870     felem_square(tmp2, gamma);
 871     /* tmp2[i] < 4 * 2^57 * 2^57 = 2^116 */
 872     widefelem_scalar(tmp2, 8);
 873     /* tmp2[i] < 8 * 2^116 = 2^119 */
 874     widefelem_diff(tmp, tmp2);
 875     /* tmp[i] < 2^119 + 2^120 < 2^121 */
 876     felem_reduce(y_out, tmp);
 877 }
 878
 879 /*-
 880  * Add two elliptic curve points:
 881  * (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where
 882  * X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 -
 883  * 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2
 884  * Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 - X_3) -
 885  *        Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3
 886  * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2)
 887  *
 888  * This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0.
 889  */
 890
 891 /*
 892  * This function is not entirely constant-time: it includes a branch for
 893  * checking whether the two input points are equal, (while not equal to the
 894  * point at infinity). This case never happens during single point
 895  * multiplication, so there is no timing leak for ECDH or ECDSA signing.
 896  */
 897 static void point_add(felem x3, felem y3, felem z3,
 898                       const felem x1, const felem y1, const felem z1,
 899                       const int mixed, const felem x2, const felem y2,
 900                       const felem z2)
 901 {
 902     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out;
 903     widefelem tmp, tmp2;
 904     limb z1_is_zero, z2_is_zero, x_equal, y_equal;
 905
 906     if (!mixed) {
 907         /* ftmp2 = z2^2 */
 908         felem_square(tmp, z2);
 909         felem_reduce(ftmp2, tmp);
 910
 911         /* ftmp4 = z2^3 */
 912         felem_mul(tmp, ftmp2, z2);
 913         felem_reduce(ftmp4, tmp);
 914
 915         /* ftmp4 = z2^3*y1 */
 916         felem_mul(tmp2, ftmp4, y1);
 917         felem_reduce(ftmp4, tmp2);
 918
 919         /* ftmp2 = z2^2*x1 */
 920         felem_mul(tmp2, ftmp2, x1);
 921         felem_reduce(ftmp2, tmp2);
 922     } else {
 923         /*
 924          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
 925          */
 926
 927         /* ftmp4 = z2^3*y1 */
 928         felem_assign(ftmp4, y1);
 929
 930         /* ftmp2 = z2^2*x1 */
 931         felem_assign(ftmp2, x1);
 932     }
 933
 934     /* ftmp = z1^2 */
 935     felem_square(tmp, z1);
 936     felem_reduce(ftmp, tmp);
 937
 938     /* ftmp3 = z1^3 */
 939     felem_mul(tmp, ftmp, z1);
 940     felem_reduce(ftmp3, tmp);
 941
 942     /* tmp = z1^3*y2 */
 943     felem_mul(tmp, ftmp3, y2);
 944     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 945
 946     /* ftmp3 = z1^3*y2 - z2^3*y1 */
 947     felem_diff_128_64(tmp, ftmp4);
 948     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 949     felem_reduce(ftmp3, tmp);
 950
 951     /* tmp = z1^2*x2 */
 952     felem_mul(tmp, ftmp, x2);
 953     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 954
 955     /* ftmp = z1^2*x2 - z2^2*x1 */
 956     felem_diff_128_64(tmp, ftmp2);
 957     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 958     felem_reduce(ftmp, tmp);
 959
 960     /*
 961      * the formulae are incorrect if the points are equal so we check for
 962      * this and do doubling if this happens
 963      */
 964     x_equal = felem_is_zero(ftmp);
 965     y_equal = felem_is_zero(ftmp3);
 966     z1_is_zero = felem_is_zero(z1);
 967     z2_is_zero = felem_is_zero(z2);
 968     /* In affine coordinates, (X_1, Y_1) == (X_2, Y_2) */
 969     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
 970         point_double(x3, y3, z3, x1, y1, z1);
 971         return;
 972     }
 973
 974     /* ftmp5 = z1*z2 */
 975     if (!mixed) {
 976         felem_mul(tmp, z1, z2);
 977         felem_reduce(ftmp5, tmp);
 978     } else {
 979         /* special case z2 = 0 is handled later */
 980         felem_assign(ftmp5, z1);
 981     }
 982
 983     /* z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) */
 984     felem_mul(tmp, ftmp, ftmp5);
 985     felem_reduce(z_out, tmp);
 986
 987     /* ftmp = (z1^2*x2 - z2^2*x1)^2 */
 988     felem_assign(ftmp5, ftmp);
 989     felem_square(tmp, ftmp);
 990     felem_reduce(ftmp, tmp);
 991
 992     /* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */
 993     felem_mul(tmp, ftmp, ftmp5);
 994     felem_reduce(ftmp5, tmp);
 995
 996     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
 997     felem_mul(tmp, ftmp2, ftmp);
 998     felem_reduce(ftmp2, tmp);
 999
1000     /* tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
1001     felem_mul(tmp, ftmp4, ftmp5);
1002     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
1003
1004     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 */
1005     felem_square(tmp2, ftmp3);
1006     /* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */
1007
1008     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */
1009     felem_diff_128_64(tmp2, ftmp5);
1010     /* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */
1011
1012     /* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
1013     felem_assign(ftmp5, ftmp2);
1014     felem_scalar(ftmp5, 2);
1015     /* ftmp5[i] < 2 * 2^57 = 2^58 */
1016
1017     /*-
1018      * x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 -
1019      *  2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2
1020      */
1021     felem_diff_128_64(tmp2, ftmp5);
1022     /* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */
1023     felem_reduce(x_out, tmp2);
1024
1025     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out */
1026     felem_diff(ftmp2, x_out);
1027     /* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */
1028
1029     /*
1030      * tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out)
1031      */
1032     felem_mul(tmp2, ftmp3, ftmp2);
1033     /* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */
1034
1035     /*-
1036      * y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) -
1037      *  z2^3*y1*(z1^2*x2 - z2^2*x1)^3
1038      */
1039     widefelem_diff(tmp2, tmp);
1040     /* tmp2[i] < 2^118 + 2^120 < 2^121 */
1041     felem_reduce(y_out, tmp2);
1042
1043     /*
1044      * the result (x_out, y_out, z_out) is incorrect if one of the inputs is
1045      * the point at infinity, so we need to check for this separately
1046      */
1047
1048     /*
1049      * if point 1 is at infinity, copy point 2 to output, and vice versa
1050      */
1051     copy_conditional(x_out, x2, z1_is_zero);
1052     copy_conditional(x_out, x1, z2_is_zero);
1053     copy_conditional(y_out, y2, z1_is_zero);
1054     copy_conditional(y_out, y1, z2_is_zero);
1055     copy_conditional(z_out, z2, z1_is_zero);
1056     copy_conditional(z_out, z1, z2_is_zero);
1057     felem_assign(x3, x_out);
1058     felem_assign(y3, y_out);
1059     felem_assign(z3, z_out);
1060 }
1061
1062 /*
1063  * select_point selects the |idx|th point from a precomputation table and
1064  * copies it to out.
1065  * The pre_comp array argument should be size of |size| argument
1066  */
1067 static void select_point(const u64 idx, unsigned int size,
1068                          const felem pre_comp[][3], felem out[3])
1069 {
1070     unsigned i, j;
1071     limb *outlimbs = &out[0][0];
1072     memset(outlimbs, 0, 3 * sizeof(felem));
1073
1074     for (i = 0; i < size; i++) {
1075         const limb *inlimbs = &pre_comp[i][0][0];
1076         u64 mask = i ^ idx;
1077         mask |= mask >> 4;
1078         mask |= mask >> 2;
1079         mask |= mask >> 1;
1080         mask &= 1;
1081         mask--;
1082         for (j = 0; j < 4 * 3; j++)
1083             outlimbs[j] |= inlimbs[j] & mask;
1084     }
1085 }
1086
1087 /* get_bit returns the |i|th bit in |in| */
1088 static char get_bit(const felem_bytearray in, unsigned i)
1089 {
1090     if (i >= 224)
1091         return 0;
1092     return (in[i >> 3] >> (i & 7)) & 1;
1093 }
1094
1095 /*
1096  * Interleaved point multiplication using precomputed point multiples: The
1097  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1098  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1099  * generator, using certain (large) precomputed multiples in g_pre_comp.
1100  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1101  */
1102 static void batch_mul(felem x_out, felem y_out, felem z_out,
1103                       const felem_bytearray scalars[],
1104                       const unsigned num_points, const u8 *g_scalar,
1105                       const int mixed, const felem pre_comp[][17][3],
1106                       const felem g_pre_comp[2][16][3])
1107 {
1108     int i, skip;
1109     unsigned num;
1110     unsigned gen_mul = (g_scalar != NULL);
1111     felem nq[3], tmp[4];
1112     u64 bits;
1113     u8 sign, digit;
1114
1115     /* set nq to the point at infinity */
1116     memset(nq, 0, 3 * sizeof(felem));
1117
1118     /*
1119      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1120      * of the generator (two in each of the last 28 rounds) and additions of
1121      * other points multiples (every 5th round).
1122      */
1123     skip = 1;                   /* save two point operations in the first
1124                                  * round */
1125     for (i = (num_points ? 220 : 27); i >= 0; --i) {
1126         /* double */
1127         if (!skip)
1128             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1129
1130         /* add multiples of the generator */
1131         if (gen_mul && (i <= 27)) {
1132             /* first, look 28 bits upwards */
1133             bits = get_bit(g_scalar, i + 196) << 3;
1134             bits |= get_bit(g_scalar, i + 140) << 2;
1135             bits |= get_bit(g_scalar, i + 84) << 1;
1136             bits |= get_bit(g_scalar, i + 28);
1137             /* select the point to add, in constant time */
1138             select_point(bits, 16, g_pre_comp[1], tmp);
1139
1140             if (!skip) {
1141                 /* value 1 below is argument for "mixed" */
1142                 point_add(nq[0], nq[1], nq[2],
1143                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1144             } else {
1145                 memcpy(nq, tmp, 3 * sizeof(felem));
1146                 skip = 0;
1147             }
1148
1149             /* second, look at the current position */
1150             bits = get_bit(g_scalar, i + 168) << 3;
1151             bits |= get_bit(g_scalar, i + 112) << 2;
1152             bits |= get_bit(g_scalar, i + 56) << 1;
1153             bits |= get_bit(g_scalar, i);
1154             /* select the point to add, in constant time */
1155             select_point(bits, 16, g_pre_comp[0], tmp);
1156             point_add(nq[0], nq[1], nq[2],
1157                       nq[0], nq[1], nq[2],
1158                       1 /* mixed */ , tmp[0], tmp[1], tmp[2]);
1159         }
1160
1161         /* do other additions every 5 doublings */
1162         if (num_points && (i % 5 == 0)) {
1163             /* loop over all scalars */
1164             for (num = 0; num < num_points; ++num) {
1165                 bits = get_bit(scalars[num], i + 4) << 5;
1166                 bits |= get_bit(scalars[num], i + 3) << 4;
1167                 bits |= get_bit(scalars[num], i + 2) << 3;
1168                 bits |= get_bit(scalars[num], i + 1) << 2;
1169                 bits |= get_bit(scalars[num], i) << 1;
1170                 bits |= get_bit(scalars[num], i - 1);
1171                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1172
1173                 /* select the point to add or subtract */
1174                 select_point(digit, 17, pre_comp[num], tmp);
1175                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1176                                             * point */
1177                 copy_conditional(tmp[1], tmp[3], sign);
1178
1179                 if (!skip) {
1180                     point_add(nq[0], nq[1], nq[2],
1181                               nq[0], nq[1], nq[2],
1182                               mixed, tmp[0], tmp[1], tmp[2]);
1183                 } else {
1184                     memcpy(nq, tmp, 3 * sizeof(felem));
1185                     skip = 0;
1186                 }
1187             }
1188         }
1189     }
1190     felem_assign(x_out, nq[0]);
1191     felem_assign(y_out, nq[1]);
1192     felem_assign(z_out, nq[2]);
1193 }
1194
1195 /******************************************************************************/
1196 /*
1197  * FUNCTIONS TO MANAGE PRECOMPUTATION
1198  */
1199
1200 static NISTP224_PRE_COMP *nistp224_pre_comp_new()
1201 {
1202     NISTP224_PRE_COMP *ret = NULL;
1203     ret = OPENSSL_malloc(sizeof *ret);
1204     if (!ret) {
1205         ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1206         return ret;
1207     }
1208     memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1209     ret->references = 1;
1210     return ret;
1211 }
1212
1213 static void *nistp224_pre_comp_dup(void *src_)
1214 {
1215     NISTP224_PRE_COMP *src = src_;
1216
1217     /* no need to actually copy, these objects never change! */
1218     CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1219
1220     return src_;
1221 }
1222
1223 static void nistp224_pre_comp_free(void *pre_)
1224 {
1225     int i;
1226     NISTP224_PRE_COMP *pre = pre_;
1227
1228     if (!pre)
1229         return;
1230
1231     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1232     if (i > 0)
1233         return;
1234
1235     OPENSSL_free(pre);
1236 }
1237
1238 static void nistp224_pre_comp_clear_free(void *pre_)
1239 {
1240     int i;
1241     NISTP224_PRE_COMP *pre = pre_;
1242
1243     if (!pre)
1244         return;
1245
1246     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1247     if (i > 0)
1248         return;
1249
1250     OPENSSL_cleanse(pre, sizeof *pre);
1251     OPENSSL_free(pre);
1252 }
1253
1254 /******************************************************************************/
1255 /*
1256  * OPENSSL EC_METHOD FUNCTIONS
1257  */
1258
1259 int ec_GFp_nistp224_group_init(EC_GROUP *group)
1260 {
1261     int ret;
1262     ret = ec_GFp_simple_group_init(group);
1263     group->a_is_minus3 = 1;
1264     return ret;
1265 }
1266
1267 int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1268                                     const BIGNUM *a, const BIGNUM *b,
1269                                     BN_CTX *ctx)
1270 {
1271     int ret = 0;
1272     BN_CTX *new_ctx = NULL;
1273     BIGNUM *curve_p, *curve_a, *curve_b;
1274
1275     if (ctx == NULL)
1276         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1277             return 0;
1278     BN_CTX_start(ctx);
1279     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1280         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1281         ((curve_b = BN_CTX_get(ctx)) == NULL))
1282         goto err;
1283     BN_bin2bn(nistp224_curve_params[0], sizeof(felem_bytearray), curve_p);
1284     BN_bin2bn(nistp224_curve_params[1], sizeof(felem_bytearray), curve_a);
1285     BN_bin2bn(nistp224_curve_params[2], sizeof(felem_bytearray), curve_b);
1286     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1287         ECerr(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE,
1288               EC_R_WRONG_CURVE_PARAMETERS);
1289         goto err;
1290     }
1291     group->field_mod_func = BN_nist_mod_224;
1292     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1293  err:
1294     BN_CTX_end(ctx);
1295     if (new_ctx != NULL)
1296         BN_CTX_free(new_ctx);
1297     return ret;
1298 }
1299
1300 /*
1301  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1302  * (X/Z^2, Y/Z^3)
1303  */
1304 int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group,
1305                                                  const EC_POINT *point,
1306                                                  BIGNUM *x, BIGNUM *y,
1307                                                  BN_CTX *ctx)
1308 {
1309     felem z1, z2, x_in, y_in, x_out, y_out;
1310     widefelem tmp;
1311
1312     if (EC_POINT_is_at_infinity(group, point)) {
1313         ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1314               EC_R_POINT_AT_INFINITY);
1315         return 0;
1316     }
1317     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1318         (!BN_to_felem(z1, point->Z)))
1319         return 0;
1320     felem_inv(z2, z1);
1321     felem_square(tmp, z2);
1322     felem_reduce(z1, tmp);
1323     felem_mul(tmp, x_in, z1);
1324     felem_reduce(x_in, tmp);
1325     felem_contract(x_out, x_in);
1326     if (x != NULL) {
1327         if (!felem_to_BN(x, x_out)) {
1328             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1329                   ERR_R_BN_LIB);
1330             return 0;
1331         }
1332     }
1333     felem_mul(tmp, z1, z2);
1334     felem_reduce(z1, tmp);
1335     felem_mul(tmp, y_in, z1);
1336     felem_reduce(y_in, tmp);
1337     felem_contract(y_out, y_in);
1338     if (y != NULL) {
1339         if (!felem_to_BN(y, y_out)) {
1340             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1341                   ERR_R_BN_LIB);
1342             return 0;
1343         }
1344     }
1345     return 1;
1346 }
1347
1348 static void make_points_affine(size_t num, felem points[ /* num */ ][3],
1349                                felem tmp_felems[ /* num+1 */ ])
1350 {
1351     /*
1352      * Runs in constant time, unless an input is the point at infinity (which
1353      * normally shouldn't happen).
1354      */
1355     ec_GFp_nistp_points_make_affine_internal(num,
1356                                              points,
1357                                              sizeof(felem),
1358                                              tmp_felems,
1359                                              (void (*)(void *))felem_one,
1360                                              (int (*)(const void *))
1361                                              felem_is_zero_int,
1362                                              (void (*)(void *, const void *))
1363                                              felem_assign,
1364                                              (void (*)(void *, const void *))
1365                                              felem_square_reduce, (void (*)
1366                                                                    (void *,
1367                                                                     const void
1368                                                                     *,
1369                                                                     const void
1370                                                                     *))
1371                                              felem_mul_reduce,
1372                                              (void (*)(void *, const void *))
1373                                              felem_inv,
1374                                              (void (*)(void *, const void *))
1375                                              felem_contract);
1376 }
1377
1378 /*
1379  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1380  * values Result is stored in r (r can equal one of the inputs).
1381  */
1382 int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
1383                                const BIGNUM *scalar, size_t num,
1384                                const EC_POINT *points[],
1385                                const BIGNUM *scalars[], BN_CTX *ctx)
1386 {
1387     int ret = 0;
1388     int j;
1389     unsigned i;
1390     int mixed = 0;
1391     BN_CTX *new_ctx = NULL;
1392     BIGNUM *x, *y, *z, *tmp_scalar;
1393     felem_bytearray g_secret;
1394     felem_bytearray *secrets = NULL;
1395     felem(*pre_comp)[17][3] = NULL;
1396     felem *tmp_felems = NULL;
1397     felem_bytearray tmp;
1398     unsigned num_bytes;
1399     int have_pre_comp = 0;
1400     size_t num_points = num;
1401     felem x_in, y_in, z_in, x_out, y_out, z_out;
1402     NISTP224_PRE_COMP *pre = NULL;
1403     const felem(*g_pre_comp)[16][3] = NULL;
1404     EC_POINT *generator = NULL;
1405     const EC_POINT *p = NULL;
1406     const BIGNUM *p_scalar = NULL;
1407
1408     if (ctx == NULL)
1409         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1410             return 0;
1411     BN_CTX_start(ctx);
1412     if (((x = BN_CTX_get(ctx)) == NULL) ||
1413         ((y = BN_CTX_get(ctx)) == NULL) ||
1414         ((z = BN_CTX_get(ctx)) == NULL) ||
1415         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1416         goto err;
1417
1418     if (scalar != NULL) {
1419         pre = EC_EX_DATA_get_data(group->extra_data,
1420                                   nistp224_pre_comp_dup,
1421                                   nistp224_pre_comp_free,
1422                                   nistp224_pre_comp_clear_free);
1423         if (pre)
1424             /* we have precomputation, try to use it */
1425             g_pre_comp = (const felem(*)[16][3])pre->g_pre_comp;
1426         else
1427             /* try to use the standard precomputation */
1428             g_pre_comp = &gmul[0];
1429         generator = EC_POINT_new(group);
1430         if (generator == NULL)
1431             goto err;
1432         /* get the generator from precomputation */
1433         if (!felem_to_BN(x, g_pre_comp[0][1][0]) ||
1434             !felem_to_BN(y, g_pre_comp[0][1][1]) ||
1435             !felem_to_BN(z, g_pre_comp[0][1][2])) {
1436             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1437             goto err;
1438         }
1439         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1440                                                       generator, x, y, z,
1441                                                       ctx))
1442             goto err;
1443         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1444             /* precomputation matches generator */
1445             have_pre_comp = 1;
1446         else
1447             /*
1448              * we don't have valid precomputation: treat the generator as a
1449              * random point
1450              */
1451             num_points = num_points + 1;
1452     }
1453
1454     if (num_points > 0) {
1455         if (num_points >= 3) {
1456             /*
1457              * unless we precompute multiples for just one or two points,
1458              * converting those into affine form is time well spent
1459              */
1460             mixed = 1;
1461         }
1462         secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1463         pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1464         if (mixed)
1465             tmp_felems =
1466                 OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1467         if ((secrets == NULL) || (pre_comp == NULL)
1468             || (mixed && (tmp_felems == NULL))) {
1469             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1470             goto err;
1471         }
1472
1473         /*
1474          * we treat NULL scalars as 0, and NULL points as points at infinity,
1475          * i.e., they contribute nothing to the linear combination
1476          */
1477         memset(secrets, 0, num_points * sizeof(felem_bytearray));
1478         memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1479         for (i = 0; i < num_points; ++i) {
1480             if (i == num)
1481                 /* the generator */
1482             {
1483                 p = EC_GROUP_get0_generator(group);
1484                 p_scalar = scalar;
1485             } else
1486                 /* the i^th point */
1487             {
1488                 p = points[i];
1489                 p_scalar = scalars[i];
1490             }
1491             if ((p_scalar != NULL) && (p != NULL)) {
1492                 /* reduce scalar to 0 <= scalar < 2^224 */
1493                 if ((BN_num_bits(p_scalar) > 224)
1494                     || (BN_is_negative(p_scalar))) {
1495                     /*
1496                      * this is an unusual input, and we don't guarantee
1497                      * constant-timeness
1498                      */
1499                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1500                         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1501                         goto err;
1502                     }
1503                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
1504                 } else
1505                     num_bytes = BN_bn2bin(p_scalar, tmp);
1506                 flip_endian(secrets[i], tmp, num_bytes);
1507                 /* precompute multiples */
1508                 if ((!BN_to_felem(x_out, p->X)) ||
1509                     (!BN_to_felem(y_out, p->Y)) ||
1510                     (!BN_to_felem(z_out, p->Z)))
1511                     goto err;
1512                 felem_assign(pre_comp[i][1][0], x_out);
1513                 felem_assign(pre_comp[i][1][1], y_out);
1514                 felem_assign(pre_comp[i][1][2], z_out);
1515                 for (j = 2; j <= 16; ++j) {
1516                     if (j & 1) {
1517                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1518                                   pre_comp[i][j][2], pre_comp[i][1][0],
1519                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1520                                   pre_comp[i][j - 1][0],
1521                                   pre_comp[i][j - 1][1],
1522                                   pre_comp[i][j - 1][2]);
1523                     } else {
1524                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1525                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1526                                      pre_comp[i][j / 2][1],
1527                                      pre_comp[i][j / 2][2]);
1528                     }
1529                 }
1530             }
1531         }
1532         if (mixed)
1533             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1534     }
1535
1536     /* the scalar for the generator */
1537     if ((scalar != NULL) && (have_pre_comp)) {
1538         memset(g_secret, 0, sizeof g_secret);
1539         /* reduce scalar to 0 <= scalar < 2^224 */
1540         if ((BN_num_bits(scalar) > 224) || (BN_is_negative(scalar))) {
1541             /*
1542              * this is an unusual input, and we don't guarantee
1543              * constant-timeness
1544              */
1545             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1546                 ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1547                 goto err;
1548             }
1549             num_bytes = BN_bn2bin(tmp_scalar, tmp);
1550         } else
1551             num_bytes = BN_bn2bin(scalar, tmp);
1552         flip_endian(g_secret, tmp, num_bytes);
1553         /* do the multiplication with generator precomputation */
1554         batch_mul(x_out, y_out, z_out,
1555                   (const felem_bytearray(*))secrets, num_points,
1556                   g_secret,
1557                   mixed, (const felem(*)[17][3])pre_comp, g_pre_comp);
1558     } else
1559         /* do the multiplication without generator precomputation */
1560         batch_mul(x_out, y_out, z_out,
1561                   (const felem_bytearray(*))secrets, num_points,
1562                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1563     /* reduce the output to its unique minimal representation */
1564     felem_contract(x_in, x_out);
1565     felem_contract(y_in, y_out);
1566     felem_contract(z_in, z_out);
1567     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1568         (!felem_to_BN(z, z_in))) {
1569         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1570         goto err;
1571     }
1572     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1573
1574  err:
1575     BN_CTX_end(ctx);
1576     EC_POINT_free(generator);
1577     if (new_ctx != NULL)
1578         BN_CTX_free(new_ctx);
1579     if (secrets != NULL)
1580         OPENSSL_free(secrets);
1581     if (pre_comp != NULL)
1582         OPENSSL_free(pre_comp);
1583     if (tmp_felems != NULL)
1584         OPENSSL_free(tmp_felems);
1585     return ret;
1586 }
1587
1588 int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1589 {
1590     int ret = 0;
1591     NISTP224_PRE_COMP *pre = NULL;
1592     int i, j;
1593     BN_CTX *new_ctx = NULL;
1594     BIGNUM *x, *y;
1595     EC_POINT *generator = NULL;
1596     felem tmp_felems[32];
1597
1598     /* throw away old precomputation */
1599     EC_EX_DATA_free_data(&group->extra_data, nistp224_pre_comp_dup,
1600                          nistp224_pre_comp_free,
1601                          nistp224_pre_comp_clear_free);
1602     if (ctx == NULL)
1603         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1604             return 0;
1605     BN_CTX_start(ctx);
1606     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
1607         goto err;
1608     /* get the generator */
1609     if (group->generator == NULL)
1610         goto err;
1611     generator = EC_POINT_new(group);
1612     if (generator == NULL)
1613         goto err;
1614     BN_bin2bn(nistp224_curve_params[3], sizeof(felem_bytearray), x);
1615     BN_bin2bn(nistp224_curve_params[4], sizeof(felem_bytearray), y);
1616     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
1617         goto err;
1618     if ((pre = nistp224_pre_comp_new()) == NULL)
1619         goto err;
1620     /*
1621      * if the generator is the standard one, use built-in precomputation
1622      */
1623     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1624         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1625         ret = 1;
1626         goto err;
1627     }
1628     if ((!BN_to_felem(pre->g_pre_comp[0][1][0], group->generator->X)) ||
1629         (!BN_to_felem(pre->g_pre_comp[0][1][1], group->generator->Y)) ||
1630         (!BN_to_felem(pre->g_pre_comp[0][1][2], group->generator->Z)))
1631         goto err;
1632     /*
1633      * compute 2^56*G, 2^112*G, 2^168*G for the first table, 2^28*G, 2^84*G,
1634      * 2^140*G, 2^196*G for the second one
1635      */
1636     for (i = 1; i <= 8; i <<= 1) {
1637         point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1638                      pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
1639                      pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]);
1640         for (j = 0; j < 27; ++j) {
1641             point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1642                          pre->g_pre_comp[1][i][2], pre->g_pre_comp[1][i][0],
1643                          pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1644         }
1645         if (i == 8)
1646             break;
1647         point_double(pre->g_pre_comp[0][2 * i][0],
1648                      pre->g_pre_comp[0][2 * i][1],
1649                      pre->g_pre_comp[0][2 * i][2], pre->g_pre_comp[1][i][0],
1650                      pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1651         for (j = 0; j < 27; ++j) {
1652             point_double(pre->g_pre_comp[0][2 * i][0],
1653                          pre->g_pre_comp[0][2 * i][1],
1654                          pre->g_pre_comp[0][2 * i][2],
1655                          pre->g_pre_comp[0][2 * i][0],
1656                          pre->g_pre_comp[0][2 * i][1],
1657                          pre->g_pre_comp[0][2 * i][2]);
1658         }
1659     }
1660     for (i = 0; i < 2; i++) {
1661         /* g_pre_comp[i][0] is the point at infinity */
1662         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
1663         /* the remaining multiples */
1664         /* 2^56*G + 2^112*G resp. 2^84*G + 2^140*G */
1665         point_add(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
1666                   pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
1667                   pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
1668                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1669                   pre->g_pre_comp[i][2][2]);
1670         /* 2^56*G + 2^168*G resp. 2^84*G + 2^196*G */
1671         point_add(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
1672                   pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
1673                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1674                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1675                   pre->g_pre_comp[i][2][2]);
1676         /* 2^112*G + 2^168*G resp. 2^140*G + 2^196*G */
1677         point_add(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
1678                   pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
1679                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1680                   0, pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
1681                   pre->g_pre_comp[i][4][2]);
1682         /*
1683          * 2^56*G + 2^112*G + 2^168*G resp. 2^84*G + 2^140*G + 2^196*G
1684          */
1685         point_add(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
1686                   pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
1687                   pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
1688                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1689                   pre->g_pre_comp[i][2][2]);
1690         for (j = 1; j < 8; ++j) {
1691             /* odd multiples: add G resp. 2^28*G */
1692             point_add(pre->g_pre_comp[i][2 * j + 1][0],
1693                       pre->g_pre_comp[i][2 * j + 1][1],
1694                       pre->g_pre_comp[i][2 * j + 1][2],
1695                       pre->g_pre_comp[i][2 * j][0],
1696                       pre->g_pre_comp[i][2 * j][1],
1697                       pre->g_pre_comp[i][2 * j][2], 0,
1698                       pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1],
1699                       pre->g_pre_comp[i][1][2]);
1700         }
1701     }
1702     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems);
1703
1704     if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp224_pre_comp_dup,
1705                              nistp224_pre_comp_free,
1706                              nistp224_pre_comp_clear_free))
1707         goto err;
1708     ret = 1;
1709     pre = NULL;
1710  err:
1711     BN_CTX_end(ctx);
1712     EC_POINT_free(generator);
1713     if (new_ctx != NULL)
1714         BN_CTX_free(new_ctx);
1715     nistp224_pre_comp_free(pre);
1716     return ret;
1717 }
1718
1719 int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group)
1720 {
1721     if (EC_EX_DATA_get_data(group->extra_data, nistp224_pre_comp_dup,
1722                             nistp224_pre_comp_free,
1723                             nistp224_pre_comp_clear_free)
1724         != NULL)
1725         return 1;
1726     else
1727         return 0;
1728 }
1729
1730 #else
1731 static void *dummy = &dummy;
1732 #endif