crypto/ec/ecp_nistp224.c

   1 /* crypto/ec/ecp_nistp224.c */
   2 /*
   3  * Written by Emilia Kasper (Google) for the OpenSSL project.
   4  */
   5 /* Copyright 2011 Google Inc.
   6  *
   7  * Licensed under the Apache License, Version 2.0 (the "License");
   8  *
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  *  Unless required by applicable law or agreed to in writing, software
  15  *  distributed under the License is distributed on an "AS IS" BASIS,
  16  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  *  See the License for the specific language governing permissions and
  18  *  limitations under the License.
  19  */
  20
  21 /*
  22  * A 64-bit implementation of the NIST P-224 elliptic curve point multiplication
  23  *
  24  * Inspired by Daniel J. Bernstein's public domain nistp224 implementation
  25  * and Adam Langley's public domain 64-bit C implementation of curve25519
  26  */
  27
  28 #include <openssl/opensslconf.h>
  29 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
  30
  31 # include <stdint.h>
  32 # include <string.h>
  33 # include <openssl/err.h>
  34 # include "ec_lcl.h"
  35
  36 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  37   /* even with gcc, the typedef won't work for 32-bit platforms */
  38 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  39                                  * platforms */
  40 # else
  41 #  error "Need GCC 3.1 or later to define type uint128_t"
  42 # endif
  43
  44 typedef uint8_t u8;
  45 typedef uint64_t u64;
  46 typedef int64_t s64;
  47
  48 /******************************************************************************/
  49 /*-
  50  * INTERNAL REPRESENTATION OF FIELD ELEMENTS
  51  *
  52  * Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3
  53  * using 64-bit coefficients called 'limbs',
  54  * and sometimes (for multiplication results) as
  55  * b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 + 2^336*b_6
  56  * using 128-bit coefficients called 'widelimbs'.
  57  * A 4-limb representation is an 'felem';
  58  * a 7-widelimb representation is a 'widefelem'.
  59  * Even within felems, bits of adjacent limbs overlap, and we don't always
  60  * reduce the representations: we ensure that inputs to each felem
  61  * multiplication satisfy a_i < 2^60, so outputs satisfy b_i < 4*2^60*2^60,
  62  * and fit into a 128-bit word without overflow. The coefficients are then
  63  * again partially reduced to obtain an felem satisfying a_i < 2^57.
  64  * We only reduce to the unique minimal representation at the end of the
  65  * computation.
  66  */
  67
  68 typedef uint64_t limb;
  69 typedef uint128_t widelimb;
  70
  71 typedef limb felem[4];
  72 typedef widelimb widefelem[7];
  73
  74 /*
  75  * Field element represented as a byte arrary. 28*8 = 224 bits is also the
  76  * group order size for the elliptic curve, and we also use this type for
  77  * scalars for point multiplication.
  78  */
  79 typedef u8 felem_bytearray[28];
  80
  81 static const felem_bytearray nistp224_curve_params[5] = {
  82     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
  83      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
  84      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01},
  85     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a */
  86      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
  87      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE},
  88     {0xB4, 0x05, 0x0A, 0x85, 0x0C, 0x04, 0xB3, 0xAB, 0xF5, 0x41, /* b */
  89      0x32, 0x56, 0x50, 0x44, 0xB0, 0xB7, 0xD7, 0xBF, 0xD8, 0xBA,
  90      0x27, 0x0B, 0x39, 0x43, 0x23, 0x55, 0xFF, 0xB4},
  91     {0xB7, 0x0E, 0x0C, 0xBD, 0x6B, 0xB4, 0xBF, 0x7F, 0x32, 0x13, /* x */
  92      0x90, 0xB9, 0x4A, 0x03, 0xC1, 0xD3, 0x56, 0xC2, 0x11, 0x22,
  93      0x34, 0x32, 0x80, 0xD6, 0x11, 0x5C, 0x1D, 0x21},
  94     {0xbd, 0x37, 0x63, 0x88, 0xb5, 0xf7, 0x23, 0xfb, 0x4c, 0x22, /* y */
  95      0xdf, 0xe6, 0xcd, 0x43, 0x75, 0xa0, 0x5a, 0x07, 0x47, 0x64,
  96      0x44, 0xd5, 0x81, 0x99, 0x85, 0x00, 0x7e, 0x34}
  97 };
  98
  99 /*-
 100  * Precomputed multiples of the standard generator
 101  * Points are given in coordinates (X, Y, Z) where Z normally is 1
 102  * (0 for the point at infinity).
 103  * For each field element, slice a_0 is word 0, etc.
 104  *
 105  * The table has 2 * 16 elements, starting with the following:
 106  * index | bits    | point
 107  * ------+---------+------------------------------
 108  *     0 | 0 0 0 0 | 0G
 109  *     1 | 0 0 0 1 | 1G
 110  *     2 | 0 0 1 0 | 2^56G
 111  *     3 | 0 0 1 1 | (2^56 + 1)G
 112  *     4 | 0 1 0 0 | 2^112G
 113  *     5 | 0 1 0 1 | (2^112 + 1)G
 114  *     6 | 0 1 1 0 | (2^112 + 2^56)G
 115  *     7 | 0 1 1 1 | (2^112 + 2^56 + 1)G
 116  *     8 | 1 0 0 0 | 2^168G
 117  *     9 | 1 0 0 1 | (2^168 + 1)G
 118  *    10 | 1 0 1 0 | (2^168 + 2^56)G
 119  *    11 | 1 0 1 1 | (2^168 + 2^56 + 1)G
 120  *    12 | 1 1 0 0 | (2^168 + 2^112)G
 121  *    13 | 1 1 0 1 | (2^168 + 2^112 + 1)G
 122  *    14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G
 123  *    15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G
 124  * followed by a copy of this with each element multiplied by 2^28.
 125  *
 126  * The reason for this is so that we can clock bits into four different
 127  * locations when doing simple scalar multiplies against the base point,
 128  * and then another four locations using the second 16 elements.
 129  */
 130 static const felem gmul[2][16][3] = {
 131 {{{0, 0, 0, 0},
 132   {0, 0, 0, 0},
 133   {0, 0, 0, 0}},
 134  {{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf},
 135   {0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723},
 136   {1, 0, 0, 0}},
 137  {{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5},
 138   {0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321},
 139   {1, 0, 0, 0}},
 140  {{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748},
 141   {0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17},
 142   {1, 0, 0, 0}},
 143  {{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe},
 144   {0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b},
 145   {1, 0, 0, 0}},
 146  {{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3},
 147   {0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a},
 148   {1, 0, 0, 0}},
 149  {{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c},
 150   {0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244},
 151   {1, 0, 0, 0}},
 152  {{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849},
 153   {0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112},
 154   {1, 0, 0, 0}},
 155  {{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47},
 156   {0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394},
 157   {1, 0, 0, 0}},
 158  {{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d},
 159   {0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7},
 160   {1, 0, 0, 0}},
 161  {{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24},
 162   {0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881},
 163   {1, 0, 0, 0}},
 164  {{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984},
 165   {0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369},
 166   {1, 0, 0, 0}},
 167  {{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3},
 168   {0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60},
 169   {1, 0, 0, 0}},
 170  {{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057},
 171   {0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9},
 172   {1, 0, 0, 0}},
 173  {{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9},
 174   {0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc},
 175   {1, 0, 0, 0}},
 176  {{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58},
 177   {0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558},
 178   {1, 0, 0, 0}}},
 179 {{{0, 0, 0, 0},
 180   {0, 0, 0, 0},
 181   {0, 0, 0, 0}},
 182  {{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31},
 183   {0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d},
 184   {1, 0, 0, 0}},
 185  {{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3},
 186   {0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a},
 187   {1, 0, 0, 0}},
 188  {{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33},
 189   {0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100},
 190   {1, 0, 0, 0}},
 191  {{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5},
 192   {0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea},
 193   {1, 0, 0, 0}},
 194  {{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be},
 195   {0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51},
 196   {1, 0, 0, 0}},
 197  {{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1},
 198   {0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb},
 199   {1, 0, 0, 0}},
 200  {{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233},
 201   {0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def},
 202   {1, 0, 0, 0}},
 203  {{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae},
 204   {0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45},
 205   {1, 0, 0, 0}},
 206  {{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e},
 207   {0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb},
 208   {1, 0, 0, 0}},
 209  {{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de},
 210   {0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3},
 211   {1, 0, 0, 0}},
 212  {{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05},
 213   {0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58},
 214   {1, 0, 0, 0}},
 215  {{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb},
 216   {0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0},
 217   {1, 0, 0, 0}},
 218  {{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9},
 219   {0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea},
 220   {1, 0, 0, 0}},
 221  {{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba},
 222   {0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405},
 223   {1, 0, 0, 0}},
 224  {{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e},
 225   {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e},
 226   {1, 0, 0, 0}}}
 227 };
 228
 229 /* Precomputation for the group generator. */
 230 typedef struct {
 231     felem g_pre_comp[2][16][3];
 232     int references;
 233 } NISTP224_PRE_COMP;
 234
 235 const EC_METHOD *EC_GFp_nistp224_method(void)
 236 {
 237     static const EC_METHOD ret = {
 238         EC_FLAGS_DEFAULT_OCT,
 239         NID_X9_62_prime_field,
 240         ec_GFp_nistp224_group_init,
 241         ec_GFp_simple_group_finish,
 242         ec_GFp_simple_group_clear_finish,
 243         ec_GFp_nist_group_copy,
 244         ec_GFp_nistp224_group_set_curve,
 245         ec_GFp_simple_group_get_curve,
 246         ec_GFp_simple_group_get_degree,
 247         ec_GFp_simple_group_check_discriminant,
 248         ec_GFp_simple_point_init,
 249         ec_GFp_simple_point_finish,
 250         ec_GFp_simple_point_clear_finish,
 251         ec_GFp_simple_point_copy,
 252         ec_GFp_simple_point_set_to_infinity,
 253         ec_GFp_simple_set_Jprojective_coordinates_GFp,
 254         ec_GFp_simple_get_Jprojective_coordinates_GFp,
 255         ec_GFp_simple_point_set_affine_coordinates,
 256         ec_GFp_nistp224_point_get_affine_coordinates,
 257         0 /* point_set_compressed_coordinates */ ,
 258         0 /* point2oct */ ,
 259         0 /* oct2point */ ,
 260         ec_GFp_simple_add,
 261         ec_GFp_simple_dbl,
 262         ec_GFp_simple_invert,
 263         ec_GFp_simple_is_at_infinity,
 264         ec_GFp_simple_is_on_curve,
 265         ec_GFp_simple_cmp,
 266         ec_GFp_simple_make_affine,
 267         ec_GFp_simple_points_make_affine,
 268         ec_GFp_nistp224_points_mul,
 269         ec_GFp_nistp224_precompute_mult,
 270         ec_GFp_nistp224_have_precompute_mult,
 271         ec_GFp_nist_field_mul,
 272         ec_GFp_nist_field_sqr,
 273         0 /* field_div */ ,
 274         0 /* field_encode */ ,
 275         0 /* field_decode */ ,
 276         0                       /* field_set_to_one */
 277     };
 278
 279     return &ret;
 280 }
 281
 282 /*
 283  * Helper functions to convert field elements to/from internal representation
 284  */
 285 static void bin28_to_felem(felem out, const u8 in[28])
 286 {
 287     out[0] = *((const uint64_t *)(in)) & 0x00ffffffffffffff;
 288     out[1] = (*((const uint64_t *)(in + 7))) & 0x00ffffffffffffff;
 289     out[2] = (*((const uint64_t *)(in + 14))) & 0x00ffffffffffffff;
 290     out[3] = (*((const uint64_t *)(in+20))) >> 8;
 291 }
 292
 293 static void felem_to_bin28(u8 out[28], const felem in)
 294 {
 295     unsigned i;
 296     for (i = 0; i < 7; ++i) {
 297         out[i] = in[0] >> (8 * i);
 298         out[i + 7] = in[1] >> (8 * i);
 299         out[i + 14] = in[2] >> (8 * i);
 300         out[i + 21] = in[3] >> (8 * i);
 301     }
 302 }
 303
 304 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 305 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 306 {
 307     unsigned i;
 308     for (i = 0; i < len; ++i)
 309         out[i] = in[len - 1 - i];
 310 }
 311
 312 /* From OpenSSL BIGNUM to internal representation */
 313 static int BN_to_felem(felem out, const BIGNUM *bn)
 314 {
 315     felem_bytearray b_in;
 316     felem_bytearray b_out;
 317     unsigned num_bytes;
 318
 319     /* BN_bn2bin eats leading zeroes */
 320     memset(b_out, 0, sizeof b_out);
 321     num_bytes = BN_num_bytes(bn);
 322     if (num_bytes > sizeof b_out) {
 323         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 324         return 0;
 325     }
 326     if (BN_is_negative(bn)) {
 327         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 328         return 0;
 329     }
 330     num_bytes = BN_bn2bin(bn, b_in);
 331     flip_endian(b_out, b_in, num_bytes);
 332     bin28_to_felem(out, b_out);
 333     return 1;
 334 }
 335
 336 /* From internal representation to OpenSSL BIGNUM */
 337 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 338 {
 339     felem_bytearray b_in, b_out;
 340     felem_to_bin28(b_in, in);
 341     flip_endian(b_out, b_in, sizeof b_out);
 342     return BN_bin2bn(b_out, sizeof b_out, out);
 343 }
 344
 345 /******************************************************************************/
 346 /*-
 347  *                              FIELD OPERATIONS
 348  *
 349  * Field operations, using the internal representation of field elements.
 350  * NB! These operations are specific to our point multiplication and cannot be
 351  * expected to be correct in general - e.g., multiplication with a large scalar
 352  * will cause an overflow.
 353  *
 354  */
 355
 356 static void felem_one(felem out)
 357 {
 358     out[0] = 1;
 359     out[1] = 0;
 360     out[2] = 0;
 361     out[3] = 0;
 362 }
 363
 364 static void felem_assign(felem out, const felem in)
 365 {
 366     out[0] = in[0];
 367     out[1] = in[1];
 368     out[2] = in[2];
 369     out[3] = in[3];
 370 }
 371
 372 /* Sum two field elements: out += in */
 373 static void felem_sum(felem out, const felem in)
 374 {
 375     out[0] += in[0];
 376     out[1] += in[1];
 377     out[2] += in[2];
 378     out[3] += in[3];
 379 }
 380
 381 /* Get negative value: out = -in */
 382 /* Assumes in[i] < 2^57 */
 383 static void felem_neg(felem out, const felem in)
 384 {
 385     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 386     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 387     static const limb two58m42m2 = (((limb) 1) << 58) -
 388         (((limb) 1) << 42) - (((limb) 1) << 2);
 389
 390     /* Set to 0 mod 2^224-2^96+1 to ensure out > in */
 391     out[0] = two58p2 - in[0];
 392     out[1] = two58m42m2 - in[1];
 393     out[2] = two58m2 - in[2];
 394     out[3] = two58m2 - in[3];
 395 }
 396
 397 /* Subtract field elements: out -= in */
 398 /* Assumes in[i] < 2^57 */
 399 static void felem_diff(felem out, const felem in)
 400 {
 401     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 402     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 403     static const limb two58m42m2 = (((limb) 1) << 58) -
 404         (((limb) 1) << 42) - (((limb) 1) << 2);
 405
 406     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 407     out[0] += two58p2;
 408     out[1] += two58m42m2;
 409     out[2] += two58m2;
 410     out[3] += two58m2;
 411
 412     out[0] -= in[0];
 413     out[1] -= in[1];
 414     out[2] -= in[2];
 415     out[3] -= in[3];
 416 }
 417
 418 /* Subtract in unreduced 128-bit mode: out -= in */
 419 /* Assumes in[i] < 2^119 */
 420 static void widefelem_diff(widefelem out, const widefelem in)
 421 {
 422     static const widelimb two120 = ((widelimb) 1) << 120;
 423     static const widelimb two120m64 = (((widelimb) 1) << 120) -
 424         (((widelimb) 1) << 64);
 425     static const widelimb two120m104m64 = (((widelimb) 1) << 120) -
 426         (((widelimb) 1) << 104) - (((widelimb) 1) << 64);
 427
 428     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 429     out[0] += two120;
 430     out[1] += two120m64;
 431     out[2] += two120m64;
 432     out[3] += two120;
 433     out[4] += two120m104m64;
 434     out[5] += two120m64;
 435     out[6] += two120m64;
 436
 437     out[0] -= in[0];
 438     out[1] -= in[1];
 439     out[2] -= in[2];
 440     out[3] -= in[3];
 441     out[4] -= in[4];
 442     out[5] -= in[5];
 443     out[6] -= in[6];
 444 }
 445
 446 /* Subtract in mixed mode: out128 -= in64 */
 447 /* in[i] < 2^63 */
 448 static void felem_diff_128_64(widefelem out, const felem in)
 449 {
 450     static const widelimb two64p8 = (((widelimb) 1) << 64) +
 451         (((widelimb) 1) << 8);
 452     static const widelimb two64m8 = (((widelimb) 1) << 64) -
 453         (((widelimb) 1) << 8);
 454     static const widelimb two64m48m8 = (((widelimb) 1) << 64) -
 455         (((widelimb) 1) << 48) - (((widelimb) 1) << 8);
 456
 457     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 458     out[0] += two64p8;
 459     out[1] += two64m48m8;
 460     out[2] += two64m8;
 461     out[3] += two64m8;
 462
 463     out[0] -= in[0];
 464     out[1] -= in[1];
 465     out[2] -= in[2];
 466     out[3] -= in[3];
 467 }
 468
 469 /*
 470  * Multiply a field element by a scalar: out = out * scalar The scalars we
 471  * actually use are small, so results fit without overflow
 472  */
 473 static void felem_scalar(felem out, const limb scalar)
 474 {
 475     out[0] *= scalar;
 476     out[1] *= scalar;
 477     out[2] *= scalar;
 478     out[3] *= scalar;
 479 }
 480
 481 /*
 482  * Multiply an unreduced field element by a scalar: out = out * scalar The
 483  * scalars we actually use are small, so results fit without overflow
 484  */
 485 static void widefelem_scalar(widefelem out, const widelimb scalar)
 486 {
 487     out[0] *= scalar;
 488     out[1] *= scalar;
 489     out[2] *= scalar;
 490     out[3] *= scalar;
 491     out[4] *= scalar;
 492     out[5] *= scalar;
 493     out[6] *= scalar;
 494 }
 495
 496 /* Square a field element: out = in^2 */
 497 static void felem_square(widefelem out, const felem in)
 498 {
 499     limb tmp0, tmp1, tmp2;
 500     tmp0 = 2 * in[0];
 501     tmp1 = 2 * in[1];
 502     tmp2 = 2 * in[2];
 503     out[0] = ((widelimb) in[0]) * in[0];
 504     out[1] = ((widelimb) in[0]) * tmp1;
 505     out[2] = ((widelimb) in[0]) * tmp2 + ((widelimb) in[1]) * in[1];
 506     out[3] = ((widelimb) in[3]) * tmp0 + ((widelimb) in[1]) * tmp2;
 507     out[4] = ((widelimb) in[3]) * tmp1 + ((widelimb) in[2]) * in[2];
 508     out[5] = ((widelimb) in[3]) * tmp2;
 509     out[6] = ((widelimb) in[3]) * in[3];
 510 }
 511
 512 /* Multiply two field elements: out = in1 * in2 */
 513 static void felem_mul(widefelem out, const felem in1, const felem in2)
 514 {
 515     out[0] = ((widelimb) in1[0]) * in2[0];
 516     out[1] = ((widelimb) in1[0]) * in2[1] + ((widelimb) in1[1]) * in2[0];
 517     out[2] = ((widelimb) in1[0]) * in2[2] + ((widelimb) in1[1]) * in2[1] +
 518              ((widelimb) in1[2]) * in2[0];
 519     out[3] = ((widelimb) in1[0]) * in2[3] + ((widelimb) in1[1]) * in2[2] +
 520              ((widelimb) in1[2]) * in2[1] + ((widelimb) in1[3]) * in2[0];
 521     out[4] = ((widelimb) in1[1]) * in2[3] + ((widelimb) in1[2]) * in2[2] +
 522              ((widelimb) in1[3]) * in2[1];
 523     out[5] = ((widelimb) in1[2]) * in2[3] + ((widelimb) in1[3]) * in2[2];
 524     out[6] = ((widelimb) in1[3]) * in2[3];
 525 }
 526
 527 /*-
 528  * Reduce seven 128-bit coefficients to four 64-bit coefficients.
 529  * Requires in[i] < 2^126,
 530  * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 */
 531 static void felem_reduce(felem out, const widefelem in)
 532 {
 533     static const widelimb two127p15 = (((widelimb) 1) << 127) +
 534         (((widelimb) 1) << 15);
 535     static const widelimb two127m71 = (((widelimb) 1) << 127) -
 536         (((widelimb) 1) << 71);
 537     static const widelimb two127m71m55 = (((widelimb) 1) << 127) -
 538         (((widelimb) 1) << 71) - (((widelimb) 1) << 55);
 539     widelimb output[5];
 540
 541     /* Add 0 mod 2^224-2^96+1 to ensure all differences are positive */
 542     output[0] = in[0] + two127p15;
 543     output[1] = in[1] + two127m71m55;
 544     output[2] = in[2] + two127m71;
 545     output[3] = in[3];
 546     output[4] = in[4];
 547
 548     /* Eliminate in[4], in[5], in[6] */
 549     output[4] += in[6] >> 16;
 550     output[3] += (in[6] & 0xffff) << 40;
 551     output[2] -= in[6];
 552
 553     output[3] += in[5] >> 16;
 554     output[2] += (in[5] & 0xffff) << 40;
 555     output[1] -= in[5];
 556
 557     output[2] += output[4] >> 16;
 558     output[1] += (output[4] & 0xffff) << 40;
 559     output[0] -= output[4];
 560
 561     /* Carry 2 -> 3 -> 4 */
 562     output[3] += output[2] >> 56;
 563     output[2] &= 0x00ffffffffffffff;
 564
 565     output[4] = output[3] >> 56;
 566     output[3] &= 0x00ffffffffffffff;
 567
 568     /* Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72 */
 569
 570     /* Eliminate output[4] */
 571     output[2] += output[4] >> 16;
 572     /* output[2] < 2^56 + 2^56 = 2^57 */
 573     output[1] += (output[4] & 0xffff) << 40;
 574     output[0] -= output[4];
 575
 576     /* Carry 0 -> 1 -> 2 -> 3 */
 577     output[1] += output[0] >> 56;
 578     out[0] = output[0] & 0x00ffffffffffffff;
 579
 580     output[2] += output[1] >> 56;
 581     /* output[2] < 2^57 + 2^72 */
 582     out[1] = output[1] & 0x00ffffffffffffff;
 583     output[3] += output[2] >> 56;
 584     /* output[3] <= 2^56 + 2^16 */
 585     out[2] = output[2] & 0x00ffffffffffffff;
 586
 587     /*-
 588      * out[0] < 2^56, out[1] < 2^56, out[2] < 2^56,
 589      * out[3] <= 2^56 + 2^16 (due to final carry),
 590      * so out < 2*p
 591      */
 592     out[3] = output[3];
 593 }
 594
 595 static void felem_square_reduce(felem out, const felem in)
 596 {
 597     widefelem tmp;
 598     felem_square(tmp, in);
 599     felem_reduce(out, tmp);
 600 }
 601
 602 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 603 {
 604     widefelem tmp;
 605     felem_mul(tmp, in1, in2);
 606     felem_reduce(out, tmp);
 607 }
 608
 609 /*
 610  * Reduce to unique minimal representation. Requires 0 <= in < 2*p (always
 611  * call felem_reduce first)
 612  */
 613 static void felem_contract(felem out, const felem in)
 614 {
 615     static const int64_t two56 = ((limb) 1) << 56;
 616     /* 0 <= in < 2*p, p = 2^224 - 2^96 + 1 */
 617     /* if in > p , reduce in = in - 2^224 + 2^96 - 1 */
 618     int64_t tmp[4], a;
 619     tmp[0] = in[0];
 620     tmp[1] = in[1];
 621     tmp[2] = in[2];
 622     tmp[3] = in[3];
 623     /* Case 1: a = 1 iff in >= 2^224 */
 624     a = (in[3] >> 56);
 625     tmp[0] -= a;
 626     tmp[1] += a << 40;
 627     tmp[3] &= 0x00ffffffffffffff;
 628     /*
 629      * Case 2: a = 0 iff p <= in < 2^224, i.e., the high 128 bits are all 1
 630      * and the lower part is non-zero
 631      */
 632     a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) |
 633         (((int64_t) (in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63);
 634     a &= 0x00ffffffffffffff;
 635     /* turn a into an all-one mask (if a = 0) or an all-zero mask */
 636     a = (a - 1) >> 63;
 637     /* subtract 2^224 - 2^96 + 1 if a is all-one */
 638     tmp[3] &= a ^ 0xffffffffffffffff;
 639     tmp[2] &= a ^ 0xffffffffffffffff;
 640     tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff;
 641     tmp[0] -= 1 & a;
 642
 643     /*
 644      * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
 645      * non-zero, so we only need one step
 646      */
 647     a = tmp[0] >> 63;
 648     tmp[0] += two56 & a;
 649     tmp[1] -= 1 & a;
 650
 651     /* carry 1 -> 2 -> 3 */
 652     tmp[2] += tmp[1] >> 56;
 653     tmp[1] &= 0x00ffffffffffffff;
 654
 655     tmp[3] += tmp[2] >> 56;
 656     tmp[2] &= 0x00ffffffffffffff;
 657
 658     /* Now 0 <= out < p */
 659     out[0] = tmp[0];
 660     out[1] = tmp[1];
 661     out[2] = tmp[2];
 662     out[3] = tmp[3];
 663 }
 664
 665 /*
 666  * Zero-check: returns 1 if input is 0, and 0 otherwise. We know that field
 667  * elements are reduced to in < 2^225, so we only need to check three cases:
 668  * 0, 2^224 - 2^96 + 1, and 2^225 - 2^97 + 2
 669  */
 670 static limb felem_is_zero(const felem in)
 671 {
 672     limb zero, two224m96p1, two225m97p2;
 673
 674     zero = in[0] | in[1] | in[2] | in[3];
 675     zero = (((int64_t) (zero) - 1) >> 63) & 1;
 676     two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000)
 677         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x00ffffffffffffff);
 678     two224m96p1 = (((int64_t) (two224m96p1) - 1) >> 63) & 1;
 679     two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000)
 680         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x01ffffffffffffff);
 681     two225m97p2 = (((int64_t) (two225m97p2) - 1) >> 63) & 1;
 682     return (zero | two224m96p1 | two225m97p2);
 683 }
 684
 685 static limb felem_is_zero_int(const felem in)
 686 {
 687     return (int)(felem_is_zero(in) & ((limb) 1));
 688 }
 689
 690 /* Invert a field element */
 691 /* Computation chain copied from djb's code */
 692 static void felem_inv(felem out, const felem in)
 693 {
 694     felem ftmp, ftmp2, ftmp3, ftmp4;
 695     widefelem tmp;
 696     unsigned i;
 697
 698     felem_square(tmp, in);
 699     felem_reduce(ftmp, tmp);    /* 2 */
 700     felem_mul(tmp, in, ftmp);
 701     felem_reduce(ftmp, tmp);    /* 2^2 - 1 */
 702     felem_square(tmp, ftmp);
 703     felem_reduce(ftmp, tmp);    /* 2^3 - 2 */
 704     felem_mul(tmp, in, ftmp);
 705     felem_reduce(ftmp, tmp);    /* 2^3 - 1 */
 706     felem_square(tmp, ftmp);
 707     felem_reduce(ftmp2, tmp);   /* 2^4 - 2 */
 708     felem_square(tmp, ftmp2);
 709     felem_reduce(ftmp2, tmp);   /* 2^5 - 4 */
 710     felem_square(tmp, ftmp2);
 711     felem_reduce(ftmp2, tmp);   /* 2^6 - 8 */
 712     felem_mul(tmp, ftmp2, ftmp);
 713     felem_reduce(ftmp, tmp);    /* 2^6 - 1 */
 714     felem_square(tmp, ftmp);
 715     felem_reduce(ftmp2, tmp);   /* 2^7 - 2 */
 716     for (i = 0; i < 5; ++i) {   /* 2^12 - 2^6 */
 717         felem_square(tmp, ftmp2);
 718         felem_reduce(ftmp2, tmp);
 719     }
 720     felem_mul(tmp, ftmp2, ftmp);
 721     felem_reduce(ftmp2, tmp);   /* 2^12 - 1 */
 722     felem_square(tmp, ftmp2);
 723     felem_reduce(ftmp3, tmp);   /* 2^13 - 2 */
 724     for (i = 0; i < 11; ++i) {  /* 2^24 - 2^12 */
 725         felem_square(tmp, ftmp3);
 726         felem_reduce(ftmp3, tmp);
 727     }
 728     felem_mul(tmp, ftmp3, ftmp2);
 729     felem_reduce(ftmp2, tmp);   /* 2^24 - 1 */
 730     felem_square(tmp, ftmp2);
 731     felem_reduce(ftmp3, tmp);   /* 2^25 - 2 */
 732     for (i = 0; i < 23; ++i) {  /* 2^48 - 2^24 */
 733         felem_square(tmp, ftmp3);
 734         felem_reduce(ftmp3, tmp);
 735     }
 736     felem_mul(tmp, ftmp3, ftmp2);
 737     felem_reduce(ftmp3, tmp);   /* 2^48 - 1 */
 738     felem_square(tmp, ftmp3);
 739     felem_reduce(ftmp4, tmp);   /* 2^49 - 2 */
 740     for (i = 0; i < 47; ++i) {  /* 2^96 - 2^48 */
 741         felem_square(tmp, ftmp4);
 742         felem_reduce(ftmp4, tmp);
 743     }
 744     felem_mul(tmp, ftmp3, ftmp4);
 745     felem_reduce(ftmp3, tmp);   /* 2^96 - 1 */
 746     felem_square(tmp, ftmp3);
 747     felem_reduce(ftmp4, tmp);   /* 2^97 - 2 */
 748     for (i = 0; i < 23; ++i) {  /* 2^120 - 2^24 */
 749         felem_square(tmp, ftmp4);
 750         felem_reduce(ftmp4, tmp);
 751     }
 752     felem_mul(tmp, ftmp2, ftmp4);
 753     felem_reduce(ftmp2, tmp);   /* 2^120 - 1 */
 754     for (i = 0; i < 6; ++i) {   /* 2^126 - 2^6 */
 755         felem_square(tmp, ftmp2);
 756         felem_reduce(ftmp2, tmp);
 757     }
 758     felem_mul(tmp, ftmp2, ftmp);
 759     felem_reduce(ftmp, tmp);    /* 2^126 - 1 */
 760     felem_square(tmp, ftmp);
 761     felem_reduce(ftmp, tmp);    /* 2^127 - 2 */
 762     felem_mul(tmp, ftmp, in);
 763     felem_reduce(ftmp, tmp);    /* 2^127 - 1 */
 764     for (i = 0; i < 97; ++i) {  /* 2^224 - 2^97 */
 765         felem_square(tmp, ftmp);
 766         felem_reduce(ftmp, tmp);
 767     }
 768     felem_mul(tmp, ftmp, ftmp3);
 769     felem_reduce(out, tmp);     /* 2^224 - 2^96 - 1 */
 770 }
 771
 772 /*
 773  * Copy in constant time: if icopy == 1, copy in to out, if icopy == 0, copy
 774  * out to itself.
 775  */
 776 static void copy_conditional(felem out, const felem in, limb icopy)
 777 {
 778     unsigned i;
 779     /*
 780      * icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one
 781      */
 782     const limb copy = -icopy;
 783     for (i = 0; i < 4; ++i) {
 784         const limb tmp = copy & (in[i] ^ out[i]);
 785         out[i] ^= tmp;
 786     }
 787 }
 788
 789 /******************************************************************************/
 790 /*-
 791  *                       ELLIPTIC CURVE POINT OPERATIONS
 792  *
 793  * Points are represented in Jacobian projective coordinates:
 794  * (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3),
 795  * or to the point at infinity if Z == 0.
 796  *
 797  */
 798
 799 /*-
 800  * Double an elliptic curve point:
 801  * (X', Y', Z') = 2 * (X, Y, Z), where
 802  * X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2
 803  * Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^2
 804  * Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z
 805  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed,
 806  * while x_out == y_in is not (maybe this works, but it's not tested).
 807  */
 808 static void
 809 point_double(felem x_out, felem y_out, felem z_out,
 810              const felem x_in, const felem y_in, const felem z_in)
 811 {
 812     widefelem tmp, tmp2;
 813     felem delta, gamma, beta, alpha, ftmp, ftmp2;
 814
 815     felem_assign(ftmp, x_in);
 816     felem_assign(ftmp2, x_in);
 817
 818     /* delta = z^2 */
 819     felem_square(tmp, z_in);
 820     felem_reduce(delta, tmp);
 821
 822     /* gamma = y^2 */
 823     felem_square(tmp, y_in);
 824     felem_reduce(gamma, tmp);
 825
 826     /* beta = x*gamma */
 827     felem_mul(tmp, x_in, gamma);
 828     felem_reduce(beta, tmp);
 829
 830     /* alpha = 3*(x-delta)*(x+delta) */
 831     felem_diff(ftmp, delta);
 832     /* ftmp[i] < 2^57 + 2^58 + 2 < 2^59 */
 833     felem_sum(ftmp2, delta);
 834     /* ftmp2[i] < 2^57 + 2^57 = 2^58 */
 835     felem_scalar(ftmp2, 3);
 836     /* ftmp2[i] < 3 * 2^58 < 2^60 */
 837     felem_mul(tmp, ftmp, ftmp2);
 838     /* tmp[i] < 2^60 * 2^59 * 4 = 2^121 */
 839     felem_reduce(alpha, tmp);
 840
 841     /* x' = alpha^2 - 8*beta */
 842     felem_square(tmp, alpha);
 843     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 844     felem_assign(ftmp, beta);
 845     felem_scalar(ftmp, 8);
 846     /* ftmp[i] < 8 * 2^57 = 2^60 */
 847     felem_diff_128_64(tmp, ftmp);
 848     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 849     felem_reduce(x_out, tmp);
 850
 851     /* z' = (y + z)^2 - gamma - delta */
 852     felem_sum(delta, gamma);
 853     /* delta[i] < 2^57 + 2^57 = 2^58 */
 854     felem_assign(ftmp, y_in);
 855     felem_sum(ftmp, z_in);
 856     /* ftmp[i] < 2^57 + 2^57 = 2^58 */
 857     felem_square(tmp, ftmp);
 858     /* tmp[i] < 4 * 2^58 * 2^58 = 2^118 */
 859     felem_diff_128_64(tmp, delta);
 860     /* tmp[i] < 2^118 + 2^64 + 8 < 2^119 */
 861     felem_reduce(z_out, tmp);
 862
 863     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
 864     felem_scalar(beta, 4);
 865     /* beta[i] < 4 * 2^57 = 2^59 */
 866     felem_diff(beta, x_out);
 867     /* beta[i] < 2^59 + 2^58 + 2 < 2^60 */
 868     felem_mul(tmp, alpha, beta);
 869     /* tmp[i] < 4 * 2^57 * 2^60 = 2^119 */
 870     felem_square(tmp2, gamma);
 871     /* tmp2[i] < 4 * 2^57 * 2^57 = 2^116 */
 872     widefelem_scalar(tmp2, 8);
 873     /* tmp2[i] < 8 * 2^116 = 2^119 */
 874     widefelem_diff(tmp, tmp2);
 875     /* tmp[i] < 2^119 + 2^120 < 2^121 */
 876     felem_reduce(y_out, tmp);
 877 }
 878
 879 /*-
 880  * Add two elliptic curve points:
 881  * (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where
 882  * X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 -
 883  * 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2
 884  * Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 - X_3) -
 885  *        Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3
 886  * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2)
 887  *
 888  * This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0.
 889  */
 890
 891 /*
 892  * This function is not entirely constant-time: it includes a branch for
 893  * checking whether the two input points are equal, (while not equal to the
 894  * point at infinity). This case never happens during single point
 895  * multiplication, so there is no timing leak for ECDH or ECDSA signing.
 896  */
 897 static void point_add(felem x3, felem y3, felem z3,
 898                       const felem x1, const felem y1, const felem z1,
 899                       const int mixed, const felem x2, const felem y2,
 900                       const felem z2)
 901 {
 902     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out;
 903     widefelem tmp, tmp2;
 904     limb z1_is_zero, z2_is_zero, x_equal, y_equal;
 905
 906     if (!mixed) {
 907         /* ftmp2 = z2^2 */
 908         felem_square(tmp, z2);
 909         felem_reduce(ftmp2, tmp);
 910
 911         /* ftmp4 = z2^3 */
 912         felem_mul(tmp, ftmp2, z2);
 913         felem_reduce(ftmp4, tmp);
 914
 915         /* ftmp4 = z2^3*y1 */
 916         felem_mul(tmp2, ftmp4, y1);
 917         felem_reduce(ftmp4, tmp2);
 918
 919         /* ftmp2 = z2^2*x1 */
 920         felem_mul(tmp2, ftmp2, x1);
 921         felem_reduce(ftmp2, tmp2);
 922     } else {
 923         /*
 924          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
 925          */
 926
 927         /* ftmp4 = z2^3*y1 */
 928         felem_assign(ftmp4, y1);
 929
 930         /* ftmp2 = z2^2*x1 */
 931         felem_assign(ftmp2, x1);
 932     }
 933
 934     /* ftmp = z1^2 */
 935     felem_square(tmp, z1);
 936     felem_reduce(ftmp, tmp);
 937
 938     /* ftmp3 = z1^3 */
 939     felem_mul(tmp, ftmp, z1);
 940     felem_reduce(ftmp3, tmp);
 941
 942     /* tmp = z1^3*y2 */
 943     felem_mul(tmp, ftmp3, y2);
 944     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 945
 946     /* ftmp3 = z1^3*y2 - z2^3*y1 */
 947     felem_diff_128_64(tmp, ftmp4);
 948     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 949     felem_reduce(ftmp3, tmp);
 950
 951     /* tmp = z1^2*x2 */
 952     felem_mul(tmp, ftmp, x2);
 953     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 954
 955     /* ftmp = z1^2*x2 - z2^2*x1 */
 956     felem_diff_128_64(tmp, ftmp2);
 957     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 958     felem_reduce(ftmp, tmp);
 959
 960     /*
 961      * the formulae are incorrect if the points are equal so we check for
 962      * this and do doubling if this happens
 963      */
 964     x_equal = felem_is_zero(ftmp);
 965     y_equal = felem_is_zero(ftmp3);
 966     z1_is_zero = felem_is_zero(z1);
 967     z2_is_zero = felem_is_zero(z2);
 968     /* In affine coordinates, (X_1, Y_1) == (X_2, Y_2) */
 969     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
 970         point_double(x3, y3, z3, x1, y1, z1);
 971         return;
 972     }
 973
 974     /* ftmp5 = z1*z2 */
 975     if (!mixed) {
 976         felem_mul(tmp, z1, z2);
 977         felem_reduce(ftmp5, tmp);
 978     } else {
 979         /* special case z2 = 0 is handled later */
 980         felem_assign(ftmp5, z1);
 981     }
 982
 983     /* z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) */
 984     felem_mul(tmp, ftmp, ftmp5);
 985     felem_reduce(z_out, tmp);
 986
 987     /* ftmp = (z1^2*x2 - z2^2*x1)^2 */
 988     felem_assign(ftmp5, ftmp);
 989     felem_square(tmp, ftmp);
 990     felem_reduce(ftmp, tmp);
 991
 992     /* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */
 993     felem_mul(tmp, ftmp, ftmp5);
 994     felem_reduce(ftmp5, tmp);
 995
 996     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
 997     felem_mul(tmp, ftmp2, ftmp);
 998     felem_reduce(ftmp2, tmp);
 999
1000     /* tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
1001     felem_mul(tmp, ftmp4, ftmp5);
1002     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
1003
1004     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 */
1005     felem_square(tmp2, ftmp3);
1006     /* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */
1007
1008     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */
1009     felem_diff_128_64(tmp2, ftmp5);
1010     /* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */
1011
1012     /* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
1013     felem_assign(ftmp5, ftmp2);
1014     felem_scalar(ftmp5, 2);
1015     /* ftmp5[i] < 2 * 2^57 = 2^58 */
1016
1017     /*-
1018      * x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 -
1019      *  2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2
1020      */
1021     felem_diff_128_64(tmp2, ftmp5);
1022     /* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */
1023     felem_reduce(x_out, tmp2);
1024
1025     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out */
1026     felem_diff(ftmp2, x_out);
1027     /* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */
1028
1029     /*
1030      * tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out)
1031      */
1032     felem_mul(tmp2, ftmp3, ftmp2);
1033     /* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */
1034
1035     /*-
1036      * y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) -
1037      *  z2^3*y1*(z1^2*x2 - z2^2*x1)^3
1038      */
1039     widefelem_diff(tmp2, tmp);
1040     /* tmp2[i] < 2^118 + 2^120 < 2^121 */
1041     felem_reduce(y_out, tmp2);
1042
1043     /*
1044      * the result (x_out, y_out, z_out) is incorrect if one of the inputs is
1045      * the point at infinity, so we need to check for this separately
1046      */
1047
1048     /*
1049      * if point 1 is at infinity, copy point 2 to output, and vice versa
1050      */
1051     copy_conditional(x_out, x2, z1_is_zero);
1052     copy_conditional(x_out, x1, z2_is_zero);
1053     copy_conditional(y_out, y2, z1_is_zero);
1054     copy_conditional(y_out, y1, z2_is_zero);
1055     copy_conditional(z_out, z2, z1_is_zero);
1056     copy_conditional(z_out, z1, z2_is_zero);
1057     felem_assign(x3, x_out);
1058     felem_assign(y3, y_out);
1059     felem_assign(z3, z_out);
1060 }
1061
1062 /*
1063  * select_point selects the |idx|th point from a precomputation table and
1064  * copies it to out.
1065  * The pre_comp array argument should be size of |size| argument
1066  */
1067 static void select_point(const u64 idx, unsigned int size,
1068                          const felem pre_comp[][3], felem out[3])
1069 {
1070     unsigned i, j;
1071     limb *outlimbs = &out[0][0];
1072     memset(outlimbs, 0, 3 * sizeof(felem));
1073
1074     for (i = 0; i < size; i++) {
1075         const limb *inlimbs = &pre_comp[i][0][0];
1076         u64 mask = i ^ idx;
1077         mask |= mask >> 4;
1078         mask |= mask >> 2;
1079         mask |= mask >> 1;
1080         mask &= 1;
1081         mask--;
1082         for (j = 0; j < 4 * 3; j++)
1083             outlimbs[j] |= inlimbs[j] & mask;
1084     }
1085 }
1086
1087 /* get_bit returns the |i|th bit in |in| */
1088 static char get_bit(const felem_bytearray in, unsigned i)
1089 {
1090     if (i >= 224)
1091         return 0;
1092     return (in[i >> 3] >> (i & 7)) & 1;
1093 }
1094
1095 /*
1096  * Interleaved point multiplication using precomputed point multiples: The
1097  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1098  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1099  * generator, using certain (large) precomputed multiples in g_pre_comp.
1100  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1101  */
1102 static void batch_mul(felem x_out, felem y_out, felem z_out,
1103                       const felem_bytearray scalars[],
1104                       const unsigned num_points, const u8 *g_scalar,
1105                       const int mixed, const felem pre_comp[][17][3],
1106                       const felem g_pre_comp[2][16][3])
1107 {
1108     int i, skip;
1109     unsigned num;
1110     unsigned gen_mul = (g_scalar != NULL);
1111     felem nq[3], tmp[4];
1112     u64 bits;
1113     u8 sign, digit;
1114
1115     /* set nq to the point at infinity */
1116     memset(nq, 0, 3 * sizeof(felem));
1117
1118     /*
1119      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1120      * of the generator (two in each of the last 28 rounds) and additions of
1121      * other points multiples (every 5th round).
1122      */
1123     skip = 1;                   /* save two point operations in the first
1124                                  * round */
1125     for (i = (num_points ? 220 : 27); i >= 0; --i) {
1126         /* double */
1127         if (!skip)
1128             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1129
1130         /* add multiples of the generator */
1131         if (gen_mul && (i <= 27)) {
1132             /* first, look 28 bits upwards */
1133             bits = get_bit(g_scalar, i + 196) << 3;
1134             bits |= get_bit(g_scalar, i + 140) << 2;
1135             bits |= get_bit(g_scalar, i + 84) << 1;
1136             bits |= get_bit(g_scalar, i + 28);
1137             /* select the point to add, in constant time */
1138             select_point(bits, 16, g_pre_comp[1], tmp);
1139
1140             if (!skip) {
1141                 /* value 1 below is argument for "mixed" */
1142                 point_add(nq[0], nq[1], nq[2],
1143                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1144             } else {
1145                 memcpy(nq, tmp, 3 * sizeof(felem));
1146                 skip = 0;
1147             }
1148
1149             /* second, look at the current position */
1150             bits = get_bit(g_scalar, i + 168) << 3;
1151             bits |= get_bit(g_scalar, i + 112) << 2;
1152             bits |= get_bit(g_scalar, i + 56) << 1;
1153             bits |= get_bit(g_scalar, i);
1154             /* select the point to add, in constant time */
1155             select_point(bits, 16, g_pre_comp[0], tmp);
1156             point_add(nq[0], nq[1], nq[2],
1157                       nq[0], nq[1], nq[2],
1158                       1 /* mixed */ , tmp[0], tmp[1], tmp[2]);
1159         }
1160
1161         /* do other additions every 5 doublings */
1162         if (num_points && (i % 5 == 0)) {
1163             /* loop over all scalars */
1164             for (num = 0; num < num_points; ++num) {
1165                 bits = get_bit(scalars[num], i + 4) << 5;
1166                 bits |= get_bit(scalars[num], i + 3) << 4;
1167                 bits |= get_bit(scalars[num], i + 2) << 3;
1168                 bits |= get_bit(scalars[num], i + 1) << 2;
1169                 bits |= get_bit(scalars[num], i) << 1;
1170                 bits |= get_bit(scalars[num], i - 1);
1171                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1172
1173                 /* select the point to add or subtract */
1174                 select_point(digit, 17, pre_comp[num], tmp);
1175                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1176                                             * point */
1177                 copy_conditional(tmp[1], tmp[3], sign);
1178
1179                 if (!skip) {
1180                     point_add(nq[0], nq[1], nq[2],
1181                               nq[0], nq[1], nq[2],
1182                               mixed, tmp[0], tmp[1], tmp[2]);
1183                 } else {
1184                     memcpy(nq, tmp, 3 * sizeof(felem));
1185                     skip = 0;
1186                 }
1187             }
1188         }
1189     }
1190     felem_assign(x_out, nq[0]);
1191     felem_assign(y_out, nq[1]);
1192     felem_assign(z_out, nq[2]);
1193 }
1194
1195 /******************************************************************************/
1196 /*
1197  * FUNCTIONS TO MANAGE PRECOMPUTATION
1198  */
1199
1200 static NISTP224_PRE_COMP *nistp224_pre_comp_new()
1201 {
1202     NISTP224_PRE_COMP *ret = NULL;
1203     ret = OPENSSL_malloc(sizeof *ret);
1204     if (!ret) {
1205         ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1206         return ret;
1207     }
1208     memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1209     ret->references = 1;
1210     return ret;
1211 }
1212
1213 static void *nistp224_pre_comp_dup(void *src_)
1214 {
1215     NISTP224_PRE_COMP *src = src_;
1216
1217     /* no need to actually copy, these objects never change! */
1218     CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1219
1220     return src_;
1221 }
1222
1223 static void nistp224_pre_comp_free(void *pre_)
1224 {
1225     int i;
1226     NISTP224_PRE_COMP *pre = pre_;
1227
1228     if (!pre)
1229         return;
1230
1231     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1232     if (i > 0)
1233         return;
1234
1235     OPENSSL_free(pre);
1236 }
1237
1238 static void nistp224_pre_comp_clear_free(void *pre_)
1239 {
1240     int i;
1241     NISTP224_PRE_COMP *pre = pre_;
1242
1243     if (!pre)
1244         return;
1245
1246     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1247     if (i > 0)
1248         return;
1249
1250     OPENSSL_clear_free(pre, sizeof *pre);
1251 }
1252
1253 /******************************************************************************/
1254 /*
1255  * OPENSSL EC_METHOD FUNCTIONS
1256  */
1257
1258 int ec_GFp_nistp224_group_init(EC_GROUP *group)
1259 {
1260     int ret;
1261     ret = ec_GFp_simple_group_init(group);
1262     group->a_is_minus3 = 1;
1263     return ret;
1264 }
1265
1266 int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1267                                     const BIGNUM *a, const BIGNUM *b,
1268                                     BN_CTX *ctx)
1269 {
1270     int ret = 0;
1271     BN_CTX *new_ctx = NULL;
1272     BIGNUM *curve_p, *curve_a, *curve_b;
1273
1274     if (ctx == NULL)
1275         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1276             return 0;
1277     BN_CTX_start(ctx);
1278     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1279         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1280         ((curve_b = BN_CTX_get(ctx)) == NULL))
1281         goto err;
1282     BN_bin2bn(nistp224_curve_params[0], sizeof(felem_bytearray), curve_p);
1283     BN_bin2bn(nistp224_curve_params[1], sizeof(felem_bytearray), curve_a);
1284     BN_bin2bn(nistp224_curve_params[2], sizeof(felem_bytearray), curve_b);
1285     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1286         ECerr(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE,
1287               EC_R_WRONG_CURVE_PARAMETERS);
1288         goto err;
1289     }
1290     group->field_mod_func = BN_nist_mod_224;
1291     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1292  err:
1293     BN_CTX_end(ctx);
1294     if (new_ctx != NULL)
1295         BN_CTX_free(new_ctx);
1296     return ret;
1297 }
1298
1299 /*
1300  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1301  * (X/Z^2, Y/Z^3)
1302  */
1303 int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group,
1304                                                  const EC_POINT *point,
1305                                                  BIGNUM *x, BIGNUM *y,
1306                                                  BN_CTX *ctx)
1307 {
1308     felem z1, z2, x_in, y_in, x_out, y_out;
1309     widefelem tmp;
1310
1311     if (EC_POINT_is_at_infinity(group, point)) {
1312         ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1313               EC_R_POINT_AT_INFINITY);
1314         return 0;
1315     }
1316     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1317         (!BN_to_felem(z1, point->Z)))
1318         return 0;
1319     felem_inv(z2, z1);
1320     felem_square(tmp, z2);
1321     felem_reduce(z1, tmp);
1322     felem_mul(tmp, x_in, z1);
1323     felem_reduce(x_in, tmp);
1324     felem_contract(x_out, x_in);
1325     if (x != NULL) {
1326         if (!felem_to_BN(x, x_out)) {
1327             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1328                   ERR_R_BN_LIB);
1329             return 0;
1330         }
1331     }
1332     felem_mul(tmp, z1, z2);
1333     felem_reduce(z1, tmp);
1334     felem_mul(tmp, y_in, z1);
1335     felem_reduce(y_in, tmp);
1336     felem_contract(y_out, y_in);
1337     if (y != NULL) {
1338         if (!felem_to_BN(y, y_out)) {
1339             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1340                   ERR_R_BN_LIB);
1341             return 0;
1342         }
1343     }
1344     return 1;
1345 }
1346
1347 static void make_points_affine(size_t num, felem points[ /* num */ ][3],
1348                                felem tmp_felems[ /* num+1 */ ])
1349 {
1350     /*
1351      * Runs in constant time, unless an input is the point at infinity (which
1352      * normally shouldn't happen).
1353      */
1354     ec_GFp_nistp_points_make_affine_internal(num,
1355                                              points,
1356                                              sizeof(felem),
1357                                              tmp_felems,
1358                                              (void (*)(void *))felem_one,
1359                                              (int (*)(const void *))
1360                                              felem_is_zero_int,
1361                                              (void (*)(void *, const void *))
1362                                              felem_assign,
1363                                              (void (*)(void *, const void *))
1364                                              felem_square_reduce, (void (*)
1365                                                                    (void *,
1366                                                                     const void
1367                                                                     *,
1368                                                                     const void
1369                                                                     *))
1370                                              felem_mul_reduce,
1371                                              (void (*)(void *, const void *))
1372                                              felem_inv,
1373                                              (void (*)(void *, const void *))
1374                                              felem_contract);
1375 }
1376
1377 /*
1378  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1379  * values Result is stored in r (r can equal one of the inputs).
1380  */
1381 int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
1382                                const BIGNUM *scalar, size_t num,
1383                                const EC_POINT *points[],
1384                                const BIGNUM *scalars[], BN_CTX *ctx)
1385 {
1386     int ret = 0;
1387     int j;
1388     unsigned i;
1389     int mixed = 0;
1390     BN_CTX *new_ctx = NULL;
1391     BIGNUM *x, *y, *z, *tmp_scalar;
1392     felem_bytearray g_secret;
1393     felem_bytearray *secrets = NULL;
1394     felem(*pre_comp)[17][3] = NULL;
1395     felem *tmp_felems = NULL;
1396     felem_bytearray tmp;
1397     unsigned num_bytes;
1398     int have_pre_comp = 0;
1399     size_t num_points = num;
1400     felem x_in, y_in, z_in, x_out, y_out, z_out;
1401     NISTP224_PRE_COMP *pre = NULL;
1402     const felem(*g_pre_comp)[16][3] = NULL;
1403     EC_POINT *generator = NULL;
1404     const EC_POINT *p = NULL;
1405     const BIGNUM *p_scalar = NULL;
1406
1407     if (ctx == NULL)
1408         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1409             return 0;
1410     BN_CTX_start(ctx);
1411     if (((x = BN_CTX_get(ctx)) == NULL) ||
1412         ((y = BN_CTX_get(ctx)) == NULL) ||
1413         ((z = BN_CTX_get(ctx)) == NULL) ||
1414         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1415         goto err;
1416
1417     if (scalar != NULL) {
1418         pre = EC_EX_DATA_get_data(group->extra_data,
1419                                   nistp224_pre_comp_dup,
1420                                   nistp224_pre_comp_free,
1421                                   nistp224_pre_comp_clear_free);
1422         if (pre)
1423             /* we have precomputation, try to use it */
1424             g_pre_comp = (const felem(*)[16][3])pre->g_pre_comp;
1425         else
1426             /* try to use the standard precomputation */
1427             g_pre_comp = &gmul[0];
1428         generator = EC_POINT_new(group);
1429         if (generator == NULL)
1430             goto err;
1431         /* get the generator from precomputation */
1432         if (!felem_to_BN(x, g_pre_comp[0][1][0]) ||
1433             !felem_to_BN(y, g_pre_comp[0][1][1]) ||
1434             !felem_to_BN(z, g_pre_comp[0][1][2])) {
1435             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1436             goto err;
1437         }
1438         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1439                                                       generator, x, y, z,
1440                                                       ctx))
1441             goto err;
1442         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1443             /* precomputation matches generator */
1444             have_pre_comp = 1;
1445         else
1446             /*
1447              * we don't have valid precomputation: treat the generator as a
1448              * random point
1449              */
1450             num_points = num_points + 1;
1451     }
1452
1453     if (num_points > 0) {
1454         if (num_points >= 3) {
1455             /*
1456              * unless we precompute multiples for just one or two points,
1457              * converting those into affine form is time well spent
1458              */
1459             mixed = 1;
1460         }
1461         secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1462         pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1463         if (mixed)
1464             tmp_felems =
1465                 OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1466         if ((secrets == NULL) || (pre_comp == NULL)
1467             || (mixed && (tmp_felems == NULL))) {
1468             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1469             goto err;
1470         }
1471
1472         /*
1473          * we treat NULL scalars as 0, and NULL points as points at infinity,
1474          * i.e., they contribute nothing to the linear combination
1475          */
1476         memset(secrets, 0, num_points * sizeof(felem_bytearray));
1477         memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1478         for (i = 0; i < num_points; ++i) {
1479             if (i == num)
1480                 /* the generator */
1481             {
1482                 p = EC_GROUP_get0_generator(group);
1483                 p_scalar = scalar;
1484             } else
1485                 /* the i^th point */
1486             {
1487                 p = points[i];
1488                 p_scalar = scalars[i];
1489             }
1490             if ((p_scalar != NULL) && (p != NULL)) {
1491                 /* reduce scalar to 0 <= scalar < 2^224 */
1492                 if ((BN_num_bits(p_scalar) > 224)
1493                     || (BN_is_negative(p_scalar))) {
1494                     /*
1495                      * this is an unusual input, and we don't guarantee
1496                      * constant-timeness
1497                      */
1498                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1499                         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1500                         goto err;
1501                     }
1502                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
1503                 } else
1504                     num_bytes = BN_bn2bin(p_scalar, tmp);
1505                 flip_endian(secrets[i], tmp, num_bytes);
1506                 /* precompute multiples */
1507                 if ((!BN_to_felem(x_out, p->X)) ||
1508                     (!BN_to_felem(y_out, p->Y)) ||
1509                     (!BN_to_felem(z_out, p->Z)))
1510                     goto err;
1511                 felem_assign(pre_comp[i][1][0], x_out);
1512                 felem_assign(pre_comp[i][1][1], y_out);
1513                 felem_assign(pre_comp[i][1][2], z_out);
1514                 for (j = 2; j <= 16; ++j) {
1515                     if (j & 1) {
1516                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1517                                   pre_comp[i][j][2], pre_comp[i][1][0],
1518                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1519                                   pre_comp[i][j - 1][0],
1520                                   pre_comp[i][j - 1][1],
1521                                   pre_comp[i][j - 1][2]);
1522                     } else {
1523                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1524                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1525                                      pre_comp[i][j / 2][1],
1526                                      pre_comp[i][j / 2][2]);
1527                     }
1528                 }
1529             }
1530         }
1531         if (mixed)
1532             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1533     }
1534
1535     /* the scalar for the generator */
1536     if ((scalar != NULL) && (have_pre_comp)) {
1537         memset(g_secret, 0, sizeof g_secret);
1538         /* reduce scalar to 0 <= scalar < 2^224 */
1539         if ((BN_num_bits(scalar) > 224) || (BN_is_negative(scalar))) {
1540             /*
1541              * this is an unusual input, and we don't guarantee
1542              * constant-timeness
1543              */
1544             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1545                 ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1546                 goto err;
1547             }
1548             num_bytes = BN_bn2bin(tmp_scalar, tmp);
1549         } else
1550             num_bytes = BN_bn2bin(scalar, tmp);
1551         flip_endian(g_secret, tmp, num_bytes);
1552         /* do the multiplication with generator precomputation */
1553         batch_mul(x_out, y_out, z_out,
1554                   (const felem_bytearray(*))secrets, num_points,
1555                   g_secret,
1556                   mixed, (const felem(*)[17][3])pre_comp, g_pre_comp);
1557     } else
1558         /* do the multiplication without generator precomputation */
1559         batch_mul(x_out, y_out, z_out,
1560                   (const felem_bytearray(*))secrets, num_points,
1561                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1562     /* reduce the output to its unique minimal representation */
1563     felem_contract(x_in, x_out);
1564     felem_contract(y_in, y_out);
1565     felem_contract(z_in, z_out);
1566     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1567         (!felem_to_BN(z, z_in))) {
1568         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1569         goto err;
1570     }
1571     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1572
1573  err:
1574     BN_CTX_end(ctx);
1575     EC_POINT_free(generator);
1576     if (new_ctx != NULL)
1577         BN_CTX_free(new_ctx);
1578     if (secrets != NULL)
1579         OPENSSL_free(secrets);
1580     if (pre_comp != NULL)
1581         OPENSSL_free(pre_comp);
1582     if (tmp_felems != NULL)
1583         OPENSSL_free(tmp_felems);
1584     return ret;
1585 }
1586
1587 int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1588 {
1589     int ret = 0;
1590     NISTP224_PRE_COMP *pre = NULL;
1591     int i, j;
1592     BN_CTX *new_ctx = NULL;
1593     BIGNUM *x, *y;
1594     EC_POINT *generator = NULL;
1595     felem tmp_felems[32];
1596
1597     /* throw away old precomputation */
1598     EC_EX_DATA_free_data(&group->extra_data, nistp224_pre_comp_dup,
1599                          nistp224_pre_comp_free,
1600                          nistp224_pre_comp_clear_free);
1601     if (ctx == NULL)
1602         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1603             return 0;
1604     BN_CTX_start(ctx);
1605     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
1606         goto err;
1607     /* get the generator */
1608     if (group->generator == NULL)
1609         goto err;
1610     generator = EC_POINT_new(group);
1611     if (generator == NULL)
1612         goto err;
1613     BN_bin2bn(nistp224_curve_params[3], sizeof(felem_bytearray), x);
1614     BN_bin2bn(nistp224_curve_params[4], sizeof(felem_bytearray), y);
1615     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
1616         goto err;
1617     if ((pre = nistp224_pre_comp_new()) == NULL)
1618         goto err;
1619     /*
1620      * if the generator is the standard one, use built-in precomputation
1621      */
1622     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1623         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1624         ret = 1;
1625         goto err;
1626     }
1627     if ((!BN_to_felem(pre->g_pre_comp[0][1][0], group->generator->X)) ||
1628         (!BN_to_felem(pre->g_pre_comp[0][1][1], group->generator->Y)) ||
1629         (!BN_to_felem(pre->g_pre_comp[0][1][2], group->generator->Z)))
1630         goto err;
1631     /*
1632      * compute 2^56*G, 2^112*G, 2^168*G for the first table, 2^28*G, 2^84*G,
1633      * 2^140*G, 2^196*G for the second one
1634      */
1635     for (i = 1; i <= 8; i <<= 1) {
1636         point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1637                      pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
1638                      pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]);
1639         for (j = 0; j < 27; ++j) {
1640             point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1641                          pre->g_pre_comp[1][i][2], pre->g_pre_comp[1][i][0],
1642                          pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1643         }
1644         if (i == 8)
1645             break;
1646         point_double(pre->g_pre_comp[0][2 * i][0],
1647                      pre->g_pre_comp[0][2 * i][1],
1648                      pre->g_pre_comp[0][2 * i][2], pre->g_pre_comp[1][i][0],
1649                      pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1650         for (j = 0; j < 27; ++j) {
1651             point_double(pre->g_pre_comp[0][2 * i][0],
1652                          pre->g_pre_comp[0][2 * i][1],
1653                          pre->g_pre_comp[0][2 * i][2],
1654                          pre->g_pre_comp[0][2 * i][0],
1655                          pre->g_pre_comp[0][2 * i][1],
1656                          pre->g_pre_comp[0][2 * i][2]);
1657         }
1658     }
1659     for (i = 0; i < 2; i++) {
1660         /* g_pre_comp[i][0] is the point at infinity */
1661         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
1662         /* the remaining multiples */
1663         /* 2^56*G + 2^112*G resp. 2^84*G + 2^140*G */
1664         point_add(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
1665                   pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
1666                   pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
1667                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1668                   pre->g_pre_comp[i][2][2]);
1669         /* 2^56*G + 2^168*G resp. 2^84*G + 2^196*G */
1670         point_add(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
1671                   pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
1672                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1673                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1674                   pre->g_pre_comp[i][2][2]);
1675         /* 2^112*G + 2^168*G resp. 2^140*G + 2^196*G */
1676         point_add(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
1677                   pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
1678                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1679                   0, pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
1680                   pre->g_pre_comp[i][4][2]);
1681         /*
1682          * 2^56*G + 2^112*G + 2^168*G resp. 2^84*G + 2^140*G + 2^196*G
1683          */
1684         point_add(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
1685                   pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
1686                   pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
1687                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1688                   pre->g_pre_comp[i][2][2]);
1689         for (j = 1; j < 8; ++j) {
1690             /* odd multiples: add G resp. 2^28*G */
1691             point_add(pre->g_pre_comp[i][2 * j + 1][0],
1692                       pre->g_pre_comp[i][2 * j + 1][1],
1693                       pre->g_pre_comp[i][2 * j + 1][2],
1694                       pre->g_pre_comp[i][2 * j][0],
1695                       pre->g_pre_comp[i][2 * j][1],
1696                       pre->g_pre_comp[i][2 * j][2], 0,
1697                       pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1],
1698                       pre->g_pre_comp[i][1][2]);
1699         }
1700     }
1701     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems);
1702
1703     if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp224_pre_comp_dup,
1704                              nistp224_pre_comp_free,
1705                              nistp224_pre_comp_clear_free))
1706         goto err;
1707     ret = 1;
1708     pre = NULL;
1709  err:
1710     BN_CTX_end(ctx);
1711     EC_POINT_free(generator);
1712     if (new_ctx != NULL)
1713         BN_CTX_free(new_ctx);
1714     nistp224_pre_comp_free(pre);
1715     return ret;
1716 }
1717
1718 int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group)
1719 {
1720     if (EC_EX_DATA_get_data(group->extra_data, nistp224_pre_comp_dup,
1721                             nistp224_pre_comp_free,
1722                             nistp224_pre_comp_clear_free)
1723         != NULL)
1724         return 1;
1725     else
1726         return 0;
1727 }
1728
1729 #else
1730 static void *dummy = &dummy;
1731 #endif