crypto/ec/ecp_nistp224.c

   1 /* crypto/ec/ecp_nistp224.c */
   2 /*
   3  * Written by Emilia Kasper (Google) for the OpenSSL project.
   4  */
   5 /* Copyright 2011 Google Inc.
   6  *
   7  * Licensed under the Apache License, Version 2.0 (the "License");
   8  *
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  *  Unless required by applicable law or agreed to in writing, software
  15  *  distributed under the License is distributed on an "AS IS" BASIS,
  16  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  *  See the License for the specific language governing permissions and
  18  *  limitations under the License.
  19  */
  20
  21 /*
  22  * A 64-bit implementation of the NIST P-224 elliptic curve point multiplication
  23  *
  24  * Inspired by Daniel J. Bernstein's public domain nistp224 implementation
  25  * and Adam Langley's public domain 64-bit C implementation of curve25519
  26  */
  27
  28 #include <openssl/opensslconf.h>
  29 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
  30
  31 # include <stdint.h>
  32 # include <string.h>
  33 # include <openssl/err.h>
  34 # include "ec_lcl.h"
  35
  36 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  37   /* even with gcc, the typedef won't work for 32-bit platforms */
  38 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  39                                  * platforms */
  40 # else
  41 #  error "Need GCC 3.1 or later to define type uint128_t"
  42 # endif
  43
  44 typedef uint8_t u8;
  45 typedef uint64_t u64;
  46 typedef int64_t s64;
  47
  48 /******************************************************************************/
  49 /*-
  50  * INTERNAL REPRESENTATION OF FIELD ELEMENTS
  51  *
  52  * Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3
  53  * using 64-bit coefficients called 'limbs',
  54  * and sometimes (for multiplication results) as
  55  * b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 + 2^336*b_6
  56  * using 128-bit coefficients called 'widelimbs'.
  57  * A 4-limb representation is an 'felem';
  58  * a 7-widelimb representation is a 'widefelem'.
  59  * Even within felems, bits of adjacent limbs overlap, and we don't always
  60  * reduce the representations: we ensure that inputs to each felem
  61  * multiplication satisfy a_i < 2^60, so outputs satisfy b_i < 4*2^60*2^60,
  62  * and fit into a 128-bit word without overflow. The coefficients are then
  63  * again partially reduced to obtain an felem satisfying a_i < 2^57.
  64  * We only reduce to the unique minimal representation at the end of the
  65  * computation.
  66  */
  67
  68 typedef uint64_t limb;
  69 typedef uint128_t widelimb;
  70
  71 typedef limb felem[4];
  72 typedef widelimb widefelem[7];
  73
  74 /*
  75  * Field element represented as a byte arrary. 28*8 = 224 bits is also the
  76  * group order size for the elliptic curve, and we also use this type for
  77  * scalars for point multiplication.
  78  */
  79 typedef u8 felem_bytearray[28];
  80
  81 static const felem_bytearray nistp224_curve_params[5] = {
  82     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
  83      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
  84      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01},
  85     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a */
  86      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
  87      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE},
  88     {0xB4, 0x05, 0x0A, 0x85, 0x0C, 0x04, 0xB3, 0xAB, 0xF5, 0x41, /* b */
  89      0x32, 0x56, 0x50, 0x44, 0xB0, 0xB7, 0xD7, 0xBF, 0xD8, 0xBA,
  90      0x27, 0x0B, 0x39, 0x43, 0x23, 0x55, 0xFF, 0xB4},
  91     {0xB7, 0x0E, 0x0C, 0xBD, 0x6B, 0xB4, 0xBF, 0x7F, 0x32, 0x13, /* x */
  92      0x90, 0xB9, 0x4A, 0x03, 0xC1, 0xD3, 0x56, 0xC2, 0x11, 0x22,
  93      0x34, 0x32, 0x80, 0xD6, 0x11, 0x5C, 0x1D, 0x21},
  94     {0xbd, 0x37, 0x63, 0x88, 0xb5, 0xf7, 0x23, 0xfb, 0x4c, 0x22, /* y */
  95      0xdf, 0xe6, 0xcd, 0x43, 0x75, 0xa0, 0x5a, 0x07, 0x47, 0x64,
  96      0x44, 0xd5, 0x81, 0x99, 0x85, 0x00, 0x7e, 0x34}
  97 };
  98
  99 /*-
 100  * Precomputed multiples of the standard generator
 101  * Points are given in coordinates (X, Y, Z) where Z normally is 1
 102  * (0 for the point at infinity).
 103  * For each field element, slice a_0 is word 0, etc.
 104  *
 105  * The table has 2 * 16 elements, starting with the following:
 106  * index | bits    | point
 107  * ------+---------+------------------------------
 108  *     0 | 0 0 0 0 | 0G
 109  *     1 | 0 0 0 1 | 1G
 110  *     2 | 0 0 1 0 | 2^56G
 111  *     3 | 0 0 1 1 | (2^56 + 1)G
 112  *     4 | 0 1 0 0 | 2^112G
 113  *     5 | 0 1 0 1 | (2^112 + 1)G
 114  *     6 | 0 1 1 0 | (2^112 + 2^56)G
 115  *     7 | 0 1 1 1 | (2^112 + 2^56 + 1)G
 116  *     8 | 1 0 0 0 | 2^168G
 117  *     9 | 1 0 0 1 | (2^168 + 1)G
 118  *    10 | 1 0 1 0 | (2^168 + 2^56)G
 119  *    11 | 1 0 1 1 | (2^168 + 2^56 + 1)G
 120  *    12 | 1 1 0 0 | (2^168 + 2^112)G
 121  *    13 | 1 1 0 1 | (2^168 + 2^112 + 1)G
 122  *    14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G
 123  *    15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G
 124  * followed by a copy of this with each element multiplied by 2^28.
 125  *
 126  * The reason for this is so that we can clock bits into four different
 127  * locations when doing simple scalar multiplies against the base point,
 128  * and then another four locations using the second 16 elements.
 129  */
 130 static const felem gmul[2][16][3] = {
 131 {{{0, 0, 0, 0},
 132   {0, 0, 0, 0},
 133   {0, 0, 0, 0}},
 134  {{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf},
 135   {0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723},
 136   {1, 0, 0, 0}},
 137  {{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5},
 138   {0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321},
 139   {1, 0, 0, 0}},
 140  {{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748},
 141   {0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17},
 142   {1, 0, 0, 0}},
 143  {{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe},
 144   {0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b},
 145   {1, 0, 0, 0}},
 146  {{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3},
 147   {0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a},
 148   {1, 0, 0, 0}},
 149  {{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c},
 150   {0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244},
 151   {1, 0, 0, 0}},
 152  {{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849},
 153   {0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112},
 154   {1, 0, 0, 0}},
 155  {{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47},
 156   {0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394},
 157   {1, 0, 0, 0}},
 158  {{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d},
 159   {0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7},
 160   {1, 0, 0, 0}},
 161  {{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24},
 162   {0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881},
 163   {1, 0, 0, 0}},
 164  {{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984},
 165   {0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369},
 166   {1, 0, 0, 0}},
 167  {{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3},
 168   {0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60},
 169   {1, 0, 0, 0}},
 170  {{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057},
 171   {0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9},
 172   {1, 0, 0, 0}},
 173  {{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9},
 174   {0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc},
 175   {1, 0, 0, 0}},
 176  {{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58},
 177   {0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558},
 178   {1, 0, 0, 0}}},
 179 {{{0, 0, 0, 0},
 180   {0, 0, 0, 0},
 181   {0, 0, 0, 0}},
 182  {{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31},
 183   {0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d},
 184   {1, 0, 0, 0}},
 185  {{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3},
 186   {0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a},
 187   {1, 0, 0, 0}},
 188  {{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33},
 189   {0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100},
 190   {1, 0, 0, 0}},
 191  {{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5},
 192   {0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea},
 193   {1, 0, 0, 0}},
 194  {{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be},
 195   {0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51},
 196   {1, 0, 0, 0}},
 197  {{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1},
 198   {0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb},
 199   {1, 0, 0, 0}},
 200  {{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233},
 201   {0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def},
 202   {1, 0, 0, 0}},
 203  {{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae},
 204   {0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45},
 205   {1, 0, 0, 0}},
 206  {{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e},
 207   {0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb},
 208   {1, 0, 0, 0}},
 209  {{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de},
 210   {0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3},
 211   {1, 0, 0, 0}},
 212  {{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05},
 213   {0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58},
 214   {1, 0, 0, 0}},
 215  {{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb},
 216   {0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0},
 217   {1, 0, 0, 0}},
 218  {{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9},
 219   {0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea},
 220   {1, 0, 0, 0}},
 221  {{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba},
 222   {0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405},
 223   {1, 0, 0, 0}},
 224  {{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e},
 225   {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e},
 226   {1, 0, 0, 0}}}
 227 };
 228
 229 /* Precomputation for the group generator. */
 230 typedef struct {
 231     felem g_pre_comp[2][16][3];
 232     int references;
 233 } NISTP224_PRE_COMP;
 234
 235 const EC_METHOD *EC_GFp_nistp224_method(void)
 236 {
 237     static const EC_METHOD ret = {
 238         EC_FLAGS_DEFAULT_OCT,
 239         NID_X9_62_prime_field,
 240         ec_GFp_nistp224_group_init,
 241         ec_GFp_simple_group_finish,
 242         ec_GFp_simple_group_clear_finish,
 243         ec_GFp_nist_group_copy,
 244         ec_GFp_nistp224_group_set_curve,
 245         ec_GFp_simple_group_get_curve,
 246         ec_GFp_simple_group_get_degree,
 247         ec_GFp_simple_group_check_discriminant,
 248         ec_GFp_simple_point_init,
 249         ec_GFp_simple_point_finish,
 250         ec_GFp_simple_point_clear_finish,
 251         ec_GFp_simple_point_copy,
 252         ec_GFp_simple_point_set_to_infinity,
 253         ec_GFp_simple_set_Jprojective_coordinates_GFp,
 254         ec_GFp_simple_get_Jprojective_coordinates_GFp,
 255         ec_GFp_simple_point_set_affine_coordinates,
 256         ec_GFp_nistp224_point_get_affine_coordinates,
 257         0 /* point_set_compressed_coordinates */ ,
 258         0 /* point2oct */ ,
 259         0 /* oct2point */ ,
 260         ec_GFp_simple_add,
 261         ec_GFp_simple_dbl,
 262         ec_GFp_simple_invert,
 263         ec_GFp_simple_is_at_infinity,
 264         ec_GFp_simple_is_on_curve,
 265         ec_GFp_simple_cmp,
 266         ec_GFp_simple_make_affine,
 267         ec_GFp_simple_points_make_affine,
 268         ec_GFp_nistp224_points_mul,
 269         ec_GFp_nistp224_precompute_mult,
 270         ec_GFp_nistp224_have_precompute_mult,
 271         ec_GFp_nist_field_mul,
 272         ec_GFp_nist_field_sqr,
 273         0 /* field_div */ ,
 274         0 /* field_encode */ ,
 275         0 /* field_decode */ ,
 276         0                       /* field_set_to_one */
 277     };
 278
 279     return &ret;
 280 }
 281
 282 /*
 283  * Helper functions to convert field elements to/from internal representation
 284  */
 285 static void bin28_to_felem(felem out, const u8 in[28])
 286 {
 287     out[0] = *((const uint64_t *)(in)) & 0x00ffffffffffffff;
 288     out[1] = (*((const uint64_t *)(in + 7))) & 0x00ffffffffffffff;
 289     out[2] = (*((const uint64_t *)(in + 14))) & 0x00ffffffffffffff;
 290     out[3] = (*((const uint64_t *)(in+20))) >> 8;
 291 }
 292
 293 static void felem_to_bin28(u8 out[28], const felem in)
 294 {
 295     unsigned i;
 296     for (i = 0; i < 7; ++i) {
 297         out[i] = in[0] >> (8 * i);
 298         out[i + 7] = in[1] >> (8 * i);
 299         out[i + 14] = in[2] >> (8 * i);
 300         out[i + 21] = in[3] >> (8 * i);
 301     }
 302 }
 303
 304 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 305 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 306 {
 307     unsigned i;
 308     for (i = 0; i < len; ++i)
 309         out[i] = in[len - 1 - i];
 310 }
 311
 312 /* From OpenSSL BIGNUM to internal representation */
 313 static int BN_to_felem(felem out, const BIGNUM *bn)
 314 {
 315     felem_bytearray b_in;
 316     felem_bytearray b_out;
 317     unsigned num_bytes;
 318
 319     /* BN_bn2bin eats leading zeroes */
 320     memset(b_out, 0, sizeof(b_out));
 321     num_bytes = BN_num_bytes(bn);
 322     if (num_bytes > sizeof b_out) {
 323         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 324         return 0;
 325     }
 326     if (BN_is_negative(bn)) {
 327         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 328         return 0;
 329     }
 330     num_bytes = BN_bn2bin(bn, b_in);
 331     flip_endian(b_out, b_in, num_bytes);
 332     bin28_to_felem(out, b_out);
 333     return 1;
 334 }
 335
 336 /* From internal representation to OpenSSL BIGNUM */
 337 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 338 {
 339     felem_bytearray b_in, b_out;
 340     felem_to_bin28(b_in, in);
 341     flip_endian(b_out, b_in, sizeof b_out);
 342     return BN_bin2bn(b_out, sizeof b_out, out);
 343 }
 344
 345 /******************************************************************************/
 346 /*-
 347  *                              FIELD OPERATIONS
 348  *
 349  * Field operations, using the internal representation of field elements.
 350  * NB! These operations are specific to our point multiplication and cannot be
 351  * expected to be correct in general - e.g., multiplication with a large scalar
 352  * will cause an overflow.
 353  *
 354  */
 355
 356 static void felem_one(felem out)
 357 {
 358     out[0] = 1;
 359     out[1] = 0;
 360     out[2] = 0;
 361     out[3] = 0;
 362 }
 363
 364 static void felem_assign(felem out, const felem in)
 365 {
 366     out[0] = in[0];
 367     out[1] = in[1];
 368     out[2] = in[2];
 369     out[3] = in[3];
 370 }
 371
 372 /* Sum two field elements: out += in */
 373 static void felem_sum(felem out, const felem in)
 374 {
 375     out[0] += in[0];
 376     out[1] += in[1];
 377     out[2] += in[2];
 378     out[3] += in[3];
 379 }
 380
 381 /* Get negative value: out = -in */
 382 /* Assumes in[i] < 2^57 */
 383 static void felem_neg(felem out, const felem in)
 384 {
 385     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 386     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 387     static const limb two58m42m2 = (((limb) 1) << 58) -
 388         (((limb) 1) << 42) - (((limb) 1) << 2);
 389
 390     /* Set to 0 mod 2^224-2^96+1 to ensure out > in */
 391     out[0] = two58p2 - in[0];
 392     out[1] = two58m42m2 - in[1];
 393     out[2] = two58m2 - in[2];
 394     out[3] = two58m2 - in[3];
 395 }
 396
 397 /* Subtract field elements: out -= in */
 398 /* Assumes in[i] < 2^57 */
 399 static void felem_diff(felem out, const felem in)
 400 {
 401     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 402     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 403     static const limb two58m42m2 = (((limb) 1) << 58) -
 404         (((limb) 1) << 42) - (((limb) 1) << 2);
 405
 406     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 407     out[0] += two58p2;
 408     out[1] += two58m42m2;
 409     out[2] += two58m2;
 410     out[3] += two58m2;
 411
 412     out[0] -= in[0];
 413     out[1] -= in[1];
 414     out[2] -= in[2];
 415     out[3] -= in[3];
 416 }
 417
 418 /* Subtract in unreduced 128-bit mode: out -= in */
 419 /* Assumes in[i] < 2^119 */
 420 static void widefelem_diff(widefelem out, const widefelem in)
 421 {
 422     static const widelimb two120 = ((widelimb) 1) << 120;
 423     static const widelimb two120m64 = (((widelimb) 1) << 120) -
 424         (((widelimb) 1) << 64);
 425     static const widelimb two120m104m64 = (((widelimb) 1) << 120) -
 426         (((widelimb) 1) << 104) - (((widelimb) 1) << 64);
 427
 428     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 429     out[0] += two120;
 430     out[1] += two120m64;
 431     out[2] += two120m64;
 432     out[3] += two120;
 433     out[4] += two120m104m64;
 434     out[5] += two120m64;
 435     out[6] += two120m64;
 436
 437     out[0] -= in[0];
 438     out[1] -= in[1];
 439     out[2] -= in[2];
 440     out[3] -= in[3];
 441     out[4] -= in[4];
 442     out[5] -= in[5];
 443     out[6] -= in[6];
 444 }
 445
 446 /* Subtract in mixed mode: out128 -= in64 */
 447 /* in[i] < 2^63 */
 448 static void felem_diff_128_64(widefelem out, const felem in)
 449 {
 450     static const widelimb two64p8 = (((widelimb) 1) << 64) +
 451         (((widelimb) 1) << 8);
 452     static const widelimb two64m8 = (((widelimb) 1) << 64) -
 453         (((widelimb) 1) << 8);
 454     static const widelimb two64m48m8 = (((widelimb) 1) << 64) -
 455         (((widelimb) 1) << 48) - (((widelimb) 1) << 8);
 456
 457     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 458     out[0] += two64p8;
 459     out[1] += two64m48m8;
 460     out[2] += two64m8;
 461     out[3] += two64m8;
 462
 463     out[0] -= in[0];
 464     out[1] -= in[1];
 465     out[2] -= in[2];
 466     out[3] -= in[3];
 467 }
 468
 469 /*
 470  * Multiply a field element by a scalar: out = out * scalar The scalars we
 471  * actually use are small, so results fit without overflow
 472  */
 473 static void felem_scalar(felem out, const limb scalar)
 474 {
 475     out[0] *= scalar;
 476     out[1] *= scalar;
 477     out[2] *= scalar;
 478     out[3] *= scalar;
 479 }
 480
 481 /*
 482  * Multiply an unreduced field element by a scalar: out = out * scalar The
 483  * scalars we actually use are small, so results fit without overflow
 484  */
 485 static void widefelem_scalar(widefelem out, const widelimb scalar)
 486 {
 487     out[0] *= scalar;
 488     out[1] *= scalar;
 489     out[2] *= scalar;
 490     out[3] *= scalar;
 491     out[4] *= scalar;
 492     out[5] *= scalar;
 493     out[6] *= scalar;
 494 }
 495
 496 /* Square a field element: out = in^2 */
 497 static void felem_square(widefelem out, const felem in)
 498 {
 499     limb tmp0, tmp1, tmp2;
 500     tmp0 = 2 * in[0];
 501     tmp1 = 2 * in[1];
 502     tmp2 = 2 * in[2];
 503     out[0] = ((widelimb) in[0]) * in[0];
 504     out[1] = ((widelimb) in[0]) * tmp1;
 505     out[2] = ((widelimb) in[0]) * tmp2 + ((widelimb) in[1]) * in[1];
 506     out[3] = ((widelimb) in[3]) * tmp0 + ((widelimb) in[1]) * tmp2;
 507     out[4] = ((widelimb) in[3]) * tmp1 + ((widelimb) in[2]) * in[2];
 508     out[5] = ((widelimb) in[3]) * tmp2;
 509     out[6] = ((widelimb) in[3]) * in[3];
 510 }
 511
 512 /* Multiply two field elements: out = in1 * in2 */
 513 static void felem_mul(widefelem out, const felem in1, const felem in2)
 514 {
 515     out[0] = ((widelimb) in1[0]) * in2[0];
 516     out[1] = ((widelimb) in1[0]) * in2[1] + ((widelimb) in1[1]) * in2[0];
 517     out[2] = ((widelimb) in1[0]) * in2[2] + ((widelimb) in1[1]) * in2[1] +
 518              ((widelimb) in1[2]) * in2[0];
 519     out[3] = ((widelimb) in1[0]) * in2[3] + ((widelimb) in1[1]) * in2[2] +
 520              ((widelimb) in1[2]) * in2[1] + ((widelimb) in1[3]) * in2[0];
 521     out[4] = ((widelimb) in1[1]) * in2[3] + ((widelimb) in1[2]) * in2[2] +
 522              ((widelimb) in1[3]) * in2[1];
 523     out[5] = ((widelimb) in1[2]) * in2[3] + ((widelimb) in1[3]) * in2[2];
 524     out[6] = ((widelimb) in1[3]) * in2[3];
 525 }
 526
 527 /*-
 528  * Reduce seven 128-bit coefficients to four 64-bit coefficients.
 529  * Requires in[i] < 2^126,
 530  * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 */
 531 static void felem_reduce(felem out, const widefelem in)
 532 {
 533     static const widelimb two127p15 = (((widelimb) 1) << 127) +
 534         (((widelimb) 1) << 15);
 535     static const widelimb two127m71 = (((widelimb) 1) << 127) -
 536         (((widelimb) 1) << 71);
 537     static const widelimb two127m71m55 = (((widelimb) 1) << 127) -
 538         (((widelimb) 1) << 71) - (((widelimb) 1) << 55);
 539     widelimb output[5];
 540
 541     /* Add 0 mod 2^224-2^96+1 to ensure all differences are positive */
 542     output[0] = in[0] + two127p15;
 543     output[1] = in[1] + two127m71m55;
 544     output[2] = in[2] + two127m71;
 545     output[3] = in[3];
 546     output[4] = in[4];
 547
 548     /* Eliminate in[4], in[5], in[6] */
 549     output[4] += in[6] >> 16;
 550     output[3] += (in[6] & 0xffff) << 40;
 551     output[2] -= in[6];
 552
 553     output[3] += in[5] >> 16;
 554     output[2] += (in[5] & 0xffff) << 40;
 555     output[1] -= in[5];
 556
 557     output[2] += output[4] >> 16;
 558     output[1] += (output[4] & 0xffff) << 40;
 559     output[0] -= output[4];
 560
 561     /* Carry 2 -> 3 -> 4 */
 562     output[3] += output[2] >> 56;
 563     output[2] &= 0x00ffffffffffffff;
 564
 565     output[4] = output[3] >> 56;
 566     output[3] &= 0x00ffffffffffffff;
 567
 568     /* Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72 */
 569
 570     /* Eliminate output[4] */
 571     output[2] += output[4] >> 16;
 572     /* output[2] < 2^56 + 2^56 = 2^57 */
 573     output[1] += (output[4] & 0xffff) << 40;
 574     output[0] -= output[4];
 575
 576     /* Carry 0 -> 1 -> 2 -> 3 */
 577     output[1] += output[0] >> 56;
 578     out[0] = output[0] & 0x00ffffffffffffff;
 579
 580     output[2] += output[1] >> 56;
 581     /* output[2] < 2^57 + 2^72 */
 582     out[1] = output[1] & 0x00ffffffffffffff;
 583     output[3] += output[2] >> 56;
 584     /* output[3] <= 2^56 + 2^16 */
 585     out[2] = output[2] & 0x00ffffffffffffff;
 586
 587     /*-
 588      * out[0] < 2^56, out[1] < 2^56, out[2] < 2^56,
 589      * out[3] <= 2^56 + 2^16 (due to final carry),
 590      * so out < 2*p
 591      */
 592     out[3] = output[3];
 593 }
 594
 595 static void felem_square_reduce(felem out, const felem in)
 596 {
 597     widefelem tmp;
 598     felem_square(tmp, in);
 599     felem_reduce(out, tmp);
 600 }
 601
 602 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 603 {
 604     widefelem tmp;
 605     felem_mul(tmp, in1, in2);
 606     felem_reduce(out, tmp);
 607 }
 608
 609 /*
 610  * Reduce to unique minimal representation. Requires 0 <= in < 2*p (always
 611  * call felem_reduce first)
 612  */
 613 static void felem_contract(felem out, const felem in)
 614 {
 615     static const int64_t two56 = ((limb) 1) << 56;
 616     /* 0 <= in < 2*p, p = 2^224 - 2^96 + 1 */
 617     /* if in > p , reduce in = in - 2^224 + 2^96 - 1 */
 618     int64_t tmp[4], a;
 619     tmp[0] = in[0];
 620     tmp[1] = in[1];
 621     tmp[2] = in[2];
 622     tmp[3] = in[3];
 623     /* Case 1: a = 1 iff in >= 2^224 */
 624     a = (in[3] >> 56);
 625     tmp[0] -= a;
 626     tmp[1] += a << 40;
 627     tmp[3] &= 0x00ffffffffffffff;
 628     /*
 629      * Case 2: a = 0 iff p <= in < 2^224, i.e., the high 128 bits are all 1
 630      * and the lower part is non-zero
 631      */
 632     a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) |
 633         (((int64_t) (in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63);
 634     a &= 0x00ffffffffffffff;
 635     /* turn a into an all-one mask (if a = 0) or an all-zero mask */
 636     a = (a - 1) >> 63;
 637     /* subtract 2^224 - 2^96 + 1 if a is all-one */
 638     tmp[3] &= a ^ 0xffffffffffffffff;
 639     tmp[2] &= a ^ 0xffffffffffffffff;
 640     tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff;
 641     tmp[0] -= 1 & a;
 642
 643     /*
 644      * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
 645      * non-zero, so we only need one step
 646      */
 647     a = tmp[0] >> 63;
 648     tmp[0] += two56 & a;
 649     tmp[1] -= 1 & a;
 650
 651     /* carry 1 -> 2 -> 3 */
 652     tmp[2] += tmp[1] >> 56;
 653     tmp[1] &= 0x00ffffffffffffff;
 654
 655     tmp[3] += tmp[2] >> 56;
 656     tmp[2] &= 0x00ffffffffffffff;
 657
 658     /* Now 0 <= out < p */
 659     out[0] = tmp[0];
 660     out[1] = tmp[1];
 661     out[2] = tmp[2];
 662     out[3] = tmp[3];
 663 }
 664
 665 /*
 666  * Zero-check: returns 1 if input is 0, and 0 otherwise. We know that field
 667  * elements are reduced to in < 2^225, so we only need to check three cases:
 668  * 0, 2^224 - 2^96 + 1, and 2^225 - 2^97 + 2
 669  */
 670 static limb felem_is_zero(const felem in)
 671 {
 672     limb zero, two224m96p1, two225m97p2;
 673
 674     zero = in[0] | in[1] | in[2] | in[3];
 675     zero = (((int64_t) (zero) - 1) >> 63) & 1;
 676     two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000)
 677         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x00ffffffffffffff);
 678     two224m96p1 = (((int64_t) (two224m96p1) - 1) >> 63) & 1;
 679     two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000)
 680         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x01ffffffffffffff);
 681     two225m97p2 = (((int64_t) (two225m97p2) - 1) >> 63) & 1;
 682     return (zero | two224m96p1 | two225m97p2);
 683 }
 684
 685 static limb felem_is_zero_int(const felem in)
 686 {
 687     return (int)(felem_is_zero(in) & ((limb) 1));
 688 }
 689
 690 /* Invert a field element */
 691 /* Computation chain copied from djb's code */
 692 static void felem_inv(felem out, const felem in)
 693 {
 694     felem ftmp, ftmp2, ftmp3, ftmp4;
 695     widefelem tmp;
 696     unsigned i;
 697
 698     felem_square(tmp, in);
 699     felem_reduce(ftmp, tmp);    /* 2 */
 700     felem_mul(tmp, in, ftmp);
 701     felem_reduce(ftmp, tmp);    /* 2^2 - 1 */
 702     felem_square(tmp, ftmp);
 703     felem_reduce(ftmp, tmp);    /* 2^3 - 2 */
 704     felem_mul(tmp, in, ftmp);
 705     felem_reduce(ftmp, tmp);    /* 2^3 - 1 */
 706     felem_square(tmp, ftmp);
 707     felem_reduce(ftmp2, tmp);   /* 2^4 - 2 */
 708     felem_square(tmp, ftmp2);
 709     felem_reduce(ftmp2, tmp);   /* 2^5 - 4 */
 710     felem_square(tmp, ftmp2);
 711     felem_reduce(ftmp2, tmp);   /* 2^6 - 8 */
 712     felem_mul(tmp, ftmp2, ftmp);
 713     felem_reduce(ftmp, tmp);    /* 2^6 - 1 */
 714     felem_square(tmp, ftmp);
 715     felem_reduce(ftmp2, tmp);   /* 2^7 - 2 */
 716     for (i = 0; i < 5; ++i) {   /* 2^12 - 2^6 */
 717         felem_square(tmp, ftmp2);
 718         felem_reduce(ftmp2, tmp);
 719     }
 720     felem_mul(tmp, ftmp2, ftmp);
 721     felem_reduce(ftmp2, tmp);   /* 2^12 - 1 */
 722     felem_square(tmp, ftmp2);
 723     felem_reduce(ftmp3, tmp);   /* 2^13 - 2 */
 724     for (i = 0; i < 11; ++i) {  /* 2^24 - 2^12 */
 725         felem_square(tmp, ftmp3);
 726         felem_reduce(ftmp3, tmp);
 727     }
 728     felem_mul(tmp, ftmp3, ftmp2);
 729     felem_reduce(ftmp2, tmp);   /* 2^24 - 1 */
 730     felem_square(tmp, ftmp2);
 731     felem_reduce(ftmp3, tmp);   /* 2^25 - 2 */
 732     for (i = 0; i < 23; ++i) {  /* 2^48 - 2^24 */
 733         felem_square(tmp, ftmp3);
 734         felem_reduce(ftmp3, tmp);
 735     }
 736     felem_mul(tmp, ftmp3, ftmp2);
 737     felem_reduce(ftmp3, tmp);   /* 2^48 - 1 */
 738     felem_square(tmp, ftmp3);
 739     felem_reduce(ftmp4, tmp);   /* 2^49 - 2 */
 740     for (i = 0; i < 47; ++i) {  /* 2^96 - 2^48 */
 741         felem_square(tmp, ftmp4);
 742         felem_reduce(ftmp4, tmp);
 743     }
 744     felem_mul(tmp, ftmp3, ftmp4);
 745     felem_reduce(ftmp3, tmp);   /* 2^96 - 1 */
 746     felem_square(tmp, ftmp3);
 747     felem_reduce(ftmp4, tmp);   /* 2^97 - 2 */
 748     for (i = 0; i < 23; ++i) {  /* 2^120 - 2^24 */
 749         felem_square(tmp, ftmp4);
 750         felem_reduce(ftmp4, tmp);
 751     }
 752     felem_mul(tmp, ftmp2, ftmp4);
 753     felem_reduce(ftmp2, tmp);   /* 2^120 - 1 */
 754     for (i = 0; i < 6; ++i) {   /* 2^126 - 2^6 */
 755         felem_square(tmp, ftmp2);
 756         felem_reduce(ftmp2, tmp);
 757     }
 758     felem_mul(tmp, ftmp2, ftmp);
 759     felem_reduce(ftmp, tmp);    /* 2^126 - 1 */
 760     felem_square(tmp, ftmp);
 761     felem_reduce(ftmp, tmp);    /* 2^127 - 2 */
 762     felem_mul(tmp, ftmp, in);
 763     felem_reduce(ftmp, tmp);    /* 2^127 - 1 */
 764     for (i = 0; i < 97; ++i) {  /* 2^224 - 2^97 */
 765         felem_square(tmp, ftmp);
 766         felem_reduce(ftmp, tmp);
 767     }
 768     felem_mul(tmp, ftmp, ftmp3);
 769     felem_reduce(out, tmp);     /* 2^224 - 2^96 - 1 */
 770 }
 771
 772 /*
 773  * Copy in constant time: if icopy == 1, copy in to out, if icopy == 0, copy
 774  * out to itself.
 775  */
 776 static void copy_conditional(felem out, const felem in, limb icopy)
 777 {
 778     unsigned i;
 779     /*
 780      * icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one
 781      */
 782     const limb copy = -icopy;
 783     for (i = 0; i < 4; ++i) {
 784         const limb tmp = copy & (in[i] ^ out[i]);
 785         out[i] ^= tmp;
 786     }
 787 }
 788
 789 /******************************************************************************/
 790 /*-
 791  *                       ELLIPTIC CURVE POINT OPERATIONS
 792  *
 793  * Points are represented in Jacobian projective coordinates:
 794  * (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3),
 795  * or to the point at infinity if Z == 0.
 796  *
 797  */
 798
 799 /*-
 800  * Double an elliptic curve point:
 801  * (X', Y', Z') = 2 * (X, Y, Z), where
 802  * X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2
 803  * Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^2
 804  * Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z
 805  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed,
 806  * while x_out == y_in is not (maybe this works, but it's not tested).
 807  */
 808 static void
 809 point_double(felem x_out, felem y_out, felem z_out,
 810              const felem x_in, const felem y_in, const felem z_in)
 811 {
 812     widefelem tmp, tmp2;
 813     felem delta, gamma, beta, alpha, ftmp, ftmp2;
 814
 815     felem_assign(ftmp, x_in);
 816     felem_assign(ftmp2, x_in);
 817
 818     /* delta = z^2 */
 819     felem_square(tmp, z_in);
 820     felem_reduce(delta, tmp);
 821
 822     /* gamma = y^2 */
 823     felem_square(tmp, y_in);
 824     felem_reduce(gamma, tmp);
 825
 826     /* beta = x*gamma */
 827     felem_mul(tmp, x_in, gamma);
 828     felem_reduce(beta, tmp);
 829
 830     /* alpha = 3*(x-delta)*(x+delta) */
 831     felem_diff(ftmp, delta);
 832     /* ftmp[i] < 2^57 + 2^58 + 2 < 2^59 */
 833     felem_sum(ftmp2, delta);
 834     /* ftmp2[i] < 2^57 + 2^57 = 2^58 */
 835     felem_scalar(ftmp2, 3);
 836     /* ftmp2[i] < 3 * 2^58 < 2^60 */
 837     felem_mul(tmp, ftmp, ftmp2);
 838     /* tmp[i] < 2^60 * 2^59 * 4 = 2^121 */
 839     felem_reduce(alpha, tmp);
 840
 841     /* x' = alpha^2 - 8*beta */
 842     felem_square(tmp, alpha);
 843     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 844     felem_assign(ftmp, beta);
 845     felem_scalar(ftmp, 8);
 846     /* ftmp[i] < 8 * 2^57 = 2^60 */
 847     felem_diff_128_64(tmp, ftmp);
 848     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 849     felem_reduce(x_out, tmp);
 850
 851     /* z' = (y + z)^2 - gamma - delta */
 852     felem_sum(delta, gamma);
 853     /* delta[i] < 2^57 + 2^57 = 2^58 */
 854     felem_assign(ftmp, y_in);
 855     felem_sum(ftmp, z_in);
 856     /* ftmp[i] < 2^57 + 2^57 = 2^58 */
 857     felem_square(tmp, ftmp);
 858     /* tmp[i] < 4 * 2^58 * 2^58 = 2^118 */
 859     felem_diff_128_64(tmp, delta);
 860     /* tmp[i] < 2^118 + 2^64 + 8 < 2^119 */
 861     felem_reduce(z_out, tmp);
 862
 863     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
 864     felem_scalar(beta, 4);
 865     /* beta[i] < 4 * 2^57 = 2^59 */
 866     felem_diff(beta, x_out);
 867     /* beta[i] < 2^59 + 2^58 + 2 < 2^60 */
 868     felem_mul(tmp, alpha, beta);
 869     /* tmp[i] < 4 * 2^57 * 2^60 = 2^119 */
 870     felem_square(tmp2, gamma);
 871     /* tmp2[i] < 4 * 2^57 * 2^57 = 2^116 */
 872     widefelem_scalar(tmp2, 8);
 873     /* tmp2[i] < 8 * 2^116 = 2^119 */
 874     widefelem_diff(tmp, tmp2);
 875     /* tmp[i] < 2^119 + 2^120 < 2^121 */
 876     felem_reduce(y_out, tmp);
 877 }
 878
 879 /*-
 880  * Add two elliptic curve points:
 881  * (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where
 882  * X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 -
 883  * 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2
 884  * Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 - X_3) -
 885  *        Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3
 886  * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2)
 887  *
 888  * This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0.
 889  */
 890
 891 /*
 892  * This function is not entirely constant-time: it includes a branch for
 893  * checking whether the two input points are equal, (while not equal to the
 894  * point at infinity). This case never happens during single point
 895  * multiplication, so there is no timing leak for ECDH or ECDSA signing.
 896  */
 897 static void point_add(felem x3, felem y3, felem z3,
 898                       const felem x1, const felem y1, const felem z1,
 899                       const int mixed, const felem x2, const felem y2,
 900                       const felem z2)
 901 {
 902     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out;
 903     widefelem tmp, tmp2;
 904     limb z1_is_zero, z2_is_zero, x_equal, y_equal;
 905
 906     if (!mixed) {
 907         /* ftmp2 = z2^2 */
 908         felem_square(tmp, z2);
 909         felem_reduce(ftmp2, tmp);
 910
 911         /* ftmp4 = z2^3 */
 912         felem_mul(tmp, ftmp2, z2);
 913         felem_reduce(ftmp4, tmp);
 914
 915         /* ftmp4 = z2^3*y1 */
 916         felem_mul(tmp2, ftmp4, y1);
 917         felem_reduce(ftmp4, tmp2);
 918
 919         /* ftmp2 = z2^2*x1 */
 920         felem_mul(tmp2, ftmp2, x1);
 921         felem_reduce(ftmp2, tmp2);
 922     } else {
 923         /*
 924          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
 925          */
 926
 927         /* ftmp4 = z2^3*y1 */
 928         felem_assign(ftmp4, y1);
 929
 930         /* ftmp2 = z2^2*x1 */
 931         felem_assign(ftmp2, x1);
 932     }
 933
 934     /* ftmp = z1^2 */
 935     felem_square(tmp, z1);
 936     felem_reduce(ftmp, tmp);
 937
 938     /* ftmp3 = z1^3 */
 939     felem_mul(tmp, ftmp, z1);
 940     felem_reduce(ftmp3, tmp);
 941
 942     /* tmp = z1^3*y2 */
 943     felem_mul(tmp, ftmp3, y2);
 944     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 945
 946     /* ftmp3 = z1^3*y2 - z2^3*y1 */
 947     felem_diff_128_64(tmp, ftmp4);
 948     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 949     felem_reduce(ftmp3, tmp);
 950
 951     /* tmp = z1^2*x2 */
 952     felem_mul(tmp, ftmp, x2);
 953     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 954
 955     /* ftmp = z1^2*x2 - z2^2*x1 */
 956     felem_diff_128_64(tmp, ftmp2);
 957     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 958     felem_reduce(ftmp, tmp);
 959
 960     /*
 961      * the formulae are incorrect if the points are equal so we check for
 962      * this and do doubling if this happens
 963      */
 964     x_equal = felem_is_zero(ftmp);
 965     y_equal = felem_is_zero(ftmp3);
 966     z1_is_zero = felem_is_zero(z1);
 967     z2_is_zero = felem_is_zero(z2);
 968     /* In affine coordinates, (X_1, Y_1) == (X_2, Y_2) */
 969     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
 970         point_double(x3, y3, z3, x1, y1, z1);
 971         return;
 972     }
 973
 974     /* ftmp5 = z1*z2 */
 975     if (!mixed) {
 976         felem_mul(tmp, z1, z2);
 977         felem_reduce(ftmp5, tmp);
 978     } else {
 979         /* special case z2 = 0 is handled later */
 980         felem_assign(ftmp5, z1);
 981     }
 982
 983     /* z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) */
 984     felem_mul(tmp, ftmp, ftmp5);
 985     felem_reduce(z_out, tmp);
 986
 987     /* ftmp = (z1^2*x2 - z2^2*x1)^2 */
 988     felem_assign(ftmp5, ftmp);
 989     felem_square(tmp, ftmp);
 990     felem_reduce(ftmp, tmp);
 991
 992     /* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */
 993     felem_mul(tmp, ftmp, ftmp5);
 994     felem_reduce(ftmp5, tmp);
 995
 996     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
 997     felem_mul(tmp, ftmp2, ftmp);
 998     felem_reduce(ftmp2, tmp);
 999
1000     /* tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
1001     felem_mul(tmp, ftmp4, ftmp5);
1002     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
1003
1004     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 */
1005     felem_square(tmp2, ftmp3);
1006     /* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */
1007
1008     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */
1009     felem_diff_128_64(tmp2, ftmp5);
1010     /* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */
1011
1012     /* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
1013     felem_assign(ftmp5, ftmp2);
1014     felem_scalar(ftmp5, 2);
1015     /* ftmp5[i] < 2 * 2^57 = 2^58 */
1016
1017     /*-
1018      * x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 -
1019      *  2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2
1020      */
1021     felem_diff_128_64(tmp2, ftmp5);
1022     /* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */
1023     felem_reduce(x_out, tmp2);
1024
1025     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out */
1026     felem_diff(ftmp2, x_out);
1027     /* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */
1028
1029     /*
1030      * tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out)
1031      */
1032     felem_mul(tmp2, ftmp3, ftmp2);
1033     /* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */
1034
1035     /*-
1036      * y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) -
1037      *  z2^3*y1*(z1^2*x2 - z2^2*x1)^3
1038      */
1039     widefelem_diff(tmp2, tmp);
1040     /* tmp2[i] < 2^118 + 2^120 < 2^121 */
1041     felem_reduce(y_out, tmp2);
1042
1043     /*
1044      * the result (x_out, y_out, z_out) is incorrect if one of the inputs is
1045      * the point at infinity, so we need to check for this separately
1046      */
1047
1048     /*
1049      * if point 1 is at infinity, copy point 2 to output, and vice versa
1050      */
1051     copy_conditional(x_out, x2, z1_is_zero);
1052     copy_conditional(x_out, x1, z2_is_zero);
1053     copy_conditional(y_out, y2, z1_is_zero);
1054     copy_conditional(y_out, y1, z2_is_zero);
1055     copy_conditional(z_out, z2, z1_is_zero);
1056     copy_conditional(z_out, z1, z2_is_zero);
1057     felem_assign(x3, x_out);
1058     felem_assign(y3, y_out);
1059     felem_assign(z3, z_out);
1060 }
1061
1062 /*
1063  * select_point selects the |idx|th point from a precomputation table and
1064  * copies it to out.
1065  * The pre_comp array argument should be size of |size| argument
1066  */
1067 static void select_point(const u64 idx, unsigned int size,
1068                          const felem pre_comp[][3], felem out[3])
1069 {
1070     unsigned i, j;
1071     limb *outlimbs = &out[0][0];
1072
1073     memset(out 0, sizeof(out));
1074     for (i = 0; i < size; i++) {
1075         const limb *inlimbs = &pre_comp[i][0][0];
1076         u64 mask = i ^ idx;
1077         mask |= mask >> 4;
1078         mask |= mask >> 2;
1079         mask |= mask >> 1;
1080         mask &= 1;
1081         mask--;
1082         for (j = 0; j < 4 * 3; j++)
1083             outlimbs[j] |= inlimbs[j] & mask;
1084     }
1085 }
1086
1087 /* get_bit returns the |i|th bit in |in| */
1088 static char get_bit(const felem_bytearray in, unsigned i)
1089 {
1090     if (i >= 224)
1091         return 0;
1092     return (in[i >> 3] >> (i & 7)) & 1;
1093 }
1094
1095 /*
1096  * Interleaved point multiplication using precomputed point multiples: The
1097  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1098  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1099  * generator, using certain (large) precomputed multiples in g_pre_comp.
1100  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1101  */
1102 static void batch_mul(felem x_out, felem y_out, felem z_out,
1103                       const felem_bytearray scalars[],
1104                       const unsigned num_points, const u8 *g_scalar,
1105                       const int mixed, const felem pre_comp[][17][3],
1106                       const felem g_pre_comp[2][16][3])
1107 {
1108     int i, skip;
1109     unsigned num;
1110     unsigned gen_mul = (g_scalar != NULL);
1111     felem nq[3], tmp[4];
1112     u64 bits;
1113     u8 sign, digit;
1114
1115     /* set nq to the point at infinity */
1116     memset(nq, 0, sizeof(nq));
1117
1118     /*
1119      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1120      * of the generator (two in each of the last 28 rounds) and additions of
1121      * other points multiples (every 5th round).
1122      */
1123     skip = 1;                   /* save two point operations in the first
1124                                  * round */
1125     for (i = (num_points ? 220 : 27); i >= 0; --i) {
1126         /* double */
1127         if (!skip)
1128             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1129
1130         /* add multiples of the generator */
1131         if (gen_mul && (i <= 27)) {
1132             /* first, look 28 bits upwards */
1133             bits = get_bit(g_scalar, i + 196) << 3;
1134             bits |= get_bit(g_scalar, i + 140) << 2;
1135             bits |= get_bit(g_scalar, i + 84) << 1;
1136             bits |= get_bit(g_scalar, i + 28);
1137             /* select the point to add, in constant time */
1138             select_point(bits, 16, g_pre_comp[1], tmp);
1139
1140             if (!skip) {
1141                 /* value 1 below is argument for "mixed" */
1142                 point_add(nq[0], nq[1], nq[2],
1143                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1144             } else {
1145                 memcpy(nq, tmp, 3 * sizeof(felem));
1146                 skip = 0;
1147             }
1148
1149             /* second, look at the current position */
1150             bits = get_bit(g_scalar, i + 168) << 3;
1151             bits |= get_bit(g_scalar, i + 112) << 2;
1152             bits |= get_bit(g_scalar, i + 56) << 1;
1153             bits |= get_bit(g_scalar, i);
1154             /* select the point to add, in constant time */
1155             select_point(bits, 16, g_pre_comp[0], tmp);
1156             point_add(nq[0], nq[1], nq[2],
1157                       nq[0], nq[1], nq[2],
1158                       1 /* mixed */ , tmp[0], tmp[1], tmp[2]);
1159         }
1160
1161         /* do other additions every 5 doublings */
1162         if (num_points && (i % 5 == 0)) {
1163             /* loop over all scalars */
1164             for (num = 0; num < num_points; ++num) {
1165                 bits = get_bit(scalars[num], i + 4) << 5;
1166                 bits |= get_bit(scalars[num], i + 3) << 4;
1167                 bits |= get_bit(scalars[num], i + 2) << 3;
1168                 bits |= get_bit(scalars[num], i + 1) << 2;
1169                 bits |= get_bit(scalars[num], i) << 1;
1170                 bits |= get_bit(scalars[num], i - 1);
1171                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1172
1173                 /* select the point to add or subtract */
1174                 select_point(digit, 17, pre_comp[num], tmp);
1175                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1176                                             * point */
1177                 copy_conditional(tmp[1], tmp[3], sign);
1178
1179                 if (!skip) {
1180                     point_add(nq[0], nq[1], nq[2],
1181                               nq[0], nq[1], nq[2],
1182                               mixed, tmp[0], tmp[1], tmp[2]);
1183                 } else {
1184                     memcpy(nq, tmp, 3 * sizeof(felem));
1185                     skip = 0;
1186                 }
1187             }
1188         }
1189     }
1190     felem_assign(x_out, nq[0]);
1191     felem_assign(y_out, nq[1]);
1192     felem_assign(z_out, nq[2]);
1193 }
1194
1195 /******************************************************************************/
1196 /*
1197  * FUNCTIONS TO MANAGE PRECOMPUTATION
1198  */
1199
1200 static NISTP224_PRE_COMP *nistp224_pre_comp_new()
1201 {
1202     NISTP224_PRE_COMP *ret = NULL;
1203     ret = OPENSSL_malloc(sizeof(*ret));
1204     if (!ret) {
1205         ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1206         return ret;
1207     }
1208     memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1209     ret->references = 1;
1210     return ret;
1211 }
1212
1213 static void *nistp224_pre_comp_dup(void *src_)
1214 {
1215     NISTP224_PRE_COMP *src = src_;
1216
1217     /* no need to actually copy, these objects never change! */
1218     CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1219
1220     return src_;
1221 }
1222
1223 static void nistp224_pre_comp_free(void *pre_)
1224 {
1225     int i;
1226     NISTP224_PRE_COMP *pre = pre_;
1227
1228     if (!pre)
1229         return;
1230
1231     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1232     if (i > 0)
1233         return;
1234
1235     OPENSSL_free(pre);
1236 }
1237
1238 static void nistp224_pre_comp_clear_free(void *pre_)
1239 {
1240     int i;
1241     NISTP224_PRE_COMP *pre = pre_;
1242
1243     if (!pre)
1244         return;
1245
1246     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1247     if (i > 0)
1248         return;
1249
1250     OPENSSL_clear_free(pre, sizeof(*pre));
1251 }
1252
1253 /******************************************************************************/
1254 /*
1255  * OPENSSL EC_METHOD FUNCTIONS
1256  */
1257
1258 int ec_GFp_nistp224_group_init(EC_GROUP *group)
1259 {
1260     int ret;
1261     ret = ec_GFp_simple_group_init(group);
1262     group->a_is_minus3 = 1;
1263     return ret;
1264 }
1265
1266 int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1267                                     const BIGNUM *a, const BIGNUM *b,
1268                                     BN_CTX *ctx)
1269 {
1270     int ret = 0;
1271     BN_CTX *new_ctx = NULL;
1272     BIGNUM *curve_p, *curve_a, *curve_b;
1273
1274     if (ctx == NULL)
1275         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1276             return 0;
1277     BN_CTX_start(ctx);
1278     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1279         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1280         ((curve_b = BN_CTX_get(ctx)) == NULL))
1281         goto err;
1282     BN_bin2bn(nistp224_curve_params[0], sizeof(felem_bytearray), curve_p);
1283     BN_bin2bn(nistp224_curve_params[1], sizeof(felem_bytearray), curve_a);
1284     BN_bin2bn(nistp224_curve_params[2], sizeof(felem_bytearray), curve_b);
1285     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1286         ECerr(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE,
1287               EC_R_WRONG_CURVE_PARAMETERS);
1288         goto err;
1289     }
1290     group->field_mod_func = BN_nist_mod_224;
1291     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1292  err:
1293     BN_CTX_end(ctx);
1294     BN_CTX_free(new_ctx);
1295     return ret;
1296 }
1297
1298 /*
1299  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1300  * (X/Z^2, Y/Z^3)
1301  */
1302 int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group,
1303                                                  const EC_POINT *point,
1304                                                  BIGNUM *x, BIGNUM *y,
1305                                                  BN_CTX *ctx)
1306 {
1307     felem z1, z2, x_in, y_in, x_out, y_out;
1308     widefelem tmp;
1309
1310     if (EC_POINT_is_at_infinity(group, point)) {
1311         ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1312               EC_R_POINT_AT_INFINITY);
1313         return 0;
1314     }
1315     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1316         (!BN_to_felem(z1, point->Z)))
1317         return 0;
1318     felem_inv(z2, z1);
1319     felem_square(tmp, z2);
1320     felem_reduce(z1, tmp);
1321     felem_mul(tmp, x_in, z1);
1322     felem_reduce(x_in, tmp);
1323     felem_contract(x_out, x_in);
1324     if (x != NULL) {
1325         if (!felem_to_BN(x, x_out)) {
1326             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1327                   ERR_R_BN_LIB);
1328             return 0;
1329         }
1330     }
1331     felem_mul(tmp, z1, z2);
1332     felem_reduce(z1, tmp);
1333     felem_mul(tmp, y_in, z1);
1334     felem_reduce(y_in, tmp);
1335     felem_contract(y_out, y_in);
1336     if (y != NULL) {
1337         if (!felem_to_BN(y, y_out)) {
1338             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1339                   ERR_R_BN_LIB);
1340             return 0;
1341         }
1342     }
1343     return 1;
1344 }
1345
1346 static void make_points_affine(size_t num, felem points[ /* num */ ][3],
1347                                felem tmp_felems[ /* num+1 */ ])
1348 {
1349     /*
1350      * Runs in constant time, unless an input is the point at infinity (which
1351      * normally shouldn't happen).
1352      */
1353     ec_GFp_nistp_points_make_affine_internal(num,
1354                                              points,
1355                                              sizeof(felem),
1356                                              tmp_felems,
1357                                              (void (*)(void *))felem_one,
1358                                              (int (*)(const void *))
1359                                              felem_is_zero_int,
1360                                              (void (*)(void *, const void *))
1361                                              felem_assign,
1362                                              (void (*)(void *, const void *))
1363                                              felem_square_reduce, (void (*)
1364                                                                    (void *,
1365                                                                     const void
1366                                                                     *,
1367                                                                     const void
1368                                                                     *))
1369                                              felem_mul_reduce,
1370                                              (void (*)(void *, const void *))
1371                                              felem_inv,
1372                                              (void (*)(void *, const void *))
1373                                              felem_contract);
1374 }
1375
1376 /*
1377  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1378  * values Result is stored in r (r can equal one of the inputs).
1379  */
1380 int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
1381                                const BIGNUM *scalar, size_t num,
1382                                const EC_POINT *points[],
1383                                const BIGNUM *scalars[], BN_CTX *ctx)
1384 {
1385     int ret = 0;
1386     int j;
1387     unsigned i;
1388     int mixed = 0;
1389     BN_CTX *new_ctx = NULL;
1390     BIGNUM *x, *y, *z, *tmp_scalar;
1391     felem_bytearray g_secret;
1392     felem_bytearray *secrets = NULL;
1393     felem (*pre_comp)[17][3] = NULL;
1394     felem *tmp_felems = NULL;
1395     felem_bytearray tmp;
1396     unsigned num_bytes;
1397     int have_pre_comp = 0;
1398     size_t num_points = num;
1399     felem x_in, y_in, z_in, x_out, y_out, z_out;
1400     NISTP224_PRE_COMP *pre = NULL;
1401     const felem(*g_pre_comp)[16][3] = NULL;
1402     EC_POINT *generator = NULL;
1403     const EC_POINT *p = NULL;
1404     const BIGNUM *p_scalar = NULL;
1405
1406     if (ctx == NULL)
1407         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1408             return 0;
1409     BN_CTX_start(ctx);
1410     if (((x = BN_CTX_get(ctx)) == NULL) ||
1411         ((y = BN_CTX_get(ctx)) == NULL) ||
1412         ((z = BN_CTX_get(ctx)) == NULL) ||
1413         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1414         goto err;
1415
1416     if (scalar != NULL) {
1417         pre = EC_EX_DATA_get_data(group->extra_data,
1418                                   nistp224_pre_comp_dup,
1419                                   nistp224_pre_comp_free,
1420                                   nistp224_pre_comp_clear_free);
1421         if (pre)
1422             /* we have precomputation, try to use it */
1423             g_pre_comp = (const felem(*)[16][3])pre->g_pre_comp;
1424         else
1425             /* try to use the standard precomputation */
1426             g_pre_comp = &gmul[0];
1427         generator = EC_POINT_new(group);
1428         if (generator == NULL)
1429             goto err;
1430         /* get the generator from precomputation */
1431         if (!felem_to_BN(x, g_pre_comp[0][1][0]) ||
1432             !felem_to_BN(y, g_pre_comp[0][1][1]) ||
1433             !felem_to_BN(z, g_pre_comp[0][1][2])) {
1434             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1435             goto err;
1436         }
1437         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1438                                                       generator, x, y, z,
1439                                                       ctx))
1440             goto err;
1441         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1442             /* precomputation matches generator */
1443             have_pre_comp = 1;
1444         else
1445             /*
1446              * we don't have valid precomputation: treat the generator as a
1447              * random point
1448              */
1449             num_points = num_points + 1;
1450     }
1451
1452     if (num_points > 0) {
1453         if (num_points >= 3) {
1454             /*
1455              * unless we precompute multiples for just one or two points,
1456              * converting those into affine form is time well spent
1457              */
1458             mixed = 1;
1459         }
1460         secrets = OPENSSL_malloc(sizeof(*secrets) * num_points);
1461         pre_comp = OPENSSL_malloc(sizeof(*pre_comp) * num_points);
1462         if (mixed)
1463             tmp_felems =
1464                 OPENSSL_malloc(sizeof(felem) * (num_points * 17 + 1));
1465         if ((secrets == NULL) || (pre_comp == NULL)
1466             || (mixed && (tmp_felems == NULL))) {
1467             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1468             goto err;
1469         }
1470
1471         /*
1472          * we treat NULL scalars as 0, and NULL points as points at infinity,
1473          * i.e., they contribute nothing to the linear combination
1474          */
1475         memset(secrets, 0, sizeof(*secrets) * num_points);
1476         memset(pre_comp, 0, sizeof(*pre_comp) * num_points);
1477         for (i = 0; i < num_points; ++i) {
1478             if (i == num)
1479                 /* the generator */
1480             {
1481                 p = EC_GROUP_get0_generator(group);
1482                 p_scalar = scalar;
1483             } else
1484                 /* the i^th point */
1485             {
1486                 p = points[i];
1487                 p_scalar = scalars[i];
1488             }
1489             if ((p_scalar != NULL) && (p != NULL)) {
1490                 /* reduce scalar to 0 <= scalar < 2^224 */
1491                 if ((BN_num_bits(p_scalar) > 224)
1492                     || (BN_is_negative(p_scalar))) {
1493                     /*
1494                      * this is an unusual input, and we don't guarantee
1495                      * constant-timeness
1496                      */
1497                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1498                         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1499                         goto err;
1500                     }
1501                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
1502                 } else
1503                     num_bytes = BN_bn2bin(p_scalar, tmp);
1504                 flip_endian(secrets[i], tmp, num_bytes);
1505                 /* precompute multiples */
1506                 if ((!BN_to_felem(x_out, p->X)) ||
1507                     (!BN_to_felem(y_out, p->Y)) ||
1508                     (!BN_to_felem(z_out, p->Z)))
1509                     goto err;
1510                 felem_assign(pre_comp[i][1][0], x_out);
1511                 felem_assign(pre_comp[i][1][1], y_out);
1512                 felem_assign(pre_comp[i][1][2], z_out);
1513                 for (j = 2; j <= 16; ++j) {
1514                     if (j & 1) {
1515                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1516                                   pre_comp[i][j][2], pre_comp[i][1][0],
1517                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1518                                   pre_comp[i][j - 1][0],
1519                                   pre_comp[i][j - 1][1],
1520                                   pre_comp[i][j - 1][2]);
1521                     } else {
1522                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1523                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1524                                      pre_comp[i][j / 2][1],
1525                                      pre_comp[i][j / 2][2]);
1526                     }
1527                 }
1528             }
1529         }
1530         if (mixed)
1531             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1532     }
1533
1534     /* the scalar for the generator */
1535     if ((scalar != NULL) && (have_pre_comp)) {
1536         memset(g_secret, 0, sizeof(g_secret));
1537         /* reduce scalar to 0 <= scalar < 2^224 */
1538         if ((BN_num_bits(scalar) > 224) || (BN_is_negative(scalar))) {
1539             /*
1540              * this is an unusual input, and we don't guarantee
1541              * constant-timeness
1542              */
1543             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1544                 ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1545                 goto err;
1546             }
1547             num_bytes = BN_bn2bin(tmp_scalar, tmp);
1548         } else
1549             num_bytes = BN_bn2bin(scalar, tmp);
1550         flip_endian(g_secret, tmp, num_bytes);
1551         /* do the multiplication with generator precomputation */
1552         batch_mul(x_out, y_out, z_out,
1553                   (const felem_bytearray(*))secrets, num_points,
1554                   g_secret,
1555                   mixed, (const felem(*)[17][3])pre_comp, g_pre_comp);
1556     } else
1557         /* do the multiplication without generator precomputation */
1558         batch_mul(x_out, y_out, z_out,
1559                   (const felem_bytearray(*))secrets, num_points,
1560                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1561     /* reduce the output to its unique minimal representation */
1562     felem_contract(x_in, x_out);
1563     felem_contract(y_in, y_out);
1564     felem_contract(z_in, z_out);
1565     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1566         (!felem_to_BN(z, z_in))) {
1567         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1568         goto err;
1569     }
1570     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1571
1572  err:
1573     BN_CTX_end(ctx);
1574     EC_POINT_free(generator);
1575     BN_CTX_free(new_ctx);
1576     OPENSSL_free(secrets);
1577     OPENSSL_free(pre_comp);
1578     OPENSSL_free(tmp_felems);
1579     return ret;
1580 }
1581
1582 int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1583 {
1584     int ret = 0;
1585     NISTP224_PRE_COMP *pre = NULL;
1586     int i, j;
1587     BN_CTX *new_ctx = NULL;
1588     BIGNUM *x, *y;
1589     EC_POINT *generator = NULL;
1590     felem tmp_felems[32];
1591
1592     /* throw away old precomputation */
1593     EC_EX_DATA_free_data(&group->extra_data, nistp224_pre_comp_dup,
1594                          nistp224_pre_comp_free,
1595                          nistp224_pre_comp_clear_free);
1596     if (ctx == NULL)
1597         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1598             return 0;
1599     BN_CTX_start(ctx);
1600     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
1601         goto err;
1602     /* get the generator */
1603     if (group->generator == NULL)
1604         goto err;
1605     generator = EC_POINT_new(group);
1606     if (generator == NULL)
1607         goto err;
1608     BN_bin2bn(nistp224_curve_params[3], sizeof(felem_bytearray), x);
1609     BN_bin2bn(nistp224_curve_params[4], sizeof(felem_bytearray), y);
1610     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
1611         goto err;
1612     if ((pre = nistp224_pre_comp_new()) == NULL)
1613         goto err;
1614     /*
1615      * if the generator is the standard one, use built-in precomputation
1616      */
1617     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1618         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1619         ret = 1;
1620         goto err;
1621     }
1622     if ((!BN_to_felem(pre->g_pre_comp[0][1][0], group->generator->X)) ||
1623         (!BN_to_felem(pre->g_pre_comp[0][1][1], group->generator->Y)) ||
1624         (!BN_to_felem(pre->g_pre_comp[0][1][2], group->generator->Z)))
1625         goto err;
1626     /*
1627      * compute 2^56*G, 2^112*G, 2^168*G for the first table, 2^28*G, 2^84*G,
1628      * 2^140*G, 2^196*G for the second one
1629      */
1630     for (i = 1; i <= 8; i <<= 1) {
1631         point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1632                      pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
1633                      pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]);
1634         for (j = 0; j < 27; ++j) {
1635             point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1636                          pre->g_pre_comp[1][i][2], pre->g_pre_comp[1][i][0],
1637                          pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1638         }
1639         if (i == 8)
1640             break;
1641         point_double(pre->g_pre_comp[0][2 * i][0],
1642                      pre->g_pre_comp[0][2 * i][1],
1643                      pre->g_pre_comp[0][2 * i][2], pre->g_pre_comp[1][i][0],
1644                      pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1645         for (j = 0; j < 27; ++j) {
1646             point_double(pre->g_pre_comp[0][2 * i][0],
1647                          pre->g_pre_comp[0][2 * i][1],
1648                          pre->g_pre_comp[0][2 * i][2],
1649                          pre->g_pre_comp[0][2 * i][0],
1650                          pre->g_pre_comp[0][2 * i][1],
1651                          pre->g_pre_comp[0][2 * i][2]);
1652         }
1653     }
1654     for (i = 0; i < 2; i++) {
1655         /* g_pre_comp[i][0] is the point at infinity */
1656         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
1657         /* the remaining multiples */
1658         /* 2^56*G + 2^112*G resp. 2^84*G + 2^140*G */
1659         point_add(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
1660                   pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
1661                   pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
1662                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1663                   pre->g_pre_comp[i][2][2]);
1664         /* 2^56*G + 2^168*G resp. 2^84*G + 2^196*G */
1665         point_add(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
1666                   pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
1667                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1668                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1669                   pre->g_pre_comp[i][2][2]);
1670         /* 2^112*G + 2^168*G resp. 2^140*G + 2^196*G */
1671         point_add(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
1672                   pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
1673                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1674                   0, pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
1675                   pre->g_pre_comp[i][4][2]);
1676         /*
1677          * 2^56*G + 2^112*G + 2^168*G resp. 2^84*G + 2^140*G + 2^196*G
1678          */
1679         point_add(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
1680                   pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
1681                   pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
1682                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1683                   pre->g_pre_comp[i][2][2]);
1684         for (j = 1; j < 8; ++j) {
1685             /* odd multiples: add G resp. 2^28*G */
1686             point_add(pre->g_pre_comp[i][2 * j + 1][0],
1687                       pre->g_pre_comp[i][2 * j + 1][1],
1688                       pre->g_pre_comp[i][2 * j + 1][2],
1689                       pre->g_pre_comp[i][2 * j][0],
1690                       pre->g_pre_comp[i][2 * j][1],
1691                       pre->g_pre_comp[i][2 * j][2], 0,
1692                       pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1],
1693                       pre->g_pre_comp[i][1][2]);
1694         }
1695     }
1696     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems);
1697
1698     if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp224_pre_comp_dup,
1699                              nistp224_pre_comp_free,
1700                              nistp224_pre_comp_clear_free))
1701         goto err;
1702     ret = 1;
1703     pre = NULL;
1704  err:
1705     BN_CTX_end(ctx);
1706     EC_POINT_free(generator);
1707     BN_CTX_free(new_ctx);
1708     nistp224_pre_comp_free(pre);
1709     return ret;
1710 }
1711
1712 int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group)
1713 {
1714     if (EC_EX_DATA_get_data(group->extra_data, nistp224_pre_comp_dup,
1715                             nistp224_pre_comp_free,
1716                             nistp224_pre_comp_clear_free)
1717         != NULL)
1718         return 1;
1719     else
1720         return 0;
1721 }
1722
1723 #else
1724 static void *dummy = &dummy;
1725 #endif