crypto/ec/ecp_nistp224.c

   1 /* crypto/ec/ecp_nistp224.c */
   2 /*
   3  * Written by Emilia Kasper (Google) for the OpenSSL project.
   4  */
   5 /* Copyright 2011 Google Inc.
   6  *
   7  * Licensed under the Apache License, Version 2.0 (the "License");
   8  *
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  *  Unless required by applicable law or agreed to in writing, software
  15  *  distributed under the License is distributed on an "AS IS" BASIS,
  16  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  *  See the License for the specific language governing permissions and
  18  *  limitations under the License.
  19  */
  20
  21 /*
  22  * A 64-bit implementation of the NIST P-224 elliptic curve point multiplication
  23  *
  24  * Inspired by Daniel J. Bernstein's public domain nistp224 implementation
  25  * and Adam Langley's public domain 64-bit C implementation of curve25519
  26  */
  27
  28 #include <openssl/opensslconf.h>
  29 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
  30
  31 # include <stdint.h>
  32 # include <string.h>
  33 # include <openssl/err.h>
  34 # include "ec_lcl.h"
  35
  36 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  37   /* even with gcc, the typedef won't work for 32-bit platforms */
  38 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  39                                  * platforms */
  40 # else
  41 #  error "Need GCC 3.1 or later to define type uint128_t"
  42 # endif
  43
  44 typedef uint8_t u8;
  45 typedef uint64_t u64;
  46 typedef int64_t s64;
  47
  48 /******************************************************************************/
  49 /*-
  50  * INTERNAL REPRESENTATION OF FIELD ELEMENTS
  51  *
  52  * Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3
  53  * using 64-bit coefficients called 'limbs',
  54  * and sometimes (for multiplication results) as
  55  * b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 + 2^336*b_6
  56  * using 128-bit coefficients called 'widelimbs'.
  57  * A 4-limb representation is an 'felem';
  58  * a 7-widelimb representation is a 'widefelem'.
  59  * Even within felems, bits of adjacent limbs overlap, and we don't always
  60  * reduce the representations: we ensure that inputs to each felem
  61  * multiplication satisfy a_i < 2^60, so outputs satisfy b_i < 4*2^60*2^60,
  62  * and fit into a 128-bit word without overflow. The coefficients are then
  63  * again partially reduced to obtain an felem satisfying a_i < 2^57.
  64  * We only reduce to the unique minimal representation at the end of the
  65  * computation.
  66  */
  67
  68 typedef uint64_t limb;
  69 typedef uint128_t widelimb;
  70
  71 typedef limb felem[4];
  72 typedef widelimb widefelem[7];
  73
  74 /*
  75  * Field element represented as a byte arrary. 28*8 = 224 bits is also the
  76  * group order size for the elliptic curve, and we also use this type for
  77  * scalars for point multiplication.
  78  */
  79 typedef u8 felem_bytearray[28];
  80
  81 static const felem_bytearray nistp224_curve_params[5] = {
  82     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
  83      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
  84      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01},
  85     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a */
  86      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
  87      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE},
  88     {0xB4, 0x05, 0x0A, 0x85, 0x0C, 0x04, 0xB3, 0xAB, 0xF5, 0x41, /* b */
  89      0x32, 0x56, 0x50, 0x44, 0xB0, 0xB7, 0xD7, 0xBF, 0xD8, 0xBA,
  90      0x27, 0x0B, 0x39, 0x43, 0x23, 0x55, 0xFF, 0xB4},
  91     {0xB7, 0x0E, 0x0C, 0xBD, 0x6B, 0xB4, 0xBF, 0x7F, 0x32, 0x13, /* x */
  92      0x90, 0xB9, 0x4A, 0x03, 0xC1, 0xD3, 0x56, 0xC2, 0x11, 0x22,
  93      0x34, 0x32, 0x80, 0xD6, 0x11, 0x5C, 0x1D, 0x21},
  94     {0xbd, 0x37, 0x63, 0x88, 0xb5, 0xf7, 0x23, 0xfb, 0x4c, 0x22, /* y */
  95      0xdf, 0xe6, 0xcd, 0x43, 0x75, 0xa0, 0x5a, 0x07, 0x47, 0x64,
  96      0x44, 0xd5, 0x81, 0x99, 0x85, 0x00, 0x7e, 0x34}
  97 };
  98
  99 /*-
 100  * Precomputed multiples of the standard generator
 101  * Points are given in coordinates (X, Y, Z) where Z normally is 1
 102  * (0 for the point at infinity).
 103  * For each field element, slice a_0 is word 0, etc.
 104  *
 105  * The table has 2 * 16 elements, starting with the following:
 106  * index | bits    | point
 107  * ------+---------+------------------------------
 108  *     0 | 0 0 0 0 | 0G
 109  *     1 | 0 0 0 1 | 1G
 110  *     2 | 0 0 1 0 | 2^56G
 111  *     3 | 0 0 1 1 | (2^56 + 1)G
 112  *     4 | 0 1 0 0 | 2^112G
 113  *     5 | 0 1 0 1 | (2^112 + 1)G
 114  *     6 | 0 1 1 0 | (2^112 + 2^56)G
 115  *     7 | 0 1 1 1 | (2^112 + 2^56 + 1)G
 116  *     8 | 1 0 0 0 | 2^168G
 117  *     9 | 1 0 0 1 | (2^168 + 1)G
 118  *    10 | 1 0 1 0 | (2^168 + 2^56)G
 119  *    11 | 1 0 1 1 | (2^168 + 2^56 + 1)G
 120  *    12 | 1 1 0 0 | (2^168 + 2^112)G
 121  *    13 | 1 1 0 1 | (2^168 + 2^112 + 1)G
 122  *    14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G
 123  *    15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G
 124  * followed by a copy of this with each element multiplied by 2^28.
 125  *
 126  * The reason for this is so that we can clock bits into four different
 127  * locations when doing simple scalar multiplies against the base point,
 128  * and then another four locations using the second 16 elements.
 129  */
 130 static const felem gmul[2][16][3] = {
 131 {{{0, 0, 0, 0},
 132   {0, 0, 0, 0},
 133   {0, 0, 0, 0}},
 134  {{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf},
 135   {0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723},
 136   {1, 0, 0, 0}},
 137  {{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5},
 138   {0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321},
 139   {1, 0, 0, 0}},
 140  {{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748},
 141   {0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17},
 142   {1, 0, 0, 0}},
 143  {{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe},
 144   {0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b},
 145   {1, 0, 0, 0}},
 146  {{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3},
 147   {0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a},
 148   {1, 0, 0, 0}},
 149  {{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c},
 150   {0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244},
 151   {1, 0, 0, 0}},
 152  {{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849},
 153   {0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112},
 154   {1, 0, 0, 0}},
 155  {{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47},
 156   {0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394},
 157   {1, 0, 0, 0}},
 158  {{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d},
 159   {0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7},
 160   {1, 0, 0, 0}},
 161  {{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24},
 162   {0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881},
 163   {1, 0, 0, 0}},
 164  {{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984},
 165   {0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369},
 166   {1, 0, 0, 0}},
 167  {{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3},
 168   {0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60},
 169   {1, 0, 0, 0}},
 170  {{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057},
 171   {0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9},
 172   {1, 0, 0, 0}},
 173  {{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9},
 174   {0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc},
 175   {1, 0, 0, 0}},
 176  {{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58},
 177   {0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558},
 178   {1, 0, 0, 0}}},
 179 {{{0, 0, 0, 0},
 180   {0, 0, 0, 0},
 181   {0, 0, 0, 0}},
 182  {{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31},
 183   {0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d},
 184   {1, 0, 0, 0}},
 185  {{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3},
 186   {0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a},
 187   {1, 0, 0, 0}},
 188  {{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33},
 189   {0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100},
 190   {1, 0, 0, 0}},
 191  {{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5},
 192   {0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea},
 193   {1, 0, 0, 0}},
 194  {{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be},
 195   {0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51},
 196   {1, 0, 0, 0}},
 197  {{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1},
 198   {0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb},
 199   {1, 0, 0, 0}},
 200  {{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233},
 201   {0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def},
 202   {1, 0, 0, 0}},
 203  {{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae},
 204   {0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45},
 205   {1, 0, 0, 0}},
 206  {{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e},
 207   {0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb},
 208   {1, 0, 0, 0}},
 209  {{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de},
 210   {0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3},
 211   {1, 0, 0, 0}},
 212  {{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05},
 213   {0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58},
 214   {1, 0, 0, 0}},
 215  {{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb},
 216   {0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0},
 217   {1, 0, 0, 0}},
 218  {{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9},
 219   {0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea},
 220   {1, 0, 0, 0}},
 221  {{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba},
 222   {0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405},
 223   {1, 0, 0, 0}},
 224  {{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e},
 225   {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e},
 226   {1, 0, 0, 0}}}
 227 };
 228
 229 /* Precomputation for the group generator. */
 230 typedef struct {
 231     felem g_pre_comp[2][16][3];
 232     int references;
 233 } NISTP224_PRE_COMP;
 234
 235 const EC_METHOD *EC_GFp_nistp224_method(void)
 236 {
 237     static const EC_METHOD ret = {
 238         EC_FLAGS_DEFAULT_OCT,
 239         NID_X9_62_prime_field,
 240         ec_GFp_nistp224_group_init,
 241         ec_GFp_simple_group_finish,
 242         ec_GFp_simple_group_clear_finish,
 243         ec_GFp_nist_group_copy,
 244         ec_GFp_nistp224_group_set_curve,
 245         ec_GFp_simple_group_get_curve,
 246         ec_GFp_simple_group_get_degree,
 247         ec_GFp_simple_group_check_discriminant,
 248         ec_GFp_simple_point_init,
 249         ec_GFp_simple_point_finish,
 250         ec_GFp_simple_point_clear_finish,
 251         ec_GFp_simple_point_copy,
 252         ec_GFp_simple_point_set_to_infinity,
 253         ec_GFp_simple_set_Jprojective_coordinates_GFp,
 254         ec_GFp_simple_get_Jprojective_coordinates_GFp,
 255         ec_GFp_simple_point_set_affine_coordinates,
 256         ec_GFp_nistp224_point_get_affine_coordinates,
 257         0 /* point_set_compressed_coordinates */ ,
 258         0 /* point2oct */ ,
 259         0 /* oct2point */ ,
 260         ec_GFp_simple_add,
 261         ec_GFp_simple_dbl,
 262         ec_GFp_simple_invert,
 263         ec_GFp_simple_is_at_infinity,
 264         ec_GFp_simple_is_on_curve,
 265         ec_GFp_simple_cmp,
 266         ec_GFp_simple_make_affine,
 267         ec_GFp_simple_points_make_affine,
 268         ec_GFp_nistp224_points_mul,
 269         ec_GFp_nistp224_precompute_mult,
 270         ec_GFp_nistp224_have_precompute_mult,
 271         ec_GFp_nist_field_mul,
 272         ec_GFp_nist_field_sqr,
 273         0 /* field_div */ ,
 274         0 /* field_encode */ ,
 275         0 /* field_decode */ ,
 276         0                       /* field_set_to_one */
 277     };
 278
 279     return &ret;
 280 }
 281
 282 /*
 283  * Helper functions to convert field elements to/from internal representation
 284  */
 285 static void bin28_to_felem(felem out, const u8 in[28])
 286 {
 287     out[0] = *((const uint64_t *)(in)) & 0x00ffffffffffffff;
 288     out[1] = (*((const uint64_t *)(in + 7))) & 0x00ffffffffffffff;
 289     out[2] = (*((const uint64_t *)(in + 14))) & 0x00ffffffffffffff;
 290     out[3] = (*((const uint64_t *)(in+20))) >> 8;
 291 }
 292
 293 static void felem_to_bin28(u8 out[28], const felem in)
 294 {
 295     unsigned i;
 296     for (i = 0; i < 7; ++i) {
 297         out[i] = in[0] >> (8 * i);
 298         out[i + 7] = in[1] >> (8 * i);
 299         out[i + 14] = in[2] >> (8 * i);
 300         out[i + 21] = in[3] >> (8 * i);
 301     }
 302 }
 303
 304 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 305 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 306 {
 307     unsigned i;
 308     for (i = 0; i < len; ++i)
 309         out[i] = in[len - 1 - i];
 310 }
 311
 312 /* From OpenSSL BIGNUM to internal representation */
 313 static int BN_to_felem(felem out, const BIGNUM *bn)
 314 {
 315     felem_bytearray b_in;
 316     felem_bytearray b_out;
 317     unsigned num_bytes;
 318
 319     /* BN_bn2bin eats leading zeroes */
 320     memset(b_out, 0, sizeof(b_out));
 321     num_bytes = BN_num_bytes(bn);
 322     if (num_bytes > sizeof b_out) {
 323         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 324         return 0;
 325     }
 326     if (BN_is_negative(bn)) {
 327         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 328         return 0;
 329     }
 330     num_bytes = BN_bn2bin(bn, b_in);
 331     flip_endian(b_out, b_in, num_bytes);
 332     bin28_to_felem(out, b_out);
 333     return 1;
 334 }
 335
 336 /* From internal representation to OpenSSL BIGNUM */
 337 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 338 {
 339     felem_bytearray b_in, b_out;
 340     felem_to_bin28(b_in, in);
 341     flip_endian(b_out, b_in, sizeof b_out);
 342     return BN_bin2bn(b_out, sizeof b_out, out);
 343 }
 344
 345 /******************************************************************************/
 346 /*-
 347  *                              FIELD OPERATIONS
 348  *
 349  * Field operations, using the internal representation of field elements.
 350  * NB! These operations are specific to our point multiplication and cannot be
 351  * expected to be correct in general - e.g., multiplication with a large scalar
 352  * will cause an overflow.
 353  *
 354  */
 355
 356 static void felem_one(felem out)
 357 {
 358     out[0] = 1;
 359     out[1] = 0;
 360     out[2] = 0;
 361     out[3] = 0;
 362 }
 363
 364 static void felem_assign(felem out, const felem in)
 365 {
 366     out[0] = in[0];
 367     out[1] = in[1];
 368     out[2] = in[2];
 369     out[3] = in[3];
 370 }
 371
 372 /* Sum two field elements: out += in */
 373 static void felem_sum(felem out, const felem in)
 374 {
 375     out[0] += in[0];
 376     out[1] += in[1];
 377     out[2] += in[2];
 378     out[3] += in[3];
 379 }
 380
 381 /* Get negative value: out = -in */
 382 /* Assumes in[i] < 2^57 */
 383 static void felem_neg(felem out, const felem in)
 384 {
 385     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 386     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 387     static const limb two58m42m2 = (((limb) 1) << 58) -
 388         (((limb) 1) << 42) - (((limb) 1) << 2);
 389
 390     /* Set to 0 mod 2^224-2^96+1 to ensure out > in */
 391     out[0] = two58p2 - in[0];
 392     out[1] = two58m42m2 - in[1];
 393     out[2] = two58m2 - in[2];
 394     out[3] = two58m2 - in[3];
 395 }
 396
 397 /* Subtract field elements: out -= in */
 398 /* Assumes in[i] < 2^57 */
 399 static void felem_diff(felem out, const felem in)
 400 {
 401     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 402     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 403     static const limb two58m42m2 = (((limb) 1) << 58) -
 404         (((limb) 1) << 42) - (((limb) 1) << 2);
 405
 406     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 407     out[0] += two58p2;
 408     out[1] += two58m42m2;
 409     out[2] += two58m2;
 410     out[3] += two58m2;
 411
 412     out[0] -= in[0];
 413     out[1] -= in[1];
 414     out[2] -= in[2];
 415     out[3] -= in[3];
 416 }
 417
 418 /* Subtract in unreduced 128-bit mode: out -= in */
 419 /* Assumes in[i] < 2^119 */
 420 static void widefelem_diff(widefelem out, const widefelem in)
 421 {
 422     static const widelimb two120 = ((widelimb) 1) << 120;
 423     static const widelimb two120m64 = (((widelimb) 1) << 120) -
 424         (((widelimb) 1) << 64);
 425     static const widelimb two120m104m64 = (((widelimb) 1) << 120) -
 426         (((widelimb) 1) << 104) - (((widelimb) 1) << 64);
 427
 428     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 429     out[0] += two120;
 430     out[1] += two120m64;
 431     out[2] += two120m64;
 432     out[3] += two120;
 433     out[4] += two120m104m64;
 434     out[5] += two120m64;
 435     out[6] += two120m64;
 436
 437     out[0] -= in[0];
 438     out[1] -= in[1];
 439     out[2] -= in[2];
 440     out[3] -= in[3];
 441     out[4] -= in[4];
 442     out[5] -= in[5];
 443     out[6] -= in[6];
 444 }
 445
 446 /* Subtract in mixed mode: out128 -= in64 */
 447 /* in[i] < 2^63 */
 448 static void felem_diff_128_64(widefelem out, const felem in)
 449 {
 450     static const widelimb two64p8 = (((widelimb) 1) << 64) +
 451         (((widelimb) 1) << 8);
 452     static const widelimb two64m8 = (((widelimb) 1) << 64) -
 453         (((widelimb) 1) << 8);
 454     static const widelimb two64m48m8 = (((widelimb) 1) << 64) -
 455         (((widelimb) 1) << 48) - (((widelimb) 1) << 8);
 456
 457     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 458     out[0] += two64p8;
 459     out[1] += two64m48m8;
 460     out[2] += two64m8;
 461     out[3] += two64m8;
 462
 463     out[0] -= in[0];
 464     out[1] -= in[1];
 465     out[2] -= in[2];
 466     out[3] -= in[3];
 467 }
 468
 469 /*
 470  * Multiply a field element by a scalar: out = out * scalar The scalars we
 471  * actually use are small, so results fit without overflow
 472  */
 473 static void felem_scalar(felem out, const limb scalar)
 474 {
 475     out[0] *= scalar;
 476     out[1] *= scalar;
 477     out[2] *= scalar;
 478     out[3] *= scalar;
 479 }
 480
 481 /*
 482  * Multiply an unreduced field element by a scalar: out = out * scalar The
 483  * scalars we actually use are small, so results fit without overflow
 484  */
 485 static void widefelem_scalar(widefelem out, const widelimb scalar)
 486 {
 487     out[0] *= scalar;
 488     out[1] *= scalar;
 489     out[2] *= scalar;
 490     out[3] *= scalar;
 491     out[4] *= scalar;
 492     out[5] *= scalar;
 493     out[6] *= scalar;
 494 }
 495
 496 /* Square a field element: out = in^2 */
 497 static void felem_square(widefelem out, const felem in)
 498 {
 499     limb tmp0, tmp1, tmp2;
 500     tmp0 = 2 * in[0];
 501     tmp1 = 2 * in[1];
 502     tmp2 = 2 * in[2];
 503     out[0] = ((widelimb) in[0]) * in[0];
 504     out[1] = ((widelimb) in[0]) * tmp1;
 505     out[2] = ((widelimb) in[0]) * tmp2 + ((widelimb) in[1]) * in[1];
 506     out[3] = ((widelimb) in[3]) * tmp0 + ((widelimb) in[1]) * tmp2;
 507     out[4] = ((widelimb) in[3]) * tmp1 + ((widelimb) in[2]) * in[2];
 508     out[5] = ((widelimb) in[3]) * tmp2;
 509     out[6] = ((widelimb) in[3]) * in[3];
 510 }
 511
 512 /* Multiply two field elements: out = in1 * in2 */
 513 static void felem_mul(widefelem out, const felem in1, const felem in2)
 514 {
 515     out[0] = ((widelimb) in1[0]) * in2[0];
 516     out[1] = ((widelimb) in1[0]) * in2[1] + ((widelimb) in1[1]) * in2[0];
 517     out[2] = ((widelimb) in1[0]) * in2[2] + ((widelimb) in1[1]) * in2[1] +
 518              ((widelimb) in1[2]) * in2[0];
 519     out[3] = ((widelimb) in1[0]) * in2[3] + ((widelimb) in1[1]) * in2[2] +
 520              ((widelimb) in1[2]) * in2[1] + ((widelimb) in1[3]) * in2[0];
 521     out[4] = ((widelimb) in1[1]) * in2[3] + ((widelimb) in1[2]) * in2[2] +
 522              ((widelimb) in1[3]) * in2[1];
 523     out[5] = ((widelimb) in1[2]) * in2[3] + ((widelimb) in1[3]) * in2[2];
 524     out[6] = ((widelimb) in1[3]) * in2[3];
 525 }
 526
 527 /*-
 528  * Reduce seven 128-bit coefficients to four 64-bit coefficients.
 529  * Requires in[i] < 2^126,
 530  * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 */
 531 static void felem_reduce(felem out, const widefelem in)
 532 {
 533     static const widelimb two127p15 = (((widelimb) 1) << 127) +
 534         (((widelimb) 1) << 15);
 535     static const widelimb two127m71 = (((widelimb) 1) << 127) -
 536         (((widelimb) 1) << 71);
 537     static const widelimb two127m71m55 = (((widelimb) 1) << 127) -
 538         (((widelimb) 1) << 71) - (((widelimb) 1) << 55);
 539     widelimb output[5];
 540
 541     /* Add 0 mod 2^224-2^96+1 to ensure all differences are positive */
 542     output[0] = in[0] + two127p15;
 543     output[1] = in[1] + two127m71m55;
 544     output[2] = in[2] + two127m71;
 545     output[3] = in[3];
 546     output[4] = in[4];
 547
 548     /* Eliminate in[4], in[5], in[6] */
 549     output[4] += in[6] >> 16;
 550     output[3] += (in[6] & 0xffff) << 40;
 551     output[2] -= in[6];
 552
 553     output[3] += in[5] >> 16;
 554     output[2] += (in[5] & 0xffff) << 40;
 555     output[1] -= in[5];
 556
 557     output[2] += output[4] >> 16;
 558     output[1] += (output[4] & 0xffff) << 40;
 559     output[0] -= output[4];
 560
 561     /* Carry 2 -> 3 -> 4 */
 562     output[3] += output[2] >> 56;
 563     output[2] &= 0x00ffffffffffffff;
 564
 565     output[4] = output[3] >> 56;
 566     output[3] &= 0x00ffffffffffffff;
 567
 568     /* Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72 */
 569
 570     /* Eliminate output[4] */
 571     output[2] += output[4] >> 16;
 572     /* output[2] < 2^56 + 2^56 = 2^57 */
 573     output[1] += (output[4] & 0xffff) << 40;
 574     output[0] -= output[4];
 575
 576     /* Carry 0 -> 1 -> 2 -> 3 */
 577     output[1] += output[0] >> 56;
 578     out[0] = output[0] & 0x00ffffffffffffff;
 579
 580     output[2] += output[1] >> 56;
 581     /* output[2] < 2^57 + 2^72 */
 582     out[1] = output[1] & 0x00ffffffffffffff;
 583     output[3] += output[2] >> 56;
 584     /* output[3] <= 2^56 + 2^16 */
 585     out[2] = output[2] & 0x00ffffffffffffff;
 586
 587     /*-
 588      * out[0] < 2^56, out[1] < 2^56, out[2] < 2^56,
 589      * out[3] <= 2^56 + 2^16 (due to final carry),
 590      * so out < 2*p
 591      */
 592     out[3] = output[3];
 593 }
 594
 595 static void felem_square_reduce(felem out, const felem in)
 596 {
 597     widefelem tmp;
 598     felem_square(tmp, in);
 599     felem_reduce(out, tmp);
 600 }
 601
 602 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 603 {
 604     widefelem tmp;
 605     felem_mul(tmp, in1, in2);
 606     felem_reduce(out, tmp);
 607 }
 608
 609 /*
 610  * Reduce to unique minimal representation. Requires 0 <= in < 2*p (always
 611  * call felem_reduce first)
 612  */
 613 static void felem_contract(felem out, const felem in)
 614 {
 615     static const int64_t two56 = ((limb) 1) << 56;
 616     /* 0 <= in < 2*p, p = 2^224 - 2^96 + 1 */
 617     /* if in > p , reduce in = in - 2^224 + 2^96 - 1 */
 618     int64_t tmp[4], a;
 619     tmp[0] = in[0];
 620     tmp[1] = in[1];
 621     tmp[2] = in[2];
 622     tmp[3] = in[3];
 623     /* Case 1: a = 1 iff in >= 2^224 */
 624     a = (in[3] >> 56);
 625     tmp[0] -= a;
 626     tmp[1] += a << 40;
 627     tmp[3] &= 0x00ffffffffffffff;
 628     /*
 629      * Case 2: a = 0 iff p <= in < 2^224, i.e., the high 128 bits are all 1
 630      * and the lower part is non-zero
 631      */
 632     a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) |
 633         (((int64_t) (in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63);
 634     a &= 0x00ffffffffffffff;
 635     /* turn a into an all-one mask (if a = 0) or an all-zero mask */
 636     a = (a - 1) >> 63;
 637     /* subtract 2^224 - 2^96 + 1 if a is all-one */
 638     tmp[3] &= a ^ 0xffffffffffffffff;
 639     tmp[2] &= a ^ 0xffffffffffffffff;
 640     tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff;
 641     tmp[0] -= 1 & a;
 642
 643     /*
 644      * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
 645      * non-zero, so we only need one step
 646      */
 647     a = tmp[0] >> 63;
 648     tmp[0] += two56 & a;
 649     tmp[1] -= 1 & a;
 650
 651     /* carry 1 -> 2 -> 3 */
 652     tmp[2] += tmp[1] >> 56;
 653     tmp[1] &= 0x00ffffffffffffff;
 654
 655     tmp[3] += tmp[2] >> 56;
 656     tmp[2] &= 0x00ffffffffffffff;
 657
 658     /* Now 0 <= out < p */
 659     out[0] = tmp[0];
 660     out[1] = tmp[1];
 661     out[2] = tmp[2];
 662     out[3] = tmp[3];
 663 }
 664
 665 /*
 666  * Zero-check: returns 1 if input is 0, and 0 otherwise. We know that field
 667  * elements are reduced to in < 2^225, so we only need to check three cases:
 668  * 0, 2^224 - 2^96 + 1, and 2^225 - 2^97 + 2
 669  */
 670 static limb felem_is_zero(const felem in)
 671 {
 672     limb zero, two224m96p1, two225m97p2;
 673
 674     zero = in[0] | in[1] | in[2] | in[3];
 675     zero = (((int64_t) (zero) - 1) >> 63) & 1;
 676     two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000)
 677         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x00ffffffffffffff);
 678     two224m96p1 = (((int64_t) (two224m96p1) - 1) >> 63) & 1;
 679     two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000)
 680         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x01ffffffffffffff);
 681     two225m97p2 = (((int64_t) (two225m97p2) - 1) >> 63) & 1;
 682     return (zero | two224m96p1 | two225m97p2);
 683 }
 684
 685 static limb felem_is_zero_int(const felem in)
 686 {
 687     return (int)(felem_is_zero(in) & ((limb) 1));
 688 }
 689
 690 /* Invert a field element */
 691 /* Computation chain copied from djb's code */
 692 static void felem_inv(felem out, const felem in)
 693 {
 694     felem ftmp, ftmp2, ftmp3, ftmp4;
 695     widefelem tmp;
 696     unsigned i;
 697
 698     felem_square(tmp, in);
 699     felem_reduce(ftmp, tmp);    /* 2 */
 700     felem_mul(tmp, in, ftmp);
 701     felem_reduce(ftmp, tmp);    /* 2^2 - 1 */
 702     felem_square(tmp, ftmp);
 703     felem_reduce(ftmp, tmp);    /* 2^3 - 2 */
 704     felem_mul(tmp, in, ftmp);
 705     felem_reduce(ftmp, tmp);    /* 2^3 - 1 */
 706     felem_square(tmp, ftmp);
 707     felem_reduce(ftmp2, tmp);   /* 2^4 - 2 */
 708     felem_square(tmp, ftmp2);
 709     felem_reduce(ftmp2, tmp);   /* 2^5 - 4 */
 710     felem_square(tmp, ftmp2);
 711     felem_reduce(ftmp2, tmp);   /* 2^6 - 8 */
 712     felem_mul(tmp, ftmp2, ftmp);
 713     felem_reduce(ftmp, tmp);    /* 2^6 - 1 */
 714     felem_square(tmp, ftmp);
 715     felem_reduce(ftmp2, tmp);   /* 2^7 - 2 */
 716     for (i = 0; i < 5; ++i) {   /* 2^12 - 2^6 */
 717         felem_square(tmp, ftmp2);
 718         felem_reduce(ftmp2, tmp);
 719     }
 720     felem_mul(tmp, ftmp2, ftmp);
 721     felem_reduce(ftmp2, tmp);   /* 2^12 - 1 */
 722     felem_square(tmp, ftmp2);
 723     felem_reduce(ftmp3, tmp);   /* 2^13 - 2 */
 724     for (i = 0; i < 11; ++i) {  /* 2^24 - 2^12 */
 725         felem_square(tmp, ftmp3);
 726         felem_reduce(ftmp3, tmp);
 727     }
 728     felem_mul(tmp, ftmp3, ftmp2);
 729     felem_reduce(ftmp2, tmp);   /* 2^24 - 1 */
 730     felem_square(tmp, ftmp2);
 731     felem_reduce(ftmp3, tmp);   /* 2^25 - 2 */
 732     for (i = 0; i < 23; ++i) {  /* 2^48 - 2^24 */
 733         felem_square(tmp, ftmp3);
 734         felem_reduce(ftmp3, tmp);
 735     }
 736     felem_mul(tmp, ftmp3, ftmp2);
 737     felem_reduce(ftmp3, tmp);   /* 2^48 - 1 */
 738     felem_square(tmp, ftmp3);
 739     felem_reduce(ftmp4, tmp);   /* 2^49 - 2 */
 740     for (i = 0; i < 47; ++i) {  /* 2^96 - 2^48 */
 741         felem_square(tmp, ftmp4);
 742         felem_reduce(ftmp4, tmp);
 743     }
 744     felem_mul(tmp, ftmp3, ftmp4);
 745     felem_reduce(ftmp3, tmp);   /* 2^96 - 1 */
 746     felem_square(tmp, ftmp3);
 747     felem_reduce(ftmp4, tmp);   /* 2^97 - 2 */
 748     for (i = 0; i < 23; ++i) {  /* 2^120 - 2^24 */
 749         felem_square(tmp, ftmp4);
 750         felem_reduce(ftmp4, tmp);
 751     }
 752     felem_mul(tmp, ftmp2, ftmp4);
 753     felem_reduce(ftmp2, tmp);   /* 2^120 - 1 */
 754     for (i = 0; i < 6; ++i) {   /* 2^126 - 2^6 */
 755         felem_square(tmp, ftmp2);
 756         felem_reduce(ftmp2, tmp);
 757     }
 758     felem_mul(tmp, ftmp2, ftmp);
 759     felem_reduce(ftmp, tmp);    /* 2^126 - 1 */
 760     felem_square(tmp, ftmp);
 761     felem_reduce(ftmp, tmp);    /* 2^127 - 2 */
 762     felem_mul(tmp, ftmp, in);
 763     felem_reduce(ftmp, tmp);    /* 2^127 - 1 */
 764     for (i = 0; i < 97; ++i) {  /* 2^224 - 2^97 */
 765         felem_square(tmp, ftmp);
 766         felem_reduce(ftmp, tmp);
 767     }
 768     felem_mul(tmp, ftmp, ftmp3);
 769     felem_reduce(out, tmp);     /* 2^224 - 2^96 - 1 */
 770 }
 771
 772 /*
 773  * Copy in constant time: if icopy == 1, copy in to out, if icopy == 0, copy
 774  * out to itself.
 775  */
 776 static void copy_conditional(felem out, const felem in, limb icopy)
 777 {
 778     unsigned i;
 779     /*
 780      * icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one
 781      */
 782     const limb copy = -icopy;
 783     for (i = 0; i < 4; ++i) {
 784         const limb tmp = copy & (in[i] ^ out[i]);
 785         out[i] ^= tmp;
 786     }
 787 }
 788
 789 /******************************************************************************/
 790 /*-
 791  *                       ELLIPTIC CURVE POINT OPERATIONS
 792  *
 793  * Points are represented in Jacobian projective coordinates:
 794  * (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3),
 795  * or to the point at infinity if Z == 0.
 796  *
 797  */
 798
 799 /*-
 800  * Double an elliptic curve point:
 801  * (X', Y', Z') = 2 * (X, Y, Z), where
 802  * X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2
 803  * Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^2
 804  * Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z
 805  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed,
 806  * while x_out == y_in is not (maybe this works, but it's not tested).
 807  */
 808 static void
 809 point_double(felem x_out, felem y_out, felem z_out,
 810              const felem x_in, const felem y_in, const felem z_in)
 811 {
 812     widefelem tmp, tmp2;
 813     felem delta, gamma, beta, alpha, ftmp, ftmp2;
 814
 815     felem_assign(ftmp, x_in);
 816     felem_assign(ftmp2, x_in);
 817
 818     /* delta = z^2 */
 819     felem_square(tmp, z_in);
 820     felem_reduce(delta, tmp);
 821
 822     /* gamma = y^2 */
 823     felem_square(tmp, y_in);
 824     felem_reduce(gamma, tmp);
 825
 826     /* beta = x*gamma */
 827     felem_mul(tmp, x_in, gamma);
 828     felem_reduce(beta, tmp);
 829
 830     /* alpha = 3*(x-delta)*(x+delta) */
 831     felem_diff(ftmp, delta);
 832     /* ftmp[i] < 2^57 + 2^58 + 2 < 2^59 */
 833     felem_sum(ftmp2, delta);
 834     /* ftmp2[i] < 2^57 + 2^57 = 2^58 */
 835     felem_scalar(ftmp2, 3);
 836     /* ftmp2[i] < 3 * 2^58 < 2^60 */
 837     felem_mul(tmp, ftmp, ftmp2);
 838     /* tmp[i] < 2^60 * 2^59 * 4 = 2^121 */
 839     felem_reduce(alpha, tmp);
 840
 841     /* x' = alpha^2 - 8*beta */
 842     felem_square(tmp, alpha);
 843     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 844     felem_assign(ftmp, beta);
 845     felem_scalar(ftmp, 8);
 846     /* ftmp[i] < 8 * 2^57 = 2^60 */
 847     felem_diff_128_64(tmp, ftmp);
 848     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 849     felem_reduce(x_out, tmp);
 850
 851     /* z' = (y + z)^2 - gamma - delta */
 852     felem_sum(delta, gamma);
 853     /* delta[i] < 2^57 + 2^57 = 2^58 */
 854     felem_assign(ftmp, y_in);
 855     felem_sum(ftmp, z_in);
 856     /* ftmp[i] < 2^57 + 2^57 = 2^58 */
 857     felem_square(tmp, ftmp);
 858     /* tmp[i] < 4 * 2^58 * 2^58 = 2^118 */
 859     felem_diff_128_64(tmp, delta);
 860     /* tmp[i] < 2^118 + 2^64 + 8 < 2^119 */
 861     felem_reduce(z_out, tmp);
 862
 863     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
 864     felem_scalar(beta, 4);
 865     /* beta[i] < 4 * 2^57 = 2^59 */
 866     felem_diff(beta, x_out);
 867     /* beta[i] < 2^59 + 2^58 + 2 < 2^60 */
 868     felem_mul(tmp, alpha, beta);
 869     /* tmp[i] < 4 * 2^57 * 2^60 = 2^119 */
 870     felem_square(tmp2, gamma);
 871     /* tmp2[i] < 4 * 2^57 * 2^57 = 2^116 */
 872     widefelem_scalar(tmp2, 8);
 873     /* tmp2[i] < 8 * 2^116 = 2^119 */
 874     widefelem_diff(tmp, tmp2);
 875     /* tmp[i] < 2^119 + 2^120 < 2^121 */
 876     felem_reduce(y_out, tmp);
 877 }
 878
 879 /*-
 880  * Add two elliptic curve points:
 881  * (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where
 882  * X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 -
 883  * 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2
 884  * Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 - X_3) -
 885  *        Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3
 886  * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2)
 887  *
 888  * This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0.
 889  */
 890
 891 /*
 892  * This function is not entirely constant-time: it includes a branch for
 893  * checking whether the two input points are equal, (while not equal to the
 894  * point at infinity). This case never happens during single point
 895  * multiplication, so there is no timing leak for ECDH or ECDSA signing.
 896  */
 897 static void point_add(felem x3, felem y3, felem z3,
 898                       const felem x1, const felem y1, const felem z1,
 899                       const int mixed, const felem x2, const felem y2,
 900                       const felem z2)
 901 {
 902     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out;
 903     widefelem tmp, tmp2;
 904     limb z1_is_zero, z2_is_zero, x_equal, y_equal;
 905
 906     if (!mixed) {
 907         /* ftmp2 = z2^2 */
 908         felem_square(tmp, z2);
 909         felem_reduce(ftmp2, tmp);
 910
 911         /* ftmp4 = z2^3 */
 912         felem_mul(tmp, ftmp2, z2);
 913         felem_reduce(ftmp4, tmp);
 914
 915         /* ftmp4 = z2^3*y1 */
 916         felem_mul(tmp2, ftmp4, y1);
 917         felem_reduce(ftmp4, tmp2);
 918
 919         /* ftmp2 = z2^2*x1 */
 920         felem_mul(tmp2, ftmp2, x1);
 921         felem_reduce(ftmp2, tmp2);
 922     } else {
 923         /*
 924          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
 925          */
 926
 927         /* ftmp4 = z2^3*y1 */
 928         felem_assign(ftmp4, y1);
 929
 930         /* ftmp2 = z2^2*x1 */
 931         felem_assign(ftmp2, x1);
 932     }
 933
 934     /* ftmp = z1^2 */
 935     felem_square(tmp, z1);
 936     felem_reduce(ftmp, tmp);
 937
 938     /* ftmp3 = z1^3 */
 939     felem_mul(tmp, ftmp, z1);
 940     felem_reduce(ftmp3, tmp);
 941
 942     /* tmp = z1^3*y2 */
 943     felem_mul(tmp, ftmp3, y2);
 944     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 945
 946     /* ftmp3 = z1^3*y2 - z2^3*y1 */
 947     felem_diff_128_64(tmp, ftmp4);
 948     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 949     felem_reduce(ftmp3, tmp);
 950
 951     /* tmp = z1^2*x2 */
 952     felem_mul(tmp, ftmp, x2);
 953     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 954
 955     /* ftmp = z1^2*x2 - z2^2*x1 */
 956     felem_diff_128_64(tmp, ftmp2);
 957     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 958     felem_reduce(ftmp, tmp);
 959
 960     /*
 961      * the formulae are incorrect if the points are equal so we check for
 962      * this and do doubling if this happens
 963      */
 964     x_equal = felem_is_zero(ftmp);
 965     y_equal = felem_is_zero(ftmp3);
 966     z1_is_zero = felem_is_zero(z1);
 967     z2_is_zero = felem_is_zero(z2);
 968     /* In affine coordinates, (X_1, Y_1) == (X_2, Y_2) */
 969     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
 970         point_double(x3, y3, z3, x1, y1, z1);
 971         return;
 972     }
 973
 974     /* ftmp5 = z1*z2 */
 975     if (!mixed) {
 976         felem_mul(tmp, z1, z2);
 977         felem_reduce(ftmp5, tmp);
 978     } else {
 979         /* special case z2 = 0 is handled later */
 980         felem_assign(ftmp5, z1);
 981     }
 982
 983     /* z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) */
 984     felem_mul(tmp, ftmp, ftmp5);
 985     felem_reduce(z_out, tmp);
 986
 987     /* ftmp = (z1^2*x2 - z2^2*x1)^2 */
 988     felem_assign(ftmp5, ftmp);
 989     felem_square(tmp, ftmp);
 990     felem_reduce(ftmp, tmp);
 991
 992     /* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */
 993     felem_mul(tmp, ftmp, ftmp5);
 994     felem_reduce(ftmp5, tmp);
 995
 996     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
 997     felem_mul(tmp, ftmp2, ftmp);
 998     felem_reduce(ftmp2, tmp);
 999
1000     /* tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
1001     felem_mul(tmp, ftmp4, ftmp5);
1002     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
1003
1004     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 */
1005     felem_square(tmp2, ftmp3);
1006     /* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */
1007
1008     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */
1009     felem_diff_128_64(tmp2, ftmp5);
1010     /* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */
1011
1012     /* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
1013     felem_assign(ftmp5, ftmp2);
1014     felem_scalar(ftmp5, 2);
1015     /* ftmp5[i] < 2 * 2^57 = 2^58 */
1016
1017     /*-
1018      * x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 -
1019      *  2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2
1020      */
1021     felem_diff_128_64(tmp2, ftmp5);
1022     /* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */
1023     felem_reduce(x_out, tmp2);
1024
1025     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out */
1026     felem_diff(ftmp2, x_out);
1027     /* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */
1028
1029     /*
1030      * tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out)
1031      */
1032     felem_mul(tmp2, ftmp3, ftmp2);
1033     /* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */
1034
1035     /*-
1036      * y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) -
1037      *  z2^3*y1*(z1^2*x2 - z2^2*x1)^3
1038      */
1039     widefelem_diff(tmp2, tmp);
1040     /* tmp2[i] < 2^118 + 2^120 < 2^121 */
1041     felem_reduce(y_out, tmp2);
1042
1043     /*
1044      * the result (x_out, y_out, z_out) is incorrect if one of the inputs is
1045      * the point at infinity, so we need to check for this separately
1046      */
1047
1048     /*
1049      * if point 1 is at infinity, copy point 2 to output, and vice versa
1050      */
1051     copy_conditional(x_out, x2, z1_is_zero);
1052     copy_conditional(x_out, x1, z2_is_zero);
1053     copy_conditional(y_out, y2, z1_is_zero);
1054     copy_conditional(y_out, y1, z2_is_zero);
1055     copy_conditional(z_out, z2, z1_is_zero);
1056     copy_conditional(z_out, z1, z2_is_zero);
1057     felem_assign(x3, x_out);
1058     felem_assign(y3, y_out);
1059     felem_assign(z3, z_out);
1060 }
1061
1062 /*
1063  * select_point selects the |idx|th point from a precomputation table and
1064  * copies it to out.
1065  * The pre_comp array argument should be size of |size| argument
1066  */
1067 static void select_point(const u64 idx, unsigned int size,
1068                          const felem pre_comp[][3], felem out[3])
1069 {
1070     unsigned i, j;
1071     limb *outlimbs = &out[0][0];
1072
1073     memset(out, 0, sizeof(*out) * 3);
1074     for (i = 0; i < size; i++) {
1075         const limb *inlimbs = &pre_comp[i][0][0];
1076         u64 mask = i ^ idx;
1077         mask |= mask >> 4;
1078         mask |= mask >> 2;
1079         mask |= mask >> 1;
1080         mask &= 1;
1081         mask--;
1082         for (j = 0; j < 4 * 3; j++)
1083             outlimbs[j] |= inlimbs[j] & mask;
1084     }
1085 }
1086
1087 /* get_bit returns the |i|th bit in |in| */
1088 static char get_bit(const felem_bytearray in, unsigned i)
1089 {
1090     if (i >= 224)
1091         return 0;
1092     return (in[i >> 3] >> (i & 7)) & 1;
1093 }
1094
1095 /*
1096  * Interleaved point multiplication using precomputed point multiples: The
1097  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1098  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1099  * generator, using certain (large) precomputed multiples in g_pre_comp.
1100  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1101  */
1102 static void batch_mul(felem x_out, felem y_out, felem z_out,
1103                       const felem_bytearray scalars[],
1104                       const unsigned num_points, const u8 *g_scalar,
1105                       const int mixed, const felem pre_comp[][17][3],
1106                       const felem g_pre_comp[2][16][3])
1107 {
1108     int i, skip;
1109     unsigned num;
1110     unsigned gen_mul = (g_scalar != NULL);
1111     felem nq[3], tmp[4];
1112     u64 bits;
1113     u8 sign, digit;
1114
1115     /* set nq to the point at infinity */
1116     memset(nq, 0, sizeof(nq));
1117
1118     /*
1119      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1120      * of the generator (two in each of the last 28 rounds) and additions of
1121      * other points multiples (every 5th round).
1122      */
1123     skip = 1;                   /* save two point operations in the first
1124                                  * round */
1125     for (i = (num_points ? 220 : 27); i >= 0; --i) {
1126         /* double */
1127         if (!skip)
1128             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1129
1130         /* add multiples of the generator */
1131         if (gen_mul && (i <= 27)) {
1132             /* first, look 28 bits upwards */
1133             bits = get_bit(g_scalar, i + 196) << 3;
1134             bits |= get_bit(g_scalar, i + 140) << 2;
1135             bits |= get_bit(g_scalar, i + 84) << 1;
1136             bits |= get_bit(g_scalar, i + 28);
1137             /* select the point to add, in constant time */
1138             select_point(bits, 16, g_pre_comp[1], tmp);
1139
1140             if (!skip) {
1141                 /* value 1 below is argument for "mixed" */
1142                 point_add(nq[0], nq[1], nq[2],
1143                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1144             } else {
1145                 memcpy(nq, tmp, 3 * sizeof(felem));
1146                 skip = 0;
1147             }
1148
1149             /* second, look at the current position */
1150             bits = get_bit(g_scalar, i + 168) << 3;
1151             bits |= get_bit(g_scalar, i + 112) << 2;
1152             bits |= get_bit(g_scalar, i + 56) << 1;
1153             bits |= get_bit(g_scalar, i);
1154             /* select the point to add, in constant time */
1155             select_point(bits, 16, g_pre_comp[0], tmp);
1156             point_add(nq[0], nq[1], nq[2],
1157                       nq[0], nq[1], nq[2],
1158                       1 /* mixed */ , tmp[0], tmp[1], tmp[2]);
1159         }
1160
1161         /* do other additions every 5 doublings */
1162         if (num_points && (i % 5 == 0)) {
1163             /* loop over all scalars */
1164             for (num = 0; num < num_points; ++num) {
1165                 bits = get_bit(scalars[num], i + 4) << 5;
1166                 bits |= get_bit(scalars[num], i + 3) << 4;
1167                 bits |= get_bit(scalars[num], i + 2) << 3;
1168                 bits |= get_bit(scalars[num], i + 1) << 2;
1169                 bits |= get_bit(scalars[num], i) << 1;
1170                 bits |= get_bit(scalars[num], i - 1);
1171                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1172
1173                 /* select the point to add or subtract */
1174                 select_point(digit, 17, pre_comp[num], tmp);
1175                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1176                                             * point */
1177                 copy_conditional(tmp[1], tmp[3], sign);
1178
1179                 if (!skip) {
1180                     point_add(nq[0], nq[1], nq[2],
1181                               nq[0], nq[1], nq[2],
1182                               mixed, tmp[0], tmp[1], tmp[2]);
1183                 } else {
1184                     memcpy(nq, tmp, 3 * sizeof(felem));
1185                     skip = 0;
1186                 }
1187             }
1188         }
1189     }
1190     felem_assign(x_out, nq[0]);
1191     felem_assign(y_out, nq[1]);
1192     felem_assign(z_out, nq[2]);
1193 }
1194
1195 /******************************************************************************/
1196 /*
1197  * FUNCTIONS TO MANAGE PRECOMPUTATION
1198  */
1199
1200 static NISTP224_PRE_COMP *nistp224_pre_comp_new()
1201 {
1202     NISTP224_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1203
1204     if (!ret) {
1205         ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1206         return ret;
1207     }
1208     ret->references = 1;
1209     return ret;
1210 }
1211
1212 static void *nistp224_pre_comp_dup(void *src_)
1213 {
1214     NISTP224_PRE_COMP *src = src_;
1215
1216     /* no need to actually copy, these objects never change! */
1217     CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1218
1219     return src_;
1220 }
1221
1222 static void nistp224_pre_comp_free(void *pre_)
1223 {
1224     int i;
1225     NISTP224_PRE_COMP *pre = pre_;
1226
1227     if (!pre)
1228         return;
1229
1230     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1231     if (i > 0)
1232         return;
1233
1234     OPENSSL_free(pre);
1235 }
1236
1237 static void nistp224_pre_comp_clear_free(void *pre_)
1238 {
1239     int i;
1240     NISTP224_PRE_COMP *pre = pre_;
1241
1242     if (!pre)
1243         return;
1244
1245     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1246     if (i > 0)
1247         return;
1248
1249     OPENSSL_clear_free(pre, sizeof(*pre));
1250 }
1251
1252 /******************************************************************************/
1253 /*
1254  * OPENSSL EC_METHOD FUNCTIONS
1255  */
1256
1257 int ec_GFp_nistp224_group_init(EC_GROUP *group)
1258 {
1259     int ret;
1260     ret = ec_GFp_simple_group_init(group);
1261     group->a_is_minus3 = 1;
1262     return ret;
1263 }
1264
1265 int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1266                                     const BIGNUM *a, const BIGNUM *b,
1267                                     BN_CTX *ctx)
1268 {
1269     int ret = 0;
1270     BN_CTX *new_ctx = NULL;
1271     BIGNUM *curve_p, *curve_a, *curve_b;
1272
1273     if (ctx == NULL)
1274         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1275             return 0;
1276     BN_CTX_start(ctx);
1277     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1278         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1279         ((curve_b = BN_CTX_get(ctx)) == NULL))
1280         goto err;
1281     BN_bin2bn(nistp224_curve_params[0], sizeof(felem_bytearray), curve_p);
1282     BN_bin2bn(nistp224_curve_params[1], sizeof(felem_bytearray), curve_a);
1283     BN_bin2bn(nistp224_curve_params[2], sizeof(felem_bytearray), curve_b);
1284     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1285         ECerr(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE,
1286               EC_R_WRONG_CURVE_PARAMETERS);
1287         goto err;
1288     }
1289     group->field_mod_func = BN_nist_mod_224;
1290     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1291  err:
1292     BN_CTX_end(ctx);
1293     BN_CTX_free(new_ctx);
1294     return ret;
1295 }
1296
1297 /*
1298  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1299  * (X/Z^2, Y/Z^3)
1300  */
1301 int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group,
1302                                                  const EC_POINT *point,
1303                                                  BIGNUM *x, BIGNUM *y,
1304                                                  BN_CTX *ctx)
1305 {
1306     felem z1, z2, x_in, y_in, x_out, y_out;
1307     widefelem tmp;
1308
1309     if (EC_POINT_is_at_infinity(group, point)) {
1310         ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1311               EC_R_POINT_AT_INFINITY);
1312         return 0;
1313     }
1314     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1315         (!BN_to_felem(z1, point->Z)))
1316         return 0;
1317     felem_inv(z2, z1);
1318     felem_square(tmp, z2);
1319     felem_reduce(z1, tmp);
1320     felem_mul(tmp, x_in, z1);
1321     felem_reduce(x_in, tmp);
1322     felem_contract(x_out, x_in);
1323     if (x != NULL) {
1324         if (!felem_to_BN(x, x_out)) {
1325             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1326                   ERR_R_BN_LIB);
1327             return 0;
1328         }
1329     }
1330     felem_mul(tmp, z1, z2);
1331     felem_reduce(z1, tmp);
1332     felem_mul(tmp, y_in, z1);
1333     felem_reduce(y_in, tmp);
1334     felem_contract(y_out, y_in);
1335     if (y != NULL) {
1336         if (!felem_to_BN(y, y_out)) {
1337             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1338                   ERR_R_BN_LIB);
1339             return 0;
1340         }
1341     }
1342     return 1;
1343 }
1344
1345 static void make_points_affine(size_t num, felem points[ /* num */ ][3],
1346                                felem tmp_felems[ /* num+1 */ ])
1347 {
1348     /*
1349      * Runs in constant time, unless an input is the point at infinity (which
1350      * normally shouldn't happen).
1351      */
1352     ec_GFp_nistp_points_make_affine_internal(num,
1353                                              points,
1354                                              sizeof(felem),
1355                                              tmp_felems,
1356                                              (void (*)(void *))felem_one,
1357                                              (int (*)(const void *))
1358                                              felem_is_zero_int,
1359                                              (void (*)(void *, const void *))
1360                                              felem_assign,
1361                                              (void (*)(void *, const void *))
1362                                              felem_square_reduce, (void (*)
1363                                                                    (void *,
1364                                                                     const void
1365                                                                     *,
1366                                                                     const void
1367                                                                     *))
1368                                              felem_mul_reduce,
1369                                              (void (*)(void *, const void *))
1370                                              felem_inv,
1371                                              (void (*)(void *, const void *))
1372                                              felem_contract);
1373 }
1374
1375 /*
1376  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1377  * values Result is stored in r (r can equal one of the inputs).
1378  */
1379 int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
1380                                const BIGNUM *scalar, size_t num,
1381                                const EC_POINT *points[],
1382                                const BIGNUM *scalars[], BN_CTX *ctx)
1383 {
1384     int ret = 0;
1385     int j;
1386     unsigned i;
1387     int mixed = 0;
1388     BN_CTX *new_ctx = NULL;
1389     BIGNUM *x, *y, *z, *tmp_scalar;
1390     felem_bytearray g_secret;
1391     felem_bytearray *secrets = NULL;
1392     felem (*pre_comp)[17][3] = NULL;
1393     felem *tmp_felems = NULL;
1394     felem_bytearray tmp;
1395     unsigned num_bytes;
1396     int have_pre_comp = 0;
1397     size_t num_points = num;
1398     felem x_in, y_in, z_in, x_out, y_out, z_out;
1399     NISTP224_PRE_COMP *pre = NULL;
1400     const felem(*g_pre_comp)[16][3] = NULL;
1401     EC_POINT *generator = NULL;
1402     const EC_POINT *p = NULL;
1403     const BIGNUM *p_scalar = NULL;
1404
1405     if (ctx == NULL)
1406         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1407             return 0;
1408     BN_CTX_start(ctx);
1409     if (((x = BN_CTX_get(ctx)) == NULL) ||
1410         ((y = BN_CTX_get(ctx)) == NULL) ||
1411         ((z = BN_CTX_get(ctx)) == NULL) ||
1412         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1413         goto err;
1414
1415     if (scalar != NULL) {
1416         pre = EC_EX_DATA_get_data(group->extra_data,
1417                                   nistp224_pre_comp_dup,
1418                                   nistp224_pre_comp_free,
1419                                   nistp224_pre_comp_clear_free);
1420         if (pre)
1421             /* we have precomputation, try to use it */
1422             g_pre_comp = (const felem(*)[16][3])pre->g_pre_comp;
1423         else
1424             /* try to use the standard precomputation */
1425             g_pre_comp = &gmul[0];
1426         generator = EC_POINT_new(group);
1427         if (generator == NULL)
1428             goto err;
1429         /* get the generator from precomputation */
1430         if (!felem_to_BN(x, g_pre_comp[0][1][0]) ||
1431             !felem_to_BN(y, g_pre_comp[0][1][1]) ||
1432             !felem_to_BN(z, g_pre_comp[0][1][2])) {
1433             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1434             goto err;
1435         }
1436         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1437                                                       generator, x, y, z,
1438                                                       ctx))
1439             goto err;
1440         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1441             /* precomputation matches generator */
1442             have_pre_comp = 1;
1443         else
1444             /*
1445              * we don't have valid precomputation: treat the generator as a
1446              * random point
1447              */
1448             num_points = num_points + 1;
1449     }
1450
1451     if (num_points > 0) {
1452         if (num_points >= 3) {
1453             /*
1454              * unless we precompute multiples for just one or two points,
1455              * converting those into affine form is time well spent
1456              */
1457             mixed = 1;
1458         }
1459         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1460         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1461         if (mixed)
1462             tmp_felems =
1463                 OPENSSL_malloc(sizeof(felem) * (num_points * 17 + 1));
1464         if ((secrets == NULL) || (pre_comp == NULL)
1465             || (mixed && (tmp_felems == NULL))) {
1466             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1467             goto err;
1468         }
1469
1470         /*
1471          * we treat NULL scalars as 0, and NULL points as points at infinity,
1472          * i.e., they contribute nothing to the linear combination
1473          */
1474         for (i = 0; i < num_points; ++i) {
1475             if (i == num)
1476                 /* the generator */
1477             {
1478                 p = EC_GROUP_get0_generator(group);
1479                 p_scalar = scalar;
1480             } else
1481                 /* the i^th point */
1482             {
1483                 p = points[i];
1484                 p_scalar = scalars[i];
1485             }
1486             if ((p_scalar != NULL) && (p != NULL)) {
1487                 /* reduce scalar to 0 <= scalar < 2^224 */
1488                 if ((BN_num_bits(p_scalar) > 224)
1489                     || (BN_is_negative(p_scalar))) {
1490                     /*
1491                      * this is an unusual input, and we don't guarantee
1492                      * constant-timeness
1493                      */
1494                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1495                         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1496                         goto err;
1497                     }
1498                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
1499                 } else
1500                     num_bytes = BN_bn2bin(p_scalar, tmp);
1501                 flip_endian(secrets[i], tmp, num_bytes);
1502                 /* precompute multiples */
1503                 if ((!BN_to_felem(x_out, p->X)) ||
1504                     (!BN_to_felem(y_out, p->Y)) ||
1505                     (!BN_to_felem(z_out, p->Z)))
1506                     goto err;
1507                 felem_assign(pre_comp[i][1][0], x_out);
1508                 felem_assign(pre_comp[i][1][1], y_out);
1509                 felem_assign(pre_comp[i][1][2], z_out);
1510                 for (j = 2; j <= 16; ++j) {
1511                     if (j & 1) {
1512                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1513                                   pre_comp[i][j][2], pre_comp[i][1][0],
1514                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1515                                   pre_comp[i][j - 1][0],
1516                                   pre_comp[i][j - 1][1],
1517                                   pre_comp[i][j - 1][2]);
1518                     } else {
1519                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1520                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1521                                      pre_comp[i][j / 2][1],
1522                                      pre_comp[i][j / 2][2]);
1523                     }
1524                 }
1525             }
1526         }
1527         if (mixed)
1528             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1529     }
1530
1531     /* the scalar for the generator */
1532     if ((scalar != NULL) && (have_pre_comp)) {
1533         memset(g_secret, 0, sizeof(g_secret));
1534         /* reduce scalar to 0 <= scalar < 2^224 */
1535         if ((BN_num_bits(scalar) > 224) || (BN_is_negative(scalar))) {
1536             /*
1537              * this is an unusual input, and we don't guarantee
1538              * constant-timeness
1539              */
1540             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1541                 ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1542                 goto err;
1543             }
1544             num_bytes = BN_bn2bin(tmp_scalar, tmp);
1545         } else
1546             num_bytes = BN_bn2bin(scalar, tmp);
1547         flip_endian(g_secret, tmp, num_bytes);
1548         /* do the multiplication with generator precomputation */
1549         batch_mul(x_out, y_out, z_out,
1550                   (const felem_bytearray(*))secrets, num_points,
1551                   g_secret,
1552                   mixed, (const felem(*)[17][3])pre_comp, g_pre_comp);
1553     } else
1554         /* do the multiplication without generator precomputation */
1555         batch_mul(x_out, y_out, z_out,
1556                   (const felem_bytearray(*))secrets, num_points,
1557                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1558     /* reduce the output to its unique minimal representation */
1559     felem_contract(x_in, x_out);
1560     felem_contract(y_in, y_out);
1561     felem_contract(z_in, z_out);
1562     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1563         (!felem_to_BN(z, z_in))) {
1564         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1565         goto err;
1566     }
1567     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1568
1569  err:
1570     BN_CTX_end(ctx);
1571     EC_POINT_free(generator);
1572     BN_CTX_free(new_ctx);
1573     OPENSSL_free(secrets);
1574     OPENSSL_free(pre_comp);
1575     OPENSSL_free(tmp_felems);
1576     return ret;
1577 }
1578
1579 int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1580 {
1581     int ret = 0;
1582     NISTP224_PRE_COMP *pre = NULL;
1583     int i, j;
1584     BN_CTX *new_ctx = NULL;
1585     BIGNUM *x, *y;
1586     EC_POINT *generator = NULL;
1587     felem tmp_felems[32];
1588
1589     /* throw away old precomputation */
1590     EC_EX_DATA_free_data(&group->extra_data, nistp224_pre_comp_dup,
1591                          nistp224_pre_comp_free,
1592                          nistp224_pre_comp_clear_free);
1593     if (ctx == NULL)
1594         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1595             return 0;
1596     BN_CTX_start(ctx);
1597     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
1598         goto err;
1599     /* get the generator */
1600     if (group->generator == NULL)
1601         goto err;
1602     generator = EC_POINT_new(group);
1603     if (generator == NULL)
1604         goto err;
1605     BN_bin2bn(nistp224_curve_params[3], sizeof(felem_bytearray), x);
1606     BN_bin2bn(nistp224_curve_params[4], sizeof(felem_bytearray), y);
1607     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
1608         goto err;
1609     if ((pre = nistp224_pre_comp_new()) == NULL)
1610         goto err;
1611     /*
1612      * if the generator is the standard one, use built-in precomputation
1613      */
1614     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1615         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1616         ret = 1;
1617         goto err;
1618     }
1619     if ((!BN_to_felem(pre->g_pre_comp[0][1][0], group->generator->X)) ||
1620         (!BN_to_felem(pre->g_pre_comp[0][1][1], group->generator->Y)) ||
1621         (!BN_to_felem(pre->g_pre_comp[0][1][2], group->generator->Z)))
1622         goto err;
1623     /*
1624      * compute 2^56*G, 2^112*G, 2^168*G for the first table, 2^28*G, 2^84*G,
1625      * 2^140*G, 2^196*G for the second one
1626      */
1627     for (i = 1; i <= 8; i <<= 1) {
1628         point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1629                      pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
1630                      pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]);
1631         for (j = 0; j < 27; ++j) {
1632             point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1633                          pre->g_pre_comp[1][i][2], pre->g_pre_comp[1][i][0],
1634                          pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1635         }
1636         if (i == 8)
1637             break;
1638         point_double(pre->g_pre_comp[0][2 * i][0],
1639                      pre->g_pre_comp[0][2 * i][1],
1640                      pre->g_pre_comp[0][2 * i][2], pre->g_pre_comp[1][i][0],
1641                      pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1642         for (j = 0; j < 27; ++j) {
1643             point_double(pre->g_pre_comp[0][2 * i][0],
1644                          pre->g_pre_comp[0][2 * i][1],
1645                          pre->g_pre_comp[0][2 * i][2],
1646                          pre->g_pre_comp[0][2 * i][0],
1647                          pre->g_pre_comp[0][2 * i][1],
1648                          pre->g_pre_comp[0][2 * i][2]);
1649         }
1650     }
1651     for (i = 0; i < 2; i++) {
1652         /* g_pre_comp[i][0] is the point at infinity */
1653         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
1654         /* the remaining multiples */
1655         /* 2^56*G + 2^112*G resp. 2^84*G + 2^140*G */
1656         point_add(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
1657                   pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
1658                   pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
1659                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1660                   pre->g_pre_comp[i][2][2]);
1661         /* 2^56*G + 2^168*G resp. 2^84*G + 2^196*G */
1662         point_add(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
1663                   pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
1664                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1665                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1666                   pre->g_pre_comp[i][2][2]);
1667         /* 2^112*G + 2^168*G resp. 2^140*G + 2^196*G */
1668         point_add(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
1669                   pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
1670                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1671                   0, pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
1672                   pre->g_pre_comp[i][4][2]);
1673         /*
1674          * 2^56*G + 2^112*G + 2^168*G resp. 2^84*G + 2^140*G + 2^196*G
1675          */
1676         point_add(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
1677                   pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
1678                   pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
1679                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1680                   pre->g_pre_comp[i][2][2]);
1681         for (j = 1; j < 8; ++j) {
1682             /* odd multiples: add G resp. 2^28*G */
1683             point_add(pre->g_pre_comp[i][2 * j + 1][0],
1684                       pre->g_pre_comp[i][2 * j + 1][1],
1685                       pre->g_pre_comp[i][2 * j + 1][2],
1686                       pre->g_pre_comp[i][2 * j][0],
1687                       pre->g_pre_comp[i][2 * j][1],
1688                       pre->g_pre_comp[i][2 * j][2], 0,
1689                       pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1],
1690                       pre->g_pre_comp[i][1][2]);
1691         }
1692     }
1693     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems);
1694
1695     if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp224_pre_comp_dup,
1696                              nistp224_pre_comp_free,
1697                              nistp224_pre_comp_clear_free))
1698         goto err;
1699     ret = 1;
1700     pre = NULL;
1701  err:
1702     BN_CTX_end(ctx);
1703     EC_POINT_free(generator);
1704     BN_CTX_free(new_ctx);
1705     nistp224_pre_comp_free(pre);
1706     return ret;
1707 }
1708
1709 int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group)
1710 {
1711     if (EC_EX_DATA_get_data(group->extra_data, nistp224_pre_comp_dup,
1712                             nistp224_pre_comp_free,
1713                             nistp224_pre_comp_clear_free)
1714         != NULL)
1715         return 1;
1716     else
1717         return 0;
1718 }
1719
1720 #else
1721 static void *dummy = &dummy;
1722 #endif