crypto/ec/ecp_nistp224.c

   1 /*
   2  * Written by Emilia Kasper (Google) for the OpenSSL project.
   3  */
   4 /* Copyright 2011 Google Inc.
   5  *
   6  * Licensed under the Apache License, Version 2.0 (the "License");
   7  *
   8  * you may not use this file except in compliance with the License.
   9  * You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  *  Unless required by applicable law or agreed to in writing, software
  14  *  distributed under the License is distributed on an "AS IS" BASIS,
  15  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  *  See the License for the specific language governing permissions and
  17  *  limitations under the License.
  18  */
  19
  20 /*
  21  * A 64-bit implementation of the NIST P-224 elliptic curve point multiplication
  22  *
  23  * Inspired by Daniel J. Bernstein's public domain nistp224 implementation
  24  * and Adam Langley's public domain 64-bit C implementation of curve25519
  25  */
  26
  27 #include <openssl/opensslconf.h>
  28 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  29 NON_EMPTY_TRANSLATION_UNIT
  30 #else
  31
  32 # include <stdint.h>
  33 # include <string.h>
  34 # include <openssl/err.h>
  35 # include "ec_lcl.h"
  36
  37 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  38   /* even with gcc, the typedef won't work for 32-bit platforms */
  39 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  40                                  * platforms */
  41 # else
  42 #  error "Need GCC 3.1 or later to define type uint128_t"
  43 # endif
  44
  45 typedef uint8_t u8;
  46 typedef uint64_t u64;
  47 typedef int64_t s64;
  48
  49 /******************************************************************************/
  50 /*-
  51  * INTERNAL REPRESENTATION OF FIELD ELEMENTS
  52  *
  53  * Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3
  54  * using 64-bit coefficients called 'limbs',
  55  * and sometimes (for multiplication results) as
  56  * b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 + 2^336*b_6
  57  * using 128-bit coefficients called 'widelimbs'.
  58  * A 4-limb representation is an 'felem';
  59  * a 7-widelimb representation is a 'widefelem'.
  60  * Even within felems, bits of adjacent limbs overlap, and we don't always
  61  * reduce the representations: we ensure that inputs to each felem
  62  * multiplication satisfy a_i < 2^60, so outputs satisfy b_i < 4*2^60*2^60,
  63  * and fit into a 128-bit word without overflow. The coefficients are then
  64  * again partially reduced to obtain an felem satisfying a_i < 2^57.
  65  * We only reduce to the unique minimal representation at the end of the
  66  * computation.
  67  */
  68
  69 typedef uint64_t limb;
  70 typedef uint128_t widelimb;
  71
  72 typedef limb felem[4];
  73 typedef widelimb widefelem[7];
  74
  75 /*
  76  * Field element represented as a byte arrary. 28*8 = 224 bits is also the
  77  * group order size for the elliptic curve, and we also use this type for
  78  * scalars for point multiplication.
  79  */
  80 typedef u8 felem_bytearray[28];
  81
  82 static const felem_bytearray nistp224_curve_params[5] = {
  83     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
  84      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
  85      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01},
  86     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a */
  87      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
  88      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE},
  89     {0xB4, 0x05, 0x0A, 0x85, 0x0C, 0x04, 0xB3, 0xAB, 0xF5, 0x41, /* b */
  90      0x32, 0x56, 0x50, 0x44, 0xB0, 0xB7, 0xD7, 0xBF, 0xD8, 0xBA,
  91      0x27, 0x0B, 0x39, 0x43, 0x23, 0x55, 0xFF, 0xB4},
  92     {0xB7, 0x0E, 0x0C, 0xBD, 0x6B, 0xB4, 0xBF, 0x7F, 0x32, 0x13, /* x */
  93      0x90, 0xB9, 0x4A, 0x03, 0xC1, 0xD3, 0x56, 0xC2, 0x11, 0x22,
  94      0x34, 0x32, 0x80, 0xD6, 0x11, 0x5C, 0x1D, 0x21},
  95     {0xbd, 0x37, 0x63, 0x88, 0xb5, 0xf7, 0x23, 0xfb, 0x4c, 0x22, /* y */
  96      0xdf, 0xe6, 0xcd, 0x43, 0x75, 0xa0, 0x5a, 0x07, 0x47, 0x64,
  97      0x44, 0xd5, 0x81, 0x99, 0x85, 0x00, 0x7e, 0x34}
  98 };
  99
 100 /*-
 101  * Precomputed multiples of the standard generator
 102  * Points are given in coordinates (X, Y, Z) where Z normally is 1
 103  * (0 for the point at infinity).
 104  * For each field element, slice a_0 is word 0, etc.
 105  *
 106  * The table has 2 * 16 elements, starting with the following:
 107  * index | bits    | point
 108  * ------+---------+------------------------------
 109  *     0 | 0 0 0 0 | 0G
 110  *     1 | 0 0 0 1 | 1G
 111  *     2 | 0 0 1 0 | 2^56G
 112  *     3 | 0 0 1 1 | (2^56 + 1)G
 113  *     4 | 0 1 0 0 | 2^112G
 114  *     5 | 0 1 0 1 | (2^112 + 1)G
 115  *     6 | 0 1 1 0 | (2^112 + 2^56)G
 116  *     7 | 0 1 1 1 | (2^112 + 2^56 + 1)G
 117  *     8 | 1 0 0 0 | 2^168G
 118  *     9 | 1 0 0 1 | (2^168 + 1)G
 119  *    10 | 1 0 1 0 | (2^168 + 2^56)G
 120  *    11 | 1 0 1 1 | (2^168 + 2^56 + 1)G
 121  *    12 | 1 1 0 0 | (2^168 + 2^112)G
 122  *    13 | 1 1 0 1 | (2^168 + 2^112 + 1)G
 123  *    14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G
 124  *    15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G
 125  * followed by a copy of this with each element multiplied by 2^28.
 126  *
 127  * The reason for this is so that we can clock bits into four different
 128  * locations when doing simple scalar multiplies against the base point,
 129  * and then another four locations using the second 16 elements.
 130  */
 131 static const felem gmul[2][16][3] = {
 132 {{{0, 0, 0, 0},
 133   {0, 0, 0, 0},
 134   {0, 0, 0, 0}},
 135  {{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf},
 136   {0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723},
 137   {1, 0, 0, 0}},
 138  {{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5},
 139   {0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321},
 140   {1, 0, 0, 0}},
 141  {{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748},
 142   {0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17},
 143   {1, 0, 0, 0}},
 144  {{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe},
 145   {0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b},
 146   {1, 0, 0, 0}},
 147  {{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3},
 148   {0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a},
 149   {1, 0, 0, 0}},
 150  {{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c},
 151   {0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244},
 152   {1, 0, 0, 0}},
 153  {{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849},
 154   {0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112},
 155   {1, 0, 0, 0}},
 156  {{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47},
 157   {0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394},
 158   {1, 0, 0, 0}},
 159  {{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d},
 160   {0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7},
 161   {1, 0, 0, 0}},
 162  {{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24},
 163   {0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881},
 164   {1, 0, 0, 0}},
 165  {{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984},
 166   {0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369},
 167   {1, 0, 0, 0}},
 168  {{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3},
 169   {0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60},
 170   {1, 0, 0, 0}},
 171  {{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057},
 172   {0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9},
 173   {1, 0, 0, 0}},
 174  {{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9},
 175   {0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc},
 176   {1, 0, 0, 0}},
 177  {{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58},
 178   {0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558},
 179   {1, 0, 0, 0}}},
 180 {{{0, 0, 0, 0},
 181   {0, 0, 0, 0},
 182   {0, 0, 0, 0}},
 183  {{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31},
 184   {0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d},
 185   {1, 0, 0, 0}},
 186  {{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3},
 187   {0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a},
 188   {1, 0, 0, 0}},
 189  {{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33},
 190   {0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100},
 191   {1, 0, 0, 0}},
 192  {{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5},
 193   {0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea},
 194   {1, 0, 0, 0}},
 195  {{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be},
 196   {0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51},
 197   {1, 0, 0, 0}},
 198  {{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1},
 199   {0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb},
 200   {1, 0, 0, 0}},
 201  {{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233},
 202   {0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def},
 203   {1, 0, 0, 0}},
 204  {{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae},
 205   {0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45},
 206   {1, 0, 0, 0}},
 207  {{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e},
 208   {0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb},
 209   {1, 0, 0, 0}},
 210  {{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de},
 211   {0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3},
 212   {1, 0, 0, 0}},
 213  {{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05},
 214   {0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58},
 215   {1, 0, 0, 0}},
 216  {{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb},
 217   {0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0},
 218   {1, 0, 0, 0}},
 219  {{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9},
 220   {0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea},
 221   {1, 0, 0, 0}},
 222  {{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba},
 223   {0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405},
 224   {1, 0, 0, 0}},
 225  {{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e},
 226   {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e},
 227   {1, 0, 0, 0}}}
 228 };
 229
 230 /* Precomputation for the group generator. */
 231 struct nistp224_pre_comp_st {
 232     felem g_pre_comp[2][16][3];
 233     int references;
 234 };
 235
 236 const EC_METHOD *EC_GFp_nistp224_method(void)
 237 {
 238     static const EC_METHOD ret = {
 239         EC_FLAGS_DEFAULT_OCT,
 240         NID_X9_62_prime_field,
 241         ec_GFp_nistp224_group_init,
 242         ec_GFp_simple_group_finish,
 243         ec_GFp_simple_group_clear_finish,
 244         ec_GFp_nist_group_copy,
 245         ec_GFp_nistp224_group_set_curve,
 246         ec_GFp_simple_group_get_curve,
 247         ec_GFp_simple_group_get_degree,
 248         0, /* group_order_bits */
 249         ec_GFp_simple_group_check_discriminant,
 250         ec_GFp_simple_point_init,
 251         ec_GFp_simple_point_finish,
 252         ec_GFp_simple_point_clear_finish,
 253         ec_GFp_simple_point_copy,
 254         ec_GFp_simple_point_set_to_infinity,
 255         ec_GFp_simple_set_Jprojective_coordinates_GFp,
 256         ec_GFp_simple_get_Jprojective_coordinates_GFp,
 257         ec_GFp_simple_point_set_affine_coordinates,
 258         ec_GFp_nistp224_point_get_affine_coordinates,
 259         0 /* point_set_compressed_coordinates */ ,
 260         0 /* point2oct */ ,
 261         0 /* oct2point */ ,
 262         ec_GFp_simple_add,
 263         ec_GFp_simple_dbl,
 264         ec_GFp_simple_invert,
 265         ec_GFp_simple_is_at_infinity,
 266         ec_GFp_simple_is_on_curve,
 267         ec_GFp_simple_cmp,
 268         ec_GFp_simple_make_affine,
 269         ec_GFp_simple_points_make_affine,
 270         ec_GFp_nistp224_points_mul,
 271         ec_GFp_nistp224_precompute_mult,
 272         ec_GFp_nistp224_have_precompute_mult,
 273         ec_GFp_nist_field_mul,
 274         ec_GFp_nist_field_sqr,
 275         0 /* field_div */ ,
 276         0 /* field_encode */ ,
 277         0 /* field_decode */ ,
 278         0                       /* field_set_to_one */
 279     };
 280
 281     return &ret;
 282 }
 283
 284 /*
 285  * Helper functions to convert field elements to/from internal representation
 286  */
 287 static void bin28_to_felem(felem out, const u8 in[28])
 288 {
 289     out[0] = *((const uint64_t *)(in)) & 0x00ffffffffffffff;
 290     out[1] = (*((const uint64_t *)(in + 7))) & 0x00ffffffffffffff;
 291     out[2] = (*((const uint64_t *)(in + 14))) & 0x00ffffffffffffff;
 292     out[3] = (*((const uint64_t *)(in+20))) >> 8;
 293 }
 294
 295 static void felem_to_bin28(u8 out[28], const felem in)
 296 {
 297     unsigned i;
 298     for (i = 0; i < 7; ++i) {
 299         out[i] = in[0] >> (8 * i);
 300         out[i + 7] = in[1] >> (8 * i);
 301         out[i + 14] = in[2] >> (8 * i);
 302         out[i + 21] = in[3] >> (8 * i);
 303     }
 304 }
 305
 306 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 307 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 308 {
 309     unsigned i;
 310     for (i = 0; i < len; ++i)
 311         out[i] = in[len - 1 - i];
 312 }
 313
 314 /* From OpenSSL BIGNUM to internal representation */
 315 static int BN_to_felem(felem out, const BIGNUM *bn)
 316 {
 317     felem_bytearray b_in;
 318     felem_bytearray b_out;
 319     unsigned num_bytes;
 320
 321     /* BN_bn2bin eats leading zeroes */
 322     memset(b_out, 0, sizeof(b_out));
 323     num_bytes = BN_num_bytes(bn);
 324     if (num_bytes > sizeof b_out) {
 325         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 326         return 0;
 327     }
 328     if (BN_is_negative(bn)) {
 329         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 330         return 0;
 331     }
 332     num_bytes = BN_bn2bin(bn, b_in);
 333     flip_endian(b_out, b_in, num_bytes);
 334     bin28_to_felem(out, b_out);
 335     return 1;
 336 }
 337
 338 /* From internal representation to OpenSSL BIGNUM */
 339 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 340 {
 341     felem_bytearray b_in, b_out;
 342     felem_to_bin28(b_in, in);
 343     flip_endian(b_out, b_in, sizeof b_out);
 344     return BN_bin2bn(b_out, sizeof b_out, out);
 345 }
 346
 347 /******************************************************************************/
 348 /*-
 349  *                              FIELD OPERATIONS
 350  *
 351  * Field operations, using the internal representation of field elements.
 352  * NB! These operations are specific to our point multiplication and cannot be
 353  * expected to be correct in general - e.g., multiplication with a large scalar
 354  * will cause an overflow.
 355  *
 356  */
 357
 358 static void felem_one(felem out)
 359 {
 360     out[0] = 1;
 361     out[1] = 0;
 362     out[2] = 0;
 363     out[3] = 0;
 364 }
 365
 366 static void felem_assign(felem out, const felem in)
 367 {
 368     out[0] = in[0];
 369     out[1] = in[1];
 370     out[2] = in[2];
 371     out[3] = in[3];
 372 }
 373
 374 /* Sum two field elements: out += in */
 375 static void felem_sum(felem out, const felem in)
 376 {
 377     out[0] += in[0];
 378     out[1] += in[1];
 379     out[2] += in[2];
 380     out[3] += in[3];
 381 }
 382
 383 /* Get negative value: out = -in */
 384 /* Assumes in[i] < 2^57 */
 385 static void felem_neg(felem out, const felem in)
 386 {
 387     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 388     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 389     static const limb two58m42m2 = (((limb) 1) << 58) -
 390         (((limb) 1) << 42) - (((limb) 1) << 2);
 391
 392     /* Set to 0 mod 2^224-2^96+1 to ensure out > in */
 393     out[0] = two58p2 - in[0];
 394     out[1] = two58m42m2 - in[1];
 395     out[2] = two58m2 - in[2];
 396     out[3] = two58m2 - in[3];
 397 }
 398
 399 /* Subtract field elements: out -= in */
 400 /* Assumes in[i] < 2^57 */
 401 static void felem_diff(felem out, const felem in)
 402 {
 403     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 404     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 405     static const limb two58m42m2 = (((limb) 1) << 58) -
 406         (((limb) 1) << 42) - (((limb) 1) << 2);
 407
 408     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 409     out[0] += two58p2;
 410     out[1] += two58m42m2;
 411     out[2] += two58m2;
 412     out[3] += two58m2;
 413
 414     out[0] -= in[0];
 415     out[1] -= in[1];
 416     out[2] -= in[2];
 417     out[3] -= in[3];
 418 }
 419
 420 /* Subtract in unreduced 128-bit mode: out -= in */
 421 /* Assumes in[i] < 2^119 */
 422 static void widefelem_diff(widefelem out, const widefelem in)
 423 {
 424     static const widelimb two120 = ((widelimb) 1) << 120;
 425     static const widelimb two120m64 = (((widelimb) 1) << 120) -
 426         (((widelimb) 1) << 64);
 427     static const widelimb two120m104m64 = (((widelimb) 1) << 120) -
 428         (((widelimb) 1) << 104) - (((widelimb) 1) << 64);
 429
 430     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 431     out[0] += two120;
 432     out[1] += two120m64;
 433     out[2] += two120m64;
 434     out[3] += two120;
 435     out[4] += two120m104m64;
 436     out[5] += two120m64;
 437     out[6] += two120m64;
 438
 439     out[0] -= in[0];
 440     out[1] -= in[1];
 441     out[2] -= in[2];
 442     out[3] -= in[3];
 443     out[4] -= in[4];
 444     out[5] -= in[5];
 445     out[6] -= in[6];
 446 }
 447
 448 /* Subtract in mixed mode: out128 -= in64 */
 449 /* in[i] < 2^63 */
 450 static void felem_diff_128_64(widefelem out, const felem in)
 451 {
 452     static const widelimb two64p8 = (((widelimb) 1) << 64) +
 453         (((widelimb) 1) << 8);
 454     static const widelimb two64m8 = (((widelimb) 1) << 64) -
 455         (((widelimb) 1) << 8);
 456     static const widelimb two64m48m8 = (((widelimb) 1) << 64) -
 457         (((widelimb) 1) << 48) - (((widelimb) 1) << 8);
 458
 459     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 460     out[0] += two64p8;
 461     out[1] += two64m48m8;
 462     out[2] += two64m8;
 463     out[3] += two64m8;
 464
 465     out[0] -= in[0];
 466     out[1] -= in[1];
 467     out[2] -= in[2];
 468     out[3] -= in[3];
 469 }
 470
 471 /*
 472  * Multiply a field element by a scalar: out = out * scalar The scalars we
 473  * actually use are small, so results fit without overflow
 474  */
 475 static void felem_scalar(felem out, const limb scalar)
 476 {
 477     out[0] *= scalar;
 478     out[1] *= scalar;
 479     out[2] *= scalar;
 480     out[3] *= scalar;
 481 }
 482
 483 /*
 484  * Multiply an unreduced field element by a scalar: out = out * scalar The
 485  * scalars we actually use are small, so results fit without overflow
 486  */
 487 static void widefelem_scalar(widefelem out, const widelimb scalar)
 488 {
 489     out[0] *= scalar;
 490     out[1] *= scalar;
 491     out[2] *= scalar;
 492     out[3] *= scalar;
 493     out[4] *= scalar;
 494     out[5] *= scalar;
 495     out[6] *= scalar;
 496 }
 497
 498 /* Square a field element: out = in^2 */
 499 static void felem_square(widefelem out, const felem in)
 500 {
 501     limb tmp0, tmp1, tmp2;
 502     tmp0 = 2 * in[0];
 503     tmp1 = 2 * in[1];
 504     tmp2 = 2 * in[2];
 505     out[0] = ((widelimb) in[0]) * in[0];
 506     out[1] = ((widelimb) in[0]) * tmp1;
 507     out[2] = ((widelimb) in[0]) * tmp2 + ((widelimb) in[1]) * in[1];
 508     out[3] = ((widelimb) in[3]) * tmp0 + ((widelimb) in[1]) * tmp2;
 509     out[4] = ((widelimb) in[3]) * tmp1 + ((widelimb) in[2]) * in[2];
 510     out[5] = ((widelimb) in[3]) * tmp2;
 511     out[6] = ((widelimb) in[3]) * in[3];
 512 }
 513
 514 /* Multiply two field elements: out = in1 * in2 */
 515 static void felem_mul(widefelem out, const felem in1, const felem in2)
 516 {
 517     out[0] = ((widelimb) in1[0]) * in2[0];
 518     out[1] = ((widelimb) in1[0]) * in2[1] + ((widelimb) in1[1]) * in2[0];
 519     out[2] = ((widelimb) in1[0]) * in2[2] + ((widelimb) in1[1]) * in2[1] +
 520              ((widelimb) in1[2]) * in2[0];
 521     out[3] = ((widelimb) in1[0]) * in2[3] + ((widelimb) in1[1]) * in2[2] +
 522              ((widelimb) in1[2]) * in2[1] + ((widelimb) in1[3]) * in2[0];
 523     out[4] = ((widelimb) in1[1]) * in2[3] + ((widelimb) in1[2]) * in2[2] +
 524              ((widelimb) in1[3]) * in2[1];
 525     out[5] = ((widelimb) in1[2]) * in2[3] + ((widelimb) in1[3]) * in2[2];
 526     out[6] = ((widelimb) in1[3]) * in2[3];
 527 }
 528
 529 /*-
 530  * Reduce seven 128-bit coefficients to four 64-bit coefficients.
 531  * Requires in[i] < 2^126,
 532  * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 */
 533 static void felem_reduce(felem out, const widefelem in)
 534 {
 535     static const widelimb two127p15 = (((widelimb) 1) << 127) +
 536         (((widelimb) 1) << 15);
 537     static const widelimb two127m71 = (((widelimb) 1) << 127) -
 538         (((widelimb) 1) << 71);
 539     static const widelimb two127m71m55 = (((widelimb) 1) << 127) -
 540         (((widelimb) 1) << 71) - (((widelimb) 1) << 55);
 541     widelimb output[5];
 542
 543     /* Add 0 mod 2^224-2^96+1 to ensure all differences are positive */
 544     output[0] = in[0] + two127p15;
 545     output[1] = in[1] + two127m71m55;
 546     output[2] = in[2] + two127m71;
 547     output[3] = in[3];
 548     output[4] = in[4];
 549
 550     /* Eliminate in[4], in[5], in[6] */
 551     output[4] += in[6] >> 16;
 552     output[3] += (in[6] & 0xffff) << 40;
 553     output[2] -= in[6];
 554
 555     output[3] += in[5] >> 16;
 556     output[2] += (in[5] & 0xffff) << 40;
 557     output[1] -= in[5];
 558
 559     output[2] += output[4] >> 16;
 560     output[1] += (output[4] & 0xffff) << 40;
 561     output[0] -= output[4];
 562
 563     /* Carry 2 -> 3 -> 4 */
 564     output[3] += output[2] >> 56;
 565     output[2] &= 0x00ffffffffffffff;
 566
 567     output[4] = output[3] >> 56;
 568     output[3] &= 0x00ffffffffffffff;
 569
 570     /* Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72 */
 571
 572     /* Eliminate output[4] */
 573     output[2] += output[4] >> 16;
 574     /* output[2] < 2^56 + 2^56 = 2^57 */
 575     output[1] += (output[4] & 0xffff) << 40;
 576     output[0] -= output[4];
 577
 578     /* Carry 0 -> 1 -> 2 -> 3 */
 579     output[1] += output[0] >> 56;
 580     out[0] = output[0] & 0x00ffffffffffffff;
 581
 582     output[2] += output[1] >> 56;
 583     /* output[2] < 2^57 + 2^72 */
 584     out[1] = output[1] & 0x00ffffffffffffff;
 585     output[3] += output[2] >> 56;
 586     /* output[3] <= 2^56 + 2^16 */
 587     out[2] = output[2] & 0x00ffffffffffffff;
 588
 589     /*-
 590      * out[0] < 2^56, out[1] < 2^56, out[2] < 2^56,
 591      * out[3] <= 2^56 + 2^16 (due to final carry),
 592      * so out < 2*p
 593      */
 594     out[3] = output[3];
 595 }
 596
 597 static void felem_square_reduce(felem out, const felem in)
 598 {
 599     widefelem tmp;
 600     felem_square(tmp, in);
 601     felem_reduce(out, tmp);
 602 }
 603
 604 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 605 {
 606     widefelem tmp;
 607     felem_mul(tmp, in1, in2);
 608     felem_reduce(out, tmp);
 609 }
 610
 611 /*
 612  * Reduce to unique minimal representation. Requires 0 <= in < 2*p (always
 613  * call felem_reduce first)
 614  */
 615 static void felem_contract(felem out, const felem in)
 616 {
 617     static const int64_t two56 = ((limb) 1) << 56;
 618     /* 0 <= in < 2*p, p = 2^224 - 2^96 + 1 */
 619     /* if in > p , reduce in = in - 2^224 + 2^96 - 1 */
 620     int64_t tmp[4], a;
 621     tmp[0] = in[0];
 622     tmp[1] = in[1];
 623     tmp[2] = in[2];
 624     tmp[3] = in[3];
 625     /* Case 1: a = 1 iff in >= 2^224 */
 626     a = (in[3] >> 56);
 627     tmp[0] -= a;
 628     tmp[1] += a << 40;
 629     tmp[3] &= 0x00ffffffffffffff;
 630     /*
 631      * Case 2: a = 0 iff p <= in < 2^224, i.e., the high 128 bits are all 1
 632      * and the lower part is non-zero
 633      */
 634     a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) |
 635         (((int64_t) (in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63);
 636     a &= 0x00ffffffffffffff;
 637     /* turn a into an all-one mask (if a = 0) or an all-zero mask */
 638     a = (a - 1) >> 63;
 639     /* subtract 2^224 - 2^96 + 1 if a is all-one */
 640     tmp[3] &= a ^ 0xffffffffffffffff;
 641     tmp[2] &= a ^ 0xffffffffffffffff;
 642     tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff;
 643     tmp[0] -= 1 & a;
 644
 645     /*
 646      * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
 647      * non-zero, so we only need one step
 648      */
 649     a = tmp[0] >> 63;
 650     tmp[0] += two56 & a;
 651     tmp[1] -= 1 & a;
 652
 653     /* carry 1 -> 2 -> 3 */
 654     tmp[2] += tmp[1] >> 56;
 655     tmp[1] &= 0x00ffffffffffffff;
 656
 657     tmp[3] += tmp[2] >> 56;
 658     tmp[2] &= 0x00ffffffffffffff;
 659
 660     /* Now 0 <= out < p */
 661     out[0] = tmp[0];
 662     out[1] = tmp[1];
 663     out[2] = tmp[2];
 664     out[3] = tmp[3];
 665 }
 666
 667 /*
 668  * Zero-check: returns 1 if input is 0, and 0 otherwise. We know that field
 669  * elements are reduced to in < 2^225, so we only need to check three cases:
 670  * 0, 2^224 - 2^96 + 1, and 2^225 - 2^97 + 2
 671  */
 672 static limb felem_is_zero(const felem in)
 673 {
 674     limb zero, two224m96p1, two225m97p2;
 675
 676     zero = in[0] | in[1] | in[2] | in[3];
 677     zero = (((int64_t) (zero) - 1) >> 63) & 1;
 678     two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000)
 679         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x00ffffffffffffff);
 680     two224m96p1 = (((int64_t) (two224m96p1) - 1) >> 63) & 1;
 681     two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000)
 682         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x01ffffffffffffff);
 683     two225m97p2 = (((int64_t) (two225m97p2) - 1) >> 63) & 1;
 684     return (zero | two224m96p1 | two225m97p2);
 685 }
 686
 687 static limb felem_is_zero_int(const felem in)
 688 {
 689     return (int)(felem_is_zero(in) & ((limb) 1));
 690 }
 691
 692 /* Invert a field element */
 693 /* Computation chain copied from djb's code */
 694 static void felem_inv(felem out, const felem in)
 695 {
 696     felem ftmp, ftmp2, ftmp3, ftmp4;
 697     widefelem tmp;
 698     unsigned i;
 699
 700     felem_square(tmp, in);
 701     felem_reduce(ftmp, tmp);    /* 2 */
 702     felem_mul(tmp, in, ftmp);
 703     felem_reduce(ftmp, tmp);    /* 2^2 - 1 */
 704     felem_square(tmp, ftmp);
 705     felem_reduce(ftmp, tmp);    /* 2^3 - 2 */
 706     felem_mul(tmp, in, ftmp);
 707     felem_reduce(ftmp, tmp);    /* 2^3 - 1 */
 708     felem_square(tmp, ftmp);
 709     felem_reduce(ftmp2, tmp);   /* 2^4 - 2 */
 710     felem_square(tmp, ftmp2);
 711     felem_reduce(ftmp2, tmp);   /* 2^5 - 4 */
 712     felem_square(tmp, ftmp2);
 713     felem_reduce(ftmp2, tmp);   /* 2^6 - 8 */
 714     felem_mul(tmp, ftmp2, ftmp);
 715     felem_reduce(ftmp, tmp);    /* 2^6 - 1 */
 716     felem_square(tmp, ftmp);
 717     felem_reduce(ftmp2, tmp);   /* 2^7 - 2 */
 718     for (i = 0; i < 5; ++i) {   /* 2^12 - 2^6 */
 719         felem_square(tmp, ftmp2);
 720         felem_reduce(ftmp2, tmp);
 721     }
 722     felem_mul(tmp, ftmp2, ftmp);
 723     felem_reduce(ftmp2, tmp);   /* 2^12 - 1 */
 724     felem_square(tmp, ftmp2);
 725     felem_reduce(ftmp3, tmp);   /* 2^13 - 2 */
 726     for (i = 0; i < 11; ++i) {  /* 2^24 - 2^12 */
 727         felem_square(tmp, ftmp3);
 728         felem_reduce(ftmp3, tmp);
 729     }
 730     felem_mul(tmp, ftmp3, ftmp2);
 731     felem_reduce(ftmp2, tmp);   /* 2^24 - 1 */
 732     felem_square(tmp, ftmp2);
 733     felem_reduce(ftmp3, tmp);   /* 2^25 - 2 */
 734     for (i = 0; i < 23; ++i) {  /* 2^48 - 2^24 */
 735         felem_square(tmp, ftmp3);
 736         felem_reduce(ftmp3, tmp);
 737     }
 738     felem_mul(tmp, ftmp3, ftmp2);
 739     felem_reduce(ftmp3, tmp);   /* 2^48 - 1 */
 740     felem_square(tmp, ftmp3);
 741     felem_reduce(ftmp4, tmp);   /* 2^49 - 2 */
 742     for (i = 0; i < 47; ++i) {  /* 2^96 - 2^48 */
 743         felem_square(tmp, ftmp4);
 744         felem_reduce(ftmp4, tmp);
 745     }
 746     felem_mul(tmp, ftmp3, ftmp4);
 747     felem_reduce(ftmp3, tmp);   /* 2^96 - 1 */
 748     felem_square(tmp, ftmp3);
 749     felem_reduce(ftmp4, tmp);   /* 2^97 - 2 */
 750     for (i = 0; i < 23; ++i) {  /* 2^120 - 2^24 */
 751         felem_square(tmp, ftmp4);
 752         felem_reduce(ftmp4, tmp);
 753     }
 754     felem_mul(tmp, ftmp2, ftmp4);
 755     felem_reduce(ftmp2, tmp);   /* 2^120 - 1 */
 756     for (i = 0; i < 6; ++i) {   /* 2^126 - 2^6 */
 757         felem_square(tmp, ftmp2);
 758         felem_reduce(ftmp2, tmp);
 759     }
 760     felem_mul(tmp, ftmp2, ftmp);
 761     felem_reduce(ftmp, tmp);    /* 2^126 - 1 */
 762     felem_square(tmp, ftmp);
 763     felem_reduce(ftmp, tmp);    /* 2^127 - 2 */
 764     felem_mul(tmp, ftmp, in);
 765     felem_reduce(ftmp, tmp);    /* 2^127 - 1 */
 766     for (i = 0; i < 97; ++i) {  /* 2^224 - 2^97 */
 767         felem_square(tmp, ftmp);
 768         felem_reduce(ftmp, tmp);
 769     }
 770     felem_mul(tmp, ftmp, ftmp3);
 771     felem_reduce(out, tmp);     /* 2^224 - 2^96 - 1 */
 772 }
 773
 774 /*
 775  * Copy in constant time: if icopy == 1, copy in to out, if icopy == 0, copy
 776  * out to itself.
 777  */
 778 static void copy_conditional(felem out, const felem in, limb icopy)
 779 {
 780     unsigned i;
 781     /*
 782      * icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one
 783      */
 784     const limb copy = -icopy;
 785     for (i = 0; i < 4; ++i) {
 786         const limb tmp = copy & (in[i] ^ out[i]);
 787         out[i] ^= tmp;
 788     }
 789 }
 790
 791 /******************************************************************************/
 792 /*-
 793  *                       ELLIPTIC CURVE POINT OPERATIONS
 794  *
 795  * Points are represented in Jacobian projective coordinates:
 796  * (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3),
 797  * or to the point at infinity if Z == 0.
 798  *
 799  */
 800
 801 /*-
 802  * Double an elliptic curve point:
 803  * (X', Y', Z') = 2 * (X, Y, Z), where
 804  * X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2
 805  * Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^2
 806  * Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z
 807  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed,
 808  * while x_out == y_in is not (maybe this works, but it's not tested).
 809  */
 810 static void
 811 point_double(felem x_out, felem y_out, felem z_out,
 812              const felem x_in, const felem y_in, const felem z_in)
 813 {
 814     widefelem tmp, tmp2;
 815     felem delta, gamma, beta, alpha, ftmp, ftmp2;
 816
 817     felem_assign(ftmp, x_in);
 818     felem_assign(ftmp2, x_in);
 819
 820     /* delta = z^2 */
 821     felem_square(tmp, z_in);
 822     felem_reduce(delta, tmp);
 823
 824     /* gamma = y^2 */
 825     felem_square(tmp, y_in);
 826     felem_reduce(gamma, tmp);
 827
 828     /* beta = x*gamma */
 829     felem_mul(tmp, x_in, gamma);
 830     felem_reduce(beta, tmp);
 831
 832     /* alpha = 3*(x-delta)*(x+delta) */
 833     felem_diff(ftmp, delta);
 834     /* ftmp[i] < 2^57 + 2^58 + 2 < 2^59 */
 835     felem_sum(ftmp2, delta);
 836     /* ftmp2[i] < 2^57 + 2^57 = 2^58 */
 837     felem_scalar(ftmp2, 3);
 838     /* ftmp2[i] < 3 * 2^58 < 2^60 */
 839     felem_mul(tmp, ftmp, ftmp2);
 840     /* tmp[i] < 2^60 * 2^59 * 4 = 2^121 */
 841     felem_reduce(alpha, tmp);
 842
 843     /* x' = alpha^2 - 8*beta */
 844     felem_square(tmp, alpha);
 845     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 846     felem_assign(ftmp, beta);
 847     felem_scalar(ftmp, 8);
 848     /* ftmp[i] < 8 * 2^57 = 2^60 */
 849     felem_diff_128_64(tmp, ftmp);
 850     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 851     felem_reduce(x_out, tmp);
 852
 853     /* z' = (y + z)^2 - gamma - delta */
 854     felem_sum(delta, gamma);
 855     /* delta[i] < 2^57 + 2^57 = 2^58 */
 856     felem_assign(ftmp, y_in);
 857     felem_sum(ftmp, z_in);
 858     /* ftmp[i] < 2^57 + 2^57 = 2^58 */
 859     felem_square(tmp, ftmp);
 860     /* tmp[i] < 4 * 2^58 * 2^58 = 2^118 */
 861     felem_diff_128_64(tmp, delta);
 862     /* tmp[i] < 2^118 + 2^64 + 8 < 2^119 */
 863     felem_reduce(z_out, tmp);
 864
 865     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
 866     felem_scalar(beta, 4);
 867     /* beta[i] < 4 * 2^57 = 2^59 */
 868     felem_diff(beta, x_out);
 869     /* beta[i] < 2^59 + 2^58 + 2 < 2^60 */
 870     felem_mul(tmp, alpha, beta);
 871     /* tmp[i] < 4 * 2^57 * 2^60 = 2^119 */
 872     felem_square(tmp2, gamma);
 873     /* tmp2[i] < 4 * 2^57 * 2^57 = 2^116 */
 874     widefelem_scalar(tmp2, 8);
 875     /* tmp2[i] < 8 * 2^116 = 2^119 */
 876     widefelem_diff(tmp, tmp2);
 877     /* tmp[i] < 2^119 + 2^120 < 2^121 */
 878     felem_reduce(y_out, tmp);
 879 }
 880
 881 /*-
 882  * Add two elliptic curve points:
 883  * (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where
 884  * X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 -
 885  * 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2
 886  * Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 - X_3) -
 887  *        Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3
 888  * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2)
 889  *
 890  * This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0.
 891  */
 892
 893 /*
 894  * This function is not entirely constant-time: it includes a branch for
 895  * checking whether the two input points are equal, (while not equal to the
 896  * point at infinity). This case never happens during single point
 897  * multiplication, so there is no timing leak for ECDH or ECDSA signing.
 898  */
 899 static void point_add(felem x3, felem y3, felem z3,
 900                       const felem x1, const felem y1, const felem z1,
 901                       const int mixed, const felem x2, const felem y2,
 902                       const felem z2)
 903 {
 904     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out;
 905     widefelem tmp, tmp2;
 906     limb z1_is_zero, z2_is_zero, x_equal, y_equal;
 907
 908     if (!mixed) {
 909         /* ftmp2 = z2^2 */
 910         felem_square(tmp, z2);
 911         felem_reduce(ftmp2, tmp);
 912
 913         /* ftmp4 = z2^3 */
 914         felem_mul(tmp, ftmp2, z2);
 915         felem_reduce(ftmp4, tmp);
 916
 917         /* ftmp4 = z2^3*y1 */
 918         felem_mul(tmp2, ftmp4, y1);
 919         felem_reduce(ftmp4, tmp2);
 920
 921         /* ftmp2 = z2^2*x1 */
 922         felem_mul(tmp2, ftmp2, x1);
 923         felem_reduce(ftmp2, tmp2);
 924     } else {
 925         /*
 926          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
 927          */
 928
 929         /* ftmp4 = z2^3*y1 */
 930         felem_assign(ftmp4, y1);
 931
 932         /* ftmp2 = z2^2*x1 */
 933         felem_assign(ftmp2, x1);
 934     }
 935
 936     /* ftmp = z1^2 */
 937     felem_square(tmp, z1);
 938     felem_reduce(ftmp, tmp);
 939
 940     /* ftmp3 = z1^3 */
 941     felem_mul(tmp, ftmp, z1);
 942     felem_reduce(ftmp3, tmp);
 943
 944     /* tmp = z1^3*y2 */
 945     felem_mul(tmp, ftmp3, y2);
 946     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 947
 948     /* ftmp3 = z1^3*y2 - z2^3*y1 */
 949     felem_diff_128_64(tmp, ftmp4);
 950     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 951     felem_reduce(ftmp3, tmp);
 952
 953     /* tmp = z1^2*x2 */
 954     felem_mul(tmp, ftmp, x2);
 955     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 956
 957     /* ftmp = z1^2*x2 - z2^2*x1 */
 958     felem_diff_128_64(tmp, ftmp2);
 959     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 960     felem_reduce(ftmp, tmp);
 961
 962     /*
 963      * the formulae are incorrect if the points are equal so we check for
 964      * this and do doubling if this happens
 965      */
 966     x_equal = felem_is_zero(ftmp);
 967     y_equal = felem_is_zero(ftmp3);
 968     z1_is_zero = felem_is_zero(z1);
 969     z2_is_zero = felem_is_zero(z2);
 970     /* In affine coordinates, (X_1, Y_1) == (X_2, Y_2) */
 971     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
 972         point_double(x3, y3, z3, x1, y1, z1);
 973         return;
 974     }
 975
 976     /* ftmp5 = z1*z2 */
 977     if (!mixed) {
 978         felem_mul(tmp, z1, z2);
 979         felem_reduce(ftmp5, tmp);
 980     } else {
 981         /* special case z2 = 0 is handled later */
 982         felem_assign(ftmp5, z1);
 983     }
 984
 985     /* z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) */
 986     felem_mul(tmp, ftmp, ftmp5);
 987     felem_reduce(z_out, tmp);
 988
 989     /* ftmp = (z1^2*x2 - z2^2*x1)^2 */
 990     felem_assign(ftmp5, ftmp);
 991     felem_square(tmp, ftmp);
 992     felem_reduce(ftmp, tmp);
 993
 994     /* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */
 995     felem_mul(tmp, ftmp, ftmp5);
 996     felem_reduce(ftmp5, tmp);
 997
 998     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
 999     felem_mul(tmp, ftmp2, ftmp);
1000     felem_reduce(ftmp2, tmp);
1001
1002     /* tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
1003     felem_mul(tmp, ftmp4, ftmp5);
1004     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
1005
1006     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 */
1007     felem_square(tmp2, ftmp3);
1008     /* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */
1009
1010     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */
1011     felem_diff_128_64(tmp2, ftmp5);
1012     /* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */
1013
1014     /* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
1015     felem_assign(ftmp5, ftmp2);
1016     felem_scalar(ftmp5, 2);
1017     /* ftmp5[i] < 2 * 2^57 = 2^58 */
1018
1019     /*-
1020      * x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 -
1021      *  2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2
1022      */
1023     felem_diff_128_64(tmp2, ftmp5);
1024     /* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */
1025     felem_reduce(x_out, tmp2);
1026
1027     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out */
1028     felem_diff(ftmp2, x_out);
1029     /* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */
1030
1031     /*
1032      * tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out)
1033      */
1034     felem_mul(tmp2, ftmp3, ftmp2);
1035     /* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */
1036
1037     /*-
1038      * y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) -
1039      *  z2^3*y1*(z1^2*x2 - z2^2*x1)^3
1040      */
1041     widefelem_diff(tmp2, tmp);
1042     /* tmp2[i] < 2^118 + 2^120 < 2^121 */
1043     felem_reduce(y_out, tmp2);
1044
1045     /*
1046      * the result (x_out, y_out, z_out) is incorrect if one of the inputs is
1047      * the point at infinity, so we need to check for this separately
1048      */
1049
1050     /*
1051      * if point 1 is at infinity, copy point 2 to output, and vice versa
1052      */
1053     copy_conditional(x_out, x2, z1_is_zero);
1054     copy_conditional(x_out, x1, z2_is_zero);
1055     copy_conditional(y_out, y2, z1_is_zero);
1056     copy_conditional(y_out, y1, z2_is_zero);
1057     copy_conditional(z_out, z2, z1_is_zero);
1058     copy_conditional(z_out, z1, z2_is_zero);
1059     felem_assign(x3, x_out);
1060     felem_assign(y3, y_out);
1061     felem_assign(z3, z_out);
1062 }
1063
1064 /*
1065  * select_point selects the |idx|th point from a precomputation table and
1066  * copies it to out.
1067  * The pre_comp array argument should be size of |size| argument
1068  */
1069 static void select_point(const u64 idx, unsigned int size,
1070                          const felem pre_comp[][3], felem out[3])
1071 {
1072     unsigned i, j;
1073     limb *outlimbs = &out[0][0];
1074
1075     memset(out, 0, sizeof(*out) * 3);
1076     for (i = 0; i < size; i++) {
1077         const limb *inlimbs = &pre_comp[i][0][0];
1078         u64 mask = i ^ idx;
1079         mask |= mask >> 4;
1080         mask |= mask >> 2;
1081         mask |= mask >> 1;
1082         mask &= 1;
1083         mask--;
1084         for (j = 0; j < 4 * 3; j++)
1085             outlimbs[j] |= inlimbs[j] & mask;
1086     }
1087 }
1088
1089 /* get_bit returns the |i|th bit in |in| */
1090 static char get_bit(const felem_bytearray in, unsigned i)
1091 {
1092     if (i >= 224)
1093         return 0;
1094     return (in[i >> 3] >> (i & 7)) & 1;
1095 }
1096
1097 /*
1098  * Interleaved point multiplication using precomputed point multiples: The
1099  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1100  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1101  * generator, using certain (large) precomputed multiples in g_pre_comp.
1102  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1103  */
1104 static void batch_mul(felem x_out, felem y_out, felem z_out,
1105                       const felem_bytearray scalars[],
1106                       const unsigned num_points, const u8 *g_scalar,
1107                       const int mixed, const felem pre_comp[][17][3],
1108                       const felem g_pre_comp[2][16][3])
1109 {
1110     int i, skip;
1111     unsigned num;
1112     unsigned gen_mul = (g_scalar != NULL);
1113     felem nq[3], tmp[4];
1114     u64 bits;
1115     u8 sign, digit;
1116
1117     /* set nq to the point at infinity */
1118     memset(nq, 0, sizeof(nq));
1119
1120     /*
1121      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1122      * of the generator (two in each of the last 28 rounds) and additions of
1123      * other points multiples (every 5th round).
1124      */
1125     skip = 1;                   /* save two point operations in the first
1126                                  * round */
1127     for (i = (num_points ? 220 : 27); i >= 0; --i) {
1128         /* double */
1129         if (!skip)
1130             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1131
1132         /* add multiples of the generator */
1133         if (gen_mul && (i <= 27)) {
1134             /* first, look 28 bits upwards */
1135             bits = get_bit(g_scalar, i + 196) << 3;
1136             bits |= get_bit(g_scalar, i + 140) << 2;
1137             bits |= get_bit(g_scalar, i + 84) << 1;
1138             bits |= get_bit(g_scalar, i + 28);
1139             /* select the point to add, in constant time */
1140             select_point(bits, 16, g_pre_comp[1], tmp);
1141
1142             if (!skip) {
1143                 /* value 1 below is argument for "mixed" */
1144                 point_add(nq[0], nq[1], nq[2],
1145                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1146             } else {
1147                 memcpy(nq, tmp, 3 * sizeof(felem));
1148                 skip = 0;
1149             }
1150
1151             /* second, look at the current position */
1152             bits = get_bit(g_scalar, i + 168) << 3;
1153             bits |= get_bit(g_scalar, i + 112) << 2;
1154             bits |= get_bit(g_scalar, i + 56) << 1;
1155             bits |= get_bit(g_scalar, i);
1156             /* select the point to add, in constant time */
1157             select_point(bits, 16, g_pre_comp[0], tmp);
1158             point_add(nq[0], nq[1], nq[2],
1159                       nq[0], nq[1], nq[2],
1160                       1 /* mixed */ , tmp[0], tmp[1], tmp[2]);
1161         }
1162
1163         /* do other additions every 5 doublings */
1164         if (num_points && (i % 5 == 0)) {
1165             /* loop over all scalars */
1166             for (num = 0; num < num_points; ++num) {
1167                 bits = get_bit(scalars[num], i + 4) << 5;
1168                 bits |= get_bit(scalars[num], i + 3) << 4;
1169                 bits |= get_bit(scalars[num], i + 2) << 3;
1170                 bits |= get_bit(scalars[num], i + 1) << 2;
1171                 bits |= get_bit(scalars[num], i) << 1;
1172                 bits |= get_bit(scalars[num], i - 1);
1173                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1174
1175                 /* select the point to add or subtract */
1176                 select_point(digit, 17, pre_comp[num], tmp);
1177                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1178                                             * point */
1179                 copy_conditional(tmp[1], tmp[3], sign);
1180
1181                 if (!skip) {
1182                     point_add(nq[0], nq[1], nq[2],
1183                               nq[0], nq[1], nq[2],
1184                               mixed, tmp[0], tmp[1], tmp[2]);
1185                 } else {
1186                     memcpy(nq, tmp, 3 * sizeof(felem));
1187                     skip = 0;
1188                 }
1189             }
1190         }
1191     }
1192     felem_assign(x_out, nq[0]);
1193     felem_assign(y_out, nq[1]);
1194     felem_assign(z_out, nq[2]);
1195 }
1196
1197 /******************************************************************************/
1198 /*
1199  * FUNCTIONS TO MANAGE PRECOMPUTATION
1200  */
1201
1202 static NISTP224_PRE_COMP *nistp224_pre_comp_new()
1203 {
1204     NISTP224_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1205
1206     if (!ret) {
1207         ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1208         return ret;
1209     }
1210     ret->references = 1;
1211     return ret;
1212 }
1213
1214 NISTP224_PRE_COMP *EC_nistp224_pre_comp_dup(NISTP224_PRE_COMP *p)
1215 {
1216     if (p != NULL)
1217         CRYPTO_add(&p->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1218     return p;
1219 }
1220
1221 void EC_nistp224_pre_comp_free(NISTP224_PRE_COMP *p)
1222 {
1223     if (p == NULL
1224         || CRYPTO_add(&p->references, -1, CRYPTO_LOCK_EC_PRE_COMP) > 0)
1225         return;
1226     OPENSSL_free(p);
1227 }
1228
1229 /******************************************************************************/
1230 /*
1231  * OPENSSL EC_METHOD FUNCTIONS
1232  */
1233
1234 int ec_GFp_nistp224_group_init(EC_GROUP *group)
1235 {
1236     int ret;
1237     ret = ec_GFp_simple_group_init(group);
1238     group->a_is_minus3 = 1;
1239     return ret;
1240 }
1241
1242 int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1243                                     const BIGNUM *a, const BIGNUM *b,
1244                                     BN_CTX *ctx)
1245 {
1246     int ret = 0;
1247     BN_CTX *new_ctx = NULL;
1248     BIGNUM *curve_p, *curve_a, *curve_b;
1249
1250     if (ctx == NULL)
1251         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1252             return 0;
1253     BN_CTX_start(ctx);
1254     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1255         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1256         ((curve_b = BN_CTX_get(ctx)) == NULL))
1257         goto err;
1258     BN_bin2bn(nistp224_curve_params[0], sizeof(felem_bytearray), curve_p);
1259     BN_bin2bn(nistp224_curve_params[1], sizeof(felem_bytearray), curve_a);
1260     BN_bin2bn(nistp224_curve_params[2], sizeof(felem_bytearray), curve_b);
1261     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1262         ECerr(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE,
1263               EC_R_WRONG_CURVE_PARAMETERS);
1264         goto err;
1265     }
1266     group->field_mod_func = BN_nist_mod_224;
1267     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1268  err:
1269     BN_CTX_end(ctx);
1270     BN_CTX_free(new_ctx);
1271     return ret;
1272 }
1273
1274 /*
1275  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1276  * (X/Z^2, Y/Z^3)
1277  */
1278 int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group,
1279                                                  const EC_POINT *point,
1280                                                  BIGNUM *x, BIGNUM *y,
1281                                                  BN_CTX *ctx)
1282 {
1283     felem z1, z2, x_in, y_in, x_out, y_out;
1284     widefelem tmp;
1285
1286     if (EC_POINT_is_at_infinity(group, point)) {
1287         ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1288               EC_R_POINT_AT_INFINITY);
1289         return 0;
1290     }
1291     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1292         (!BN_to_felem(z1, point->Z)))
1293         return 0;
1294     felem_inv(z2, z1);
1295     felem_square(tmp, z2);
1296     felem_reduce(z1, tmp);
1297     felem_mul(tmp, x_in, z1);
1298     felem_reduce(x_in, tmp);
1299     felem_contract(x_out, x_in);
1300     if (x != NULL) {
1301         if (!felem_to_BN(x, x_out)) {
1302             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1303                   ERR_R_BN_LIB);
1304             return 0;
1305         }
1306     }
1307     felem_mul(tmp, z1, z2);
1308     felem_reduce(z1, tmp);
1309     felem_mul(tmp, y_in, z1);
1310     felem_reduce(y_in, tmp);
1311     felem_contract(y_out, y_in);
1312     if (y != NULL) {
1313         if (!felem_to_BN(y, y_out)) {
1314             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1315                   ERR_R_BN_LIB);
1316             return 0;
1317         }
1318     }
1319     return 1;
1320 }
1321
1322 static void make_points_affine(size_t num, felem points[ /* num */ ][3],
1323                                felem tmp_felems[ /* num+1 */ ])
1324 {
1325     /*
1326      * Runs in constant time, unless an input is the point at infinity (which
1327      * normally shouldn't happen).
1328      */
1329     ec_GFp_nistp_points_make_affine_internal(num,
1330                                              points,
1331                                              sizeof(felem),
1332                                              tmp_felems,
1333                                              (void (*)(void *))felem_one,
1334                                              (int (*)(const void *))
1335                                              felem_is_zero_int,
1336                                              (void (*)(void *, const void *))
1337                                              felem_assign,
1338                                              (void (*)(void *, const void *))
1339                                              felem_square_reduce, (void (*)
1340                                                                    (void *,
1341                                                                     const void
1342                                                                     *,
1343                                                                     const void
1344                                                                     *))
1345                                              felem_mul_reduce,
1346                                              (void (*)(void *, const void *))
1347                                              felem_inv,
1348                                              (void (*)(void *, const void *))
1349                                              felem_contract);
1350 }
1351
1352 /*
1353  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1354  * values Result is stored in r (r can equal one of the inputs).
1355  */
1356 int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
1357                                const BIGNUM *scalar, size_t num,
1358                                const EC_POINT *points[],
1359                                const BIGNUM *scalars[], BN_CTX *ctx)
1360 {
1361     int ret = 0;
1362     int j;
1363     unsigned i;
1364     int mixed = 0;
1365     BN_CTX *new_ctx = NULL;
1366     BIGNUM *x, *y, *z, *tmp_scalar;
1367     felem_bytearray g_secret;
1368     felem_bytearray *secrets = NULL;
1369     felem (*pre_comp)[17][3] = NULL;
1370     felem *tmp_felems = NULL;
1371     felem_bytearray tmp;
1372     unsigned num_bytes;
1373     int have_pre_comp = 0;
1374     size_t num_points = num;
1375     felem x_in, y_in, z_in, x_out, y_out, z_out;
1376     NISTP224_PRE_COMP *pre = NULL;
1377     const felem(*g_pre_comp)[16][3] = NULL;
1378     EC_POINT *generator = NULL;
1379     const EC_POINT *p = NULL;
1380     const BIGNUM *p_scalar = NULL;
1381
1382     if (ctx == NULL)
1383         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1384             return 0;
1385     BN_CTX_start(ctx);
1386     if (((x = BN_CTX_get(ctx)) == NULL) ||
1387         ((y = BN_CTX_get(ctx)) == NULL) ||
1388         ((z = BN_CTX_get(ctx)) == NULL) ||
1389         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1390         goto err;
1391
1392     if (scalar != NULL) {
1393         pre = group->pre_comp.nistp224;
1394         if (pre)
1395             /* we have precomputation, try to use it */
1396             g_pre_comp = (const felem(*)[16][3])pre->g_pre_comp;
1397         else
1398             /* try to use the standard precomputation */
1399             g_pre_comp = &gmul[0];
1400         generator = EC_POINT_new(group);
1401         if (generator == NULL)
1402             goto err;
1403         /* get the generator from precomputation */
1404         if (!felem_to_BN(x, g_pre_comp[0][1][0]) ||
1405             !felem_to_BN(y, g_pre_comp[0][1][1]) ||
1406             !felem_to_BN(z, g_pre_comp[0][1][2])) {
1407             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1408             goto err;
1409         }
1410         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1411                                                       generator, x, y, z,
1412                                                       ctx))
1413             goto err;
1414         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1415             /* precomputation matches generator */
1416             have_pre_comp = 1;
1417         else
1418             /*
1419              * we don't have valid precomputation: treat the generator as a
1420              * random point
1421              */
1422             num_points = num_points + 1;
1423     }
1424
1425     if (num_points > 0) {
1426         if (num_points >= 3) {
1427             /*
1428              * unless we precompute multiples for just one or two points,
1429              * converting those into affine form is time well spent
1430              */
1431             mixed = 1;
1432         }
1433         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1434         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1435         if (mixed)
1436             tmp_felems =
1437                 OPENSSL_malloc(sizeof(felem) * (num_points * 17 + 1));
1438         if ((secrets == NULL) || (pre_comp == NULL)
1439             || (mixed && (tmp_felems == NULL))) {
1440             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1441             goto err;
1442         }
1443
1444         /*
1445          * we treat NULL scalars as 0, and NULL points as points at infinity,
1446          * i.e., they contribute nothing to the linear combination
1447          */
1448         for (i = 0; i < num_points; ++i) {
1449             if (i == num)
1450                 /* the generator */
1451             {
1452                 p = EC_GROUP_get0_generator(group);
1453                 p_scalar = scalar;
1454             } else
1455                 /* the i^th point */
1456             {
1457                 p = points[i];
1458                 p_scalar = scalars[i];
1459             }
1460             if ((p_scalar != NULL) && (p != NULL)) {
1461                 /* reduce scalar to 0 <= scalar < 2^224 */
1462                 if ((BN_num_bits(p_scalar) > 224)
1463                     || (BN_is_negative(p_scalar))) {
1464                     /*
1465                      * this is an unusual input, and we don't guarantee
1466                      * constant-timeness
1467                      */
1468                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1469                         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1470                         goto err;
1471                     }
1472                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
1473                 } else
1474                     num_bytes = BN_bn2bin(p_scalar, tmp);
1475                 flip_endian(secrets[i], tmp, num_bytes);
1476                 /* precompute multiples */
1477                 if ((!BN_to_felem(x_out, p->X)) ||
1478                     (!BN_to_felem(y_out, p->Y)) ||
1479                     (!BN_to_felem(z_out, p->Z)))
1480                     goto err;
1481                 felem_assign(pre_comp[i][1][0], x_out);
1482                 felem_assign(pre_comp[i][1][1], y_out);
1483                 felem_assign(pre_comp[i][1][2], z_out);
1484                 for (j = 2; j <= 16; ++j) {
1485                     if (j & 1) {
1486                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1487                                   pre_comp[i][j][2], pre_comp[i][1][0],
1488                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1489                                   pre_comp[i][j - 1][0],
1490                                   pre_comp[i][j - 1][1],
1491                                   pre_comp[i][j - 1][2]);
1492                     } else {
1493                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1494                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1495                                      pre_comp[i][j / 2][1],
1496                                      pre_comp[i][j / 2][2]);
1497                     }
1498                 }
1499             }
1500         }
1501         if (mixed)
1502             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1503     }
1504
1505     /* the scalar for the generator */
1506     if ((scalar != NULL) && (have_pre_comp)) {
1507         memset(g_secret, 0, sizeof(g_secret));
1508         /* reduce scalar to 0 <= scalar < 2^224 */
1509         if ((BN_num_bits(scalar) > 224) || (BN_is_negative(scalar))) {
1510             /*
1511              * this is an unusual input, and we don't guarantee
1512              * constant-timeness
1513              */
1514             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1515                 ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1516                 goto err;
1517             }
1518             num_bytes = BN_bn2bin(tmp_scalar, tmp);
1519         } else
1520             num_bytes = BN_bn2bin(scalar, tmp);
1521         flip_endian(g_secret, tmp, num_bytes);
1522         /* do the multiplication with generator precomputation */
1523         batch_mul(x_out, y_out, z_out,
1524                   (const felem_bytearray(*))secrets, num_points,
1525                   g_secret,
1526                   mixed, (const felem(*)[17][3])pre_comp, g_pre_comp);
1527     } else
1528         /* do the multiplication without generator precomputation */
1529         batch_mul(x_out, y_out, z_out,
1530                   (const felem_bytearray(*))secrets, num_points,
1531                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1532     /* reduce the output to its unique minimal representation */
1533     felem_contract(x_in, x_out);
1534     felem_contract(y_in, y_out);
1535     felem_contract(z_in, z_out);
1536     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1537         (!felem_to_BN(z, z_in))) {
1538         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1539         goto err;
1540     }
1541     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1542
1543  err:
1544     BN_CTX_end(ctx);
1545     EC_POINT_free(generator);
1546     BN_CTX_free(new_ctx);
1547     OPENSSL_free(secrets);
1548     OPENSSL_free(pre_comp);
1549     OPENSSL_free(tmp_felems);
1550     return ret;
1551 }
1552
1553 int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1554 {
1555     int ret = 0;
1556     NISTP224_PRE_COMP *pre = NULL;
1557     int i, j;
1558     BN_CTX *new_ctx = NULL;
1559     BIGNUM *x, *y;
1560     EC_POINT *generator = NULL;
1561     felem tmp_felems[32];
1562
1563     /* throw away old precomputation */
1564     EC_pre_comp_free(group);
1565     if (ctx == NULL)
1566         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1567             return 0;
1568     BN_CTX_start(ctx);
1569     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
1570         goto err;
1571     /* get the generator */
1572     if (group->generator == NULL)
1573         goto err;
1574     generator = EC_POINT_new(group);
1575     if (generator == NULL)
1576         goto err;
1577     BN_bin2bn(nistp224_curve_params[3], sizeof(felem_bytearray), x);
1578     BN_bin2bn(nistp224_curve_params[4], sizeof(felem_bytearray), y);
1579     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
1580         goto err;
1581     if ((pre = nistp224_pre_comp_new()) == NULL)
1582         goto err;
1583     /*
1584      * if the generator is the standard one, use built-in precomputation
1585      */
1586     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1587         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1588         goto done;
1589     }
1590     if ((!BN_to_felem(pre->g_pre_comp[0][1][0], group->generator->X)) ||
1591         (!BN_to_felem(pre->g_pre_comp[0][1][1], group->generator->Y)) ||
1592         (!BN_to_felem(pre->g_pre_comp[0][1][2], group->generator->Z)))
1593         goto err;
1594     /*
1595      * compute 2^56*G, 2^112*G, 2^168*G for the first table, 2^28*G, 2^84*G,
1596      * 2^140*G, 2^196*G for the second one
1597      */
1598     for (i = 1; i <= 8; i <<= 1) {
1599         point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1600                      pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
1601                      pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]);
1602         for (j = 0; j < 27; ++j) {
1603             point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1604                          pre->g_pre_comp[1][i][2], pre->g_pre_comp[1][i][0],
1605                          pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1606         }
1607         if (i == 8)
1608             break;
1609         point_double(pre->g_pre_comp[0][2 * i][0],
1610                      pre->g_pre_comp[0][2 * i][1],
1611                      pre->g_pre_comp[0][2 * i][2], pre->g_pre_comp[1][i][0],
1612                      pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1613         for (j = 0; j < 27; ++j) {
1614             point_double(pre->g_pre_comp[0][2 * i][0],
1615                          pre->g_pre_comp[0][2 * i][1],
1616                          pre->g_pre_comp[0][2 * i][2],
1617                          pre->g_pre_comp[0][2 * i][0],
1618                          pre->g_pre_comp[0][2 * i][1],
1619                          pre->g_pre_comp[0][2 * i][2]);
1620         }
1621     }
1622     for (i = 0; i < 2; i++) {
1623         /* g_pre_comp[i][0] is the point at infinity */
1624         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
1625         /* the remaining multiples */
1626         /* 2^56*G + 2^112*G resp. 2^84*G + 2^140*G */
1627         point_add(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
1628                   pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
1629                   pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
1630                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1631                   pre->g_pre_comp[i][2][2]);
1632         /* 2^56*G + 2^168*G resp. 2^84*G + 2^196*G */
1633         point_add(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
1634                   pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
1635                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1636                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1637                   pre->g_pre_comp[i][2][2]);
1638         /* 2^112*G + 2^168*G resp. 2^140*G + 2^196*G */
1639         point_add(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
1640                   pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
1641                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1642                   0, pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
1643                   pre->g_pre_comp[i][4][2]);
1644         /*
1645          * 2^56*G + 2^112*G + 2^168*G resp. 2^84*G + 2^140*G + 2^196*G
1646          */
1647         point_add(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
1648                   pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
1649                   pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
1650                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1651                   pre->g_pre_comp[i][2][2]);
1652         for (j = 1; j < 8; ++j) {
1653             /* odd multiples: add G resp. 2^28*G */
1654             point_add(pre->g_pre_comp[i][2 * j + 1][0],
1655                       pre->g_pre_comp[i][2 * j + 1][1],
1656                       pre->g_pre_comp[i][2 * j + 1][2],
1657                       pre->g_pre_comp[i][2 * j][0],
1658                       pre->g_pre_comp[i][2 * j][1],
1659                       pre->g_pre_comp[i][2 * j][2], 0,
1660                       pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1],
1661                       pre->g_pre_comp[i][1][2]);
1662         }
1663     }
1664     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems);
1665
1666  done:
1667     SETPRECOMP(group, nistp224, pre);
1668     pre = NULL;
1669     ret = 1;
1670  err:
1671     BN_CTX_end(ctx);
1672     EC_POINT_free(generator);
1673     BN_CTX_free(new_ctx);
1674     EC_nistp224_pre_comp_free(pre);
1675     return ret;
1676 }
1677
1678 int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group)
1679 {
1680     return HAVEPRECOMP(group, nistp224);
1681 }
1682
1683 #endif