crypto/ec/ecp_nistp521.c

   1 /*
   2  * Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the Apache License 2.0 (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  28  *
  29  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  30  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  31  * work which got its smarts from Daniel J. Bernstein's work on the same.
  32  */
  33
  34 #include <openssl/e_os2.h>
  35 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  36 NON_EMPTY_TRANSLATION_UNIT
  37 #else
  38
  39 # include <string.h>
  40 # include <openssl/err.h>
  41 # include "ec_lcl.h"
  42
  43 # if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
  44   /* even with gcc, the typedef won't work for 32-bit platforms */
  45 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  46                                  * platforms */
  47 # else
  48 #  error "Your compiler doesn't appear to support 128-bit integer types"
  49 # endif
  50
  51 typedef uint8_t u8;
  52 typedef uint64_t u64;
  53
  54 /*
  55  * The underlying field. P521 operates over GF(2^521-1). We can serialise an
  56  * element of this field into 66 bytes where the most significant byte
  57  * contains only a single bit. We call this an felem_bytearray.
  58  */
  59
  60 typedef u8 felem_bytearray[66];
  61
  62 /*
  63  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  64  * These values are big-endian.
  65  */
  66 static const felem_bytearray nistp521_curve_params[5] = {
  67     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
  68      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  69      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  70      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  74      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  75      0xff, 0xff},
  76     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
  77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  78      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  79      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  82      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  83      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  84      0xff, 0xfc},
  85     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
  86      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  87      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  88      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  89      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  90      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  91      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  92      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  93      0x3f, 0x00},
  94     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
  95      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  96      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
  97      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
  98      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
  99      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
 100      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
 101      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 102      0xbd, 0x66},
 103     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
 104      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 105      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 106      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 107      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 108      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 109      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 110      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 111      0x66, 0x50}
 112 };
 113
 114 /*-
 115  * The representation of field elements.
 116  * ------------------------------------
 117  *
 118  * We represent field elements with nine values. These values are either 64 or
 119  * 128 bits and the field element represented is:
 120  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 121  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 122  * 58 bits apart, but are greater than 58 bits in length, the most significant
 123  * bits of each limb overlap with the least significant bits of the next.
 124  *
 125  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 126  * 'largefelem' */
 127
 128 # define NLIMBS 9
 129
 130 typedef uint64_t limb;
 131 typedef limb felem[NLIMBS];
 132 typedef uint128_t largefelem[NLIMBS];
 133
 134 static const limb bottom57bits = 0x1ffffffffffffff;
 135 static const limb bottom58bits = 0x3ffffffffffffff;
 136
 137 /*
 138  * bin66_to_felem takes a little-endian byte array and converts it into felem
 139  * form. This assumes that the CPU is little-endian.
 140  */
 141 static void bin66_to_felem(felem out, const u8 in[66])
 142 {
 143     out[0] = (*((limb *) & in[0])) & bottom58bits;
 144     out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
 145     out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
 146     out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
 147     out[4] = (*((limb *) & in[29])) & bottom58bits;
 148     out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
 149     out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
 150     out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
 151     out[8] = (*((limb *) & in[58])) & bottom57bits;
 152 }
 153
 154 /*
 155  * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 156  * array. This assumes that the CPU is little-endian.
 157  */
 158 static void felem_to_bin66(u8 out[66], const felem in)
 159 {
 160     memset(out, 0, 66);
 161     (*((limb *) & out[0])) = in[0];
 162     (*((limb *) & out[7])) |= in[1] << 2;
 163     (*((limb *) & out[14])) |= in[2] << 4;
 164     (*((limb *) & out[21])) |= in[3] << 6;
 165     (*((limb *) & out[29])) = in[4];
 166     (*((limb *) & out[36])) |= in[5] << 2;
 167     (*((limb *) & out[43])) |= in[6] << 4;
 168     (*((limb *) & out[50])) |= in[7] << 6;
 169     (*((limb *) & out[58])) = in[8];
 170 }
 171
 172 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 173 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 174 {
 175     unsigned i;
 176     for (i = 0; i < len; ++i)
 177         out[i] = in[len - 1 - i];
 178 }
 179
 180 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 181 static int BN_to_felem(felem out, const BIGNUM *bn)
 182 {
 183     felem_bytearray b_in;
 184     felem_bytearray b_out;
 185     unsigned num_bytes;
 186
 187     /* BN_bn2bin eats leading zeroes */
 188     memset(b_out, 0, sizeof(b_out));
 189     num_bytes = BN_num_bytes(bn);
 190     if (num_bytes > sizeof(b_out)) {
 191         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 192         return 0;
 193     }
 194     if (BN_is_negative(bn)) {
 195         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 196         return 0;
 197     }
 198     num_bytes = BN_bn2bin(bn, b_in);
 199     flip_endian(b_out, b_in, num_bytes);
 200     bin66_to_felem(out, b_out);
 201     return 1;
 202 }
 203
 204 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 205 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 206 {
 207     felem_bytearray b_in, b_out;
 208     felem_to_bin66(b_in, in);
 209     flip_endian(b_out, b_in, sizeof(b_out));
 210     return BN_bin2bn(b_out, sizeof(b_out), out);
 211 }
 212
 213 /*-
 214  * Field operations
 215  * ----------------
 216  */
 217
 218 static void felem_one(felem out)
 219 {
 220     out[0] = 1;
 221     out[1] = 0;
 222     out[2] = 0;
 223     out[3] = 0;
 224     out[4] = 0;
 225     out[5] = 0;
 226     out[6] = 0;
 227     out[7] = 0;
 228     out[8] = 0;
 229 }
 230
 231 static void felem_assign(felem out, const felem in)
 232 {
 233     out[0] = in[0];
 234     out[1] = in[1];
 235     out[2] = in[2];
 236     out[3] = in[3];
 237     out[4] = in[4];
 238     out[5] = in[5];
 239     out[6] = in[6];
 240     out[7] = in[7];
 241     out[8] = in[8];
 242 }
 243
 244 /* felem_sum64 sets out = out + in. */
 245 static void felem_sum64(felem out, const felem in)
 246 {
 247     out[0] += in[0];
 248     out[1] += in[1];
 249     out[2] += in[2];
 250     out[3] += in[3];
 251     out[4] += in[4];
 252     out[5] += in[5];
 253     out[6] += in[6];
 254     out[7] += in[7];
 255     out[8] += in[8];
 256 }
 257
 258 /* felem_scalar sets out = in * scalar */
 259 static void felem_scalar(felem out, const felem in, limb scalar)
 260 {
 261     out[0] = in[0] * scalar;
 262     out[1] = in[1] * scalar;
 263     out[2] = in[2] * scalar;
 264     out[3] = in[3] * scalar;
 265     out[4] = in[4] * scalar;
 266     out[5] = in[5] * scalar;
 267     out[6] = in[6] * scalar;
 268     out[7] = in[7] * scalar;
 269     out[8] = in[8] * scalar;
 270 }
 271
 272 /* felem_scalar64 sets out = out * scalar */
 273 static void felem_scalar64(felem out, limb scalar)
 274 {
 275     out[0] *= scalar;
 276     out[1] *= scalar;
 277     out[2] *= scalar;
 278     out[3] *= scalar;
 279     out[4] *= scalar;
 280     out[5] *= scalar;
 281     out[6] *= scalar;
 282     out[7] *= scalar;
 283     out[8] *= scalar;
 284 }
 285
 286 /* felem_scalar128 sets out = out * scalar */
 287 static void felem_scalar128(largefelem out, limb scalar)
 288 {
 289     out[0] *= scalar;
 290     out[1] *= scalar;
 291     out[2] *= scalar;
 292     out[3] *= scalar;
 293     out[4] *= scalar;
 294     out[5] *= scalar;
 295     out[6] *= scalar;
 296     out[7] *= scalar;
 297     out[8] *= scalar;
 298 }
 299
 300 /*-
 301  * felem_neg sets |out| to |-in|
 302  * On entry:
 303  *   in[i] < 2^59 + 2^14
 304  * On exit:
 305  *   out[i] < 2^62
 306  */
 307 static void felem_neg(felem out, const felem in)
 308 {
 309     /* In order to prevent underflow, we subtract from 0 mod p. */
 310     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 311     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 312
 313     out[0] = two62m3 - in[0];
 314     out[1] = two62m2 - in[1];
 315     out[2] = two62m2 - in[2];
 316     out[3] = two62m2 - in[3];
 317     out[4] = two62m2 - in[4];
 318     out[5] = two62m2 - in[5];
 319     out[6] = two62m2 - in[6];
 320     out[7] = two62m2 - in[7];
 321     out[8] = two62m2 - in[8];
 322 }
 323
 324 /*-
 325  * felem_diff64 subtracts |in| from |out|
 326  * On entry:
 327  *   in[i] < 2^59 + 2^14
 328  * On exit:
 329  *   out[i] < out[i] + 2^62
 330  */
 331 static void felem_diff64(felem out, const felem in)
 332 {
 333     /*
 334      * In order to prevent underflow, we add 0 mod p before subtracting.
 335      */
 336     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 337     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 338
 339     out[0] += two62m3 - in[0];
 340     out[1] += two62m2 - in[1];
 341     out[2] += two62m2 - in[2];
 342     out[3] += two62m2 - in[3];
 343     out[4] += two62m2 - in[4];
 344     out[5] += two62m2 - in[5];
 345     out[6] += two62m2 - in[6];
 346     out[7] += two62m2 - in[7];
 347     out[8] += two62m2 - in[8];
 348 }
 349
 350 /*-
 351  * felem_diff_128_64 subtracts |in| from |out|
 352  * On entry:
 353  *   in[i] < 2^62 + 2^17
 354  * On exit:
 355  *   out[i] < out[i] + 2^63
 356  */
 357 static void felem_diff_128_64(largefelem out, const felem in)
 358 {
 359     /*
 360      * In order to prevent underflow, we add 64p mod p (which is equivalent
 361      * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
 362      * digit number with all bits set to 1. See "The representation of field
 363      * elements" comment above for a description of how limbs are used to
 364      * represent a number. 64p is represented with 8 limbs containing a number
 365      * with 58 bits set and one limb with a number with 57 bits set.
 366      */
 367     static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
 368     static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
 369
 370     out[0] += two63m6 - in[0];
 371     out[1] += two63m5 - in[1];
 372     out[2] += two63m5 - in[2];
 373     out[3] += two63m5 - in[3];
 374     out[4] += two63m5 - in[4];
 375     out[5] += two63m5 - in[5];
 376     out[6] += two63m5 - in[6];
 377     out[7] += two63m5 - in[7];
 378     out[8] += two63m5 - in[8];
 379 }
 380
 381 /*-
 382  * felem_diff_128_64 subtracts |in| from |out|
 383  * On entry:
 384  *   in[i] < 2^126
 385  * On exit:
 386  *   out[i] < out[i] + 2^127 - 2^69
 387  */
 388 static void felem_diff128(largefelem out, const largefelem in)
 389 {
 390     /*
 391      * In order to prevent underflow, we add 0 mod p before subtracting.
 392      */
 393     static const uint128_t two127m70 =
 394         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
 395     static const uint128_t two127m69 =
 396         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
 397
 398     out[0] += (two127m70 - in[0]);
 399     out[1] += (two127m69 - in[1]);
 400     out[2] += (two127m69 - in[2]);
 401     out[3] += (two127m69 - in[3]);
 402     out[4] += (two127m69 - in[4]);
 403     out[5] += (two127m69 - in[5]);
 404     out[6] += (two127m69 - in[6]);
 405     out[7] += (two127m69 - in[7]);
 406     out[8] += (two127m69 - in[8]);
 407 }
 408
 409 /*-
 410  * felem_square sets |out| = |in|^2
 411  * On entry:
 412  *   in[i] < 2^62
 413  * On exit:
 414  *   out[i] < 17 * max(in[i]) * max(in[i])
 415  */
 416 static void felem_square(largefelem out, const felem in)
 417 {
 418     felem inx2, inx4;
 419     felem_scalar(inx2, in, 2);
 420     felem_scalar(inx4, in, 4);
 421
 422     /*-
 423      * We have many cases were we want to do
 424      *   in[x] * in[y] +
 425      *   in[y] * in[x]
 426      * This is obviously just
 427      *   2 * in[x] * in[y]
 428      * However, rather than do the doubling on the 128 bit result, we
 429      * double one of the inputs to the multiplication by reading from
 430      * |inx2|
 431      */
 432
 433     out[0] = ((uint128_t) in[0]) * in[0];
 434     out[1] = ((uint128_t) in[0]) * inx2[1];
 435     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
 436     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
 437     out[4] = ((uint128_t) in[0]) * inx2[4] +
 438              ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
 439     out[5] = ((uint128_t) in[0]) * inx2[5] +
 440              ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
 441     out[6] = ((uint128_t) in[0]) * inx2[6] +
 442              ((uint128_t) in[1]) * inx2[5] +
 443              ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
 444     out[7] = ((uint128_t) in[0]) * inx2[7] +
 445              ((uint128_t) in[1]) * inx2[6] +
 446              ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
 447     out[8] = ((uint128_t) in[0]) * inx2[8] +
 448              ((uint128_t) in[1]) * inx2[7] +
 449              ((uint128_t) in[2]) * inx2[6] +
 450              ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
 451
 452     /*
 453      * The remaining limbs fall above 2^521, with the first falling at 2^522.
 454      * They correspond to locations one bit up from the limbs produced above
 455      * so we would have to multiply by two to align them. Again, rather than
 456      * operate on the 128-bit result, we double one of the inputs to the
 457      * multiplication. If we want to double for both this reason, and the
 458      * reason above, then we end up multiplying by four.
 459      */
 460
 461     /* 9 */
 462     out[0] += ((uint128_t) in[1]) * inx4[8] +
 463               ((uint128_t) in[2]) * inx4[7] +
 464               ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
 465
 466     /* 10 */
 467     out[1] += ((uint128_t) in[2]) * inx4[8] +
 468               ((uint128_t) in[3]) * inx4[7] +
 469               ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
 470
 471     /* 11 */
 472     out[2] += ((uint128_t) in[3]) * inx4[8] +
 473               ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
 474
 475     /* 12 */
 476     out[3] += ((uint128_t) in[4]) * inx4[8] +
 477               ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
 478
 479     /* 13 */
 480     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
 481
 482     /* 14 */
 483     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
 484
 485     /* 15 */
 486     out[6] += ((uint128_t) in[7]) * inx4[8];
 487
 488     /* 16 */
 489     out[7] += ((uint128_t) in[8]) * inx2[8];
 490 }
 491
 492 /*-
 493  * felem_mul sets |out| = |in1| * |in2|
 494  * On entry:
 495  *   in1[i] < 2^64
 496  *   in2[i] < 2^63
 497  * On exit:
 498  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 499  */
 500 static void felem_mul(largefelem out, const felem in1, const felem in2)
 501 {
 502     felem in2x2;
 503     felem_scalar(in2x2, in2, 2);
 504
 505     out[0] = ((uint128_t) in1[0]) * in2[0];
 506
 507     out[1] = ((uint128_t) in1[0]) * in2[1] +
 508              ((uint128_t) in1[1]) * in2[0];
 509
 510     out[2] = ((uint128_t) in1[0]) * in2[2] +
 511              ((uint128_t) in1[1]) * in2[1] +
 512              ((uint128_t) in1[2]) * in2[0];
 513
 514     out[3] = ((uint128_t) in1[0]) * in2[3] +
 515              ((uint128_t) in1[1]) * in2[2] +
 516              ((uint128_t) in1[2]) * in2[1] +
 517              ((uint128_t) in1[3]) * in2[0];
 518
 519     out[4] = ((uint128_t) in1[0]) * in2[4] +
 520              ((uint128_t) in1[1]) * in2[3] +
 521              ((uint128_t) in1[2]) * in2[2] +
 522              ((uint128_t) in1[3]) * in2[1] +
 523              ((uint128_t) in1[4]) * in2[0];
 524
 525     out[5] = ((uint128_t) in1[0]) * in2[5] +
 526              ((uint128_t) in1[1]) * in2[4] +
 527              ((uint128_t) in1[2]) * in2[3] +
 528              ((uint128_t) in1[3]) * in2[2] +
 529              ((uint128_t) in1[4]) * in2[1] +
 530              ((uint128_t) in1[5]) * in2[0];
 531
 532     out[6] = ((uint128_t) in1[0]) * in2[6] +
 533              ((uint128_t) in1[1]) * in2[5] +
 534              ((uint128_t) in1[2]) * in2[4] +
 535              ((uint128_t) in1[3]) * in2[3] +
 536              ((uint128_t) in1[4]) * in2[2] +
 537              ((uint128_t) in1[5]) * in2[1] +
 538              ((uint128_t) in1[6]) * in2[0];
 539
 540     out[7] = ((uint128_t) in1[0]) * in2[7] +
 541              ((uint128_t) in1[1]) * in2[6] +
 542              ((uint128_t) in1[2]) * in2[5] +
 543              ((uint128_t) in1[3]) * in2[4] +
 544              ((uint128_t) in1[4]) * in2[3] +
 545              ((uint128_t) in1[5]) * in2[2] +
 546              ((uint128_t) in1[6]) * in2[1] +
 547              ((uint128_t) in1[7]) * in2[0];
 548
 549     out[8] = ((uint128_t) in1[0]) * in2[8] +
 550              ((uint128_t) in1[1]) * in2[7] +
 551              ((uint128_t) in1[2]) * in2[6] +
 552              ((uint128_t) in1[3]) * in2[5] +
 553              ((uint128_t) in1[4]) * in2[4] +
 554              ((uint128_t) in1[5]) * in2[3] +
 555              ((uint128_t) in1[6]) * in2[2] +
 556              ((uint128_t) in1[7]) * in2[1] +
 557              ((uint128_t) in1[8]) * in2[0];
 558
 559     /* See comment in felem_square about the use of in2x2 here */
 560
 561     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 562               ((uint128_t) in1[2]) * in2x2[7] +
 563               ((uint128_t) in1[3]) * in2x2[6] +
 564               ((uint128_t) in1[4]) * in2x2[5] +
 565               ((uint128_t) in1[5]) * in2x2[4] +
 566               ((uint128_t) in1[6]) * in2x2[3] +
 567               ((uint128_t) in1[7]) * in2x2[2] +
 568               ((uint128_t) in1[8]) * in2x2[1];
 569
 570     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 571               ((uint128_t) in1[3]) * in2x2[7] +
 572               ((uint128_t) in1[4]) * in2x2[6] +
 573               ((uint128_t) in1[5]) * in2x2[5] +
 574               ((uint128_t) in1[6]) * in2x2[4] +
 575               ((uint128_t) in1[7]) * in2x2[3] +
 576               ((uint128_t) in1[8]) * in2x2[2];
 577
 578     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 579               ((uint128_t) in1[4]) * in2x2[7] +
 580               ((uint128_t) in1[5]) * in2x2[6] +
 581               ((uint128_t) in1[6]) * in2x2[5] +
 582               ((uint128_t) in1[7]) * in2x2[4] +
 583               ((uint128_t) in1[8]) * in2x2[3];
 584
 585     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 586               ((uint128_t) in1[5]) * in2x2[7] +
 587               ((uint128_t) in1[6]) * in2x2[6] +
 588               ((uint128_t) in1[7]) * in2x2[5] +
 589               ((uint128_t) in1[8]) * in2x2[4];
 590
 591     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 592               ((uint128_t) in1[6]) * in2x2[7] +
 593               ((uint128_t) in1[7]) * in2x2[6] +
 594               ((uint128_t) in1[8]) * in2x2[5];
 595
 596     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 597               ((uint128_t) in1[7]) * in2x2[7] +
 598               ((uint128_t) in1[8]) * in2x2[6];
 599
 600     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 601               ((uint128_t) in1[8]) * in2x2[7];
 602
 603     out[7] += ((uint128_t) in1[8]) * in2x2[8];
 604 }
 605
 606 static const limb bottom52bits = 0xfffffffffffff;
 607
 608 /*-
 609  * felem_reduce converts a largefelem to an felem.
 610  * On entry:
 611  *   in[i] < 2^128
 612  * On exit:
 613  *   out[i] < 2^59 + 2^14
 614  */
 615 static void felem_reduce(felem out, const largefelem in)
 616 {
 617     u64 overflow1, overflow2;
 618
 619     out[0] = ((limb) in[0]) & bottom58bits;
 620     out[1] = ((limb) in[1]) & bottom58bits;
 621     out[2] = ((limb) in[2]) & bottom58bits;
 622     out[3] = ((limb) in[3]) & bottom58bits;
 623     out[4] = ((limb) in[4]) & bottom58bits;
 624     out[5] = ((limb) in[5]) & bottom58bits;
 625     out[6] = ((limb) in[6]) & bottom58bits;
 626     out[7] = ((limb) in[7]) & bottom58bits;
 627     out[8] = ((limb) in[8]) & bottom58bits;
 628
 629     /* out[i] < 2^58 */
 630
 631     out[1] += ((limb) in[0]) >> 58;
 632     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 633     /*-
 634      * out[1] < 2^58 + 2^6 + 2^58
 635      *        = 2^59 + 2^6
 636      */
 637     out[2] += ((limb) (in[0] >> 64)) >> 52;
 638
 639     out[2] += ((limb) in[1]) >> 58;
 640     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 641     out[3] += ((limb) (in[1] >> 64)) >> 52;
 642
 643     out[3] += ((limb) in[2]) >> 58;
 644     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 645     out[4] += ((limb) (in[2] >> 64)) >> 52;
 646
 647     out[4] += ((limb) in[3]) >> 58;
 648     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 649     out[5] += ((limb) (in[3] >> 64)) >> 52;
 650
 651     out[5] += ((limb) in[4]) >> 58;
 652     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 653     out[6] += ((limb) (in[4] >> 64)) >> 52;
 654
 655     out[6] += ((limb) in[5]) >> 58;
 656     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 657     out[7] += ((limb) (in[5] >> 64)) >> 52;
 658
 659     out[7] += ((limb) in[6]) >> 58;
 660     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 661     out[8] += ((limb) (in[6] >> 64)) >> 52;
 662
 663     out[8] += ((limb) in[7]) >> 58;
 664     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 665     /*-
 666      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 667      *            < 2^59 + 2^13
 668      */
 669     overflow1 = ((limb) (in[7] >> 64)) >> 52;
 670
 671     overflow1 += ((limb) in[8]) >> 58;
 672     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 673     overflow2 = ((limb) (in[8] >> 64)) >> 52;
 674
 675     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
 676     overflow2 <<= 1;            /* overflow2 < 2^13 */
 677
 678     out[0] += overflow1;        /* out[0] < 2^60 */
 679     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
 680
 681     out[1] += out[0] >> 58;
 682     out[0] &= bottom58bits;
 683     /*-
 684      * out[0] < 2^58
 685      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 686      *        < 2^59 + 2^14
 687      */
 688 }
 689
 690 static void felem_square_reduce(felem out, const felem in)
 691 {
 692     largefelem tmp;
 693     felem_square(tmp, in);
 694     felem_reduce(out, tmp);
 695 }
 696
 697 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 698 {
 699     largefelem tmp;
 700     felem_mul(tmp, in1, in2);
 701     felem_reduce(out, tmp);
 702 }
 703
 704 /*-
 705  * felem_inv calculates |out| = |in|^{-1}
 706  *
 707  * Based on Fermat's Little Theorem:
 708  *   a^p = a (mod p)
 709  *   a^{p-1} = 1 (mod p)
 710  *   a^{p-2} = a^{-1} (mod p)
 711  */
 712 static void felem_inv(felem out, const felem in)
 713 {
 714     felem ftmp, ftmp2, ftmp3, ftmp4;
 715     largefelem tmp;
 716     unsigned i;
 717
 718     felem_square(tmp, in);
 719     felem_reduce(ftmp, tmp);    /* 2^1 */
 720     felem_mul(tmp, in, ftmp);
 721     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
 722     felem_assign(ftmp2, ftmp);
 723     felem_square(tmp, ftmp);
 724     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
 725     felem_mul(tmp, in, ftmp);
 726     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
 727     felem_square(tmp, ftmp);
 728     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
 729
 730     felem_square(tmp, ftmp2);
 731     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
 732     felem_square(tmp, ftmp3);
 733     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
 734     felem_mul(tmp, ftmp3, ftmp2);
 735     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
 736
 737     felem_assign(ftmp2, ftmp3);
 738     felem_square(tmp, ftmp3);
 739     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
 740     felem_square(tmp, ftmp3);
 741     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
 742     felem_square(tmp, ftmp3);
 743     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
 744     felem_square(tmp, ftmp3);
 745     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
 746     felem_assign(ftmp4, ftmp3);
 747     felem_mul(tmp, ftmp3, ftmp);
 748     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
 749     felem_square(tmp, ftmp4);
 750     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
 751     felem_mul(tmp, ftmp3, ftmp2);
 752     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
 753     felem_assign(ftmp2, ftmp3);
 754
 755     for (i = 0; i < 8; i++) {
 756         felem_square(tmp, ftmp3);
 757         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
 758     }
 759     felem_mul(tmp, ftmp3, ftmp2);
 760     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
 761     felem_assign(ftmp2, ftmp3);
 762
 763     for (i = 0; i < 16; i++) {
 764         felem_square(tmp, ftmp3);
 765         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
 766     }
 767     felem_mul(tmp, ftmp3, ftmp2);
 768     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
 769     felem_assign(ftmp2, ftmp3);
 770
 771     for (i = 0; i < 32; i++) {
 772         felem_square(tmp, ftmp3);
 773         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
 774     }
 775     felem_mul(tmp, ftmp3, ftmp2);
 776     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
 777     felem_assign(ftmp2, ftmp3);
 778
 779     for (i = 0; i < 64; i++) {
 780         felem_square(tmp, ftmp3);
 781         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
 782     }
 783     felem_mul(tmp, ftmp3, ftmp2);
 784     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
 785     felem_assign(ftmp2, ftmp3);
 786
 787     for (i = 0; i < 128; i++) {
 788         felem_square(tmp, ftmp3);
 789         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
 790     }
 791     felem_mul(tmp, ftmp3, ftmp2);
 792     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
 793     felem_assign(ftmp2, ftmp3);
 794
 795     for (i = 0; i < 256; i++) {
 796         felem_square(tmp, ftmp3);
 797         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
 798     }
 799     felem_mul(tmp, ftmp3, ftmp2);
 800     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
 801
 802     for (i = 0; i < 9; i++) {
 803         felem_square(tmp, ftmp3);
 804         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
 805     }
 806     felem_mul(tmp, ftmp3, ftmp4);
 807     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
 808     felem_mul(tmp, ftmp3, in);
 809     felem_reduce(out, tmp);     /* 2^512 - 3 */
 810 }
 811
 812 /* This is 2^521-1, expressed as an felem */
 813 static const felem kPrime = {
 814     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 815     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 816     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 817 };
 818
 819 /*-
 820  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 821  * otherwise.
 822  * On entry:
 823  *   in[i] < 2^59 + 2^14
 824  */
 825 static limb felem_is_zero(const felem in)
 826 {
 827     felem ftmp;
 828     limb is_zero, is_p;
 829     felem_assign(ftmp, in);
 830
 831     ftmp[0] += ftmp[8] >> 57;
 832     ftmp[8] &= bottom57bits;
 833     /* ftmp[8] < 2^57 */
 834     ftmp[1] += ftmp[0] >> 58;
 835     ftmp[0] &= bottom58bits;
 836     ftmp[2] += ftmp[1] >> 58;
 837     ftmp[1] &= bottom58bits;
 838     ftmp[3] += ftmp[2] >> 58;
 839     ftmp[2] &= bottom58bits;
 840     ftmp[4] += ftmp[3] >> 58;
 841     ftmp[3] &= bottom58bits;
 842     ftmp[5] += ftmp[4] >> 58;
 843     ftmp[4] &= bottom58bits;
 844     ftmp[6] += ftmp[5] >> 58;
 845     ftmp[5] &= bottom58bits;
 846     ftmp[7] += ftmp[6] >> 58;
 847     ftmp[6] &= bottom58bits;
 848     ftmp[8] += ftmp[7] >> 58;
 849     ftmp[7] &= bottom58bits;
 850     /* ftmp[8] < 2^57 + 4 */
 851
 852     /*
 853      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
 854      * than our bound for ftmp[8]. Therefore we only have to check if the
 855      * zero is zero or 2^521-1.
 856      */
 857
 858     is_zero = 0;
 859     is_zero |= ftmp[0];
 860     is_zero |= ftmp[1];
 861     is_zero |= ftmp[2];
 862     is_zero |= ftmp[3];
 863     is_zero |= ftmp[4];
 864     is_zero |= ftmp[5];
 865     is_zero |= ftmp[6];
 866     is_zero |= ftmp[7];
 867     is_zero |= ftmp[8];
 868
 869     is_zero--;
 870     /*
 871      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
 872      * can be set is if is_zero was 0 before the decrement.
 873      */
 874     is_zero = 0 - (is_zero >> 63);
 875
 876     is_p = ftmp[0] ^ kPrime[0];
 877     is_p |= ftmp[1] ^ kPrime[1];
 878     is_p |= ftmp[2] ^ kPrime[2];
 879     is_p |= ftmp[3] ^ kPrime[3];
 880     is_p |= ftmp[4] ^ kPrime[4];
 881     is_p |= ftmp[5] ^ kPrime[5];
 882     is_p |= ftmp[6] ^ kPrime[6];
 883     is_p |= ftmp[7] ^ kPrime[7];
 884     is_p |= ftmp[8] ^ kPrime[8];
 885
 886     is_p--;
 887     is_p = 0 - (is_p >> 63);
 888
 889     is_zero |= is_p;
 890     return is_zero;
 891 }
 892
 893 static int felem_is_zero_int(const void *in)
 894 {
 895     return (int)(felem_is_zero(in) & ((limb) 1));
 896 }
 897
 898 /*-
 899  * felem_contract converts |in| to its unique, minimal representation.
 900  * On entry:
 901  *   in[i] < 2^59 + 2^14
 902  */
 903 static void felem_contract(felem out, const felem in)
 904 {
 905     limb is_p, is_greater, sign;
 906     static const limb two58 = ((limb) 1) << 58;
 907
 908     felem_assign(out, in);
 909
 910     out[0] += out[8] >> 57;
 911     out[8] &= bottom57bits;
 912     /* out[8] < 2^57 */
 913     out[1] += out[0] >> 58;
 914     out[0] &= bottom58bits;
 915     out[2] += out[1] >> 58;
 916     out[1] &= bottom58bits;
 917     out[3] += out[2] >> 58;
 918     out[2] &= bottom58bits;
 919     out[4] += out[3] >> 58;
 920     out[3] &= bottom58bits;
 921     out[5] += out[4] >> 58;
 922     out[4] &= bottom58bits;
 923     out[6] += out[5] >> 58;
 924     out[5] &= bottom58bits;
 925     out[7] += out[6] >> 58;
 926     out[6] &= bottom58bits;
 927     out[8] += out[7] >> 58;
 928     out[7] &= bottom58bits;
 929     /* out[8] < 2^57 + 4 */
 930
 931     /*
 932      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
 933      * out. See the comments in felem_is_zero regarding why we don't test for
 934      * other multiples of the prime.
 935      */
 936
 937     /*
 938      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
 939      */
 940
 941     is_p = out[0] ^ kPrime[0];
 942     is_p |= out[1] ^ kPrime[1];
 943     is_p |= out[2] ^ kPrime[2];
 944     is_p |= out[3] ^ kPrime[3];
 945     is_p |= out[4] ^ kPrime[4];
 946     is_p |= out[5] ^ kPrime[5];
 947     is_p |= out[6] ^ kPrime[6];
 948     is_p |= out[7] ^ kPrime[7];
 949     is_p |= out[8] ^ kPrime[8];
 950
 951     is_p--;
 952     is_p &= is_p << 32;
 953     is_p &= is_p << 16;
 954     is_p &= is_p << 8;
 955     is_p &= is_p << 4;
 956     is_p &= is_p << 2;
 957     is_p &= is_p << 1;
 958     is_p = 0 - (is_p >> 63);
 959     is_p = ~is_p;
 960
 961     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 962
 963     out[0] &= is_p;
 964     out[1] &= is_p;
 965     out[2] &= is_p;
 966     out[3] &= is_p;
 967     out[4] &= is_p;
 968     out[5] &= is_p;
 969     out[6] &= is_p;
 970     out[7] &= is_p;
 971     out[8] &= is_p;
 972
 973     /*
 974      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
 975      * 57 is greater than zero as (2^521-1) + x >= 2^522
 976      */
 977     is_greater = out[8] >> 57;
 978     is_greater |= is_greater << 32;
 979     is_greater |= is_greater << 16;
 980     is_greater |= is_greater << 8;
 981     is_greater |= is_greater << 4;
 982     is_greater |= is_greater << 2;
 983     is_greater |= is_greater << 1;
 984     is_greater = 0 - (is_greater >> 63);
 985
 986     out[0] -= kPrime[0] & is_greater;
 987     out[1] -= kPrime[1] & is_greater;
 988     out[2] -= kPrime[2] & is_greater;
 989     out[3] -= kPrime[3] & is_greater;
 990     out[4] -= kPrime[4] & is_greater;
 991     out[5] -= kPrime[5] & is_greater;
 992     out[6] -= kPrime[6] & is_greater;
 993     out[7] -= kPrime[7] & is_greater;
 994     out[8] -= kPrime[8] & is_greater;
 995
 996     /* Eliminate negative coefficients */
 997     sign = -(out[0] >> 63);
 998     out[0] += (two58 & sign);
 999     out[1] -= (1 & sign);
1000     sign = -(out[1] >> 63);
1001     out[1] += (two58 & sign);
1002     out[2] -= (1 & sign);
1003     sign = -(out[2] >> 63);
1004     out[2] += (two58 & sign);
1005     out[3] -= (1 & sign);
1006     sign = -(out[3] >> 63);
1007     out[3] += (two58 & sign);
1008     out[4] -= (1 & sign);
1009     sign = -(out[4] >> 63);
1010     out[4] += (two58 & sign);
1011     out[5] -= (1 & sign);
1012     sign = -(out[0] >> 63);
1013     out[5] += (two58 & sign);
1014     out[6] -= (1 & sign);
1015     sign = -(out[6] >> 63);
1016     out[6] += (two58 & sign);
1017     out[7] -= (1 & sign);
1018     sign = -(out[7] >> 63);
1019     out[7] += (two58 & sign);
1020     out[8] -= (1 & sign);
1021     sign = -(out[5] >> 63);
1022     out[5] += (two58 & sign);
1023     out[6] -= (1 & sign);
1024     sign = -(out[6] >> 63);
1025     out[6] += (two58 & sign);
1026     out[7] -= (1 & sign);
1027     sign = -(out[7] >> 63);
1028     out[7] += (two58 & sign);
1029     out[8] -= (1 & sign);
1030 }
1031
1032 /*-
1033  * Group operations
1034  * ----------------
1035  *
1036  * Building on top of the field operations we have the operations on the
1037  * elliptic curve group itself. Points on the curve are represented in Jacobian
1038  * coordinates */
1039
1040 /*-
1041  * point_double calculates 2*(x_in, y_in, z_in)
1042  *
1043  * The method is taken from:
1044  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1045  *
1046  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1047  * while x_out == y_in is not (maybe this works, but it's not tested). */
1048 static void
1049 point_double(felem x_out, felem y_out, felem z_out,
1050              const felem x_in, const felem y_in, const felem z_in)
1051 {
1052     largefelem tmp, tmp2;
1053     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1054
1055     felem_assign(ftmp, x_in);
1056     felem_assign(ftmp2, x_in);
1057
1058     /* delta = z^2 */
1059     felem_square(tmp, z_in);
1060     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1061
1062     /* gamma = y^2 */
1063     felem_square(tmp, y_in);
1064     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1065
1066     /* beta = x*gamma */
1067     felem_mul(tmp, x_in, gamma);
1068     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1069
1070     /* alpha = 3*(x-delta)*(x+delta) */
1071     felem_diff64(ftmp, delta);
1072     /* ftmp[i] < 2^61 */
1073     felem_sum64(ftmp2, delta);
1074     /* ftmp2[i] < 2^60 + 2^15 */
1075     felem_scalar64(ftmp2, 3);
1076     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1077     felem_mul(tmp, ftmp, ftmp2);
1078     /*-
1079      * tmp[i] < 17(3*2^121 + 3*2^76)
1080      *        = 61*2^121 + 61*2^76
1081      *        < 64*2^121 + 64*2^76
1082      *        = 2^127 + 2^82
1083      *        < 2^128
1084      */
1085     felem_reduce(alpha, tmp);
1086
1087     /* x' = alpha^2 - 8*beta */
1088     felem_square(tmp, alpha);
1089     /*
1090      * tmp[i] < 17*2^120 < 2^125
1091      */
1092     felem_assign(ftmp, beta);
1093     felem_scalar64(ftmp, 8);
1094     /* ftmp[i] < 2^62 + 2^17 */
1095     felem_diff_128_64(tmp, ftmp);
1096     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1097     felem_reduce(x_out, tmp);
1098
1099     /* z' = (y + z)^2 - gamma - delta */
1100     felem_sum64(delta, gamma);
1101     /* delta[i] < 2^60 + 2^15 */
1102     felem_assign(ftmp, y_in);
1103     felem_sum64(ftmp, z_in);
1104     /* ftmp[i] < 2^60 + 2^15 */
1105     felem_square(tmp, ftmp);
1106     /*
1107      * tmp[i] < 17(2^122) < 2^127
1108      */
1109     felem_diff_128_64(tmp, delta);
1110     /* tmp[i] < 2^127 + 2^63 */
1111     felem_reduce(z_out, tmp);
1112
1113     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1114     felem_scalar64(beta, 4);
1115     /* beta[i] < 2^61 + 2^16 */
1116     felem_diff64(beta, x_out);
1117     /* beta[i] < 2^61 + 2^60 + 2^16 */
1118     felem_mul(tmp, alpha, beta);
1119     /*-
1120      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1121      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1122      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1123      *        < 2^128
1124      */
1125     felem_square(tmp2, gamma);
1126     /*-
1127      * tmp2[i] < 17*(2^59 + 2^14)^2
1128      *         = 17*(2^118 + 2^74 + 2^28)
1129      */
1130     felem_scalar128(tmp2, 8);
1131     /*-
1132      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1133      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1134      *         < 2^126
1135      */
1136     felem_diff128(tmp, tmp2);
1137     /*-
1138      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1139      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1140      *          2^74 + 2^69 + 2^34 + 2^30
1141      *        < 2^128
1142      */
1143     felem_reduce(y_out, tmp);
1144 }
1145
1146 /* copy_conditional copies in to out iff mask is all ones. */
1147 static void copy_conditional(felem out, const felem in, limb mask)
1148 {
1149     unsigned i;
1150     for (i = 0; i < NLIMBS; ++i) {
1151         const limb tmp = mask & (in[i] ^ out[i]);
1152         out[i] ^= tmp;
1153     }
1154 }
1155
1156 /*-
1157  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1158  *
1159  * The method is taken from
1160  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1161  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1162  *
1163  * This function includes a branch for checking whether the two input points
1164  * are equal (while not equal to the point at infinity). See comment below
1165  * on constant-time.
1166  */
1167 static void point_add(felem x3, felem y3, felem z3,
1168                       const felem x1, const felem y1, const felem z1,
1169                       const int mixed, const felem x2, const felem y2,
1170                       const felem z2)
1171 {
1172     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1173     largefelem tmp, tmp2;
1174     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1175
1176     z1_is_zero = felem_is_zero(z1);
1177     z2_is_zero = felem_is_zero(z2);
1178
1179     /* ftmp = z1z1 = z1**2 */
1180     felem_square(tmp, z1);
1181     felem_reduce(ftmp, tmp);
1182
1183     if (!mixed) {
1184         /* ftmp2 = z2z2 = z2**2 */
1185         felem_square(tmp, z2);
1186         felem_reduce(ftmp2, tmp);
1187
1188         /* u1 = ftmp3 = x1*z2z2 */
1189         felem_mul(tmp, x1, ftmp2);
1190         felem_reduce(ftmp3, tmp);
1191
1192         /* ftmp5 = z1 + z2 */
1193         felem_assign(ftmp5, z1);
1194         felem_sum64(ftmp5, z2);
1195         /* ftmp5[i] < 2^61 */
1196
1197         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1198         felem_square(tmp, ftmp5);
1199         /* tmp[i] < 17*2^122 */
1200         felem_diff_128_64(tmp, ftmp);
1201         /* tmp[i] < 17*2^122 + 2^63 */
1202         felem_diff_128_64(tmp, ftmp2);
1203         /* tmp[i] < 17*2^122 + 2^64 */
1204         felem_reduce(ftmp5, tmp);
1205
1206         /* ftmp2 = z2 * z2z2 */
1207         felem_mul(tmp, ftmp2, z2);
1208         felem_reduce(ftmp2, tmp);
1209
1210         /* s1 = ftmp6 = y1 * z2**3 */
1211         felem_mul(tmp, y1, ftmp2);
1212         felem_reduce(ftmp6, tmp);
1213     } else {
1214         /*
1215          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1216          */
1217
1218         /* u1 = ftmp3 = x1*z2z2 */
1219         felem_assign(ftmp3, x1);
1220
1221         /* ftmp5 = 2*z1z2 */
1222         felem_scalar(ftmp5, z1, 2);
1223
1224         /* s1 = ftmp6 = y1 * z2**3 */
1225         felem_assign(ftmp6, y1);
1226     }
1227
1228     /* u2 = x2*z1z1 */
1229     felem_mul(tmp, x2, ftmp);
1230     /* tmp[i] < 17*2^120 */
1231
1232     /* h = ftmp4 = u2 - u1 */
1233     felem_diff_128_64(tmp, ftmp3);
1234     /* tmp[i] < 17*2^120 + 2^63 */
1235     felem_reduce(ftmp4, tmp);
1236
1237     x_equal = felem_is_zero(ftmp4);
1238
1239     /* z_out = ftmp5 * h */
1240     felem_mul(tmp, ftmp5, ftmp4);
1241     felem_reduce(z_out, tmp);
1242
1243     /* ftmp = z1 * z1z1 */
1244     felem_mul(tmp, ftmp, z1);
1245     felem_reduce(ftmp, tmp);
1246
1247     /* s2 = tmp = y2 * z1**3 */
1248     felem_mul(tmp, y2, ftmp);
1249     /* tmp[i] < 17*2^120 */
1250
1251     /* r = ftmp5 = (s2 - s1)*2 */
1252     felem_diff_128_64(tmp, ftmp6);
1253     /* tmp[i] < 17*2^120 + 2^63 */
1254     felem_reduce(ftmp5, tmp);
1255     y_equal = felem_is_zero(ftmp5);
1256     felem_scalar64(ftmp5, 2);
1257     /* ftmp5[i] < 2^61 */
1258
1259     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1260         /*
1261          * This is obviously not constant-time but it will almost-never happen
1262          * for ECDH / ECDSA. The case where it can happen is during scalar-mult
1263          * where the intermediate value gets very close to the group order.
1264          * Since |ec_GFp_nistp_recode_scalar_bits| produces signed digits for
1265          * the scalar, it's possible for the intermediate value to be a small
1266          * negative multiple of the base point, and for the final signed digit
1267          * to be the same value. We believe that this only occurs for the scalar
1268          * 1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
1269          * ffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb
1270          * 71e913863f7, in that case the penultimate intermediate is -9G and
1271          * the final digit is also -9G. Since this only happens for a single
1272          * scalar, the timing leak is irrelevant. (Any attacker who wanted to
1273          * check whether a secret scalar was that exact value, can already do
1274          * so.)
1275          */
1276         point_double(x3, y3, z3, x1, y1, z1);
1277         return;
1278     }
1279
1280     /* I = ftmp = (2h)**2 */
1281     felem_assign(ftmp, ftmp4);
1282     felem_scalar64(ftmp, 2);
1283     /* ftmp[i] < 2^61 */
1284     felem_square(tmp, ftmp);
1285     /* tmp[i] < 17*2^122 */
1286     felem_reduce(ftmp, tmp);
1287
1288     /* J = ftmp2 = h * I */
1289     felem_mul(tmp, ftmp4, ftmp);
1290     felem_reduce(ftmp2, tmp);
1291
1292     /* V = ftmp4 = U1 * I */
1293     felem_mul(tmp, ftmp3, ftmp);
1294     felem_reduce(ftmp4, tmp);
1295
1296     /* x_out = r**2 - J - 2V */
1297     felem_square(tmp, ftmp5);
1298     /* tmp[i] < 17*2^122 */
1299     felem_diff_128_64(tmp, ftmp2);
1300     /* tmp[i] < 17*2^122 + 2^63 */
1301     felem_assign(ftmp3, ftmp4);
1302     felem_scalar64(ftmp4, 2);
1303     /* ftmp4[i] < 2^61 */
1304     felem_diff_128_64(tmp, ftmp4);
1305     /* tmp[i] < 17*2^122 + 2^64 */
1306     felem_reduce(x_out, tmp);
1307
1308     /* y_out = r(V-x_out) - 2 * s1 * J */
1309     felem_diff64(ftmp3, x_out);
1310     /*
1311      * ftmp3[i] < 2^60 + 2^60 = 2^61
1312      */
1313     felem_mul(tmp, ftmp5, ftmp3);
1314     /* tmp[i] < 17*2^122 */
1315     felem_mul(tmp2, ftmp6, ftmp2);
1316     /* tmp2[i] < 17*2^120 */
1317     felem_scalar128(tmp2, 2);
1318     /* tmp2[i] < 17*2^121 */
1319     felem_diff128(tmp, tmp2);
1320         /*-
1321          * tmp[i] < 2^127 - 2^69 + 17*2^122
1322          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1323          *        < 2^127
1324          */
1325     felem_reduce(y_out, tmp);
1326
1327     copy_conditional(x_out, x2, z1_is_zero);
1328     copy_conditional(x_out, x1, z2_is_zero);
1329     copy_conditional(y_out, y2, z1_is_zero);
1330     copy_conditional(y_out, y1, z2_is_zero);
1331     copy_conditional(z_out, z2, z1_is_zero);
1332     copy_conditional(z_out, z1, z2_is_zero);
1333     felem_assign(x3, x_out);
1334     felem_assign(y3, y_out);
1335     felem_assign(z3, z_out);
1336 }
1337
1338 /*-
1339  * Base point pre computation
1340  * --------------------------
1341  *
1342  * Two different sorts of precomputed tables are used in the following code.
1343  * Each contain various points on the curve, where each point is three field
1344  * elements (x, y, z).
1345  *
1346  * For the base point table, z is usually 1 (0 for the point at infinity).
1347  * This table has 16 elements:
1348  * index | bits    | point
1349  * ------+---------+------------------------------
1350  *     0 | 0 0 0 0 | 0G
1351  *     1 | 0 0 0 1 | 1G
1352  *     2 | 0 0 1 0 | 2^130G
1353  *     3 | 0 0 1 1 | (2^130 + 1)G
1354  *     4 | 0 1 0 0 | 2^260G
1355  *     5 | 0 1 0 1 | (2^260 + 1)G
1356  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1357  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1358  *     8 | 1 0 0 0 | 2^390G
1359  *     9 | 1 0 0 1 | (2^390 + 1)G
1360  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1361  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1362  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1363  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1364  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1365  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1366  *
1367  * The reason for this is so that we can clock bits into four different
1368  * locations when doing simple scalar multiplies against the base point.
1369  *
1370  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1371
1372 /* gmul is the table of precomputed base points */
1373 static const felem gmul[16][3] = {
1374 {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1375  {0, 0, 0, 0, 0, 0, 0, 0, 0},
1376  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1377 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1378   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1379   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1380  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1381   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1382   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1383  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1384 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1385   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1386   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1387  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1388   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1389   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1390  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1391 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1392   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1393   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1394  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1395   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1396   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1397  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1398 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1399   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1400   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1401  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1402   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1403   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1404  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1405 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1406   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1407   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1408  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1409   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1410   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1411  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1412 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1413   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1414   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1415  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1416   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1417   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1418  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1419 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1420   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1421   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1422  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1423   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1424   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1425  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1426 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1427   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1428   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1429  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1430   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1431   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1432  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1433 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1434   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1435   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1436  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1437   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1438   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1439  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1440 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1441   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1442   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1443  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1444   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1445   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1446  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1447 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1448   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1449   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1450  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1451   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1452   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1453  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1454 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1455   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1456   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1457  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1458   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1459   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1460  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1461 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1462   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1463   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1464  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1465   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1466   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1467  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1468 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1469   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1470   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1471  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1472   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1473   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1474  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1475 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1476   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1477   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1478  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1479   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1480   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1481  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1482 };
1483
1484 /*
1485  * select_point selects the |idx|th point from a precomputation table and
1486  * copies it to out.
1487  */
1488  /* pre_comp below is of the size provided in |size| */
1489 static void select_point(const limb idx, unsigned int size,
1490                          const felem pre_comp[][3], felem out[3])
1491 {
1492     unsigned i, j;
1493     limb *outlimbs = &out[0][0];
1494
1495     memset(out, 0, sizeof(*out) * 3);
1496
1497     for (i = 0; i < size; i++) {
1498         const limb *inlimbs = &pre_comp[i][0][0];
1499         limb mask = i ^ idx;
1500         mask |= mask >> 4;
1501         mask |= mask >> 2;
1502         mask |= mask >> 1;
1503         mask &= 1;
1504         mask--;
1505         for (j = 0; j < NLIMBS * 3; j++)
1506             outlimbs[j] |= inlimbs[j] & mask;
1507     }
1508 }
1509
1510 /* get_bit returns the |i|th bit in |in| */
1511 static char get_bit(const felem_bytearray in, int i)
1512 {
1513     if (i < 0)
1514         return 0;
1515     return (in[i >> 3] >> (i & 7)) & 1;
1516 }
1517
1518 /*
1519  * Interleaved point multiplication using precomputed point multiples: The
1520  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1521  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1522  * generator, using certain (large) precomputed multiples in g_pre_comp.
1523  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1524  */
1525 static void batch_mul(felem x_out, felem y_out, felem z_out,
1526                       const felem_bytearray scalars[],
1527                       const unsigned num_points, const u8 *g_scalar,
1528                       const int mixed, const felem pre_comp[][17][3],
1529                       const felem g_pre_comp[16][3])
1530 {
1531     int i, skip;
1532     unsigned num, gen_mul = (g_scalar != NULL);
1533     felem nq[3], tmp[4];
1534     limb bits;
1535     u8 sign, digit;
1536
1537     /* set nq to the point at infinity */
1538     memset(nq, 0, sizeof(nq));
1539
1540     /*
1541      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1542      * of the generator (last quarter of rounds) and additions of other
1543      * points multiples (every 5th round).
1544      */
1545     skip = 1;                   /* save two point operations in the first
1546                                  * round */
1547     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1548         /* double */
1549         if (!skip)
1550             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1551
1552         /* add multiples of the generator */
1553         if (gen_mul && (i <= 130)) {
1554             bits = get_bit(g_scalar, i + 390) << 3;
1555             if (i < 130) {
1556                 bits |= get_bit(g_scalar, i + 260) << 2;
1557                 bits |= get_bit(g_scalar, i + 130) << 1;
1558                 bits |= get_bit(g_scalar, i);
1559             }
1560             /* select the point to add, in constant time */
1561             select_point(bits, 16, g_pre_comp, tmp);
1562             if (!skip) {
1563                 /* The 1 argument below is for "mixed" */
1564                 point_add(nq[0], nq[1], nq[2],
1565                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1566             } else {
1567                 memcpy(nq, tmp, 3 * sizeof(felem));
1568                 skip = 0;
1569             }
1570         }
1571
1572         /* do other additions every 5 doublings */
1573         if (num_points && (i % 5 == 0)) {
1574             /* loop over all scalars */
1575             for (num = 0; num < num_points; ++num) {
1576                 bits = get_bit(scalars[num], i + 4) << 5;
1577                 bits |= get_bit(scalars[num], i + 3) << 4;
1578                 bits |= get_bit(scalars[num], i + 2) << 3;
1579                 bits |= get_bit(scalars[num], i + 1) << 2;
1580                 bits |= get_bit(scalars[num], i) << 1;
1581                 bits |= get_bit(scalars[num], i - 1);
1582                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1583
1584                 /*
1585                  * select the point to add or subtract, in constant time
1586                  */
1587                 select_point(digit, 17, pre_comp[num], tmp);
1588                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1589                                             * point */
1590                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1591
1592                 if (!skip) {
1593                     point_add(nq[0], nq[1], nq[2],
1594                               nq[0], nq[1], nq[2],
1595                               mixed, tmp[0], tmp[1], tmp[2]);
1596                 } else {
1597                     memcpy(nq, tmp, 3 * sizeof(felem));
1598                     skip = 0;
1599                 }
1600             }
1601         }
1602     }
1603     felem_assign(x_out, nq[0]);
1604     felem_assign(y_out, nq[1]);
1605     felem_assign(z_out, nq[2]);
1606 }
1607
1608 /* Precomputation for the group generator. */
1609 struct nistp521_pre_comp_st {
1610     felem g_pre_comp[16][3];
1611     CRYPTO_REF_COUNT references;
1612     CRYPTO_RWLOCK *lock;
1613 };
1614
1615 const EC_METHOD *EC_GFp_nistp521_method(void)
1616 {
1617     static const EC_METHOD ret = {
1618         EC_FLAGS_DEFAULT_OCT,
1619         NID_X9_62_prime_field,
1620         ec_GFp_nistp521_group_init,
1621         ec_GFp_simple_group_finish,
1622         ec_GFp_simple_group_clear_finish,
1623         ec_GFp_nist_group_copy,
1624         ec_GFp_nistp521_group_set_curve,
1625         ec_GFp_simple_group_get_curve,
1626         ec_GFp_simple_group_get_degree,
1627         ec_group_simple_order_bits,
1628         ec_GFp_simple_group_check_discriminant,
1629         ec_GFp_simple_point_init,
1630         ec_GFp_simple_point_finish,
1631         ec_GFp_simple_point_clear_finish,
1632         ec_GFp_simple_point_copy,
1633         ec_GFp_simple_point_set_to_infinity,
1634         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1635         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1636         ec_GFp_simple_point_set_affine_coordinates,
1637         ec_GFp_nistp521_point_get_affine_coordinates,
1638         0 /* point_set_compressed_coordinates */ ,
1639         0 /* point2oct */ ,
1640         0 /* oct2point */ ,
1641         ec_GFp_simple_add,
1642         ec_GFp_simple_dbl,
1643         ec_GFp_simple_invert,
1644         ec_GFp_simple_is_at_infinity,
1645         ec_GFp_simple_is_on_curve,
1646         ec_GFp_simple_cmp,
1647         ec_GFp_simple_make_affine,
1648         ec_GFp_simple_points_make_affine,
1649         ec_GFp_nistp521_points_mul,
1650         ec_GFp_nistp521_precompute_mult,
1651         ec_GFp_nistp521_have_precompute_mult,
1652         ec_GFp_nist_field_mul,
1653         ec_GFp_nist_field_sqr,
1654         0 /* field_div */ ,
1655         ec_GFp_simple_field_inv,
1656         0 /* field_encode */ ,
1657         0 /* field_decode */ ,
1658         0,                      /* field_set_to_one */
1659         ec_key_simple_priv2oct,
1660         ec_key_simple_oct2priv,
1661         0, /* set private */
1662         ec_key_simple_generate_key,
1663         ec_key_simple_check_key,
1664         ec_key_simple_generate_public_key,
1665         0, /* keycopy */
1666         0, /* keyfinish */
1667         ecdh_simple_compute_key,
1668         ecdsa_simple_sign_setup,
1669         ecdsa_simple_sign_sig,
1670         ecdsa_simple_verify_sig,
1671         0, /* field_inverse_mod_ord */
1672         0, /* blind_coordinates */
1673         0, /* ladder_pre */
1674         0, /* ladder_step */
1675         0  /* ladder_post */
1676     };
1677
1678     return &ret;
1679 }
1680
1681 /******************************************************************************/
1682 /*
1683  * FUNCTIONS TO MANAGE PRECOMPUTATION
1684  */
1685
1686 static NISTP521_PRE_COMP *nistp521_pre_comp_new(void)
1687 {
1688     NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1689
1690     if (ret == NULL) {
1691         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1692         return ret;
1693     }
1694
1695     ret->references = 1;
1696
1697     ret->lock = CRYPTO_THREAD_lock_new();
1698     if (ret->lock == NULL) {
1699         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1700         OPENSSL_free(ret);
1701         return NULL;
1702     }
1703     return ret;
1704 }
1705
1706 NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
1707 {
1708     int i;
1709     if (p != NULL)
1710         CRYPTO_UP_REF(&p->references, &i, p->lock);
1711     return p;
1712 }
1713
1714 void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1715 {
1716     int i;
1717
1718     if (p == NULL)
1719         return;
1720
1721     CRYPTO_DOWN_REF(&p->references, &i, p->lock);
1722     REF_PRINT_COUNT("EC_nistp521", x);
1723     if (i > 0)
1724         return;
1725     REF_ASSERT_ISNT(i < 0);
1726
1727     CRYPTO_THREAD_lock_free(p->lock);
1728     OPENSSL_free(p);
1729 }
1730
1731 /******************************************************************************/
1732 /*
1733  * OPENSSL EC_METHOD FUNCTIONS
1734  */
1735
1736 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1737 {
1738     int ret;
1739     ret = ec_GFp_simple_group_init(group);
1740     group->a_is_minus3 = 1;
1741     return ret;
1742 }
1743
1744 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1745                                     const BIGNUM *a, const BIGNUM *b,
1746                                     BN_CTX *ctx)
1747 {
1748     int ret = 0;
1749     BIGNUM *curve_p, *curve_a, *curve_b;
1750 #ifndef FIPS_MODE
1751     BN_CTX *new_ctx = NULL;
1752
1753     if (ctx == NULL)
1754         ctx = new_ctx = BN_CTX_new();
1755 #endif
1756     if (ctx == NULL)
1757         return 0;
1758
1759     BN_CTX_start(ctx);
1760     curve_p = BN_CTX_get(ctx);
1761     curve_a = BN_CTX_get(ctx);
1762     curve_b = BN_CTX_get(ctx);
1763     if (curve_b == NULL)
1764         goto err;
1765     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1766     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1767     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1768     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1769         ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1770               EC_R_WRONG_CURVE_PARAMETERS);
1771         goto err;
1772     }
1773     group->field_mod_func = BN_nist_mod_521;
1774     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1775  err:
1776     BN_CTX_end(ctx);
1777 #ifndef FIPS_MODE
1778     BN_CTX_free(new_ctx);
1779 #endif
1780     return ret;
1781 }
1782
1783 /*
1784  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1785  * (X/Z^2, Y/Z^3)
1786  */
1787 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1788                                                  const EC_POINT *point,
1789                                                  BIGNUM *x, BIGNUM *y,
1790                                                  BN_CTX *ctx)
1791 {
1792     felem z1, z2, x_in, y_in, x_out, y_out;
1793     largefelem tmp;
1794
1795     if (EC_POINT_is_at_infinity(group, point)) {
1796         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1797               EC_R_POINT_AT_INFINITY);
1798         return 0;
1799     }
1800     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1801         (!BN_to_felem(z1, point->Z)))
1802         return 0;
1803     felem_inv(z2, z1);
1804     felem_square(tmp, z2);
1805     felem_reduce(z1, tmp);
1806     felem_mul(tmp, x_in, z1);
1807     felem_reduce(x_in, tmp);
1808     felem_contract(x_out, x_in);
1809     if (x != NULL) {
1810         if (!felem_to_BN(x, x_out)) {
1811             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1812                   ERR_R_BN_LIB);
1813             return 0;
1814         }
1815     }
1816     felem_mul(tmp, z1, z2);
1817     felem_reduce(z1, tmp);
1818     felem_mul(tmp, y_in, z1);
1819     felem_reduce(y_in, tmp);
1820     felem_contract(y_out, y_in);
1821     if (y != NULL) {
1822         if (!felem_to_BN(y, y_out)) {
1823             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1824                   ERR_R_BN_LIB);
1825             return 0;
1826         }
1827     }
1828     return 1;
1829 }
1830
1831 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1832 static void make_points_affine(size_t num, felem points[][3],
1833                                felem tmp_felems[])
1834 {
1835     /*
1836      * Runs in constant time, unless an input is the point at infinity (which
1837      * normally shouldn't happen).
1838      */
1839     ec_GFp_nistp_points_make_affine_internal(num,
1840                                              points,
1841                                              sizeof(felem),
1842                                              tmp_felems,
1843                                              (void (*)(void *))felem_one,
1844                                              felem_is_zero_int,
1845                                              (void (*)(void *, const void *))
1846                                              felem_assign,
1847                                              (void (*)(void *, const void *))
1848                                              felem_square_reduce, (void (*)
1849                                                                    (void *,
1850                                                                     const void
1851                                                                     *,
1852                                                                     const void
1853                                                                     *))
1854                                              felem_mul_reduce,
1855                                              (void (*)(void *, const void *))
1856                                              felem_inv,
1857                                              (void (*)(void *, const void *))
1858                                              felem_contract);
1859 }
1860
1861 /*
1862  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1863  * values Result is stored in r (r can equal one of the inputs).
1864  */
1865 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1866                                const BIGNUM *scalar, size_t num,
1867                                const EC_POINT *points[],
1868                                const BIGNUM *scalars[], BN_CTX *ctx)
1869 {
1870     int ret = 0;
1871     int j;
1872     int mixed = 0;
1873     BIGNUM *x, *y, *z, *tmp_scalar;
1874     felem_bytearray g_secret;
1875     felem_bytearray *secrets = NULL;
1876     felem (*pre_comp)[17][3] = NULL;
1877     felem *tmp_felems = NULL;
1878     felem_bytearray tmp;
1879     unsigned i, num_bytes;
1880     int have_pre_comp = 0;
1881     size_t num_points = num;
1882     felem x_in, y_in, z_in, x_out, y_out, z_out;
1883     NISTP521_PRE_COMP *pre = NULL;
1884     felem(*g_pre_comp)[3] = NULL;
1885     EC_POINT *generator = NULL;
1886     const EC_POINT *p = NULL;
1887     const BIGNUM *p_scalar = NULL;
1888
1889     BN_CTX_start(ctx);
1890     x = BN_CTX_get(ctx);
1891     y = BN_CTX_get(ctx);
1892     z = BN_CTX_get(ctx);
1893     tmp_scalar = BN_CTX_get(ctx);
1894     if (tmp_scalar == NULL)
1895         goto err;
1896
1897     if (scalar != NULL) {
1898         pre = group->pre_comp.nistp521;
1899         if (pre)
1900             /* we have precomputation, try to use it */
1901             g_pre_comp = &pre->g_pre_comp[0];
1902         else
1903             /* try to use the standard precomputation */
1904             g_pre_comp = (felem(*)[3]) gmul;
1905         generator = EC_POINT_new(group);
1906         if (generator == NULL)
1907             goto err;
1908         /* get the generator from precomputation */
1909         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1910             !felem_to_BN(y, g_pre_comp[1][1]) ||
1911             !felem_to_BN(z, g_pre_comp[1][2])) {
1912             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1913             goto err;
1914         }
1915         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1916                                                       generator, x, y, z,
1917                                                       ctx))
1918             goto err;
1919         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1920             /* precomputation matches generator */
1921             have_pre_comp = 1;
1922         else
1923             /*
1924              * we don't have valid precomputation: treat the generator as a
1925              * random point
1926              */
1927             num_points++;
1928     }
1929
1930     if (num_points > 0) {
1931         if (num_points >= 2) {
1932             /*
1933              * unless we precompute multiples for just one point, converting
1934              * those into affine form is time well spent
1935              */
1936             mixed = 1;
1937         }
1938         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1939         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1940         if (mixed)
1941             tmp_felems =
1942                 OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1943         if ((secrets == NULL) || (pre_comp == NULL)
1944             || (mixed && (tmp_felems == NULL))) {
1945             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1946             goto err;
1947         }
1948
1949         /*
1950          * we treat NULL scalars as 0, and NULL points as points at infinity,
1951          * i.e., they contribute nothing to the linear combination
1952          */
1953         for (i = 0; i < num_points; ++i) {
1954             if (i == num)
1955                 /*
1956                  * we didn't have a valid precomputation, so we pick the
1957                  * generator
1958                  */
1959             {
1960                 p = EC_GROUP_get0_generator(group);
1961                 p_scalar = scalar;
1962             } else
1963                 /* the i^th point */
1964             {
1965                 p = points[i];
1966                 p_scalar = scalars[i];
1967             }
1968             if ((p_scalar != NULL) && (p != NULL)) {
1969                 /* reduce scalar to 0 <= scalar < 2^521 */
1970                 if ((BN_num_bits(p_scalar) > 521)
1971                     || (BN_is_negative(p_scalar))) {
1972                     /*
1973                      * this is an unusual input, and we don't guarantee
1974                      * constant-timeness
1975                      */
1976                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1977                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1978                         goto err;
1979                     }
1980                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
1981                 } else
1982                     num_bytes = BN_bn2bin(p_scalar, tmp);
1983                 flip_endian(secrets[i], tmp, num_bytes);
1984                 /* precompute multiples */
1985                 if ((!BN_to_felem(x_out, p->X)) ||
1986                     (!BN_to_felem(y_out, p->Y)) ||
1987                     (!BN_to_felem(z_out, p->Z)))
1988                     goto err;
1989                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1990                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1991                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1992                 for (j = 2; j <= 16; ++j) {
1993                     if (j & 1) {
1994                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1995                                   pre_comp[i][j][2], pre_comp[i][1][0],
1996                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1997                                   pre_comp[i][j - 1][0],
1998                                   pre_comp[i][j - 1][1],
1999                                   pre_comp[i][j - 1][2]);
2000                     } else {
2001                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
2002                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
2003                                      pre_comp[i][j / 2][1],
2004                                      pre_comp[i][j / 2][2]);
2005                     }
2006                 }
2007             }
2008         }
2009         if (mixed)
2010             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
2011     }
2012
2013     /* the scalar for the generator */
2014     if ((scalar != NULL) && (have_pre_comp)) {
2015         memset(g_secret, 0, sizeof(g_secret));
2016         /* reduce scalar to 0 <= scalar < 2^521 */
2017         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
2018             /*
2019              * this is an unusual input, and we don't guarantee
2020              * constant-timeness
2021              */
2022             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2023                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2024                 goto err;
2025             }
2026             num_bytes = BN_bn2bin(tmp_scalar, tmp);
2027         } else
2028             num_bytes = BN_bn2bin(scalar, tmp);
2029         flip_endian(g_secret, tmp, num_bytes);
2030         /* do the multiplication with generator precomputation */
2031         batch_mul(x_out, y_out, z_out,
2032                   (const felem_bytearray(*))secrets, num_points,
2033                   g_secret,
2034                   mixed, (const felem(*)[17][3])pre_comp,
2035                   (const felem(*)[3])g_pre_comp);
2036     } else
2037         /* do the multiplication without generator precomputation */
2038         batch_mul(x_out, y_out, z_out,
2039                   (const felem_bytearray(*))secrets, num_points,
2040                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
2041     /* reduce the output to its unique minimal representation */
2042     felem_contract(x_in, x_out);
2043     felem_contract(y_in, y_out);
2044     felem_contract(z_in, z_out);
2045     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2046         (!felem_to_BN(z, z_in))) {
2047         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2048         goto err;
2049     }
2050     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2051
2052  err:
2053     BN_CTX_end(ctx);
2054     EC_POINT_free(generator);
2055     OPENSSL_free(secrets);
2056     OPENSSL_free(pre_comp);
2057     OPENSSL_free(tmp_felems);
2058     return ret;
2059 }
2060
2061 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2062 {
2063     int ret = 0;
2064     NISTP521_PRE_COMP *pre = NULL;
2065     int i, j;
2066     BIGNUM *x, *y;
2067     EC_POINT *generator = NULL;
2068     felem tmp_felems[16];
2069 #ifndef FIPS_MODE
2070     BN_CTX *new_ctx = NULL;
2071 #endif
2072
2073     /* throw away old precomputation */
2074     EC_pre_comp_free(group);
2075
2076 #ifndef FIPS_MODE
2077     if (ctx == NULL)
2078         ctx = new_ctx = BN_CTX_new();
2079 #endif
2080     if (ctx == NULL)
2081         return 0;
2082
2083     BN_CTX_start(ctx);
2084     x = BN_CTX_get(ctx);
2085     y = BN_CTX_get(ctx);
2086     if (y == NULL)
2087         goto err;
2088     /* get the generator */
2089     if (group->generator == NULL)
2090         goto err;
2091     generator = EC_POINT_new(group);
2092     if (generator == NULL)
2093         goto err;
2094     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2095     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2096     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2097         goto err;
2098     if ((pre = nistp521_pre_comp_new()) == NULL)
2099         goto err;
2100     /*
2101      * if the generator is the standard one, use built-in precomputation
2102      */
2103     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2104         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2105         goto done;
2106     }
2107     if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
2108         (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
2109         (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2110         goto err;
2111     /* compute 2^130*G, 2^260*G, 2^390*G */
2112     for (i = 1; i <= 4; i <<= 1) {
2113         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2114                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2115                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2116         for (j = 0; j < 129; ++j) {
2117             point_double(pre->g_pre_comp[2 * i][0],
2118                          pre->g_pre_comp[2 * i][1],
2119                          pre->g_pre_comp[2 * i][2],
2120                          pre->g_pre_comp[2 * i][0],
2121                          pre->g_pre_comp[2 * i][1],
2122                          pre->g_pre_comp[2 * i][2]);
2123         }
2124     }
2125     /* g_pre_comp[0] is the point at infinity */
2126     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2127     /* the remaining multiples */
2128     /* 2^130*G + 2^260*G */
2129     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2130               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2131               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2132               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2133               pre->g_pre_comp[2][2]);
2134     /* 2^130*G + 2^390*G */
2135     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2136               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2137               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2138               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2139               pre->g_pre_comp[2][2]);
2140     /* 2^260*G + 2^390*G */
2141     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2142               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2143               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2144               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2145               pre->g_pre_comp[4][2]);
2146     /* 2^130*G + 2^260*G + 2^390*G */
2147     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2148               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2149               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2150               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2151               pre->g_pre_comp[2][2]);
2152     for (i = 1; i < 8; ++i) {
2153         /* odd multiples: add G */
2154         point_add(pre->g_pre_comp[2 * i + 1][0],
2155                   pre->g_pre_comp[2 * i + 1][1],
2156                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2157                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2158                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2159                   pre->g_pre_comp[1][2]);
2160     }
2161     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2162
2163  done:
2164     SETPRECOMP(group, nistp521, pre);
2165     ret = 1;
2166     pre = NULL;
2167  err:
2168     BN_CTX_end(ctx);
2169     EC_POINT_free(generator);
2170 #ifndef FIPS_MODE
2171     BN_CTX_free(new_ctx);
2172 #endif
2173     EC_nistp521_pre_comp_free(pre);
2174     return ret;
2175 }
2176
2177 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2178 {
2179     return HAVEPRECOMP(group, nistp521);
2180 }
2181
2182 #endif