crypto/ec/ecp_nistp521.c

   1 /*
   2  * Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the OpenSSL license (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  28  *
  29  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  30  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  31  * work which got its smarts from Daniel J. Bernstein's work on the same.
  32  */
  33
  34 #include <openssl/opensslconf.h>
  35 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  36 NON_EMPTY_TRANSLATION_UNIT
  37 #else
  38
  39 # ifndef OPENSSL_SYS_VMS
  40 #  include <stdint.h>
  41 # else
  42 #  include <inttypes.h>
  43 # endif
  44
  45 # include <string.h>
  46 # include <openssl/err.h>
  47 # include "ec_lcl.h"
  48
  49 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  50   /* even with gcc, the typedef won't work for 32-bit platforms */
  51 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  52                                  * platforms */
  53 # else
  54 #  error "Need GCC 3.1 or later to define type uint128_t"
  55 # endif
  56
  57 typedef uint8_t u8;
  58 typedef uint64_t u64;
  59 typedef int64_t s64;
  60
  61 /*
  62  * The underlying field. P521 operates over GF(2^521-1). We can serialise an
  63  * element of this field into 66 bytes where the most significant byte
  64  * contains only a single bit. We call this an felem_bytearray.
  65  */
  66
  67 typedef u8 felem_bytearray[66];
  68
  69 /*
  70  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  71  * These values are big-endian.
  72  */
  73 static const felem_bytearray nistp521_curve_params[5] = {
  74     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
  75      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  76      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  78      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  79      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  82      0xff, 0xff},
  83     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
  84      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  85      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  86      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  87      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  88      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  89      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  90      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  91      0xff, 0xfc},
  92     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
  93      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  94      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  95      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  96      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  97      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  98      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  99      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
 100      0x3f, 0x00},
 101     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
 102      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
 103      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
 104      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
 105      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
 106      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
 107      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
 108      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 109      0xbd, 0x66},
 110     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
 111      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 112      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 113      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 114      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 115      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 116      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 117      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 118      0x66, 0x50}
 119 };
 120
 121 /*-
 122  * The representation of field elements.
 123  * ------------------------------------
 124  *
 125  * We represent field elements with nine values. These values are either 64 or
 126  * 128 bits and the field element represented is:
 127  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 128  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 129  * 58 bits apart, but are greater than 58 bits in length, the most significant
 130  * bits of each limb overlap with the least significant bits of the next.
 131  *
 132  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 133  * 'largefelem' */
 134
 135 # define NLIMBS 9
 136
 137 typedef uint64_t limb;
 138 typedef limb felem[NLIMBS];
 139 typedef uint128_t largefelem[NLIMBS];
 140
 141 static const limb bottom57bits = 0x1ffffffffffffff;
 142 static const limb bottom58bits = 0x3ffffffffffffff;
 143
 144 /*
 145  * bin66_to_felem takes a little-endian byte array and converts it into felem
 146  * form. This assumes that the CPU is little-endian.
 147  */
 148 static void bin66_to_felem(felem out, const u8 in[66])
 149 {
 150     out[0] = (*((limb *) & in[0])) & bottom58bits;
 151     out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
 152     out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
 153     out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
 154     out[4] = (*((limb *) & in[29])) & bottom58bits;
 155     out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
 156     out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
 157     out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
 158     out[8] = (*((limb *) & in[58])) & bottom57bits;
 159 }
 160
 161 /*
 162  * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 163  * array. This assumes that the CPU is little-endian.
 164  */
 165 static void felem_to_bin66(u8 out[66], const felem in)
 166 {
 167     memset(out, 0, 66);
 168     (*((limb *) & out[0])) = in[0];
 169     (*((limb *) & out[7])) |= in[1] << 2;
 170     (*((limb *) & out[14])) |= in[2] << 4;
 171     (*((limb *) & out[21])) |= in[3] << 6;
 172     (*((limb *) & out[29])) = in[4];
 173     (*((limb *) & out[36])) |= in[5] << 2;
 174     (*((limb *) & out[43])) |= in[6] << 4;
 175     (*((limb *) & out[50])) |= in[7] << 6;
 176     (*((limb *) & out[58])) = in[8];
 177 }
 178
 179 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 180 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 181 {
 182     unsigned i;
 183     for (i = 0; i < len; ++i)
 184         out[i] = in[len - 1 - i];
 185 }
 186
 187 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 188 static int BN_to_felem(felem out, const BIGNUM *bn)
 189 {
 190     felem_bytearray b_in;
 191     felem_bytearray b_out;
 192     unsigned num_bytes;
 193
 194     /* BN_bn2bin eats leading zeroes */
 195     memset(b_out, 0, sizeof(b_out));
 196     num_bytes = BN_num_bytes(bn);
 197     if (num_bytes > sizeof b_out) {
 198         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 199         return 0;
 200     }
 201     if (BN_is_negative(bn)) {
 202         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 203         return 0;
 204     }
 205     num_bytes = BN_bn2bin(bn, b_in);
 206     flip_endian(b_out, b_in, num_bytes);
 207     bin66_to_felem(out, b_out);
 208     return 1;
 209 }
 210
 211 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 212 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 213 {
 214     felem_bytearray b_in, b_out;
 215     felem_to_bin66(b_in, in);
 216     flip_endian(b_out, b_in, sizeof b_out);
 217     return BN_bin2bn(b_out, sizeof b_out, out);
 218 }
 219
 220 /*-
 221  * Field operations
 222  * ----------------
 223  */
 224
 225 static void felem_one(felem out)
 226 {
 227     out[0] = 1;
 228     out[1] = 0;
 229     out[2] = 0;
 230     out[3] = 0;
 231     out[4] = 0;
 232     out[5] = 0;
 233     out[6] = 0;
 234     out[7] = 0;
 235     out[8] = 0;
 236 }
 237
 238 static void felem_assign(felem out, const felem in)
 239 {
 240     out[0] = in[0];
 241     out[1] = in[1];
 242     out[2] = in[2];
 243     out[3] = in[3];
 244     out[4] = in[4];
 245     out[5] = in[5];
 246     out[6] = in[6];
 247     out[7] = in[7];
 248     out[8] = in[8];
 249 }
 250
 251 /* felem_sum64 sets out = out + in. */
 252 static void felem_sum64(felem out, const felem in)
 253 {
 254     out[0] += in[0];
 255     out[1] += in[1];
 256     out[2] += in[2];
 257     out[3] += in[3];
 258     out[4] += in[4];
 259     out[5] += in[5];
 260     out[6] += in[6];
 261     out[7] += in[7];
 262     out[8] += in[8];
 263 }
 264
 265 /* felem_scalar sets out = in * scalar */
 266 static void felem_scalar(felem out, const felem in, limb scalar)
 267 {
 268     out[0] = in[0] * scalar;
 269     out[1] = in[1] * scalar;
 270     out[2] = in[2] * scalar;
 271     out[3] = in[3] * scalar;
 272     out[4] = in[4] * scalar;
 273     out[5] = in[5] * scalar;
 274     out[6] = in[6] * scalar;
 275     out[7] = in[7] * scalar;
 276     out[8] = in[8] * scalar;
 277 }
 278
 279 /* felem_scalar64 sets out = out * scalar */
 280 static void felem_scalar64(felem out, limb scalar)
 281 {
 282     out[0] *= scalar;
 283     out[1] *= scalar;
 284     out[2] *= scalar;
 285     out[3] *= scalar;
 286     out[4] *= scalar;
 287     out[5] *= scalar;
 288     out[6] *= scalar;
 289     out[7] *= scalar;
 290     out[8] *= scalar;
 291 }
 292
 293 /* felem_scalar128 sets out = out * scalar */
 294 static void felem_scalar128(largefelem out, limb scalar)
 295 {
 296     out[0] *= scalar;
 297     out[1] *= scalar;
 298     out[2] *= scalar;
 299     out[3] *= scalar;
 300     out[4] *= scalar;
 301     out[5] *= scalar;
 302     out[6] *= scalar;
 303     out[7] *= scalar;
 304     out[8] *= scalar;
 305 }
 306
 307 /*-
 308  * felem_neg sets |out| to |-in|
 309  * On entry:
 310  *   in[i] < 2^59 + 2^14
 311  * On exit:
 312  *   out[i] < 2^62
 313  */
 314 static void felem_neg(felem out, const felem in)
 315 {
 316     /* In order to prevent underflow, we subtract from 0 mod p. */
 317     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 318     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 319
 320     out[0] = two62m3 - in[0];
 321     out[1] = two62m2 - in[1];
 322     out[2] = two62m2 - in[2];
 323     out[3] = two62m2 - in[3];
 324     out[4] = two62m2 - in[4];
 325     out[5] = two62m2 - in[5];
 326     out[6] = two62m2 - in[6];
 327     out[7] = two62m2 - in[7];
 328     out[8] = two62m2 - in[8];
 329 }
 330
 331 /*-
 332  * felem_diff64 subtracts |in| from |out|
 333  * On entry:
 334  *   in[i] < 2^59 + 2^14
 335  * On exit:
 336  *   out[i] < out[i] + 2^62
 337  */
 338 static void felem_diff64(felem out, const felem in)
 339 {
 340     /*
 341      * In order to prevent underflow, we add 0 mod p before subtracting.
 342      */
 343     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 344     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 345
 346     out[0] += two62m3 - in[0];
 347     out[1] += two62m2 - in[1];
 348     out[2] += two62m2 - in[2];
 349     out[3] += two62m2 - in[3];
 350     out[4] += two62m2 - in[4];
 351     out[5] += two62m2 - in[5];
 352     out[6] += two62m2 - in[6];
 353     out[7] += two62m2 - in[7];
 354     out[8] += two62m2 - in[8];
 355 }
 356
 357 /*-
 358  * felem_diff_128_64 subtracts |in| from |out|
 359  * On entry:
 360  *   in[i] < 2^62 + 2^17
 361  * On exit:
 362  *   out[i] < out[i] + 2^63
 363  */
 364 static void felem_diff_128_64(largefelem out, const felem in)
 365 {
 366     /*
 367      * In order to prevent underflow, we add 0 mod p before subtracting.
 368      */
 369     static const limb two63m6 = (((limb) 1) << 62) - (((limb) 1) << 5);
 370     static const limb two63m5 = (((limb) 1) << 62) - (((limb) 1) << 4);
 371
 372     out[0] += two63m6 - in[0];
 373     out[1] += two63m5 - in[1];
 374     out[2] += two63m5 - in[2];
 375     out[3] += two63m5 - in[3];
 376     out[4] += two63m5 - in[4];
 377     out[5] += two63m5 - in[5];
 378     out[6] += two63m5 - in[6];
 379     out[7] += two63m5 - in[7];
 380     out[8] += two63m5 - in[8];
 381 }
 382
 383 /*-
 384  * felem_diff_128_64 subtracts |in| from |out|
 385  * On entry:
 386  *   in[i] < 2^126
 387  * On exit:
 388  *   out[i] < out[i] + 2^127 - 2^69
 389  */
 390 static void felem_diff128(largefelem out, const largefelem in)
 391 {
 392     /*
 393      * In order to prevent underflow, we add 0 mod p before subtracting.
 394      */
 395     static const uint128_t two127m70 =
 396         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
 397     static const uint128_t two127m69 =
 398         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
 399
 400     out[0] += (two127m70 - in[0]);
 401     out[1] += (two127m69 - in[1]);
 402     out[2] += (two127m69 - in[2]);
 403     out[3] += (two127m69 - in[3]);
 404     out[4] += (two127m69 - in[4]);
 405     out[5] += (two127m69 - in[5]);
 406     out[6] += (two127m69 - in[6]);
 407     out[7] += (two127m69 - in[7]);
 408     out[8] += (two127m69 - in[8]);
 409 }
 410
 411 /*-
 412  * felem_square sets |out| = |in|^2
 413  * On entry:
 414  *   in[i] < 2^62
 415  * On exit:
 416  *   out[i] < 17 * max(in[i]) * max(in[i])
 417  */
 418 static void felem_square(largefelem out, const felem in)
 419 {
 420     felem inx2, inx4;
 421     felem_scalar(inx2, in, 2);
 422     felem_scalar(inx4, in, 4);
 423
 424     /*-
 425      * We have many cases were we want to do
 426      *   in[x] * in[y] +
 427      *   in[y] * in[x]
 428      * This is obviously just
 429      *   2 * in[x] * in[y]
 430      * However, rather than do the doubling on the 128 bit result, we
 431      * double one of the inputs to the multiplication by reading from
 432      * |inx2|
 433      */
 434
 435     out[0] = ((uint128_t) in[0]) * in[0];
 436     out[1] = ((uint128_t) in[0]) * inx2[1];
 437     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
 438     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
 439     out[4] = ((uint128_t) in[0]) * inx2[4] +
 440              ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
 441     out[5] = ((uint128_t) in[0]) * inx2[5] +
 442              ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
 443     out[6] = ((uint128_t) in[0]) * inx2[6] +
 444              ((uint128_t) in[1]) * inx2[5] +
 445              ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
 446     out[7] = ((uint128_t) in[0]) * inx2[7] +
 447              ((uint128_t) in[1]) * inx2[6] +
 448              ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
 449     out[8] = ((uint128_t) in[0]) * inx2[8] +
 450              ((uint128_t) in[1]) * inx2[7] +
 451              ((uint128_t) in[2]) * inx2[6] +
 452              ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
 453
 454     /*
 455      * The remaining limbs fall above 2^521, with the first falling at 2^522.
 456      * They correspond to locations one bit up from the limbs produced above
 457      * so we would have to multiply by two to align them. Again, rather than
 458      * operate on the 128-bit result, we double one of the inputs to the
 459      * multiplication. If we want to double for both this reason, and the
 460      * reason above, then we end up multiplying by four.
 461      */
 462
 463     /* 9 */
 464     out[0] += ((uint128_t) in[1]) * inx4[8] +
 465               ((uint128_t) in[2]) * inx4[7] +
 466               ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
 467
 468     /* 10 */
 469     out[1] += ((uint128_t) in[2]) * inx4[8] +
 470               ((uint128_t) in[3]) * inx4[7] +
 471               ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
 472
 473     /* 11 */
 474     out[2] += ((uint128_t) in[3]) * inx4[8] +
 475               ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
 476
 477     /* 12 */
 478     out[3] += ((uint128_t) in[4]) * inx4[8] +
 479               ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
 480
 481     /* 13 */
 482     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
 483
 484     /* 14 */
 485     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
 486
 487     /* 15 */
 488     out[6] += ((uint128_t) in[7]) * inx4[8];
 489
 490     /* 16 */
 491     out[7] += ((uint128_t) in[8]) * inx2[8];
 492 }
 493
 494 /*-
 495  * felem_mul sets |out| = |in1| * |in2|
 496  * On entry:
 497  *   in1[i] < 2^64
 498  *   in2[i] < 2^63
 499  * On exit:
 500  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 501  */
 502 static void felem_mul(largefelem out, const felem in1, const felem in2)
 503 {
 504     felem in2x2;
 505     felem_scalar(in2x2, in2, 2);
 506
 507     out[0] = ((uint128_t) in1[0]) * in2[0];
 508
 509     out[1] = ((uint128_t) in1[0]) * in2[1] +
 510              ((uint128_t) in1[1]) * in2[0];
 511
 512     out[2] = ((uint128_t) in1[0]) * in2[2] +
 513              ((uint128_t) in1[1]) * in2[1] +
 514              ((uint128_t) in1[2]) * in2[0];
 515
 516     out[3] = ((uint128_t) in1[0]) * in2[3] +
 517              ((uint128_t) in1[1]) * in2[2] +
 518              ((uint128_t) in1[2]) * in2[1] +
 519              ((uint128_t) in1[3]) * in2[0];
 520
 521     out[4] = ((uint128_t) in1[0]) * in2[4] +
 522              ((uint128_t) in1[1]) * in2[3] +
 523              ((uint128_t) in1[2]) * in2[2] +
 524              ((uint128_t) in1[3]) * in2[1] +
 525              ((uint128_t) in1[4]) * in2[0];
 526
 527     out[5] = ((uint128_t) in1[0]) * in2[5] +
 528              ((uint128_t) in1[1]) * in2[4] +
 529              ((uint128_t) in1[2]) * in2[3] +
 530              ((uint128_t) in1[3]) * in2[2] +
 531              ((uint128_t) in1[4]) * in2[1] +
 532              ((uint128_t) in1[5]) * in2[0];
 533
 534     out[6] = ((uint128_t) in1[0]) * in2[6] +
 535              ((uint128_t) in1[1]) * in2[5] +
 536              ((uint128_t) in1[2]) * in2[4] +
 537              ((uint128_t) in1[3]) * in2[3] +
 538              ((uint128_t) in1[4]) * in2[2] +
 539              ((uint128_t) in1[5]) * in2[1] +
 540              ((uint128_t) in1[6]) * in2[0];
 541
 542     out[7] = ((uint128_t) in1[0]) * in2[7] +
 543              ((uint128_t) in1[1]) * in2[6] +
 544              ((uint128_t) in1[2]) * in2[5] +
 545              ((uint128_t) in1[3]) * in2[4] +
 546              ((uint128_t) in1[4]) * in2[3] +
 547              ((uint128_t) in1[5]) * in2[2] +
 548              ((uint128_t) in1[6]) * in2[1] +
 549              ((uint128_t) in1[7]) * in2[0];
 550
 551     out[8] = ((uint128_t) in1[0]) * in2[8] +
 552              ((uint128_t) in1[1]) * in2[7] +
 553              ((uint128_t) in1[2]) * in2[6] +
 554              ((uint128_t) in1[3]) * in2[5] +
 555              ((uint128_t) in1[4]) * in2[4] +
 556              ((uint128_t) in1[5]) * in2[3] +
 557              ((uint128_t) in1[6]) * in2[2] +
 558              ((uint128_t) in1[7]) * in2[1] +
 559              ((uint128_t) in1[8]) * in2[0];
 560
 561     /* See comment in felem_square about the use of in2x2 here */
 562
 563     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 564               ((uint128_t) in1[2]) * in2x2[7] +
 565               ((uint128_t) in1[3]) * in2x2[6] +
 566               ((uint128_t) in1[4]) * in2x2[5] +
 567               ((uint128_t) in1[5]) * in2x2[4] +
 568               ((uint128_t) in1[6]) * in2x2[3] +
 569               ((uint128_t) in1[7]) * in2x2[2] +
 570               ((uint128_t) in1[8]) * in2x2[1];
 571
 572     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 573               ((uint128_t) in1[3]) * in2x2[7] +
 574               ((uint128_t) in1[4]) * in2x2[6] +
 575               ((uint128_t) in1[5]) * in2x2[5] +
 576               ((uint128_t) in1[6]) * in2x2[4] +
 577               ((uint128_t) in1[7]) * in2x2[3] +
 578               ((uint128_t) in1[8]) * in2x2[2];
 579
 580     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 581               ((uint128_t) in1[4]) * in2x2[7] +
 582               ((uint128_t) in1[5]) * in2x2[6] +
 583               ((uint128_t) in1[6]) * in2x2[5] +
 584               ((uint128_t) in1[7]) * in2x2[4] +
 585               ((uint128_t) in1[8]) * in2x2[3];
 586
 587     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 588               ((uint128_t) in1[5]) * in2x2[7] +
 589               ((uint128_t) in1[6]) * in2x2[6] +
 590               ((uint128_t) in1[7]) * in2x2[5] +
 591               ((uint128_t) in1[8]) * in2x2[4];
 592
 593     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 594               ((uint128_t) in1[6]) * in2x2[7] +
 595               ((uint128_t) in1[7]) * in2x2[6] +
 596               ((uint128_t) in1[8]) * in2x2[5];
 597
 598     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 599               ((uint128_t) in1[7]) * in2x2[7] +
 600               ((uint128_t) in1[8]) * in2x2[6];
 601
 602     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 603               ((uint128_t) in1[8]) * in2x2[7];
 604
 605     out[7] += ((uint128_t) in1[8]) * in2x2[8];
 606 }
 607
 608 static const limb bottom52bits = 0xfffffffffffff;
 609
 610 /*-
 611  * felem_reduce converts a largefelem to an felem.
 612  * On entry:
 613  *   in[i] < 2^128
 614  * On exit:
 615  *   out[i] < 2^59 + 2^14
 616  */
 617 static void felem_reduce(felem out, const largefelem in)
 618 {
 619     u64 overflow1, overflow2;
 620
 621     out[0] = ((limb) in[0]) & bottom58bits;
 622     out[1] = ((limb) in[1]) & bottom58bits;
 623     out[2] = ((limb) in[2]) & bottom58bits;
 624     out[3] = ((limb) in[3]) & bottom58bits;
 625     out[4] = ((limb) in[4]) & bottom58bits;
 626     out[5] = ((limb) in[5]) & bottom58bits;
 627     out[6] = ((limb) in[6]) & bottom58bits;
 628     out[7] = ((limb) in[7]) & bottom58bits;
 629     out[8] = ((limb) in[8]) & bottom58bits;
 630
 631     /* out[i] < 2^58 */
 632
 633     out[1] += ((limb) in[0]) >> 58;
 634     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 635     /*-
 636      * out[1] < 2^58 + 2^6 + 2^58
 637      *        = 2^59 + 2^6
 638      */
 639     out[2] += ((limb) (in[0] >> 64)) >> 52;
 640
 641     out[2] += ((limb) in[1]) >> 58;
 642     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 643     out[3] += ((limb) (in[1] >> 64)) >> 52;
 644
 645     out[3] += ((limb) in[2]) >> 58;
 646     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 647     out[4] += ((limb) (in[2] >> 64)) >> 52;
 648
 649     out[4] += ((limb) in[3]) >> 58;
 650     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 651     out[5] += ((limb) (in[3] >> 64)) >> 52;
 652
 653     out[5] += ((limb) in[4]) >> 58;
 654     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 655     out[6] += ((limb) (in[4] >> 64)) >> 52;
 656
 657     out[6] += ((limb) in[5]) >> 58;
 658     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 659     out[7] += ((limb) (in[5] >> 64)) >> 52;
 660
 661     out[7] += ((limb) in[6]) >> 58;
 662     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 663     out[8] += ((limb) (in[6] >> 64)) >> 52;
 664
 665     out[8] += ((limb) in[7]) >> 58;
 666     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 667     /*-
 668      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 669      *            < 2^59 + 2^13
 670      */
 671     overflow1 = ((limb) (in[7] >> 64)) >> 52;
 672
 673     overflow1 += ((limb) in[8]) >> 58;
 674     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 675     overflow2 = ((limb) (in[8] >> 64)) >> 52;
 676
 677     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
 678     overflow2 <<= 1;            /* overflow2 < 2^13 */
 679
 680     out[0] += overflow1;        /* out[0] < 2^60 */
 681     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
 682
 683     out[1] += out[0] >> 58;
 684     out[0] &= bottom58bits;
 685     /*-
 686      * out[0] < 2^58
 687      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 688      *        < 2^59 + 2^14
 689      */
 690 }
 691
 692 static void felem_square_reduce(felem out, const felem in)
 693 {
 694     largefelem tmp;
 695     felem_square(tmp, in);
 696     felem_reduce(out, tmp);
 697 }
 698
 699 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 700 {
 701     largefelem tmp;
 702     felem_mul(tmp, in1, in2);
 703     felem_reduce(out, tmp);
 704 }
 705
 706 /*-
 707  * felem_inv calculates |out| = |in|^{-1}
 708  *
 709  * Based on Fermat's Little Theorem:
 710  *   a^p = a (mod p)
 711  *   a^{p-1} = 1 (mod p)
 712  *   a^{p-2} = a^{-1} (mod p)
 713  */
 714 static void felem_inv(felem out, const felem in)
 715 {
 716     felem ftmp, ftmp2, ftmp3, ftmp4;
 717     largefelem tmp;
 718     unsigned i;
 719
 720     felem_square(tmp, in);
 721     felem_reduce(ftmp, tmp);    /* 2^1 */
 722     felem_mul(tmp, in, ftmp);
 723     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
 724     felem_assign(ftmp2, ftmp);
 725     felem_square(tmp, ftmp);
 726     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
 727     felem_mul(tmp, in, ftmp);
 728     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
 729     felem_square(tmp, ftmp);
 730     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
 731
 732     felem_square(tmp, ftmp2);
 733     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
 734     felem_square(tmp, ftmp3);
 735     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
 736     felem_mul(tmp, ftmp3, ftmp2);
 737     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
 738
 739     felem_assign(ftmp2, ftmp3);
 740     felem_square(tmp, ftmp3);
 741     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
 742     felem_square(tmp, ftmp3);
 743     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
 744     felem_square(tmp, ftmp3);
 745     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
 746     felem_square(tmp, ftmp3);
 747     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
 748     felem_assign(ftmp4, ftmp3);
 749     felem_mul(tmp, ftmp3, ftmp);
 750     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
 751     felem_square(tmp, ftmp4);
 752     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
 753     felem_mul(tmp, ftmp3, ftmp2);
 754     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
 755     felem_assign(ftmp2, ftmp3);
 756
 757     for (i = 0; i < 8; i++) {
 758         felem_square(tmp, ftmp3);
 759         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
 760     }
 761     felem_mul(tmp, ftmp3, ftmp2);
 762     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
 763     felem_assign(ftmp2, ftmp3);
 764
 765     for (i = 0; i < 16; i++) {
 766         felem_square(tmp, ftmp3);
 767         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
 768     }
 769     felem_mul(tmp, ftmp3, ftmp2);
 770     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
 771     felem_assign(ftmp2, ftmp3);
 772
 773     for (i = 0; i < 32; i++) {
 774         felem_square(tmp, ftmp3);
 775         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
 776     }
 777     felem_mul(tmp, ftmp3, ftmp2);
 778     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
 779     felem_assign(ftmp2, ftmp3);
 780
 781     for (i = 0; i < 64; i++) {
 782         felem_square(tmp, ftmp3);
 783         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
 784     }
 785     felem_mul(tmp, ftmp3, ftmp2);
 786     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
 787     felem_assign(ftmp2, ftmp3);
 788
 789     for (i = 0; i < 128; i++) {
 790         felem_square(tmp, ftmp3);
 791         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
 792     }
 793     felem_mul(tmp, ftmp3, ftmp2);
 794     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
 795     felem_assign(ftmp2, ftmp3);
 796
 797     for (i = 0; i < 256; i++) {
 798         felem_square(tmp, ftmp3);
 799         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
 800     }
 801     felem_mul(tmp, ftmp3, ftmp2);
 802     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
 803
 804     for (i = 0; i < 9; i++) {
 805         felem_square(tmp, ftmp3);
 806         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
 807     }
 808     felem_mul(tmp, ftmp3, ftmp4);
 809     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
 810     felem_mul(tmp, ftmp3, in);
 811     felem_reduce(out, tmp);     /* 2^512 - 3 */
 812 }
 813
 814 /* This is 2^521-1, expressed as an felem */
 815 static const felem kPrime = {
 816     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 817     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 818     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 819 };
 820
 821 /*-
 822  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 823  * otherwise.
 824  * On entry:
 825  *   in[i] < 2^59 + 2^14
 826  */
 827 static limb felem_is_zero(const felem in)
 828 {
 829     felem ftmp;
 830     limb is_zero, is_p;
 831     felem_assign(ftmp, in);
 832
 833     ftmp[0] += ftmp[8] >> 57;
 834     ftmp[8] &= bottom57bits;
 835     /* ftmp[8] < 2^57 */
 836     ftmp[1] += ftmp[0] >> 58;
 837     ftmp[0] &= bottom58bits;
 838     ftmp[2] += ftmp[1] >> 58;
 839     ftmp[1] &= bottom58bits;
 840     ftmp[3] += ftmp[2] >> 58;
 841     ftmp[2] &= bottom58bits;
 842     ftmp[4] += ftmp[3] >> 58;
 843     ftmp[3] &= bottom58bits;
 844     ftmp[5] += ftmp[4] >> 58;
 845     ftmp[4] &= bottom58bits;
 846     ftmp[6] += ftmp[5] >> 58;
 847     ftmp[5] &= bottom58bits;
 848     ftmp[7] += ftmp[6] >> 58;
 849     ftmp[6] &= bottom58bits;
 850     ftmp[8] += ftmp[7] >> 58;
 851     ftmp[7] &= bottom58bits;
 852     /* ftmp[8] < 2^57 + 4 */
 853
 854     /*
 855      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
 856      * than our bound for ftmp[8]. Therefore we only have to check if the
 857      * zero is zero or 2^521-1.
 858      */
 859
 860     is_zero = 0;
 861     is_zero |= ftmp[0];
 862     is_zero |= ftmp[1];
 863     is_zero |= ftmp[2];
 864     is_zero |= ftmp[3];
 865     is_zero |= ftmp[4];
 866     is_zero |= ftmp[5];
 867     is_zero |= ftmp[6];
 868     is_zero |= ftmp[7];
 869     is_zero |= ftmp[8];
 870
 871     is_zero--;
 872     /*
 873      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
 874      * can be set is if is_zero was 0 before the decrement.
 875      */
 876     is_zero = ((s64) is_zero) >> 63;
 877
 878     is_p = ftmp[0] ^ kPrime[0];
 879     is_p |= ftmp[1] ^ kPrime[1];
 880     is_p |= ftmp[2] ^ kPrime[2];
 881     is_p |= ftmp[3] ^ kPrime[3];
 882     is_p |= ftmp[4] ^ kPrime[4];
 883     is_p |= ftmp[5] ^ kPrime[5];
 884     is_p |= ftmp[6] ^ kPrime[6];
 885     is_p |= ftmp[7] ^ kPrime[7];
 886     is_p |= ftmp[8] ^ kPrime[8];
 887
 888     is_p--;
 889     is_p = ((s64) is_p) >> 63;
 890
 891     is_zero |= is_p;
 892     return is_zero;
 893 }
 894
 895 static int felem_is_zero_int(const felem in)
 896 {
 897     return (int)(felem_is_zero(in) & ((limb) 1));
 898 }
 899
 900 /*-
 901  * felem_contract converts |in| to its unique, minimal representation.
 902  * On entry:
 903  *   in[i] < 2^59 + 2^14
 904  */
 905 static void felem_contract(felem out, const felem in)
 906 {
 907     limb is_p, is_greater, sign;
 908     static const limb two58 = ((limb) 1) << 58;
 909
 910     felem_assign(out, in);
 911
 912     out[0] += out[8] >> 57;
 913     out[8] &= bottom57bits;
 914     /* out[8] < 2^57 */
 915     out[1] += out[0] >> 58;
 916     out[0] &= bottom58bits;
 917     out[2] += out[1] >> 58;
 918     out[1] &= bottom58bits;
 919     out[3] += out[2] >> 58;
 920     out[2] &= bottom58bits;
 921     out[4] += out[3] >> 58;
 922     out[3] &= bottom58bits;
 923     out[5] += out[4] >> 58;
 924     out[4] &= bottom58bits;
 925     out[6] += out[5] >> 58;
 926     out[5] &= bottom58bits;
 927     out[7] += out[6] >> 58;
 928     out[6] &= bottom58bits;
 929     out[8] += out[7] >> 58;
 930     out[7] &= bottom58bits;
 931     /* out[8] < 2^57 + 4 */
 932
 933     /*
 934      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
 935      * out. See the comments in felem_is_zero regarding why we don't test for
 936      * other multiples of the prime.
 937      */
 938
 939     /*
 940      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
 941      */
 942
 943     is_p = out[0] ^ kPrime[0];
 944     is_p |= out[1] ^ kPrime[1];
 945     is_p |= out[2] ^ kPrime[2];
 946     is_p |= out[3] ^ kPrime[3];
 947     is_p |= out[4] ^ kPrime[4];
 948     is_p |= out[5] ^ kPrime[5];
 949     is_p |= out[6] ^ kPrime[6];
 950     is_p |= out[7] ^ kPrime[7];
 951     is_p |= out[8] ^ kPrime[8];
 952
 953     is_p--;
 954     is_p &= is_p << 32;
 955     is_p &= is_p << 16;
 956     is_p &= is_p << 8;
 957     is_p &= is_p << 4;
 958     is_p &= is_p << 2;
 959     is_p &= is_p << 1;
 960     is_p = ((s64) is_p) >> 63;
 961     is_p = ~is_p;
 962
 963     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 964
 965     out[0] &= is_p;
 966     out[1] &= is_p;
 967     out[2] &= is_p;
 968     out[3] &= is_p;
 969     out[4] &= is_p;
 970     out[5] &= is_p;
 971     out[6] &= is_p;
 972     out[7] &= is_p;
 973     out[8] &= is_p;
 974
 975     /*
 976      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
 977      * 57 is greater than zero as (2^521-1) + x >= 2^522
 978      */
 979     is_greater = out[8] >> 57;
 980     is_greater |= is_greater << 32;
 981     is_greater |= is_greater << 16;
 982     is_greater |= is_greater << 8;
 983     is_greater |= is_greater << 4;
 984     is_greater |= is_greater << 2;
 985     is_greater |= is_greater << 1;
 986     is_greater = ((s64) is_greater) >> 63;
 987
 988     out[0] -= kPrime[0] & is_greater;
 989     out[1] -= kPrime[1] & is_greater;
 990     out[2] -= kPrime[2] & is_greater;
 991     out[3] -= kPrime[3] & is_greater;
 992     out[4] -= kPrime[4] & is_greater;
 993     out[5] -= kPrime[5] & is_greater;
 994     out[6] -= kPrime[6] & is_greater;
 995     out[7] -= kPrime[7] & is_greater;
 996     out[8] -= kPrime[8] & is_greater;
 997
 998     /* Eliminate negative coefficients */
 999     sign = -(out[0] >> 63);
1000     out[0] += (two58 & sign);
1001     out[1] -= (1 & sign);
1002     sign = -(out[1] >> 63);
1003     out[1] += (two58 & sign);
1004     out[2] -= (1 & sign);
1005     sign = -(out[2] >> 63);
1006     out[2] += (two58 & sign);
1007     out[3] -= (1 & sign);
1008     sign = -(out[3] >> 63);
1009     out[3] += (two58 & sign);
1010     out[4] -= (1 & sign);
1011     sign = -(out[4] >> 63);
1012     out[4] += (two58 & sign);
1013     out[5] -= (1 & sign);
1014     sign = -(out[0] >> 63);
1015     out[5] += (two58 & sign);
1016     out[6] -= (1 & sign);
1017     sign = -(out[6] >> 63);
1018     out[6] += (two58 & sign);
1019     out[7] -= (1 & sign);
1020     sign = -(out[7] >> 63);
1021     out[7] += (two58 & sign);
1022     out[8] -= (1 & sign);
1023     sign = -(out[5] >> 63);
1024     out[5] += (two58 & sign);
1025     out[6] -= (1 & sign);
1026     sign = -(out[6] >> 63);
1027     out[6] += (two58 & sign);
1028     out[7] -= (1 & sign);
1029     sign = -(out[7] >> 63);
1030     out[7] += (two58 & sign);
1031     out[8] -= (1 & sign);
1032 }
1033
1034 /*-
1035  * Group operations
1036  * ----------------
1037  *
1038  * Building on top of the field operations we have the operations on the
1039  * elliptic curve group itself. Points on the curve are represented in Jacobian
1040  * coordinates */
1041
1042 /*-
1043  * point_double calculates 2*(x_in, y_in, z_in)
1044  *
1045  * The method is taken from:
1046  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1047  *
1048  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1049  * while x_out == y_in is not (maybe this works, but it's not tested). */
1050 static void
1051 point_double(felem x_out, felem y_out, felem z_out,
1052              const felem x_in, const felem y_in, const felem z_in)
1053 {
1054     largefelem tmp, tmp2;
1055     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1056
1057     felem_assign(ftmp, x_in);
1058     felem_assign(ftmp2, x_in);
1059
1060     /* delta = z^2 */
1061     felem_square(tmp, z_in);
1062     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1063
1064     /* gamma = y^2 */
1065     felem_square(tmp, y_in);
1066     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1067
1068     /* beta = x*gamma */
1069     felem_mul(tmp, x_in, gamma);
1070     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1071
1072     /* alpha = 3*(x-delta)*(x+delta) */
1073     felem_diff64(ftmp, delta);
1074     /* ftmp[i] < 2^61 */
1075     felem_sum64(ftmp2, delta);
1076     /* ftmp2[i] < 2^60 + 2^15 */
1077     felem_scalar64(ftmp2, 3);
1078     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1079     felem_mul(tmp, ftmp, ftmp2);
1080     /*-
1081      * tmp[i] < 17(3*2^121 + 3*2^76)
1082      *        = 61*2^121 + 61*2^76
1083      *        < 64*2^121 + 64*2^76
1084      *        = 2^127 + 2^82
1085      *        < 2^128
1086      */
1087     felem_reduce(alpha, tmp);
1088
1089     /* x' = alpha^2 - 8*beta */
1090     felem_square(tmp, alpha);
1091     /*
1092      * tmp[i] < 17*2^120 < 2^125
1093      */
1094     felem_assign(ftmp, beta);
1095     felem_scalar64(ftmp, 8);
1096     /* ftmp[i] < 2^62 + 2^17 */
1097     felem_diff_128_64(tmp, ftmp);
1098     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1099     felem_reduce(x_out, tmp);
1100
1101     /* z' = (y + z)^2 - gamma - delta */
1102     felem_sum64(delta, gamma);
1103     /* delta[i] < 2^60 + 2^15 */
1104     felem_assign(ftmp, y_in);
1105     felem_sum64(ftmp, z_in);
1106     /* ftmp[i] < 2^60 + 2^15 */
1107     felem_square(tmp, ftmp);
1108     /*
1109      * tmp[i] < 17(2^122) < 2^127
1110      */
1111     felem_diff_128_64(tmp, delta);
1112     /* tmp[i] < 2^127 + 2^63 */
1113     felem_reduce(z_out, tmp);
1114
1115     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1116     felem_scalar64(beta, 4);
1117     /* beta[i] < 2^61 + 2^16 */
1118     felem_diff64(beta, x_out);
1119     /* beta[i] < 2^61 + 2^60 + 2^16 */
1120     felem_mul(tmp, alpha, beta);
1121     /*-
1122      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1123      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1124      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1125      *        < 2^128
1126      */
1127     felem_square(tmp2, gamma);
1128     /*-
1129      * tmp2[i] < 17*(2^59 + 2^14)^2
1130      *         = 17*(2^118 + 2^74 + 2^28)
1131      */
1132     felem_scalar128(tmp2, 8);
1133     /*-
1134      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1135      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1136      *         < 2^126
1137      */
1138     felem_diff128(tmp, tmp2);
1139     /*-
1140      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1141      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1142      *          2^74 + 2^69 + 2^34 + 2^30
1143      *        < 2^128
1144      */
1145     felem_reduce(y_out, tmp);
1146 }
1147
1148 /* copy_conditional copies in to out iff mask is all ones. */
1149 static void copy_conditional(felem out, const felem in, limb mask)
1150 {
1151     unsigned i;
1152     for (i = 0; i < NLIMBS; ++i) {
1153         const limb tmp = mask & (in[i] ^ out[i]);
1154         out[i] ^= tmp;
1155     }
1156 }
1157
1158 /*-
1159  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1160  *
1161  * The method is taken from
1162  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1163  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1164  *
1165  * This function includes a branch for checking whether the two input points
1166  * are equal (while not equal to the point at infinity). This case never
1167  * happens during single point multiplication, so there is no timing leak for
1168  * ECDH or ECDSA signing. */
1169 static void point_add(felem x3, felem y3, felem z3,
1170                       const felem x1, const felem y1, const felem z1,
1171                       const int mixed, const felem x2, const felem y2,
1172                       const felem z2)
1173 {
1174     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1175     largefelem tmp, tmp2;
1176     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1177
1178     z1_is_zero = felem_is_zero(z1);
1179     z2_is_zero = felem_is_zero(z2);
1180
1181     /* ftmp = z1z1 = z1**2 */
1182     felem_square(tmp, z1);
1183     felem_reduce(ftmp, tmp);
1184
1185     if (!mixed) {
1186         /* ftmp2 = z2z2 = z2**2 */
1187         felem_square(tmp, z2);
1188         felem_reduce(ftmp2, tmp);
1189
1190         /* u1 = ftmp3 = x1*z2z2 */
1191         felem_mul(tmp, x1, ftmp2);
1192         felem_reduce(ftmp3, tmp);
1193
1194         /* ftmp5 = z1 + z2 */
1195         felem_assign(ftmp5, z1);
1196         felem_sum64(ftmp5, z2);
1197         /* ftmp5[i] < 2^61 */
1198
1199         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1200         felem_square(tmp, ftmp5);
1201         /* tmp[i] < 17*2^122 */
1202         felem_diff_128_64(tmp, ftmp);
1203         /* tmp[i] < 17*2^122 + 2^63 */
1204         felem_diff_128_64(tmp, ftmp2);
1205         /* tmp[i] < 17*2^122 + 2^64 */
1206         felem_reduce(ftmp5, tmp);
1207
1208         /* ftmp2 = z2 * z2z2 */
1209         felem_mul(tmp, ftmp2, z2);
1210         felem_reduce(ftmp2, tmp);
1211
1212         /* s1 = ftmp6 = y1 * z2**3 */
1213         felem_mul(tmp, y1, ftmp2);
1214         felem_reduce(ftmp6, tmp);
1215     } else {
1216         /*
1217          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1218          */
1219
1220         /* u1 = ftmp3 = x1*z2z2 */
1221         felem_assign(ftmp3, x1);
1222
1223         /* ftmp5 = 2*z1z2 */
1224         felem_scalar(ftmp5, z1, 2);
1225
1226         /* s1 = ftmp6 = y1 * z2**3 */
1227         felem_assign(ftmp6, y1);
1228     }
1229
1230     /* u2 = x2*z1z1 */
1231     felem_mul(tmp, x2, ftmp);
1232     /* tmp[i] < 17*2^120 */
1233
1234     /* h = ftmp4 = u2 - u1 */
1235     felem_diff_128_64(tmp, ftmp3);
1236     /* tmp[i] < 17*2^120 + 2^63 */
1237     felem_reduce(ftmp4, tmp);
1238
1239     x_equal = felem_is_zero(ftmp4);
1240
1241     /* z_out = ftmp5 * h */
1242     felem_mul(tmp, ftmp5, ftmp4);
1243     felem_reduce(z_out, tmp);
1244
1245     /* ftmp = z1 * z1z1 */
1246     felem_mul(tmp, ftmp, z1);
1247     felem_reduce(ftmp, tmp);
1248
1249     /* s2 = tmp = y2 * z1**3 */
1250     felem_mul(tmp, y2, ftmp);
1251     /* tmp[i] < 17*2^120 */
1252
1253     /* r = ftmp5 = (s2 - s1)*2 */
1254     felem_diff_128_64(tmp, ftmp6);
1255     /* tmp[i] < 17*2^120 + 2^63 */
1256     felem_reduce(ftmp5, tmp);
1257     y_equal = felem_is_zero(ftmp5);
1258     felem_scalar64(ftmp5, 2);
1259     /* ftmp5[i] < 2^61 */
1260
1261     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1262         point_double(x3, y3, z3, x1, y1, z1);
1263         return;
1264     }
1265
1266     /* I = ftmp = (2h)**2 */
1267     felem_assign(ftmp, ftmp4);
1268     felem_scalar64(ftmp, 2);
1269     /* ftmp[i] < 2^61 */
1270     felem_square(tmp, ftmp);
1271     /* tmp[i] < 17*2^122 */
1272     felem_reduce(ftmp, tmp);
1273
1274     /* J = ftmp2 = h * I */
1275     felem_mul(tmp, ftmp4, ftmp);
1276     felem_reduce(ftmp2, tmp);
1277
1278     /* V = ftmp4 = U1 * I */
1279     felem_mul(tmp, ftmp3, ftmp);
1280     felem_reduce(ftmp4, tmp);
1281
1282     /* x_out = r**2 - J - 2V */
1283     felem_square(tmp, ftmp5);
1284     /* tmp[i] < 17*2^122 */
1285     felem_diff_128_64(tmp, ftmp2);
1286     /* tmp[i] < 17*2^122 + 2^63 */
1287     felem_assign(ftmp3, ftmp4);
1288     felem_scalar64(ftmp4, 2);
1289     /* ftmp4[i] < 2^61 */
1290     felem_diff_128_64(tmp, ftmp4);
1291     /* tmp[i] < 17*2^122 + 2^64 */
1292     felem_reduce(x_out, tmp);
1293
1294     /* y_out = r(V-x_out) - 2 * s1 * J */
1295     felem_diff64(ftmp3, x_out);
1296     /*
1297      * ftmp3[i] < 2^60 + 2^60 = 2^61
1298      */
1299     felem_mul(tmp, ftmp5, ftmp3);
1300     /* tmp[i] < 17*2^122 */
1301     felem_mul(tmp2, ftmp6, ftmp2);
1302     /* tmp2[i] < 17*2^120 */
1303     felem_scalar128(tmp2, 2);
1304     /* tmp2[i] < 17*2^121 */
1305     felem_diff128(tmp, tmp2);
1306         /*-
1307          * tmp[i] < 2^127 - 2^69 + 17*2^122
1308          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1309          *        < 2^127
1310          */
1311     felem_reduce(y_out, tmp);
1312
1313     copy_conditional(x_out, x2, z1_is_zero);
1314     copy_conditional(x_out, x1, z2_is_zero);
1315     copy_conditional(y_out, y2, z1_is_zero);
1316     copy_conditional(y_out, y1, z2_is_zero);
1317     copy_conditional(z_out, z2, z1_is_zero);
1318     copy_conditional(z_out, z1, z2_is_zero);
1319     felem_assign(x3, x_out);
1320     felem_assign(y3, y_out);
1321     felem_assign(z3, z_out);
1322 }
1323
1324 /*-
1325  * Base point pre computation
1326  * --------------------------
1327  *
1328  * Two different sorts of precomputed tables are used in the following code.
1329  * Each contain various points on the curve, where each point is three field
1330  * elements (x, y, z).
1331  *
1332  * For the base point table, z is usually 1 (0 for the point at infinity).
1333  * This table has 16 elements:
1334  * index | bits    | point
1335  * ------+---------+------------------------------
1336  *     0 | 0 0 0 0 | 0G
1337  *     1 | 0 0 0 1 | 1G
1338  *     2 | 0 0 1 0 | 2^130G
1339  *     3 | 0 0 1 1 | (2^130 + 1)G
1340  *     4 | 0 1 0 0 | 2^260G
1341  *     5 | 0 1 0 1 | (2^260 + 1)G
1342  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1343  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1344  *     8 | 1 0 0 0 | 2^390G
1345  *     9 | 1 0 0 1 | (2^390 + 1)G
1346  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1347  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1348  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1349  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1350  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1351  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1352  *
1353  * The reason for this is so that we can clock bits into four different
1354  * locations when doing simple scalar multiplies against the base point.
1355  *
1356  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1357
1358 /* gmul is the table of precomputed base points */
1359 static const felem gmul[16][3] = {
1360 {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1361  {0, 0, 0, 0, 0, 0, 0, 0, 0},
1362  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1363 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1364   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1365   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1366  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1367   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1368   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1369  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1370 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1371   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1372   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1373  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1374   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1375   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1376  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1377 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1378   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1379   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1380  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1381   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1382   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1383  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1384 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1385   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1386   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1387  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1388   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1389   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1390  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1391 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1392   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1393   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1394  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1395   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1396   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1397  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1398 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1399   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1400   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1401  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1402   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1403   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1404  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1405 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1406   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1407   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1408  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1409   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1410   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1411  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1412 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1413   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1414   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1415  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1416   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1417   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1418  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1419 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1420   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1421   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1422  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1423   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1424   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1425  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1426 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1427   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1428   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1429  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1430   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1431   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1432  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1433 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1434   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1435   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1436  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1437   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1438   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1439  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1440 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1441   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1442   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1443  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1444   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1445   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1446  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1447 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1448   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1449   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1450  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1451   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1452   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1453  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1454 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1455   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1456   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1457  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1458   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1459   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1460  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1461 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1462   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1463   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1464  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1465   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1466   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1467  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1468 };
1469
1470 /*
1471  * select_point selects the |idx|th point from a precomputation table and
1472  * copies it to out.
1473  */
1474  /* pre_comp below is of the size provided in |size| */
1475 static void select_point(const limb idx, unsigned int size,
1476                          const felem pre_comp[][3], felem out[3])
1477 {
1478     unsigned i, j;
1479     limb *outlimbs = &out[0][0];
1480
1481     memset(out, 0, sizeof(*out) * 3);
1482
1483     for (i = 0; i < size; i++) {
1484         const limb *inlimbs = &pre_comp[i][0][0];
1485         limb mask = i ^ idx;
1486         mask |= mask >> 4;
1487         mask |= mask >> 2;
1488         mask |= mask >> 1;
1489         mask &= 1;
1490         mask--;
1491         for (j = 0; j < NLIMBS * 3; j++)
1492             outlimbs[j] |= inlimbs[j] & mask;
1493     }
1494 }
1495
1496 /* get_bit returns the |i|th bit in |in| */
1497 static char get_bit(const felem_bytearray in, int i)
1498 {
1499     if (i < 0)
1500         return 0;
1501     return (in[i >> 3] >> (i & 7)) & 1;
1502 }
1503
1504 /*
1505  * Interleaved point multiplication using precomputed point multiples: The
1506  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1507  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1508  * generator, using certain (large) precomputed multiples in g_pre_comp.
1509  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1510  */
1511 static void batch_mul(felem x_out, felem y_out, felem z_out,
1512                       const felem_bytearray scalars[],
1513                       const unsigned num_points, const u8 *g_scalar,
1514                       const int mixed, const felem pre_comp[][17][3],
1515                       const felem g_pre_comp[16][3])
1516 {
1517     int i, skip;
1518     unsigned num, gen_mul = (g_scalar != NULL);
1519     felem nq[3], tmp[4];
1520     limb bits;
1521     u8 sign, digit;
1522
1523     /* set nq to the point at infinity */
1524     memset(nq, 0, sizeof(nq));
1525
1526     /*
1527      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1528      * of the generator (last quarter of rounds) and additions of other
1529      * points multiples (every 5th round).
1530      */
1531     skip = 1;                   /* save two point operations in the first
1532                                  * round */
1533     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1534         /* double */
1535         if (!skip)
1536             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1537
1538         /* add multiples of the generator */
1539         if (gen_mul && (i <= 130)) {
1540             bits = get_bit(g_scalar, i + 390) << 3;
1541             if (i < 130) {
1542                 bits |= get_bit(g_scalar, i + 260) << 2;
1543                 bits |= get_bit(g_scalar, i + 130) << 1;
1544                 bits |= get_bit(g_scalar, i);
1545             }
1546             /* select the point to add, in constant time */
1547             select_point(bits, 16, g_pre_comp, tmp);
1548             if (!skip) {
1549                 /* The 1 argument below is for "mixed" */
1550                 point_add(nq[0], nq[1], nq[2],
1551                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1552             } else {
1553                 memcpy(nq, tmp, 3 * sizeof(felem));
1554                 skip = 0;
1555             }
1556         }
1557
1558         /* do other additions every 5 doublings */
1559         if (num_points && (i % 5 == 0)) {
1560             /* loop over all scalars */
1561             for (num = 0; num < num_points; ++num) {
1562                 bits = get_bit(scalars[num], i + 4) << 5;
1563                 bits |= get_bit(scalars[num], i + 3) << 4;
1564                 bits |= get_bit(scalars[num], i + 2) << 3;
1565                 bits |= get_bit(scalars[num], i + 1) << 2;
1566                 bits |= get_bit(scalars[num], i) << 1;
1567                 bits |= get_bit(scalars[num], i - 1);
1568                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1569
1570                 /*
1571                  * select the point to add or subtract, in constant time
1572                  */
1573                 select_point(digit, 17, pre_comp[num], tmp);
1574                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1575                                             * point */
1576                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1577
1578                 if (!skip) {
1579                     point_add(nq[0], nq[1], nq[2],
1580                               nq[0], nq[1], nq[2],
1581                               mixed, tmp[0], tmp[1], tmp[2]);
1582                 } else {
1583                     memcpy(nq, tmp, 3 * sizeof(felem));
1584                     skip = 0;
1585                 }
1586             }
1587         }
1588     }
1589     felem_assign(x_out, nq[0]);
1590     felem_assign(y_out, nq[1]);
1591     felem_assign(z_out, nq[2]);
1592 }
1593
1594 /* Precomputation for the group generator. */
1595 struct nistp521_pre_comp_st {
1596     felem g_pre_comp[16][3];
1597     int references;
1598     CRYPTO_RWLOCK *lock;
1599 };
1600
1601 const EC_METHOD *EC_GFp_nistp521_method(void)
1602 {
1603     static const EC_METHOD ret = {
1604         EC_FLAGS_DEFAULT_OCT,
1605         NID_X9_62_prime_field,
1606         ec_GFp_nistp521_group_init,
1607         ec_GFp_simple_group_finish,
1608         ec_GFp_simple_group_clear_finish,
1609         ec_GFp_nist_group_copy,
1610         ec_GFp_nistp521_group_set_curve,
1611         ec_GFp_simple_group_get_curve,
1612         ec_GFp_simple_group_get_degree,
1613         ec_group_simple_order_bits,
1614         ec_GFp_simple_group_check_discriminant,
1615         ec_GFp_simple_point_init,
1616         ec_GFp_simple_point_finish,
1617         ec_GFp_simple_point_clear_finish,
1618         ec_GFp_simple_point_copy,
1619         ec_GFp_simple_point_set_to_infinity,
1620         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1621         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1622         ec_GFp_simple_point_set_affine_coordinates,
1623         ec_GFp_nistp521_point_get_affine_coordinates,
1624         0 /* point_set_compressed_coordinates */ ,
1625         0 /* point2oct */ ,
1626         0 /* oct2point */ ,
1627         ec_GFp_simple_add,
1628         ec_GFp_simple_dbl,
1629         ec_GFp_simple_invert,
1630         ec_GFp_simple_is_at_infinity,
1631         ec_GFp_simple_is_on_curve,
1632         ec_GFp_simple_cmp,
1633         ec_GFp_simple_make_affine,
1634         ec_GFp_simple_points_make_affine,
1635         ec_GFp_nistp521_points_mul,
1636         ec_GFp_nistp521_precompute_mult,
1637         ec_GFp_nistp521_have_precompute_mult,
1638         ec_GFp_nist_field_mul,
1639         ec_GFp_nist_field_sqr,
1640         0 /* field_div */ ,
1641         0 /* field_encode */ ,
1642         0 /* field_decode */ ,
1643         0,                      /* field_set_to_one */
1644         ec_key_simple_priv2oct,
1645         ec_key_simple_oct2priv,
1646         0, /* set private */
1647         ec_key_simple_generate_key,
1648         ec_key_simple_check_key,
1649         ec_key_simple_generate_public_key,
1650         0, /* keycopy */
1651         0, /* keyfinish */
1652         ecdh_simple_compute_key
1653     };
1654
1655     return &ret;
1656 }
1657
1658 /******************************************************************************/
1659 /*
1660  * FUNCTIONS TO MANAGE PRECOMPUTATION
1661  */
1662
1663 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1664 {
1665     NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1666
1667     if (ret == NULL) {
1668         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1669         return ret;
1670     }
1671
1672     ret->references = 1;
1673
1674     ret->lock = CRYPTO_THREAD_lock_new();
1675     if (ret->lock == NULL) {
1676         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1677         OPENSSL_free(ret);
1678         return NULL;
1679     }
1680     return ret;
1681 }
1682
1683 NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
1684 {
1685     int i;
1686     if (p != NULL)
1687         CRYPTO_atomic_add(&p->references, 1, &i, p->lock);
1688     return p;
1689 }
1690
1691 void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1692 {
1693     int i;
1694
1695     if (p == NULL)
1696         return;
1697
1698     CRYPTO_atomic_add(&p->references, -1, &i, p->lock);
1699     REF_PRINT_COUNT("EC_nistp521", x);
1700     if (i > 0)
1701         return;
1702     REF_ASSERT_ISNT(i < 0);
1703
1704     CRYPTO_THREAD_lock_free(p->lock);
1705     OPENSSL_free(p);
1706 }
1707
1708 /******************************************************************************/
1709 /*
1710  * OPENSSL EC_METHOD FUNCTIONS
1711  */
1712
1713 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1714 {
1715     int ret;
1716     ret = ec_GFp_simple_group_init(group);
1717     group->a_is_minus3 = 1;
1718     return ret;
1719 }
1720
1721 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1722                                     const BIGNUM *a, const BIGNUM *b,
1723                                     BN_CTX *ctx)
1724 {
1725     int ret = 0;
1726     BN_CTX *new_ctx = NULL;
1727     BIGNUM *curve_p, *curve_a, *curve_b;
1728
1729     if (ctx == NULL)
1730         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1731             return 0;
1732     BN_CTX_start(ctx);
1733     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1734         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1735         ((curve_b = BN_CTX_get(ctx)) == NULL))
1736         goto err;
1737     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1738     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1739     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1740     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1741         ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1742               EC_R_WRONG_CURVE_PARAMETERS);
1743         goto err;
1744     }
1745     group->field_mod_func = BN_nist_mod_521;
1746     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1747  err:
1748     BN_CTX_end(ctx);
1749     BN_CTX_free(new_ctx);
1750     return ret;
1751 }
1752
1753 /*
1754  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1755  * (X/Z^2, Y/Z^3)
1756  */
1757 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1758                                                  const EC_POINT *point,
1759                                                  BIGNUM *x, BIGNUM *y,
1760                                                  BN_CTX *ctx)
1761 {
1762     felem z1, z2, x_in, y_in, x_out, y_out;
1763     largefelem tmp;
1764
1765     if (EC_POINT_is_at_infinity(group, point)) {
1766         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1767               EC_R_POINT_AT_INFINITY);
1768         return 0;
1769     }
1770     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1771         (!BN_to_felem(z1, point->Z)))
1772         return 0;
1773     felem_inv(z2, z1);
1774     felem_square(tmp, z2);
1775     felem_reduce(z1, tmp);
1776     felem_mul(tmp, x_in, z1);
1777     felem_reduce(x_in, tmp);
1778     felem_contract(x_out, x_in);
1779     if (x != NULL) {
1780         if (!felem_to_BN(x, x_out)) {
1781             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1782                   ERR_R_BN_LIB);
1783             return 0;
1784         }
1785     }
1786     felem_mul(tmp, z1, z2);
1787     felem_reduce(z1, tmp);
1788     felem_mul(tmp, y_in, z1);
1789     felem_reduce(y_in, tmp);
1790     felem_contract(y_out, y_in);
1791     if (y != NULL) {
1792         if (!felem_to_BN(y, y_out)) {
1793             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1794                   ERR_R_BN_LIB);
1795             return 0;
1796         }
1797     }
1798     return 1;
1799 }
1800
1801 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1802 static void make_points_affine(size_t num, felem points[][3],
1803                                felem tmp_felems[])
1804 {
1805     /*
1806      * Runs in constant time, unless an input is the point at infinity (which
1807      * normally shouldn't happen).
1808      */
1809     ec_GFp_nistp_points_make_affine_internal(num,
1810                                              points,
1811                                              sizeof(felem),
1812                                              tmp_felems,
1813                                              (void (*)(void *))felem_one,
1814                                              (int (*)(const void *))
1815                                              felem_is_zero_int,
1816                                              (void (*)(void *, const void *))
1817                                              felem_assign,
1818                                              (void (*)(void *, const void *))
1819                                              felem_square_reduce, (void (*)
1820                                                                    (void *,
1821                                                                     const void
1822                                                                     *,
1823                                                                     const void
1824                                                                     *))
1825                                              felem_mul_reduce,
1826                                              (void (*)(void *, const void *))
1827                                              felem_inv,
1828                                              (void (*)(void *, const void *))
1829                                              felem_contract);
1830 }
1831
1832 /*
1833  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1834  * values Result is stored in r (r can equal one of the inputs).
1835  */
1836 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1837                                const BIGNUM *scalar, size_t num,
1838                                const EC_POINT *points[],
1839                                const BIGNUM *scalars[], BN_CTX *ctx)
1840 {
1841     int ret = 0;
1842     int j;
1843     int mixed = 0;
1844     BN_CTX *new_ctx = NULL;
1845     BIGNUM *x, *y, *z, *tmp_scalar;
1846     felem_bytearray g_secret;
1847     felem_bytearray *secrets = NULL;
1848     felem (*pre_comp)[17][3] = NULL;
1849     felem *tmp_felems = NULL;
1850     felem_bytearray tmp;
1851     unsigned i, num_bytes;
1852     int have_pre_comp = 0;
1853     size_t num_points = num;
1854     felem x_in, y_in, z_in, x_out, y_out, z_out;
1855     NISTP521_PRE_COMP *pre = NULL;
1856     felem(*g_pre_comp)[3] = NULL;
1857     EC_POINT *generator = NULL;
1858     const EC_POINT *p = NULL;
1859     const BIGNUM *p_scalar = NULL;
1860
1861     if (ctx == NULL)
1862         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1863             return 0;
1864     BN_CTX_start(ctx);
1865     if (((x = BN_CTX_get(ctx)) == NULL) ||
1866         ((y = BN_CTX_get(ctx)) == NULL) ||
1867         ((z = BN_CTX_get(ctx)) == NULL) ||
1868         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1869         goto err;
1870
1871     if (scalar != NULL) {
1872         pre = group->pre_comp.nistp521;
1873         if (pre)
1874             /* we have precomputation, try to use it */
1875             g_pre_comp = &pre->g_pre_comp[0];
1876         else
1877             /* try to use the standard precomputation */
1878             g_pre_comp = (felem(*)[3]) gmul;
1879         generator = EC_POINT_new(group);
1880         if (generator == NULL)
1881             goto err;
1882         /* get the generator from precomputation */
1883         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1884             !felem_to_BN(y, g_pre_comp[1][1]) ||
1885             !felem_to_BN(z, g_pre_comp[1][2])) {
1886             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1887             goto err;
1888         }
1889         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1890                                                       generator, x, y, z,
1891                                                       ctx))
1892             goto err;
1893         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1894             /* precomputation matches generator */
1895             have_pre_comp = 1;
1896         else
1897             /*
1898              * we don't have valid precomputation: treat the generator as a
1899              * random point
1900              */
1901             num_points++;
1902     }
1903
1904     if (num_points > 0) {
1905         if (num_points >= 2) {
1906             /*
1907              * unless we precompute multiples for just one point, converting
1908              * those into affine form is time well spent
1909              */
1910             mixed = 1;
1911         }
1912         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1913         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1914         if (mixed)
1915             tmp_felems =
1916                 OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1917         if ((secrets == NULL) || (pre_comp == NULL)
1918             || (mixed && (tmp_felems == NULL))) {
1919             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1920             goto err;
1921         }
1922
1923         /*
1924          * we treat NULL scalars as 0, and NULL points as points at infinity,
1925          * i.e., they contribute nothing to the linear combination
1926          */
1927         for (i = 0; i < num_points; ++i) {
1928             if (i == num)
1929                 /*
1930                  * we didn't have a valid precomputation, so we pick the
1931                  * generator
1932                  */
1933             {
1934                 p = EC_GROUP_get0_generator(group);
1935                 p_scalar = scalar;
1936             } else
1937                 /* the i^th point */
1938             {
1939                 p = points[i];
1940                 p_scalar = scalars[i];
1941             }
1942             if ((p_scalar != NULL) && (p != NULL)) {
1943                 /* reduce scalar to 0 <= scalar < 2^521 */
1944                 if ((BN_num_bits(p_scalar) > 521)
1945                     || (BN_is_negative(p_scalar))) {
1946                     /*
1947                      * this is an unusual input, and we don't guarantee
1948                      * constant-timeness
1949                      */
1950                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1951                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1952                         goto err;
1953                     }
1954                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
1955                 } else
1956                     num_bytes = BN_bn2bin(p_scalar, tmp);
1957                 flip_endian(secrets[i], tmp, num_bytes);
1958                 /* precompute multiples */
1959                 if ((!BN_to_felem(x_out, p->X)) ||
1960                     (!BN_to_felem(y_out, p->Y)) ||
1961                     (!BN_to_felem(z_out, p->Z)))
1962                     goto err;
1963                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1964                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1965                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1966                 for (j = 2; j <= 16; ++j) {
1967                     if (j & 1) {
1968                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1969                                   pre_comp[i][j][2], pre_comp[i][1][0],
1970                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1971                                   pre_comp[i][j - 1][0],
1972                                   pre_comp[i][j - 1][1],
1973                                   pre_comp[i][j - 1][2]);
1974                     } else {
1975                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1976                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1977                                      pre_comp[i][j / 2][1],
1978                                      pre_comp[i][j / 2][2]);
1979                     }
1980                 }
1981             }
1982         }
1983         if (mixed)
1984             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1985     }
1986
1987     /* the scalar for the generator */
1988     if ((scalar != NULL) && (have_pre_comp)) {
1989         memset(g_secret, 0, sizeof(g_secret));
1990         /* reduce scalar to 0 <= scalar < 2^521 */
1991         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1992             /*
1993              * this is an unusual input, and we don't guarantee
1994              * constant-timeness
1995              */
1996             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1997                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1998                 goto err;
1999             }
2000             num_bytes = BN_bn2bin(tmp_scalar, tmp);
2001         } else
2002             num_bytes = BN_bn2bin(scalar, tmp);
2003         flip_endian(g_secret, tmp, num_bytes);
2004         /* do the multiplication with generator precomputation */
2005         batch_mul(x_out, y_out, z_out,
2006                   (const felem_bytearray(*))secrets, num_points,
2007                   g_secret,
2008                   mixed, (const felem(*)[17][3])pre_comp,
2009                   (const felem(*)[3])g_pre_comp);
2010     } else
2011         /* do the multiplication without generator precomputation */
2012         batch_mul(x_out, y_out, z_out,
2013                   (const felem_bytearray(*))secrets, num_points,
2014                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
2015     /* reduce the output to its unique minimal representation */
2016     felem_contract(x_in, x_out);
2017     felem_contract(y_in, y_out);
2018     felem_contract(z_in, z_out);
2019     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2020         (!felem_to_BN(z, z_in))) {
2021         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2022         goto err;
2023     }
2024     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2025
2026  err:
2027     BN_CTX_end(ctx);
2028     EC_POINT_free(generator);
2029     BN_CTX_free(new_ctx);
2030     OPENSSL_free(secrets);
2031     OPENSSL_free(pre_comp);
2032     OPENSSL_free(tmp_felems);
2033     return ret;
2034 }
2035
2036 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2037 {
2038     int ret = 0;
2039     NISTP521_PRE_COMP *pre = NULL;
2040     int i, j;
2041     BN_CTX *new_ctx = NULL;
2042     BIGNUM *x, *y;
2043     EC_POINT *generator = NULL;
2044     felem tmp_felems[16];
2045
2046     /* throw away old precomputation */
2047     EC_pre_comp_free(group);
2048     if (ctx == NULL)
2049         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2050             return 0;
2051     BN_CTX_start(ctx);
2052     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2053         goto err;
2054     /* get the generator */
2055     if (group->generator == NULL)
2056         goto err;
2057     generator = EC_POINT_new(group);
2058     if (generator == NULL)
2059         goto err;
2060     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2061     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2062     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2063         goto err;
2064     if ((pre = nistp521_pre_comp_new()) == NULL)
2065         goto err;
2066     /*
2067      * if the generator is the standard one, use built-in precomputation
2068      */
2069     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2070         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2071         goto done;
2072     }
2073     if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
2074         (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
2075         (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2076         goto err;
2077     /* compute 2^130*G, 2^260*G, 2^390*G */
2078     for (i = 1; i <= 4; i <<= 1) {
2079         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2080                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2081                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2082         for (j = 0; j < 129; ++j) {
2083             point_double(pre->g_pre_comp[2 * i][0],
2084                          pre->g_pre_comp[2 * i][1],
2085                          pre->g_pre_comp[2 * i][2],
2086                          pre->g_pre_comp[2 * i][0],
2087                          pre->g_pre_comp[2 * i][1],
2088                          pre->g_pre_comp[2 * i][2]);
2089         }
2090     }
2091     /* g_pre_comp[0] is the point at infinity */
2092     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2093     /* the remaining multiples */
2094     /* 2^130*G + 2^260*G */
2095     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2096               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2097               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2098               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2099               pre->g_pre_comp[2][2]);
2100     /* 2^130*G + 2^390*G */
2101     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2102               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2103               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2104               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2105               pre->g_pre_comp[2][2]);
2106     /* 2^260*G + 2^390*G */
2107     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2108               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2109               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2110               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2111               pre->g_pre_comp[4][2]);
2112     /* 2^130*G + 2^260*G + 2^390*G */
2113     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2114               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2115               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2116               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2117               pre->g_pre_comp[2][2]);
2118     for (i = 1; i < 8; ++i) {
2119         /* odd multiples: add G */
2120         point_add(pre->g_pre_comp[2 * i + 1][0],
2121                   pre->g_pre_comp[2 * i + 1][1],
2122                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2123                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2124                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2125                   pre->g_pre_comp[1][2]);
2126     }
2127     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2128
2129  done:
2130     SETPRECOMP(group, nistp521, pre);
2131     ret = 1;
2132     pre = NULL;
2133  err:
2134     BN_CTX_end(ctx);
2135     EC_POINT_free(generator);
2136     BN_CTX_free(new_ctx);
2137     EC_nistp521_pre_comp_free(pre);
2138     return ret;
2139 }
2140
2141 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2142 {
2143     return HAVEPRECOMP(group, nistp521);
2144 }
2145
2146 #endif