crypto/ec/ecp_nistp521.c

   1 /* crypto/ec/ecp_nistp521.c */
   2 /*
   3  * Written by Adam Langley (Google) for the OpenSSL project
   4  */
   5 /* Copyright 2011 Google Inc.
   6  *
   7  * Licensed under the Apache License, Version 2.0 (the "License");
   8  *
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  *  Unless required by applicable law or agreed to in writing, software
  15  *  distributed under the License is distributed on an "AS IS" BASIS,
  16  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  *  See the License for the specific language governing permissions and
  18  *  limitations under the License.
  19  */
  20
  21 /*
  22  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  23  *
  24  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  25  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  26  * work which got its smarts from Daniel J. Bernstein's work on the same.
  27  */
  28
  29 #include <openssl/opensslconf.h>
  30 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
  31
  32 # ifndef OPENSSL_SYS_VMS
  33 #  include <stdint.h>
  34 # else
  35 #  include <inttypes.h>
  36 # endif
  37
  38 # include <string.h>
  39 # include <openssl/err.h>
  40 # include "ec_lcl.h"
  41
  42 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  43   /* even with gcc, the typedef won't work for 32-bit platforms */
  44 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  45                                  * platforms */
  46 # else
  47 #  error "Need GCC 3.1 or later to define type uint128_t"
  48 # endif
  49
  50 typedef uint8_t u8;
  51 typedef uint64_t u64;
  52 typedef int64_t s64;
  53
  54 /*
  55  * The underlying field. P521 operates over GF(2^521-1). We can serialise an
  56  * element of this field into 66 bytes where the most significant byte
  57  * contains only a single bit. We call this an felem_bytearray.
  58  */
  59
  60 typedef u8 felem_bytearray[66];
  61
  62 /*
  63  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  64  * These values are big-endian.
  65  */
  66 static const felem_bytearray nistp521_curve_params[5] = {
  67     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
  68      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  69      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  70      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  74      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  75      0xff, 0xff},
  76     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
  77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  78      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  79      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  82      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  83      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  84      0xff, 0xfc},
  85     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
  86      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  87      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  88      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  89      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  90      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  91      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  92      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  93      0x3f, 0x00},
  94     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
  95      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  96      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
  97      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
  98      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
  99      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
 100      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
 101      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 102      0xbd, 0x66},
 103     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
 104      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 105      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 106      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 107      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 108      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 109      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 110      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 111      0x66, 0x50}
 112 };
 113
 114 /*-
 115  * The representation of field elements.
 116  * ------------------------------------
 117  *
 118  * We represent field elements with nine values. These values are either 64 or
 119  * 128 bits and the field element represented is:
 120  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 121  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 122  * 58 bits apart, but are greater than 58 bits in length, the most significant
 123  * bits of each limb overlap with the least significant bits of the next.
 124  *
 125  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 126  * 'largefelem' */
 127
 128 # define NLIMBS 9
 129
 130 typedef uint64_t limb;
 131 typedef limb felem[NLIMBS];
 132 typedef uint128_t largefelem[NLIMBS];
 133
 134 static const limb bottom57bits = 0x1ffffffffffffff;
 135 static const limb bottom58bits = 0x3ffffffffffffff;
 136
 137 /*
 138  * bin66_to_felem takes a little-endian byte array and converts it into felem
 139  * form. This assumes that the CPU is little-endian.
 140  */
 141 static void bin66_to_felem(felem out, const u8 in[66])
 142 {
 143     out[0] = (*((limb *) & in[0])) & bottom58bits;
 144     out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
 145     out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
 146     out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
 147     out[4] = (*((limb *) & in[29])) & bottom58bits;
 148     out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
 149     out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
 150     out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
 151     out[8] = (*((limb *) & in[58])) & bottom57bits;
 152 }
 153
 154 /*
 155  * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 156  * array. This assumes that the CPU is little-endian.
 157  */
 158 static void felem_to_bin66(u8 out[66], const felem in)
 159 {
 160     memset(out, 0, 66);
 161     (*((limb *) & out[0])) = in[0];
 162     (*((limb *) & out[7])) |= in[1] << 2;
 163     (*((limb *) & out[14])) |= in[2] << 4;
 164     (*((limb *) & out[21])) |= in[3] << 6;
 165     (*((limb *) & out[29])) = in[4];
 166     (*((limb *) & out[36])) |= in[5] << 2;
 167     (*((limb *) & out[43])) |= in[6] << 4;
 168     (*((limb *) & out[50])) |= in[7] << 6;
 169     (*((limb *) & out[58])) = in[8];
 170 }
 171
 172 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 173 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 174 {
 175     unsigned i;
 176     for (i = 0; i < len; ++i)
 177         out[i] = in[len - 1 - i];
 178 }
 179
 180 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 181 static int BN_to_felem(felem out, const BIGNUM *bn)
 182 {
 183     felem_bytearray b_in;
 184     felem_bytearray b_out;
 185     unsigned num_bytes;
 186
 187     /* BN_bn2bin eats leading zeroes */
 188     memset(b_out, 0, sizeof b_out);
 189     num_bytes = BN_num_bytes(bn);
 190     if (num_bytes > sizeof b_out) {
 191         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 192         return 0;
 193     }
 194     if (BN_is_negative(bn)) {
 195         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 196         return 0;
 197     }
 198     num_bytes = BN_bn2bin(bn, b_in);
 199     flip_endian(b_out, b_in, num_bytes);
 200     bin66_to_felem(out, b_out);
 201     return 1;
 202 }
 203
 204 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 205 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 206 {
 207     felem_bytearray b_in, b_out;
 208     felem_to_bin66(b_in, in);
 209     flip_endian(b_out, b_in, sizeof b_out);
 210     return BN_bin2bn(b_out, sizeof b_out, out);
 211 }
 212
 213 /*-
 214  * Field operations
 215  * ----------------
 216  */
 217
 218 static void felem_one(felem out)
 219 {
 220     out[0] = 1;
 221     out[1] = 0;
 222     out[2] = 0;
 223     out[3] = 0;
 224     out[4] = 0;
 225     out[5] = 0;
 226     out[6] = 0;
 227     out[7] = 0;
 228     out[8] = 0;
 229 }
 230
 231 static void felem_assign(felem out, const felem in)
 232 {
 233     out[0] = in[0];
 234     out[1] = in[1];
 235     out[2] = in[2];
 236     out[3] = in[3];
 237     out[4] = in[4];
 238     out[5] = in[5];
 239     out[6] = in[6];
 240     out[7] = in[7];
 241     out[8] = in[8];
 242 }
 243
 244 /* felem_sum64 sets out = out + in. */
 245 static void felem_sum64(felem out, const felem in)
 246 {
 247     out[0] += in[0];
 248     out[1] += in[1];
 249     out[2] += in[2];
 250     out[3] += in[3];
 251     out[4] += in[4];
 252     out[5] += in[5];
 253     out[6] += in[6];
 254     out[7] += in[7];
 255     out[8] += in[8];
 256 }
 257
 258 /* felem_scalar sets out = in * scalar */
 259 static void felem_scalar(felem out, const felem in, limb scalar)
 260 {
 261     out[0] = in[0] * scalar;
 262     out[1] = in[1] * scalar;
 263     out[2] = in[2] * scalar;
 264     out[3] = in[3] * scalar;
 265     out[4] = in[4] * scalar;
 266     out[5] = in[5] * scalar;
 267     out[6] = in[6] * scalar;
 268     out[7] = in[7] * scalar;
 269     out[8] = in[8] * scalar;
 270 }
 271
 272 /* felem_scalar64 sets out = out * scalar */
 273 static void felem_scalar64(felem out, limb scalar)
 274 {
 275     out[0] *= scalar;
 276     out[1] *= scalar;
 277     out[2] *= scalar;
 278     out[3] *= scalar;
 279     out[4] *= scalar;
 280     out[5] *= scalar;
 281     out[6] *= scalar;
 282     out[7] *= scalar;
 283     out[8] *= scalar;
 284 }
 285
 286 /* felem_scalar128 sets out = out * scalar */
 287 static void felem_scalar128(largefelem out, limb scalar)
 288 {
 289     out[0] *= scalar;
 290     out[1] *= scalar;
 291     out[2] *= scalar;
 292     out[3] *= scalar;
 293     out[4] *= scalar;
 294     out[5] *= scalar;
 295     out[6] *= scalar;
 296     out[7] *= scalar;
 297     out[8] *= scalar;
 298 }
 299
 300 /*-
 301  * felem_neg sets |out| to |-in|
 302  * On entry:
 303  *   in[i] < 2^59 + 2^14
 304  * On exit:
 305  *   out[i] < 2^62
 306  */
 307 static void felem_neg(felem out, const felem in)
 308 {
 309     /* In order to prevent underflow, we subtract from 0 mod p. */
 310     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 311     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 312
 313     out[0] = two62m3 - in[0];
 314     out[1] = two62m2 - in[1];
 315     out[2] = two62m2 - in[2];
 316     out[3] = two62m2 - in[3];
 317     out[4] = two62m2 - in[4];
 318     out[5] = two62m2 - in[5];
 319     out[6] = two62m2 - in[6];
 320     out[7] = two62m2 - in[7];
 321     out[8] = two62m2 - in[8];
 322 }
 323
 324 /*-
 325  * felem_diff64 subtracts |in| from |out|
 326  * On entry:
 327  *   in[i] < 2^59 + 2^14
 328  * On exit:
 329  *   out[i] < out[i] + 2^62
 330  */
 331 static void felem_diff64(felem out, const felem in)
 332 {
 333     /*
 334      * In order to prevent underflow, we add 0 mod p before subtracting.
 335      */
 336     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 337     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 338
 339     out[0] += two62m3 - in[0];
 340     out[1] += two62m2 - in[1];
 341     out[2] += two62m2 - in[2];
 342     out[3] += two62m2 - in[3];
 343     out[4] += two62m2 - in[4];
 344     out[5] += two62m2 - in[5];
 345     out[6] += two62m2 - in[6];
 346     out[7] += two62m2 - in[7];
 347     out[8] += two62m2 - in[8];
 348 }
 349
 350 /*-
 351  * felem_diff_128_64 subtracts |in| from |out|
 352  * On entry:
 353  *   in[i] < 2^62 + 2^17
 354  * On exit:
 355  *   out[i] < out[i] + 2^63
 356  */
 357 static void felem_diff_128_64(largefelem out, const felem in)
 358 {
 359     /*
 360      * In order to prevent underflow, we add 0 mod p before subtracting.
 361      */
 362     static const limb two63m6 = (((limb) 1) << 62) - (((limb) 1) << 5);
 363     static const limb two63m5 = (((limb) 1) << 62) - (((limb) 1) << 4);
 364
 365     out[0] += two63m6 - in[0];
 366     out[1] += two63m5 - in[1];
 367     out[2] += two63m5 - in[2];
 368     out[3] += two63m5 - in[3];
 369     out[4] += two63m5 - in[4];
 370     out[5] += two63m5 - in[5];
 371     out[6] += two63m5 - in[6];
 372     out[7] += two63m5 - in[7];
 373     out[8] += two63m5 - in[8];
 374 }
 375
 376 /*-
 377  * felem_diff_128_64 subtracts |in| from |out|
 378  * On entry:
 379  *   in[i] < 2^126
 380  * On exit:
 381  *   out[i] < out[i] + 2^127 - 2^69
 382  */
 383 static void felem_diff128(largefelem out, const largefelem in)
 384 {
 385     /*
 386      * In order to prevent underflow, we add 0 mod p before subtracting.
 387      */
 388     static const uint128_t two127m70 =
 389         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
 390     static const uint128_t two127m69 =
 391         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
 392
 393     out[0] += (two127m70 - in[0]);
 394     out[1] += (two127m69 - in[1]);
 395     out[2] += (two127m69 - in[2]);
 396     out[3] += (two127m69 - in[3]);
 397     out[4] += (two127m69 - in[4]);
 398     out[5] += (two127m69 - in[5]);
 399     out[6] += (two127m69 - in[6]);
 400     out[7] += (two127m69 - in[7]);
 401     out[8] += (two127m69 - in[8]);
 402 }
 403
 404 /*-
 405  * felem_square sets |out| = |in|^2
 406  * On entry:
 407  *   in[i] < 2^62
 408  * On exit:
 409  *   out[i] < 17 * max(in[i]) * max(in[i])
 410  */
 411 static void felem_square(largefelem out, const felem in)
 412 {
 413     felem inx2, inx4;
 414     felem_scalar(inx2, in, 2);
 415     felem_scalar(inx4, in, 4);
 416
 417         /*-
 418          * We have many cases were we want to do
 419          *   in[x] * in[y] +
 420          *   in[y] * in[x]
 421          * This is obviously just
 422          *   2 * in[x] * in[y]
 423          * However, rather than do the doubling on the 128 bit result, we
 424          * double one of the inputs to the multiplication by reading from
 425          * |inx2| */
 426
 427     out[0] = ((uint128_t) in[0]) * in[0];
 428     out[1] = ((uint128_t) in[0]) * inx2[1];
 429     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
 430     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
 431     out[4] = ((uint128_t) in[0]) * inx2[4] +
 432         ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
 433     out[5] = ((uint128_t) in[0]) * inx2[5] +
 434         ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
 435     out[6] = ((uint128_t) in[0]) * inx2[6] +
 436         ((uint128_t) in[1]) * inx2[5] +
 437         ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
 438     out[7] = ((uint128_t) in[0]) * inx2[7] +
 439         ((uint128_t) in[1]) * inx2[6] +
 440         ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
 441     out[8] = ((uint128_t) in[0]) * inx2[8] +
 442         ((uint128_t) in[1]) * inx2[7] +
 443         ((uint128_t) in[2]) * inx2[6] +
 444         ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
 445
 446     /*
 447      * The remaining limbs fall above 2^521, with the first falling at 2^522.
 448      * They correspond to locations one bit up from the limbs produced above
 449      * so we would have to multiply by two to align them. Again, rather than
 450      * operate on the 128-bit result, we double one of the inputs to the
 451      * multiplication. If we want to double for both this reason, and the
 452      * reason above, then we end up multiplying by four.
 453      */
 454
 455     /* 9 */
 456     out[0] += ((uint128_t) in[1]) * inx4[8] +
 457         ((uint128_t) in[2]) * inx4[7] +
 458         ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
 459
 460     /* 10 */
 461     out[1] += ((uint128_t) in[2]) * inx4[8] +
 462         ((uint128_t) in[3]) * inx4[7] +
 463         ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
 464
 465     /* 11 */
 466     out[2] += ((uint128_t) in[3]) * inx4[8] +
 467         ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
 468
 469     /* 12 */
 470     out[3] += ((uint128_t) in[4]) * inx4[8] +
 471         ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
 472
 473     /* 13 */
 474     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
 475
 476     /* 14 */
 477     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
 478
 479     /* 15 */
 480     out[6] += ((uint128_t) in[7]) * inx4[8];
 481
 482     /* 16 */
 483     out[7] += ((uint128_t) in[8]) * inx2[8];
 484 }
 485
 486 /*-
 487  * felem_mul sets |out| = |in1| * |in2|
 488  * On entry:
 489  *   in1[i] < 2^64
 490  *   in2[i] < 2^63
 491  * On exit:
 492  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 493  */
 494 static void felem_mul(largefelem out, const felem in1, const felem in2)
 495 {
 496     felem in2x2;
 497     felem_scalar(in2x2, in2, 2);
 498
 499     out[0] = ((uint128_t) in1[0]) * in2[0];
 500
 501     out[1] = ((uint128_t) in1[0]) * in2[1] + ((uint128_t) in1[1]) * in2[0];
 502
 503     out[2] = ((uint128_t) in1[0]) * in2[2] +
 504         ((uint128_t) in1[1]) * in2[1] + ((uint128_t) in1[2]) * in2[0];
 505
 506     out[3] = ((uint128_t) in1[0]) * in2[3] +
 507         ((uint128_t) in1[1]) * in2[2] +
 508         ((uint128_t) in1[2]) * in2[1] + ((uint128_t) in1[3]) * in2[0];
 509
 510     out[4] = ((uint128_t) in1[0]) * in2[4] +
 511         ((uint128_t) in1[1]) * in2[3] +
 512         ((uint128_t) in1[2]) * in2[2] +
 513         ((uint128_t) in1[3]) * in2[1] + ((uint128_t) in1[4]) * in2[0];
 514
 515     out[5] = ((uint128_t) in1[0]) * in2[5] +
 516         ((uint128_t) in1[1]) * in2[4] +
 517         ((uint128_t) in1[2]) * in2[3] +
 518         ((uint128_t) in1[3]) * in2[2] +
 519         ((uint128_t) in1[4]) * in2[1] + ((uint128_t) in1[5]) * in2[0];
 520
 521     out[6] = ((uint128_t) in1[0]) * in2[6] +
 522         ((uint128_t) in1[1]) * in2[5] +
 523         ((uint128_t) in1[2]) * in2[4] +
 524         ((uint128_t) in1[3]) * in2[3] +
 525         ((uint128_t) in1[4]) * in2[2] +
 526         ((uint128_t) in1[5]) * in2[1] + ((uint128_t) in1[6]) * in2[0];
 527
 528     out[7] = ((uint128_t) in1[0]) * in2[7] +
 529         ((uint128_t) in1[1]) * in2[6] +
 530         ((uint128_t) in1[2]) * in2[5] +
 531         ((uint128_t) in1[3]) * in2[4] +
 532         ((uint128_t) in1[4]) * in2[3] +
 533         ((uint128_t) in1[5]) * in2[2] +
 534         ((uint128_t) in1[6]) * in2[1] + ((uint128_t) in1[7]) * in2[0];
 535
 536     out[8] = ((uint128_t) in1[0]) * in2[8] +
 537         ((uint128_t) in1[1]) * in2[7] +
 538         ((uint128_t) in1[2]) * in2[6] +
 539         ((uint128_t) in1[3]) * in2[5] +
 540         ((uint128_t) in1[4]) * in2[4] +
 541         ((uint128_t) in1[5]) * in2[3] +
 542         ((uint128_t) in1[6]) * in2[2] +
 543         ((uint128_t) in1[7]) * in2[1] + ((uint128_t) in1[8]) * in2[0];
 544
 545     /* See comment in felem_square about the use of in2x2 here */
 546
 547     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 548         ((uint128_t) in1[2]) * in2x2[7] +
 549         ((uint128_t) in1[3]) * in2x2[6] +
 550         ((uint128_t) in1[4]) * in2x2[5] +
 551         ((uint128_t) in1[5]) * in2x2[4] +
 552         ((uint128_t) in1[6]) * in2x2[3] +
 553         ((uint128_t) in1[7]) * in2x2[2] + ((uint128_t) in1[8]) * in2x2[1];
 554
 555     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 556         ((uint128_t) in1[3]) * in2x2[7] +
 557         ((uint128_t) in1[4]) * in2x2[6] +
 558         ((uint128_t) in1[5]) * in2x2[5] +
 559         ((uint128_t) in1[6]) * in2x2[4] +
 560         ((uint128_t) in1[7]) * in2x2[3] + ((uint128_t) in1[8]) * in2x2[2];
 561
 562     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 563         ((uint128_t) in1[4]) * in2x2[7] +
 564         ((uint128_t) in1[5]) * in2x2[6] +
 565         ((uint128_t) in1[6]) * in2x2[5] +
 566         ((uint128_t) in1[7]) * in2x2[4] + ((uint128_t) in1[8]) * in2x2[3];
 567
 568     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 569         ((uint128_t) in1[5]) * in2x2[7] +
 570         ((uint128_t) in1[6]) * in2x2[6] +
 571         ((uint128_t) in1[7]) * in2x2[5] + ((uint128_t) in1[8]) * in2x2[4];
 572
 573     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 574         ((uint128_t) in1[6]) * in2x2[7] +
 575         ((uint128_t) in1[7]) * in2x2[6] + ((uint128_t) in1[8]) * in2x2[5];
 576
 577     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 578         ((uint128_t) in1[7]) * in2x2[7] + ((uint128_t) in1[8]) * in2x2[6];
 579
 580     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 581         ((uint128_t) in1[8]) * in2x2[7];
 582
 583     out[7] += ((uint128_t) in1[8]) * in2x2[8];
 584 }
 585
 586 static const limb bottom52bits = 0xfffffffffffff;
 587
 588 /*-
 589  * felem_reduce converts a largefelem to an felem.
 590  * On entry:
 591  *   in[i] < 2^128
 592  * On exit:
 593  *   out[i] < 2^59 + 2^14
 594  */
 595 static void felem_reduce(felem out, const largefelem in)
 596 {
 597     u64 overflow1, overflow2;
 598
 599     out[0] = ((limb) in[0]) & bottom58bits;
 600     out[1] = ((limb) in[1]) & bottom58bits;
 601     out[2] = ((limb) in[2]) & bottom58bits;
 602     out[3] = ((limb) in[3]) & bottom58bits;
 603     out[4] = ((limb) in[4]) & bottom58bits;
 604     out[5] = ((limb) in[5]) & bottom58bits;
 605     out[6] = ((limb) in[6]) & bottom58bits;
 606     out[7] = ((limb) in[7]) & bottom58bits;
 607     out[8] = ((limb) in[8]) & bottom58bits;
 608
 609     /* out[i] < 2^58 */
 610
 611     out[1] += ((limb) in[0]) >> 58;
 612     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 613         /*-
 614          * out[1] < 2^58 + 2^6 + 2^58
 615          *        = 2^59 + 2^6
 616          */
 617     out[2] += ((limb) (in[0] >> 64)) >> 52;
 618
 619     out[2] += ((limb) in[1]) >> 58;
 620     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 621     out[3] += ((limb) (in[1] >> 64)) >> 52;
 622
 623     out[3] += ((limb) in[2]) >> 58;
 624     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 625     out[4] += ((limb) (in[2] >> 64)) >> 52;
 626
 627     out[4] += ((limb) in[3]) >> 58;
 628     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 629     out[5] += ((limb) (in[3] >> 64)) >> 52;
 630
 631     out[5] += ((limb) in[4]) >> 58;
 632     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 633     out[6] += ((limb) (in[4] >> 64)) >> 52;
 634
 635     out[6] += ((limb) in[5]) >> 58;
 636     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 637     out[7] += ((limb) (in[5] >> 64)) >> 52;
 638
 639     out[7] += ((limb) in[6]) >> 58;
 640     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 641     out[8] += ((limb) (in[6] >> 64)) >> 52;
 642
 643     out[8] += ((limb) in[7]) >> 58;
 644     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 645         /*-
 646          * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 647          *            < 2^59 + 2^13
 648          */
 649     overflow1 = ((limb) (in[7] >> 64)) >> 52;
 650
 651     overflow1 += ((limb) in[8]) >> 58;
 652     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 653     overflow2 = ((limb) (in[8] >> 64)) >> 52;
 654
 655     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
 656     overflow2 <<= 1;            /* overflow2 < 2^13 */
 657
 658     out[0] += overflow1;        /* out[0] < 2^60 */
 659     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
 660
 661     out[1] += out[0] >> 58;
 662     out[0] &= bottom58bits;
 663         /*-
 664          * out[0] < 2^58
 665          * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 666          *        < 2^59 + 2^14
 667          */
 668 }
 669
 670 static void felem_square_reduce(felem out, const felem in)
 671 {
 672     largefelem tmp;
 673     felem_square(tmp, in);
 674     felem_reduce(out, tmp);
 675 }
 676
 677 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 678 {
 679     largefelem tmp;
 680     felem_mul(tmp, in1, in2);
 681     felem_reduce(out, tmp);
 682 }
 683
 684 /*-
 685  * felem_inv calculates |out| = |in|^{-1}
 686  *
 687  * Based on Fermat's Little Theorem:
 688  *   a^p = a (mod p)
 689  *   a^{p-1} = 1 (mod p)
 690  *   a^{p-2} = a^{-1} (mod p)
 691  */
 692 static void felem_inv(felem out, const felem in)
 693 {
 694     felem ftmp, ftmp2, ftmp3, ftmp4;
 695     largefelem tmp;
 696     unsigned i;
 697
 698     felem_square(tmp, in);
 699     felem_reduce(ftmp, tmp);    /* 2^1 */
 700     felem_mul(tmp, in, ftmp);
 701     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
 702     felem_assign(ftmp2, ftmp);
 703     felem_square(tmp, ftmp);
 704     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
 705     felem_mul(tmp, in, ftmp);
 706     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
 707     felem_square(tmp, ftmp);
 708     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
 709
 710     felem_square(tmp, ftmp2);
 711     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
 712     felem_square(tmp, ftmp3);
 713     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
 714     felem_mul(tmp, ftmp3, ftmp2);
 715     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
 716
 717     felem_assign(ftmp2, ftmp3);
 718     felem_square(tmp, ftmp3);
 719     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
 720     felem_square(tmp, ftmp3);
 721     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
 722     felem_square(tmp, ftmp3);
 723     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
 724     felem_square(tmp, ftmp3);
 725     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
 726     felem_assign(ftmp4, ftmp3);
 727     felem_mul(tmp, ftmp3, ftmp);
 728     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
 729     felem_square(tmp, ftmp4);
 730     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
 731     felem_mul(tmp, ftmp3, ftmp2);
 732     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
 733     felem_assign(ftmp2, ftmp3);
 734
 735     for (i = 0; i < 8; i++) {
 736         felem_square(tmp, ftmp3);
 737         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
 738     }
 739     felem_mul(tmp, ftmp3, ftmp2);
 740     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
 741     felem_assign(ftmp2, ftmp3);
 742
 743     for (i = 0; i < 16; i++) {
 744         felem_square(tmp, ftmp3);
 745         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
 746     }
 747     felem_mul(tmp, ftmp3, ftmp2);
 748     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
 749     felem_assign(ftmp2, ftmp3);
 750
 751     for (i = 0; i < 32; i++) {
 752         felem_square(tmp, ftmp3);
 753         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
 754     }
 755     felem_mul(tmp, ftmp3, ftmp2);
 756     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
 757     felem_assign(ftmp2, ftmp3);
 758
 759     for (i = 0; i < 64; i++) {
 760         felem_square(tmp, ftmp3);
 761         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
 762     }
 763     felem_mul(tmp, ftmp3, ftmp2);
 764     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
 765     felem_assign(ftmp2, ftmp3);
 766
 767     for (i = 0; i < 128; i++) {
 768         felem_square(tmp, ftmp3);
 769         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
 770     }
 771     felem_mul(tmp, ftmp3, ftmp2);
 772     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
 773     felem_assign(ftmp2, ftmp3);
 774
 775     for (i = 0; i < 256; i++) {
 776         felem_square(tmp, ftmp3);
 777         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
 778     }
 779     felem_mul(tmp, ftmp3, ftmp2);
 780     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
 781
 782     for (i = 0; i < 9; i++) {
 783         felem_square(tmp, ftmp3);
 784         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
 785     }
 786     felem_mul(tmp, ftmp3, ftmp4);
 787     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
 788     felem_mul(tmp, ftmp3, in);
 789     felem_reduce(out, tmp);     /* 2^512 - 3 */
 790 }
 791
 792 /* This is 2^521-1, expressed as an felem */
 793 static const felem kPrime = {
 794     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 795     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 796     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 797 };
 798
 799 /*-
 800  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 801  * otherwise.
 802  * On entry:
 803  *   in[i] < 2^59 + 2^14
 804  */
 805 static limb felem_is_zero(const felem in)
 806 {
 807     felem ftmp;
 808     limb is_zero, is_p;
 809     felem_assign(ftmp, in);
 810
 811     ftmp[0] += ftmp[8] >> 57;
 812     ftmp[8] &= bottom57bits;
 813     /* ftmp[8] < 2^57 */
 814     ftmp[1] += ftmp[0] >> 58;
 815     ftmp[0] &= bottom58bits;
 816     ftmp[2] += ftmp[1] >> 58;
 817     ftmp[1] &= bottom58bits;
 818     ftmp[3] += ftmp[2] >> 58;
 819     ftmp[2] &= bottom58bits;
 820     ftmp[4] += ftmp[3] >> 58;
 821     ftmp[3] &= bottom58bits;
 822     ftmp[5] += ftmp[4] >> 58;
 823     ftmp[4] &= bottom58bits;
 824     ftmp[6] += ftmp[5] >> 58;
 825     ftmp[5] &= bottom58bits;
 826     ftmp[7] += ftmp[6] >> 58;
 827     ftmp[6] &= bottom58bits;
 828     ftmp[8] += ftmp[7] >> 58;
 829     ftmp[7] &= bottom58bits;
 830     /* ftmp[8] < 2^57 + 4 */
 831
 832     /*
 833      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
 834      * than our bound for ftmp[8]. Therefore we only have to check if the
 835      * zero is zero or 2^521-1.
 836      */
 837
 838     is_zero = 0;
 839     is_zero |= ftmp[0];
 840     is_zero |= ftmp[1];
 841     is_zero |= ftmp[2];
 842     is_zero |= ftmp[3];
 843     is_zero |= ftmp[4];
 844     is_zero |= ftmp[5];
 845     is_zero |= ftmp[6];
 846     is_zero |= ftmp[7];
 847     is_zero |= ftmp[8];
 848
 849     is_zero--;
 850     /*
 851      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
 852      * can be set is if is_zero was 0 before the decrement.
 853      */
 854     is_zero = ((s64) is_zero) >> 63;
 855
 856     is_p = ftmp[0] ^ kPrime[0];
 857     is_p |= ftmp[1] ^ kPrime[1];
 858     is_p |= ftmp[2] ^ kPrime[2];
 859     is_p |= ftmp[3] ^ kPrime[3];
 860     is_p |= ftmp[4] ^ kPrime[4];
 861     is_p |= ftmp[5] ^ kPrime[5];
 862     is_p |= ftmp[6] ^ kPrime[6];
 863     is_p |= ftmp[7] ^ kPrime[7];
 864     is_p |= ftmp[8] ^ kPrime[8];
 865
 866     is_p--;
 867     is_p = ((s64) is_p) >> 63;
 868
 869     is_zero |= is_p;
 870     return is_zero;
 871 }
 872
 873 static int felem_is_zero_int(const felem in)
 874 {
 875     return (int)(felem_is_zero(in) & ((limb) 1));
 876 }
 877
 878 /*-
 879  * felem_contract converts |in| to its unique, minimal representation.
 880  * On entry:
 881  *   in[i] < 2^59 + 2^14
 882  */
 883 static void felem_contract(felem out, const felem in)
 884 {
 885     limb is_p, is_greater, sign;
 886     static const limb two58 = ((limb) 1) << 58;
 887
 888     felem_assign(out, in);
 889
 890     out[0] += out[8] >> 57;
 891     out[8] &= bottom57bits;
 892     /* out[8] < 2^57 */
 893     out[1] += out[0] >> 58;
 894     out[0] &= bottom58bits;
 895     out[2] += out[1] >> 58;
 896     out[1] &= bottom58bits;
 897     out[3] += out[2] >> 58;
 898     out[2] &= bottom58bits;
 899     out[4] += out[3] >> 58;
 900     out[3] &= bottom58bits;
 901     out[5] += out[4] >> 58;
 902     out[4] &= bottom58bits;
 903     out[6] += out[5] >> 58;
 904     out[5] &= bottom58bits;
 905     out[7] += out[6] >> 58;
 906     out[6] &= bottom58bits;
 907     out[8] += out[7] >> 58;
 908     out[7] &= bottom58bits;
 909     /* out[8] < 2^57 + 4 */
 910
 911     /*
 912      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
 913      * out. See the comments in felem_is_zero regarding why we don't test for
 914      * other multiples of the prime.
 915      */
 916
 917     /*
 918      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
 919      */
 920
 921     is_p = out[0] ^ kPrime[0];
 922     is_p |= out[1] ^ kPrime[1];
 923     is_p |= out[2] ^ kPrime[2];
 924     is_p |= out[3] ^ kPrime[3];
 925     is_p |= out[4] ^ kPrime[4];
 926     is_p |= out[5] ^ kPrime[5];
 927     is_p |= out[6] ^ kPrime[6];
 928     is_p |= out[7] ^ kPrime[7];
 929     is_p |= out[8] ^ kPrime[8];
 930
 931     is_p--;
 932     is_p &= is_p << 32;
 933     is_p &= is_p << 16;
 934     is_p &= is_p << 8;
 935     is_p &= is_p << 4;
 936     is_p &= is_p << 2;
 937     is_p &= is_p << 1;
 938     is_p = ((s64) is_p) >> 63;
 939     is_p = ~is_p;
 940
 941     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 942
 943     out[0] &= is_p;
 944     out[1] &= is_p;
 945     out[2] &= is_p;
 946     out[3] &= is_p;
 947     out[4] &= is_p;
 948     out[5] &= is_p;
 949     out[6] &= is_p;
 950     out[7] &= is_p;
 951     out[8] &= is_p;
 952
 953     /*
 954      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
 955      * 57 is greater than zero as (2^521-1) + x >= 2^522
 956      */
 957     is_greater = out[8] >> 57;
 958     is_greater |= is_greater << 32;
 959     is_greater |= is_greater << 16;
 960     is_greater |= is_greater << 8;
 961     is_greater |= is_greater << 4;
 962     is_greater |= is_greater << 2;
 963     is_greater |= is_greater << 1;
 964     is_greater = ((s64) is_greater) >> 63;
 965
 966     out[0] -= kPrime[0] & is_greater;
 967     out[1] -= kPrime[1] & is_greater;
 968     out[2] -= kPrime[2] & is_greater;
 969     out[3] -= kPrime[3] & is_greater;
 970     out[4] -= kPrime[4] & is_greater;
 971     out[5] -= kPrime[5] & is_greater;
 972     out[6] -= kPrime[6] & is_greater;
 973     out[7] -= kPrime[7] & is_greater;
 974     out[8] -= kPrime[8] & is_greater;
 975
 976     /* Eliminate negative coefficients */
 977     sign = -(out[0] >> 63);
 978     out[0] += (two58 & sign);
 979     out[1] -= (1 & sign);
 980     sign = -(out[1] >> 63);
 981     out[1] += (two58 & sign);
 982     out[2] -= (1 & sign);
 983     sign = -(out[2] >> 63);
 984     out[2] += (two58 & sign);
 985     out[3] -= (1 & sign);
 986     sign = -(out[3] >> 63);
 987     out[3] += (two58 & sign);
 988     out[4] -= (1 & sign);
 989     sign = -(out[4] >> 63);
 990     out[4] += (two58 & sign);
 991     out[5] -= (1 & sign);
 992     sign = -(out[0] >> 63);
 993     out[5] += (two58 & sign);
 994     out[6] -= (1 & sign);
 995     sign = -(out[6] >> 63);
 996     out[6] += (two58 & sign);
 997     out[7] -= (1 & sign);
 998     sign = -(out[7] >> 63);
 999     out[7] += (two58 & sign);
1000     out[8] -= (1 & sign);
1001     sign = -(out[5] >> 63);
1002     out[5] += (two58 & sign);
1003     out[6] -= (1 & sign);
1004     sign = -(out[6] >> 63);
1005     out[6] += (two58 & sign);
1006     out[7] -= (1 & sign);
1007     sign = -(out[7] >> 63);
1008     out[7] += (two58 & sign);
1009     out[8] -= (1 & sign);
1010 }
1011
1012 /*-
1013  * Group operations
1014  * ----------------
1015  *
1016  * Building on top of the field operations we have the operations on the
1017  * elliptic curve group itself. Points on the curve are represented in Jacobian
1018  * coordinates */
1019
1020 /*-
1021  * point_double calcuates 2*(x_in, y_in, z_in)
1022  *
1023  * The method is taken from:
1024  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1025  *
1026  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1027  * while x_out == y_in is not (maybe this works, but it's not tested). */
1028 static void
1029 point_double(felem x_out, felem y_out, felem z_out,
1030              const felem x_in, const felem y_in, const felem z_in)
1031 {
1032     largefelem tmp, tmp2;
1033     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1034
1035     felem_assign(ftmp, x_in);
1036     felem_assign(ftmp2, x_in);
1037
1038     /* delta = z^2 */
1039     felem_square(tmp, z_in);
1040     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1041
1042     /* gamma = y^2 */
1043     felem_square(tmp, y_in);
1044     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1045
1046     /* beta = x*gamma */
1047     felem_mul(tmp, x_in, gamma);
1048     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1049
1050     /* alpha = 3*(x-delta)*(x+delta) */
1051     felem_diff64(ftmp, delta);
1052     /* ftmp[i] < 2^61 */
1053     felem_sum64(ftmp2, delta);
1054     /* ftmp2[i] < 2^60 + 2^15 */
1055     felem_scalar64(ftmp2, 3);
1056     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1057     felem_mul(tmp, ftmp, ftmp2);
1058         /*-
1059          * tmp[i] < 17(3*2^121 + 3*2^76)
1060          *        = 61*2^121 + 61*2^76
1061          *        < 64*2^121 + 64*2^76
1062          *        = 2^127 + 2^82
1063          *        < 2^128
1064          */
1065     felem_reduce(alpha, tmp);
1066
1067     /* x' = alpha^2 - 8*beta */
1068     felem_square(tmp, alpha);
1069     /*
1070      * tmp[i] < 17*2^120 < 2^125
1071      */
1072     felem_assign(ftmp, beta);
1073     felem_scalar64(ftmp, 8);
1074     /* ftmp[i] < 2^62 + 2^17 */
1075     felem_diff_128_64(tmp, ftmp);
1076     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1077     felem_reduce(x_out, tmp);
1078
1079     /* z' = (y + z)^2 - gamma - delta */
1080     felem_sum64(delta, gamma);
1081     /* delta[i] < 2^60 + 2^15 */
1082     felem_assign(ftmp, y_in);
1083     felem_sum64(ftmp, z_in);
1084     /* ftmp[i] < 2^60 + 2^15 */
1085     felem_square(tmp, ftmp);
1086     /*
1087      * tmp[i] < 17(2^122) < 2^127
1088      */
1089     felem_diff_128_64(tmp, delta);
1090     /* tmp[i] < 2^127 + 2^63 */
1091     felem_reduce(z_out, tmp);
1092
1093     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1094     felem_scalar64(beta, 4);
1095     /* beta[i] < 2^61 + 2^16 */
1096     felem_diff64(beta, x_out);
1097     /* beta[i] < 2^61 + 2^60 + 2^16 */
1098     felem_mul(tmp, alpha, beta);
1099         /*-
1100          * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1101          *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1102          *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1103          *        < 2^128
1104          */
1105     felem_square(tmp2, gamma);
1106         /*-
1107          * tmp2[i] < 17*(2^59 + 2^14)^2
1108          *         = 17*(2^118 + 2^74 + 2^28)
1109          */
1110     felem_scalar128(tmp2, 8);
1111         /*-
1112          * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1113          *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1114          *         < 2^126
1115          */
1116     felem_diff128(tmp, tmp2);
1117         /*-
1118          * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1119          *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1120          *          2^74 + 2^69 + 2^34 + 2^30
1121          *        < 2^128
1122          */
1123     felem_reduce(y_out, tmp);
1124 }
1125
1126 /* copy_conditional copies in to out iff mask is all ones. */
1127 static void copy_conditional(felem out, const felem in, limb mask)
1128 {
1129     unsigned i;
1130     for (i = 0; i < NLIMBS; ++i) {
1131         const limb tmp = mask & (in[i] ^ out[i]);
1132         out[i] ^= tmp;
1133     }
1134 }
1135
1136 /*-
1137  * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1138  *
1139  * The method is taken from
1140  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1141  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1142  *
1143  * This function includes a branch for checking whether the two input points
1144  * are equal (while not equal to the point at infinity). This case never
1145  * happens during single point multiplication, so there is no timing leak for
1146  * ECDH or ECDSA signing. */
1147 static void point_add(felem x3, felem y3, felem z3,
1148                       const felem x1, const felem y1, const felem z1,
1149                       const int mixed, const felem x2, const felem y2,
1150                       const felem z2)
1151 {
1152     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1153     largefelem tmp, tmp2;
1154     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1155
1156     z1_is_zero = felem_is_zero(z1);
1157     z2_is_zero = felem_is_zero(z2);
1158
1159     /* ftmp = z1z1 = z1**2 */
1160     felem_square(tmp, z1);
1161     felem_reduce(ftmp, tmp);
1162
1163     if (!mixed) {
1164         /* ftmp2 = z2z2 = z2**2 */
1165         felem_square(tmp, z2);
1166         felem_reduce(ftmp2, tmp);
1167
1168         /* u1 = ftmp3 = x1*z2z2 */
1169         felem_mul(tmp, x1, ftmp2);
1170         felem_reduce(ftmp3, tmp);
1171
1172         /* ftmp5 = z1 + z2 */
1173         felem_assign(ftmp5, z1);
1174         felem_sum64(ftmp5, z2);
1175         /* ftmp5[i] < 2^61 */
1176
1177         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1178         felem_square(tmp, ftmp5);
1179         /* tmp[i] < 17*2^122 */
1180         felem_diff_128_64(tmp, ftmp);
1181         /* tmp[i] < 17*2^122 + 2^63 */
1182         felem_diff_128_64(tmp, ftmp2);
1183         /* tmp[i] < 17*2^122 + 2^64 */
1184         felem_reduce(ftmp5, tmp);
1185
1186         /* ftmp2 = z2 * z2z2 */
1187         felem_mul(tmp, ftmp2, z2);
1188         felem_reduce(ftmp2, tmp);
1189
1190         /* s1 = ftmp6 = y1 * z2**3 */
1191         felem_mul(tmp, y1, ftmp2);
1192         felem_reduce(ftmp6, tmp);
1193     } else {
1194         /*
1195          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1196          */
1197
1198         /* u1 = ftmp3 = x1*z2z2 */
1199         felem_assign(ftmp3, x1);
1200
1201         /* ftmp5 = 2*z1z2 */
1202         felem_scalar(ftmp5, z1, 2);
1203
1204         /* s1 = ftmp6 = y1 * z2**3 */
1205         felem_assign(ftmp6, y1);
1206     }
1207
1208     /* u2 = x2*z1z1 */
1209     felem_mul(tmp, x2, ftmp);
1210     /* tmp[i] < 17*2^120 */
1211
1212     /* h = ftmp4 = u2 - u1 */
1213     felem_diff_128_64(tmp, ftmp3);
1214     /* tmp[i] < 17*2^120 + 2^63 */
1215     felem_reduce(ftmp4, tmp);
1216
1217     x_equal = felem_is_zero(ftmp4);
1218
1219     /* z_out = ftmp5 * h */
1220     felem_mul(tmp, ftmp5, ftmp4);
1221     felem_reduce(z_out, tmp);
1222
1223     /* ftmp = z1 * z1z1 */
1224     felem_mul(tmp, ftmp, z1);
1225     felem_reduce(ftmp, tmp);
1226
1227     /* s2 = tmp = y2 * z1**3 */
1228     felem_mul(tmp, y2, ftmp);
1229     /* tmp[i] < 17*2^120 */
1230
1231     /* r = ftmp5 = (s2 - s1)*2 */
1232     felem_diff_128_64(tmp, ftmp6);
1233     /* tmp[i] < 17*2^120 + 2^63 */
1234     felem_reduce(ftmp5, tmp);
1235     y_equal = felem_is_zero(ftmp5);
1236     felem_scalar64(ftmp5, 2);
1237     /* ftmp5[i] < 2^61 */
1238
1239     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1240         point_double(x3, y3, z3, x1, y1, z1);
1241         return;
1242     }
1243
1244     /* I = ftmp = (2h)**2 */
1245     felem_assign(ftmp, ftmp4);
1246     felem_scalar64(ftmp, 2);
1247     /* ftmp[i] < 2^61 */
1248     felem_square(tmp, ftmp);
1249     /* tmp[i] < 17*2^122 */
1250     felem_reduce(ftmp, tmp);
1251
1252     /* J = ftmp2 = h * I */
1253     felem_mul(tmp, ftmp4, ftmp);
1254     felem_reduce(ftmp2, tmp);
1255
1256     /* V = ftmp4 = U1 * I */
1257     felem_mul(tmp, ftmp3, ftmp);
1258     felem_reduce(ftmp4, tmp);
1259
1260     /* x_out = r**2 - J - 2V */
1261     felem_square(tmp, ftmp5);
1262     /* tmp[i] < 17*2^122 */
1263     felem_diff_128_64(tmp, ftmp2);
1264     /* tmp[i] < 17*2^122 + 2^63 */
1265     felem_assign(ftmp3, ftmp4);
1266     felem_scalar64(ftmp4, 2);
1267     /* ftmp4[i] < 2^61 */
1268     felem_diff_128_64(tmp, ftmp4);
1269     /* tmp[i] < 17*2^122 + 2^64 */
1270     felem_reduce(x_out, tmp);
1271
1272     /* y_out = r(V-x_out) - 2 * s1 * J */
1273     felem_diff64(ftmp3, x_out);
1274     /*
1275      * ftmp3[i] < 2^60 + 2^60 = 2^61
1276      */
1277     felem_mul(tmp, ftmp5, ftmp3);
1278     /* tmp[i] < 17*2^122 */
1279     felem_mul(tmp2, ftmp6, ftmp2);
1280     /* tmp2[i] < 17*2^120 */
1281     felem_scalar128(tmp2, 2);
1282     /* tmp2[i] < 17*2^121 */
1283     felem_diff128(tmp, tmp2);
1284         /*-
1285          * tmp[i] < 2^127 - 2^69 + 17*2^122
1286          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1287          *        < 2^127
1288          */
1289     felem_reduce(y_out, tmp);
1290
1291     copy_conditional(x_out, x2, z1_is_zero);
1292     copy_conditional(x_out, x1, z2_is_zero);
1293     copy_conditional(y_out, y2, z1_is_zero);
1294     copy_conditional(y_out, y1, z2_is_zero);
1295     copy_conditional(z_out, z2, z1_is_zero);
1296     copy_conditional(z_out, z1, z2_is_zero);
1297     felem_assign(x3, x_out);
1298     felem_assign(y3, y_out);
1299     felem_assign(z3, z_out);
1300 }
1301
1302 /*-
1303  * Base point pre computation
1304  * --------------------------
1305  *
1306  * Two different sorts of precomputed tables are used in the following code.
1307  * Each contain various points on the curve, where each point is three field
1308  * elements (x, y, z).
1309  *
1310  * For the base point table, z is usually 1 (0 for the point at infinity).
1311  * This table has 16 elements:
1312  * index | bits    | point
1313  * ------+---------+------------------------------
1314  *     0 | 0 0 0 0 | 0G
1315  *     1 | 0 0 0 1 | 1G
1316  *     2 | 0 0 1 0 | 2^130G
1317  *     3 | 0 0 1 1 | (2^130 + 1)G
1318  *     4 | 0 1 0 0 | 2^260G
1319  *     5 | 0 1 0 1 | (2^260 + 1)G
1320  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1321  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1322  *     8 | 1 0 0 0 | 2^390G
1323  *     9 | 1 0 0 1 | (2^390 + 1)G
1324  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1325  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1326  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1327  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1328  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1329  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1330  *
1331  * The reason for this is so that we can clock bits into four different
1332  * locations when doing simple scalar multiplies against the base point.
1333  *
1334  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1335
1336 /* gmul is the table of precomputed base points */
1337 static const felem gmul[16][3] = { {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1338                                     {0, 0, 0, 0, 0, 0, 0, 0, 0},
1339                                     {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1340 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1341   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1342   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1343  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1344   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1345   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1346  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1347 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1348   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1349   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1350  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1351   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1352   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1353  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1354 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1355   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1356   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1357  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1358   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1359   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1360  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1361 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1362   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1363   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1364  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1365   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1366   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1367  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1368 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1369   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1370   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1371  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1372   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1373   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1374  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1375 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1376   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1377   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1378  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1379   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1380   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1381  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1382 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1383   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1384   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1385  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1386   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1387   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1388  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1389 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1390   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1391   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1392  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1393   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1394   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1395  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1396 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1397   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1398   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1399  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1400   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1401   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1402  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1403 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1404   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1405   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1406  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1407   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1408   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1409  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1410 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1411   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1412   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1413  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1414   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1415   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1416  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1417 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1418   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1419   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1420  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1421   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1422   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1423  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1424 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1425   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1426   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1427  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1428   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1429   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1430  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1431 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1432   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1433   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1434  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1435   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1436   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1437  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1438 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1439   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1440   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1441  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1442   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1443   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1444  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1445 };
1446
1447 /*
1448  * select_point selects the |idx|th point from a precomputation table and
1449  * copies it to out.
1450  */
1451  /* pre_comp below is of the size provided in |size| */
1452 static void select_point(const limb idx, unsigned int size,
1453                          const felem pre_comp[][3], felem out[3])
1454 {
1455     unsigned i, j;
1456     limb *outlimbs = &out[0][0];
1457     memset(outlimbs, 0, 3 * sizeof(felem));
1458
1459     for (i = 0; i < size; i++) {
1460         const limb *inlimbs = &pre_comp[i][0][0];
1461         limb mask = i ^ idx;
1462         mask |= mask >> 4;
1463         mask |= mask >> 2;
1464         mask |= mask >> 1;
1465         mask &= 1;
1466         mask--;
1467         for (j = 0; j < NLIMBS * 3; j++)
1468             outlimbs[j] |= inlimbs[j] & mask;
1469     }
1470 }
1471
1472 /* get_bit returns the |i|th bit in |in| */
1473 static char get_bit(const felem_bytearray in, int i)
1474 {
1475     if (i < 0)
1476         return 0;
1477     return (in[i >> 3] >> (i & 7)) & 1;
1478 }
1479
1480 /*
1481  * Interleaved point multiplication using precomputed point multiples: The
1482  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1483  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1484  * generator, using certain (large) precomputed multiples in g_pre_comp.
1485  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1486  */
1487 static void batch_mul(felem x_out, felem y_out, felem z_out,
1488                       const felem_bytearray scalars[],
1489                       const unsigned num_points, const u8 *g_scalar,
1490                       const int mixed, const felem pre_comp[][17][3],
1491                       const felem g_pre_comp[16][3])
1492 {
1493     int i, skip;
1494     unsigned num, gen_mul = (g_scalar != NULL);
1495     felem nq[3], tmp[4];
1496     limb bits;
1497     u8 sign, digit;
1498
1499     /* set nq to the point at infinity */
1500     memset(nq, 0, 3 * sizeof(felem));
1501
1502     /*
1503      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1504      * of the generator (last quarter of rounds) and additions of other
1505      * points multiples (every 5th round).
1506      */
1507     skip = 1;                   /* save two point operations in the first
1508                                  * round */
1509     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1510         /* double */
1511         if (!skip)
1512             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1513
1514         /* add multiples of the generator */
1515         if (gen_mul && (i <= 130)) {
1516             bits = get_bit(g_scalar, i + 390) << 3;
1517             if (i < 130) {
1518                 bits |= get_bit(g_scalar, i + 260) << 2;
1519                 bits |= get_bit(g_scalar, i + 130) << 1;
1520                 bits |= get_bit(g_scalar, i);
1521             }
1522             /* select the point to add, in constant time */
1523             select_point(bits, 16, g_pre_comp, tmp);
1524             if (!skip) {
1525                 /* The 1 argument below is for "mixed" */
1526                 point_add(nq[0], nq[1], nq[2],
1527                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1528             } else {
1529                 memcpy(nq, tmp, 3 * sizeof(felem));
1530                 skip = 0;
1531             }
1532         }
1533
1534         /* do other additions every 5 doublings */
1535         if (num_points && (i % 5 == 0)) {
1536             /* loop over all scalars */
1537             for (num = 0; num < num_points; ++num) {
1538                 bits = get_bit(scalars[num], i + 4) << 5;
1539                 bits |= get_bit(scalars[num], i + 3) << 4;
1540                 bits |= get_bit(scalars[num], i + 2) << 3;
1541                 bits |= get_bit(scalars[num], i + 1) << 2;
1542                 bits |= get_bit(scalars[num], i) << 1;
1543                 bits |= get_bit(scalars[num], i - 1);
1544                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1545
1546                 /*
1547                  * select the point to add or subtract, in constant time
1548                  */
1549                 select_point(digit, 17, pre_comp[num], tmp);
1550                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1551                                             * point */
1552                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1553
1554                 if (!skip) {
1555                     point_add(nq[0], nq[1], nq[2],
1556                               nq[0], nq[1], nq[2],
1557                               mixed, tmp[0], tmp[1], tmp[2]);
1558                 } else {
1559                     memcpy(nq, tmp, 3 * sizeof(felem));
1560                     skip = 0;
1561                 }
1562             }
1563         }
1564     }
1565     felem_assign(x_out, nq[0]);
1566     felem_assign(y_out, nq[1]);
1567     felem_assign(z_out, nq[2]);
1568 }
1569
1570 /* Precomputation for the group generator. */
1571 typedef struct {
1572     felem g_pre_comp[16][3];
1573     int references;
1574 } NISTP521_PRE_COMP;
1575
1576 const EC_METHOD *EC_GFp_nistp521_method(void)
1577 {
1578     static const EC_METHOD ret = {
1579         EC_FLAGS_DEFAULT_OCT,
1580         NID_X9_62_prime_field,
1581         ec_GFp_nistp521_group_init,
1582         ec_GFp_simple_group_finish,
1583         ec_GFp_simple_group_clear_finish,
1584         ec_GFp_nist_group_copy,
1585         ec_GFp_nistp521_group_set_curve,
1586         ec_GFp_simple_group_get_curve,
1587         ec_GFp_simple_group_get_degree,
1588         ec_GFp_simple_group_check_discriminant,
1589         ec_GFp_simple_point_init,
1590         ec_GFp_simple_point_finish,
1591         ec_GFp_simple_point_clear_finish,
1592         ec_GFp_simple_point_copy,
1593         ec_GFp_simple_point_set_to_infinity,
1594         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1595         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1596         ec_GFp_simple_point_set_affine_coordinates,
1597         ec_GFp_nistp521_point_get_affine_coordinates,
1598         0 /* point_set_compressed_coordinates */ ,
1599         0 /* point2oct */ ,
1600         0 /* oct2point */ ,
1601         ec_GFp_simple_add,
1602         ec_GFp_simple_dbl,
1603         ec_GFp_simple_invert,
1604         ec_GFp_simple_is_at_infinity,
1605         ec_GFp_simple_is_on_curve,
1606         ec_GFp_simple_cmp,
1607         ec_GFp_simple_make_affine,
1608         ec_GFp_simple_points_make_affine,
1609         ec_GFp_nistp521_points_mul,
1610         ec_GFp_nistp521_precompute_mult,
1611         ec_GFp_nistp521_have_precompute_mult,
1612         ec_GFp_nist_field_mul,
1613         ec_GFp_nist_field_sqr,
1614         0 /* field_div */ ,
1615         0 /* field_encode */ ,
1616         0 /* field_decode */ ,
1617         0                       /* field_set_to_one */
1618     };
1619
1620     return &ret;
1621 }
1622
1623 /******************************************************************************/
1624 /*
1625  * FUNCTIONS TO MANAGE PRECOMPUTATION
1626  */
1627
1628 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1629 {
1630     NISTP521_PRE_COMP *ret = NULL;
1631     ret = (NISTP521_PRE_COMP *) OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1632     if (!ret) {
1633         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1634         return ret;
1635     }
1636     memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1637     ret->references = 1;
1638     return ret;
1639 }
1640
1641 static void *nistp521_pre_comp_dup(void *src_)
1642 {
1643     NISTP521_PRE_COMP *src = src_;
1644
1645     /* no need to actually copy, these objects never change! */
1646     CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1647
1648     return src_;
1649 }
1650
1651 static void nistp521_pre_comp_free(void *pre_)
1652 {
1653     int i;
1654     NISTP521_PRE_COMP *pre = pre_;
1655
1656     if (!pre)
1657         return;
1658
1659     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1660     if (i > 0)
1661         return;
1662
1663     OPENSSL_free(pre);
1664 }
1665
1666 static void nistp521_pre_comp_clear_free(void *pre_)
1667 {
1668     int i;
1669     NISTP521_PRE_COMP *pre = pre_;
1670
1671     if (!pre)
1672         return;
1673
1674     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1675     if (i > 0)
1676         return;
1677
1678     OPENSSL_cleanse(pre, sizeof(*pre));
1679     OPENSSL_free(pre);
1680 }
1681
1682 /******************************************************************************/
1683 /*
1684  * OPENSSL EC_METHOD FUNCTIONS
1685  */
1686
1687 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1688 {
1689     int ret;
1690     ret = ec_GFp_simple_group_init(group);
1691     group->a_is_minus3 = 1;
1692     return ret;
1693 }
1694
1695 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1696                                     const BIGNUM *a, const BIGNUM *b,
1697                                     BN_CTX *ctx)
1698 {
1699     int ret = 0;
1700     BN_CTX *new_ctx = NULL;
1701     BIGNUM *curve_p, *curve_a, *curve_b;
1702
1703     if (ctx == NULL)
1704         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1705             return 0;
1706     BN_CTX_start(ctx);
1707     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1708         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1709         ((curve_b = BN_CTX_get(ctx)) == NULL))
1710         goto err;
1711     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1712     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1713     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1714     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1715         ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1716               EC_R_WRONG_CURVE_PARAMETERS);
1717         goto err;
1718     }
1719     group->field_mod_func = BN_nist_mod_521;
1720     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1721  err:
1722     BN_CTX_end(ctx);
1723     if (new_ctx != NULL)
1724         BN_CTX_free(new_ctx);
1725     return ret;
1726 }
1727
1728 /*
1729  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1730  * (X/Z^2, Y/Z^3)
1731  */
1732 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1733                                                  const EC_POINT *point,
1734                                                  BIGNUM *x, BIGNUM *y,
1735                                                  BN_CTX *ctx)
1736 {
1737     felem z1, z2, x_in, y_in, x_out, y_out;
1738     largefelem tmp;
1739
1740     if (EC_POINT_is_at_infinity(group, point)) {
1741         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1742               EC_R_POINT_AT_INFINITY);
1743         return 0;
1744     }
1745     if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1746         (!BN_to_felem(z1, &point->Z)))
1747         return 0;
1748     felem_inv(z2, z1);
1749     felem_square(tmp, z2);
1750     felem_reduce(z1, tmp);
1751     felem_mul(tmp, x_in, z1);
1752     felem_reduce(x_in, tmp);
1753     felem_contract(x_out, x_in);
1754     if (x != NULL) {
1755         if (!felem_to_BN(x, x_out)) {
1756             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1757                   ERR_R_BN_LIB);
1758             return 0;
1759         }
1760     }
1761     felem_mul(tmp, z1, z2);
1762     felem_reduce(z1, tmp);
1763     felem_mul(tmp, y_in, z1);
1764     felem_reduce(y_in, tmp);
1765     felem_contract(y_out, y_in);
1766     if (y != NULL) {
1767         if (!felem_to_BN(y, y_out)) {
1768             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1769                   ERR_R_BN_LIB);
1770             return 0;
1771         }
1772     }
1773     return 1;
1774 }
1775
1776 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1777 static void make_points_affine(size_t num, felem points[][3],
1778                                felem tmp_felems[])
1779 {
1780     /*
1781      * Runs in constant time, unless an input is the point at infinity (which
1782      * normally shouldn't happen).
1783      */
1784     ec_GFp_nistp_points_make_affine_internal(num,
1785                                              points,
1786                                              sizeof(felem),
1787                                              tmp_felems,
1788                                              (void (*)(void *))felem_one,
1789                                              (int (*)(const void *))
1790                                              felem_is_zero_int,
1791                                              (void (*)(void *, const void *))
1792                                              felem_assign,
1793                                              (void (*)(void *, const void *))
1794                                              felem_square_reduce, (void (*)
1795                                                                    (void *,
1796                                                                     const void
1797                                                                     *,
1798                                                                     const void
1799                                                                     *))
1800                                              felem_mul_reduce,
1801                                              (void (*)(void *, const void *))
1802                                              felem_inv,
1803                                              (void (*)(void *, const void *))
1804                                              felem_contract);
1805 }
1806
1807 /*
1808  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1809  * values Result is stored in r (r can equal one of the inputs).
1810  */
1811 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1812                                const BIGNUM *scalar, size_t num,
1813                                const EC_POINT *points[],
1814                                const BIGNUM *scalars[], BN_CTX *ctx)
1815 {
1816     int ret = 0;
1817     int j;
1818     int mixed = 0;
1819     BN_CTX *new_ctx = NULL;
1820     BIGNUM *x, *y, *z, *tmp_scalar;
1821     felem_bytearray g_secret;
1822     felem_bytearray *secrets = NULL;
1823     felem(*pre_comp)[17][3] = NULL;
1824     felem *tmp_felems = NULL;
1825     felem_bytearray tmp;
1826     unsigned i, num_bytes;
1827     int have_pre_comp = 0;
1828     size_t num_points = num;
1829     felem x_in, y_in, z_in, x_out, y_out, z_out;
1830     NISTP521_PRE_COMP *pre = NULL;
1831     felem(*g_pre_comp)[3] = NULL;
1832     EC_POINT *generator = NULL;
1833     const EC_POINT *p = NULL;
1834     const BIGNUM *p_scalar = NULL;
1835
1836     if (ctx == NULL)
1837         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1838             return 0;
1839     BN_CTX_start(ctx);
1840     if (((x = BN_CTX_get(ctx)) == NULL) ||
1841         ((y = BN_CTX_get(ctx)) == NULL) ||
1842         ((z = BN_CTX_get(ctx)) == NULL) ||
1843         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1844         goto err;
1845
1846     if (scalar != NULL) {
1847         pre = EC_EX_DATA_get_data(group->extra_data,
1848                                   nistp521_pre_comp_dup,
1849                                   nistp521_pre_comp_free,
1850                                   nistp521_pre_comp_clear_free);
1851         if (pre)
1852             /* we have precomputation, try to use it */
1853             g_pre_comp = &pre->g_pre_comp[0];
1854         else
1855             /* try to use the standard precomputation */
1856             g_pre_comp = (felem(*)[3]) gmul;
1857         generator = EC_POINT_new(group);
1858         if (generator == NULL)
1859             goto err;
1860         /* get the generator from precomputation */
1861         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1862             !felem_to_BN(y, g_pre_comp[1][1]) ||
1863             !felem_to_BN(z, g_pre_comp[1][2])) {
1864             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1865             goto err;
1866         }
1867         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1868                                                       generator, x, y, z,
1869                                                       ctx))
1870             goto err;
1871         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1872             /* precomputation matches generator */
1873             have_pre_comp = 1;
1874         else
1875             /*
1876              * we don't have valid precomputation: treat the generator as a
1877              * random point
1878              */
1879             num_points++;
1880     }
1881
1882     if (num_points > 0) {
1883         if (num_points >= 2) {
1884             /*
1885              * unless we precompute multiples for just one point, converting
1886              * those into affine form is time well spent
1887              */
1888             mixed = 1;
1889         }
1890         secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1891         pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1892         if (mixed)
1893             tmp_felems =
1894                 OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1895         if ((secrets == NULL) || (pre_comp == NULL)
1896             || (mixed && (tmp_felems == NULL))) {
1897             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1898             goto err;
1899         }
1900
1901         /*
1902          * we treat NULL scalars as 0, and NULL points as points at infinity,
1903          * i.e., they contribute nothing to the linear combination
1904          */
1905         memset(secrets, 0, num_points * sizeof(felem_bytearray));
1906         memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1907         for (i = 0; i < num_points; ++i) {
1908             if (i == num)
1909                 /*
1910                  * we didn't have a valid precomputation, so we pick the
1911                  * generator
1912                  */
1913             {
1914                 p = EC_GROUP_get0_generator(group);
1915                 p_scalar = scalar;
1916             } else
1917                 /* the i^th point */
1918             {
1919                 p = points[i];
1920                 p_scalar = scalars[i];
1921             }
1922             if ((p_scalar != NULL) && (p != NULL)) {
1923                 /* reduce scalar to 0 <= scalar < 2^521 */
1924                 if ((BN_num_bits(p_scalar) > 521)
1925                     || (BN_is_negative(p_scalar))) {
1926                     /*
1927                      * this is an unusual input, and we don't guarantee
1928                      * constant-timeness
1929                      */
1930                     if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
1931                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1932                         goto err;
1933                     }
1934                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
1935                 } else
1936                     num_bytes = BN_bn2bin(p_scalar, tmp);
1937                 flip_endian(secrets[i], tmp, num_bytes);
1938                 /* precompute multiples */
1939                 if ((!BN_to_felem(x_out, &p->X)) ||
1940                     (!BN_to_felem(y_out, &p->Y)) ||
1941                     (!BN_to_felem(z_out, &p->Z)))
1942                     goto err;
1943                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1944                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1945                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1946                 for (j = 2; j <= 16; ++j) {
1947                     if (j & 1) {
1948                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1949                                   pre_comp[i][j][2], pre_comp[i][1][0],
1950                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1951                                   pre_comp[i][j - 1][0],
1952                                   pre_comp[i][j - 1][1],
1953                                   pre_comp[i][j - 1][2]);
1954                     } else {
1955                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1956                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1957                                      pre_comp[i][j / 2][1],
1958                                      pre_comp[i][j / 2][2]);
1959                     }
1960                 }
1961             }
1962         }
1963         if (mixed)
1964             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1965     }
1966
1967     /* the scalar for the generator */
1968     if ((scalar != NULL) && (have_pre_comp)) {
1969         memset(g_secret, 0, sizeof(g_secret));
1970         /* reduce scalar to 0 <= scalar < 2^521 */
1971         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1972             /*
1973              * this is an unusual input, and we don't guarantee
1974              * constant-timeness
1975              */
1976             if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
1977                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1978                 goto err;
1979             }
1980             num_bytes = BN_bn2bin(tmp_scalar, tmp);
1981         } else
1982             num_bytes = BN_bn2bin(scalar, tmp);
1983         flip_endian(g_secret, tmp, num_bytes);
1984         /* do the multiplication with generator precomputation */
1985         batch_mul(x_out, y_out, z_out,
1986                   (const felem_bytearray(*))secrets, num_points,
1987                   g_secret,
1988                   mixed, (const felem(*)[17][3])pre_comp,
1989                   (const felem(*)[3])g_pre_comp);
1990     } else
1991         /* do the multiplication without generator precomputation */
1992         batch_mul(x_out, y_out, z_out,
1993                   (const felem_bytearray(*))secrets, num_points,
1994                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1995     /* reduce the output to its unique minimal representation */
1996     felem_contract(x_in, x_out);
1997     felem_contract(y_in, y_out);
1998     felem_contract(z_in, z_out);
1999     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2000         (!felem_to_BN(z, z_in))) {
2001         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2002         goto err;
2003     }
2004     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2005
2006  err:
2007     BN_CTX_end(ctx);
2008     if (generator != NULL)
2009         EC_POINT_free(generator);
2010     if (new_ctx != NULL)
2011         BN_CTX_free(new_ctx);
2012     if (secrets != NULL)
2013         OPENSSL_free(secrets);
2014     if (pre_comp != NULL)
2015         OPENSSL_free(pre_comp);
2016     if (tmp_felems != NULL)
2017         OPENSSL_free(tmp_felems);
2018     return ret;
2019 }
2020
2021 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2022 {
2023     int ret = 0;
2024     NISTP521_PRE_COMP *pre = NULL;
2025     int i, j;
2026     BN_CTX *new_ctx = NULL;
2027     BIGNUM *x, *y;
2028     EC_POINT *generator = NULL;
2029     felem tmp_felems[16];
2030
2031     /* throw away old precomputation */
2032     EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
2033                          nistp521_pre_comp_free,
2034                          nistp521_pre_comp_clear_free);
2035     if (ctx == NULL)
2036         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2037             return 0;
2038     BN_CTX_start(ctx);
2039     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2040         goto err;
2041     /* get the generator */
2042     if (group->generator == NULL)
2043         goto err;
2044     generator = EC_POINT_new(group);
2045     if (generator == NULL)
2046         goto err;
2047     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2048     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2049     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2050         goto err;
2051     if ((pre = nistp521_pre_comp_new()) == NULL)
2052         goto err;
2053     /*
2054      * if the generator is the standard one, use built-in precomputation
2055      */
2056     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2057         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2058         ret = 1;
2059         goto err;
2060     }
2061     if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
2062         (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
2063         (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
2064         goto err;
2065     /* compute 2^130*G, 2^260*G, 2^390*G */
2066     for (i = 1; i <= 4; i <<= 1) {
2067         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2068                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2069                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2070         for (j = 0; j < 129; ++j) {
2071             point_double(pre->g_pre_comp[2 * i][0],
2072                          pre->g_pre_comp[2 * i][1],
2073                          pre->g_pre_comp[2 * i][2],
2074                          pre->g_pre_comp[2 * i][0],
2075                          pre->g_pre_comp[2 * i][1],
2076                          pre->g_pre_comp[2 * i][2]);
2077         }
2078     }
2079     /* g_pre_comp[0] is the point at infinity */
2080     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2081     /* the remaining multiples */
2082     /* 2^130*G + 2^260*G */
2083     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2084               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2085               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2086               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2087               pre->g_pre_comp[2][2]);
2088     /* 2^130*G + 2^390*G */
2089     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2090               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2091               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2092               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2093               pre->g_pre_comp[2][2]);
2094     /* 2^260*G + 2^390*G */
2095     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2096               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2097               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2098               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2099               pre->g_pre_comp[4][2]);
2100     /* 2^130*G + 2^260*G + 2^390*G */
2101     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2102               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2103               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2104               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2105               pre->g_pre_comp[2][2]);
2106     for (i = 1; i < 8; ++i) {
2107         /* odd multiples: add G */
2108         point_add(pre->g_pre_comp[2 * i + 1][0],
2109                   pre->g_pre_comp[2 * i + 1][1],
2110                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2111                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2112                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2113                   pre->g_pre_comp[1][2]);
2114     }
2115     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2116
2117     if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
2118                              nistp521_pre_comp_free,
2119                              nistp521_pre_comp_clear_free))
2120         goto err;
2121     ret = 1;
2122     pre = NULL;
2123  err:
2124     BN_CTX_end(ctx);
2125     if (generator != NULL)
2126         EC_POINT_free(generator);
2127     if (new_ctx != NULL)
2128         BN_CTX_free(new_ctx);
2129     if (pre)
2130         nistp521_pre_comp_free(pre);
2131     return ret;
2132 }
2133
2134 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2135 {
2136     if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2137                             nistp521_pre_comp_free,
2138                             nistp521_pre_comp_clear_free)
2139         != NULL)
2140         return 1;
2141     else
2142         return 0;
2143 }
2144
2145 #else
2146 static void *dummy = &dummy;
2147 #endif