crypto/ec/ecp_nistp521.c

   1 /*
   2  * Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the OpenSSL license (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  28  *
  29  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  30  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  31  * work which got its smarts from Daniel J. Bernstein's work on the same.
  32  */
  33
  34 #include <openssl/e_os2.h>
  35 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  36 NON_EMPTY_TRANSLATION_UNIT
  37 #else
  38
  39 # include <string.h>
  40 # include <openssl/err.h>
  41 # include "ec_lcl.h"
  42
  43 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  44   /* even with gcc, the typedef won't work for 32-bit platforms */
  45 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  46                                  * platforms */
  47 # else
  48 #  error "Need GCC 3.1 or later to define type uint128_t"
  49 # endif
  50
  51 typedef uint8_t u8;
  52 typedef uint64_t u64;
  53 typedef int64_t s64;
  54
  55 /*
  56  * The underlying field. P521 operates over GF(2^521-1). We can serialise an
  57  * element of this field into 66 bytes where the most significant byte
  58  * contains only a single bit. We call this an felem_bytearray.
  59  */
  60
  61 typedef u8 felem_bytearray[66];
  62
  63 /*
  64  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  65  * These values are big-endian.
  66  */
  67 static const felem_bytearray nistp521_curve_params[5] = {
  68     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
  69      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  70      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  74      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  75      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  76      0xff, 0xff},
  77     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
  78      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  79      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  82      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  83      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  84      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  85      0xff, 0xfc},
  86     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
  87      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  88      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  89      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  90      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  91      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  92      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  93      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  94      0x3f, 0x00},
  95     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
  96      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  97      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
  98      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
  99      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
 100      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
 101      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
 102      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 103      0xbd, 0x66},
 104     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
 105      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 106      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 107      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 108      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 109      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 110      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 111      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 112      0x66, 0x50}
 113 };
 114
 115 /*-
 116  * The representation of field elements.
 117  * ------------------------------------
 118  *
 119  * We represent field elements with nine values. These values are either 64 or
 120  * 128 bits and the field element represented is:
 121  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 122  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 123  * 58 bits apart, but are greater than 58 bits in length, the most significant
 124  * bits of each limb overlap with the least significant bits of the next.
 125  *
 126  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 127  * 'largefelem' */
 128
 129 # define NLIMBS 9
 130
 131 typedef uint64_t limb;
 132 typedef limb felem[NLIMBS];
 133 typedef uint128_t largefelem[NLIMBS];
 134
 135 static const limb bottom57bits = 0x1ffffffffffffff;
 136 static const limb bottom58bits = 0x3ffffffffffffff;
 137
 138 /*
 139  * bin66_to_felem takes a little-endian byte array and converts it into felem
 140  * form. This assumes that the CPU is little-endian.
 141  */
 142 static void bin66_to_felem(felem out, const u8 in[66])
 143 {
 144     out[0] = (*((limb *) & in[0])) & bottom58bits;
 145     out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
 146     out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
 147     out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
 148     out[4] = (*((limb *) & in[29])) & bottom58bits;
 149     out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
 150     out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
 151     out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
 152     out[8] = (*((limb *) & in[58])) & bottom57bits;
 153 }
 154
 155 /*
 156  * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 157  * array. This assumes that the CPU is little-endian.
 158  */
 159 static void felem_to_bin66(u8 out[66], const felem in)
 160 {
 161     memset(out, 0, 66);
 162     (*((limb *) & out[0])) = in[0];
 163     (*((limb *) & out[7])) |= in[1] << 2;
 164     (*((limb *) & out[14])) |= in[2] << 4;
 165     (*((limb *) & out[21])) |= in[3] << 6;
 166     (*((limb *) & out[29])) = in[4];
 167     (*((limb *) & out[36])) |= in[5] << 2;
 168     (*((limb *) & out[43])) |= in[6] << 4;
 169     (*((limb *) & out[50])) |= in[7] << 6;
 170     (*((limb *) & out[58])) = in[8];
 171 }
 172
 173 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 174 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 175 {
 176     unsigned i;
 177     for (i = 0; i < len; ++i)
 178         out[i] = in[len - 1 - i];
 179 }
 180
 181 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 182 static int BN_to_felem(felem out, const BIGNUM *bn)
 183 {
 184     felem_bytearray b_in;
 185     felem_bytearray b_out;
 186     unsigned num_bytes;
 187
 188     /* BN_bn2bin eats leading zeroes */
 189     memset(b_out, 0, sizeof(b_out));
 190     num_bytes = BN_num_bytes(bn);
 191     if (num_bytes > sizeof b_out) {
 192         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 193         return 0;
 194     }
 195     if (BN_is_negative(bn)) {
 196         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 197         return 0;
 198     }
 199     num_bytes = BN_bn2bin(bn, b_in);
 200     flip_endian(b_out, b_in, num_bytes);
 201     bin66_to_felem(out, b_out);
 202     return 1;
 203 }
 204
 205 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 206 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 207 {
 208     felem_bytearray b_in, b_out;
 209     felem_to_bin66(b_in, in);
 210     flip_endian(b_out, b_in, sizeof b_out);
 211     return BN_bin2bn(b_out, sizeof b_out, out);
 212 }
 213
 214 /*-
 215  * Field operations
 216  * ----------------
 217  */
 218
 219 static void felem_one(felem out)
 220 {
 221     out[0] = 1;
 222     out[1] = 0;
 223     out[2] = 0;
 224     out[3] = 0;
 225     out[4] = 0;
 226     out[5] = 0;
 227     out[6] = 0;
 228     out[7] = 0;
 229     out[8] = 0;
 230 }
 231
 232 static void felem_assign(felem out, const felem in)
 233 {
 234     out[0] = in[0];
 235     out[1] = in[1];
 236     out[2] = in[2];
 237     out[3] = in[3];
 238     out[4] = in[4];
 239     out[5] = in[5];
 240     out[6] = in[6];
 241     out[7] = in[7];
 242     out[8] = in[8];
 243 }
 244
 245 /* felem_sum64 sets out = out + in. */
 246 static void felem_sum64(felem out, const felem in)
 247 {
 248     out[0] += in[0];
 249     out[1] += in[1];
 250     out[2] += in[2];
 251     out[3] += in[3];
 252     out[4] += in[4];
 253     out[5] += in[5];
 254     out[6] += in[6];
 255     out[7] += in[7];
 256     out[8] += in[8];
 257 }
 258
 259 /* felem_scalar sets out = in * scalar */
 260 static void felem_scalar(felem out, const felem in, limb scalar)
 261 {
 262     out[0] = in[0] * scalar;
 263     out[1] = in[1] * scalar;
 264     out[2] = in[2] * scalar;
 265     out[3] = in[3] * scalar;
 266     out[4] = in[4] * scalar;
 267     out[5] = in[5] * scalar;
 268     out[6] = in[6] * scalar;
 269     out[7] = in[7] * scalar;
 270     out[8] = in[8] * scalar;
 271 }
 272
 273 /* felem_scalar64 sets out = out * scalar */
 274 static void felem_scalar64(felem out, limb scalar)
 275 {
 276     out[0] *= scalar;
 277     out[1] *= scalar;
 278     out[2] *= scalar;
 279     out[3] *= scalar;
 280     out[4] *= scalar;
 281     out[5] *= scalar;
 282     out[6] *= scalar;
 283     out[7] *= scalar;
 284     out[8] *= scalar;
 285 }
 286
 287 /* felem_scalar128 sets out = out * scalar */
 288 static void felem_scalar128(largefelem out, limb scalar)
 289 {
 290     out[0] *= scalar;
 291     out[1] *= scalar;
 292     out[2] *= scalar;
 293     out[3] *= scalar;
 294     out[4] *= scalar;
 295     out[5] *= scalar;
 296     out[6] *= scalar;
 297     out[7] *= scalar;
 298     out[8] *= scalar;
 299 }
 300
 301 /*-
 302  * felem_neg sets |out| to |-in|
 303  * On entry:
 304  *   in[i] < 2^59 + 2^14
 305  * On exit:
 306  *   out[i] < 2^62
 307  */
 308 static void felem_neg(felem out, const felem in)
 309 {
 310     /* In order to prevent underflow, we subtract from 0 mod p. */
 311     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 312     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 313
 314     out[0] = two62m3 - in[0];
 315     out[1] = two62m2 - in[1];
 316     out[2] = two62m2 - in[2];
 317     out[3] = two62m2 - in[3];
 318     out[4] = two62m2 - in[4];
 319     out[5] = two62m2 - in[5];
 320     out[6] = two62m2 - in[6];
 321     out[7] = two62m2 - in[7];
 322     out[8] = two62m2 - in[8];
 323 }
 324
 325 /*-
 326  * felem_diff64 subtracts |in| from |out|
 327  * On entry:
 328  *   in[i] < 2^59 + 2^14
 329  * On exit:
 330  *   out[i] < out[i] + 2^62
 331  */
 332 static void felem_diff64(felem out, const felem in)
 333 {
 334     /*
 335      * In order to prevent underflow, we add 0 mod p before subtracting.
 336      */
 337     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 338     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 339
 340     out[0] += two62m3 - in[0];
 341     out[1] += two62m2 - in[1];
 342     out[2] += two62m2 - in[2];
 343     out[3] += two62m2 - in[3];
 344     out[4] += two62m2 - in[4];
 345     out[5] += two62m2 - in[5];
 346     out[6] += two62m2 - in[6];
 347     out[7] += two62m2 - in[7];
 348     out[8] += two62m2 - in[8];
 349 }
 350
 351 /*-
 352  * felem_diff_128_64 subtracts |in| from |out|
 353  * On entry:
 354  *   in[i] < 2^62 + 2^17
 355  * On exit:
 356  *   out[i] < out[i] + 2^63
 357  */
 358 static void felem_diff_128_64(largefelem out, const felem in)
 359 {
 360     /*
 361      * In order to prevent underflow, we add 0 mod p before subtracting.
 362      */
 363     static const limb two63m6 = (((limb) 1) << 62) - (((limb) 1) << 5);
 364     static const limb two63m5 = (((limb) 1) << 62) - (((limb) 1) << 4);
 365
 366     out[0] += two63m6 - in[0];
 367     out[1] += two63m5 - in[1];
 368     out[2] += two63m5 - in[2];
 369     out[3] += two63m5 - in[3];
 370     out[4] += two63m5 - in[4];
 371     out[5] += two63m5 - in[5];
 372     out[6] += two63m5 - in[6];
 373     out[7] += two63m5 - in[7];
 374     out[8] += two63m5 - in[8];
 375 }
 376
 377 /*-
 378  * felem_diff_128_64 subtracts |in| from |out|
 379  * On entry:
 380  *   in[i] < 2^126
 381  * On exit:
 382  *   out[i] < out[i] + 2^127 - 2^69
 383  */
 384 static void felem_diff128(largefelem out, const largefelem in)
 385 {
 386     /*
 387      * In order to prevent underflow, we add 0 mod p before subtracting.
 388      */
 389     static const uint128_t two127m70 =
 390         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
 391     static const uint128_t two127m69 =
 392         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
 393
 394     out[0] += (two127m70 - in[0]);
 395     out[1] += (two127m69 - in[1]);
 396     out[2] += (two127m69 - in[2]);
 397     out[3] += (two127m69 - in[3]);
 398     out[4] += (two127m69 - in[4]);
 399     out[5] += (two127m69 - in[5]);
 400     out[6] += (two127m69 - in[6]);
 401     out[7] += (two127m69 - in[7]);
 402     out[8] += (two127m69 - in[8]);
 403 }
 404
 405 /*-
 406  * felem_square sets |out| = |in|^2
 407  * On entry:
 408  *   in[i] < 2^62
 409  * On exit:
 410  *   out[i] < 17 * max(in[i]) * max(in[i])
 411  */
 412 static void felem_square(largefelem out, const felem in)
 413 {
 414     felem inx2, inx4;
 415     felem_scalar(inx2, in, 2);
 416     felem_scalar(inx4, in, 4);
 417
 418     /*-
 419      * We have many cases were we want to do
 420      *   in[x] * in[y] +
 421      *   in[y] * in[x]
 422      * This is obviously just
 423      *   2 * in[x] * in[y]
 424      * However, rather than do the doubling on the 128 bit result, we
 425      * double one of the inputs to the multiplication by reading from
 426      * |inx2|
 427      */
 428
 429     out[0] = ((uint128_t) in[0]) * in[0];
 430     out[1] = ((uint128_t) in[0]) * inx2[1];
 431     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
 432     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
 433     out[4] = ((uint128_t) in[0]) * inx2[4] +
 434              ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
 435     out[5] = ((uint128_t) in[0]) * inx2[5] +
 436              ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
 437     out[6] = ((uint128_t) in[0]) * inx2[6] +
 438              ((uint128_t) in[1]) * inx2[5] +
 439              ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
 440     out[7] = ((uint128_t) in[0]) * inx2[7] +
 441              ((uint128_t) in[1]) * inx2[6] +
 442              ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
 443     out[8] = ((uint128_t) in[0]) * inx2[8] +
 444              ((uint128_t) in[1]) * inx2[7] +
 445              ((uint128_t) in[2]) * inx2[6] +
 446              ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
 447
 448     /*
 449      * The remaining limbs fall above 2^521, with the first falling at 2^522.
 450      * They correspond to locations one bit up from the limbs produced above
 451      * so we would have to multiply by two to align them. Again, rather than
 452      * operate on the 128-bit result, we double one of the inputs to the
 453      * multiplication. If we want to double for both this reason, and the
 454      * reason above, then we end up multiplying by four.
 455      */
 456
 457     /* 9 */
 458     out[0] += ((uint128_t) in[1]) * inx4[8] +
 459               ((uint128_t) in[2]) * inx4[7] +
 460               ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
 461
 462     /* 10 */
 463     out[1] += ((uint128_t) in[2]) * inx4[8] +
 464               ((uint128_t) in[3]) * inx4[7] +
 465               ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
 466
 467     /* 11 */
 468     out[2] += ((uint128_t) in[3]) * inx4[8] +
 469               ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
 470
 471     /* 12 */
 472     out[3] += ((uint128_t) in[4]) * inx4[8] +
 473               ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
 474
 475     /* 13 */
 476     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
 477
 478     /* 14 */
 479     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
 480
 481     /* 15 */
 482     out[6] += ((uint128_t) in[7]) * inx4[8];
 483
 484     /* 16 */
 485     out[7] += ((uint128_t) in[8]) * inx2[8];
 486 }
 487
 488 /*-
 489  * felem_mul sets |out| = |in1| * |in2|
 490  * On entry:
 491  *   in1[i] < 2^64
 492  *   in2[i] < 2^63
 493  * On exit:
 494  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 495  */
 496 static void felem_mul(largefelem out, const felem in1, const felem in2)
 497 {
 498     felem in2x2;
 499     felem_scalar(in2x2, in2, 2);
 500
 501     out[0] = ((uint128_t) in1[0]) * in2[0];
 502
 503     out[1] = ((uint128_t) in1[0]) * in2[1] +
 504              ((uint128_t) in1[1]) * in2[0];
 505
 506     out[2] = ((uint128_t) in1[0]) * in2[2] +
 507              ((uint128_t) in1[1]) * in2[1] +
 508              ((uint128_t) in1[2]) * in2[0];
 509
 510     out[3] = ((uint128_t) in1[0]) * in2[3] +
 511              ((uint128_t) in1[1]) * in2[2] +
 512              ((uint128_t) in1[2]) * in2[1] +
 513              ((uint128_t) in1[3]) * in2[0];
 514
 515     out[4] = ((uint128_t) in1[0]) * in2[4] +
 516              ((uint128_t) in1[1]) * in2[3] +
 517              ((uint128_t) in1[2]) * in2[2] +
 518              ((uint128_t) in1[3]) * in2[1] +
 519              ((uint128_t) in1[4]) * in2[0];
 520
 521     out[5] = ((uint128_t) in1[0]) * in2[5] +
 522              ((uint128_t) in1[1]) * in2[4] +
 523              ((uint128_t) in1[2]) * in2[3] +
 524              ((uint128_t) in1[3]) * in2[2] +
 525              ((uint128_t) in1[4]) * in2[1] +
 526              ((uint128_t) in1[5]) * in2[0];
 527
 528     out[6] = ((uint128_t) in1[0]) * in2[6] +
 529              ((uint128_t) in1[1]) * in2[5] +
 530              ((uint128_t) in1[2]) * in2[4] +
 531              ((uint128_t) in1[3]) * in2[3] +
 532              ((uint128_t) in1[4]) * in2[2] +
 533              ((uint128_t) in1[5]) * in2[1] +
 534              ((uint128_t) in1[6]) * in2[0];
 535
 536     out[7] = ((uint128_t) in1[0]) * in2[7] +
 537              ((uint128_t) in1[1]) * in2[6] +
 538              ((uint128_t) in1[2]) * in2[5] +
 539              ((uint128_t) in1[3]) * in2[4] +
 540              ((uint128_t) in1[4]) * in2[3] +
 541              ((uint128_t) in1[5]) * in2[2] +
 542              ((uint128_t) in1[6]) * in2[1] +
 543              ((uint128_t) in1[7]) * in2[0];
 544
 545     out[8] = ((uint128_t) in1[0]) * in2[8] +
 546              ((uint128_t) in1[1]) * in2[7] +
 547              ((uint128_t) in1[2]) * in2[6] +
 548              ((uint128_t) in1[3]) * in2[5] +
 549              ((uint128_t) in1[4]) * in2[4] +
 550              ((uint128_t) in1[5]) * in2[3] +
 551              ((uint128_t) in1[6]) * in2[2] +
 552              ((uint128_t) in1[7]) * in2[1] +
 553              ((uint128_t) in1[8]) * in2[0];
 554
 555     /* See comment in felem_square about the use of in2x2 here */
 556
 557     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 558               ((uint128_t) in1[2]) * in2x2[7] +
 559               ((uint128_t) in1[3]) * in2x2[6] +
 560               ((uint128_t) in1[4]) * in2x2[5] +
 561               ((uint128_t) in1[5]) * in2x2[4] +
 562               ((uint128_t) in1[6]) * in2x2[3] +
 563               ((uint128_t) in1[7]) * in2x2[2] +
 564               ((uint128_t) in1[8]) * in2x2[1];
 565
 566     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 567               ((uint128_t) in1[3]) * in2x2[7] +
 568               ((uint128_t) in1[4]) * in2x2[6] +
 569               ((uint128_t) in1[5]) * in2x2[5] +
 570               ((uint128_t) in1[6]) * in2x2[4] +
 571               ((uint128_t) in1[7]) * in2x2[3] +
 572               ((uint128_t) in1[8]) * in2x2[2];
 573
 574     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 575               ((uint128_t) in1[4]) * in2x2[7] +
 576               ((uint128_t) in1[5]) * in2x2[6] +
 577               ((uint128_t) in1[6]) * in2x2[5] +
 578               ((uint128_t) in1[7]) * in2x2[4] +
 579               ((uint128_t) in1[8]) * in2x2[3];
 580
 581     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 582               ((uint128_t) in1[5]) * in2x2[7] +
 583               ((uint128_t) in1[6]) * in2x2[6] +
 584               ((uint128_t) in1[7]) * in2x2[5] +
 585               ((uint128_t) in1[8]) * in2x2[4];
 586
 587     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 588               ((uint128_t) in1[6]) * in2x2[7] +
 589               ((uint128_t) in1[7]) * in2x2[6] +
 590               ((uint128_t) in1[8]) * in2x2[5];
 591
 592     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 593               ((uint128_t) in1[7]) * in2x2[7] +
 594               ((uint128_t) in1[8]) * in2x2[6];
 595
 596     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 597               ((uint128_t) in1[8]) * in2x2[7];
 598
 599     out[7] += ((uint128_t) in1[8]) * in2x2[8];
 600 }
 601
 602 static const limb bottom52bits = 0xfffffffffffff;
 603
 604 /*-
 605  * felem_reduce converts a largefelem to an felem.
 606  * On entry:
 607  *   in[i] < 2^128
 608  * On exit:
 609  *   out[i] < 2^59 + 2^14
 610  */
 611 static void felem_reduce(felem out, const largefelem in)
 612 {
 613     u64 overflow1, overflow2;
 614
 615     out[0] = ((limb) in[0]) & bottom58bits;
 616     out[1] = ((limb) in[1]) & bottom58bits;
 617     out[2] = ((limb) in[2]) & bottom58bits;
 618     out[3] = ((limb) in[3]) & bottom58bits;
 619     out[4] = ((limb) in[4]) & bottom58bits;
 620     out[5] = ((limb) in[5]) & bottom58bits;
 621     out[6] = ((limb) in[6]) & bottom58bits;
 622     out[7] = ((limb) in[7]) & bottom58bits;
 623     out[8] = ((limb) in[8]) & bottom58bits;
 624
 625     /* out[i] < 2^58 */
 626
 627     out[1] += ((limb) in[0]) >> 58;
 628     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 629     /*-
 630      * out[1] < 2^58 + 2^6 + 2^58
 631      *        = 2^59 + 2^6
 632      */
 633     out[2] += ((limb) (in[0] >> 64)) >> 52;
 634
 635     out[2] += ((limb) in[1]) >> 58;
 636     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 637     out[3] += ((limb) (in[1] >> 64)) >> 52;
 638
 639     out[3] += ((limb) in[2]) >> 58;
 640     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 641     out[4] += ((limb) (in[2] >> 64)) >> 52;
 642
 643     out[4] += ((limb) in[3]) >> 58;
 644     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 645     out[5] += ((limb) (in[3] >> 64)) >> 52;
 646
 647     out[5] += ((limb) in[4]) >> 58;
 648     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 649     out[6] += ((limb) (in[4] >> 64)) >> 52;
 650
 651     out[6] += ((limb) in[5]) >> 58;
 652     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 653     out[7] += ((limb) (in[5] >> 64)) >> 52;
 654
 655     out[7] += ((limb) in[6]) >> 58;
 656     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 657     out[8] += ((limb) (in[6] >> 64)) >> 52;
 658
 659     out[8] += ((limb) in[7]) >> 58;
 660     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 661     /*-
 662      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 663      *            < 2^59 + 2^13
 664      */
 665     overflow1 = ((limb) (in[7] >> 64)) >> 52;
 666
 667     overflow1 += ((limb) in[8]) >> 58;
 668     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 669     overflow2 = ((limb) (in[8] >> 64)) >> 52;
 670
 671     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
 672     overflow2 <<= 1;            /* overflow2 < 2^13 */
 673
 674     out[0] += overflow1;        /* out[0] < 2^60 */
 675     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
 676
 677     out[1] += out[0] >> 58;
 678     out[0] &= bottom58bits;
 679     /*-
 680      * out[0] < 2^58
 681      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 682      *        < 2^59 + 2^14
 683      */
 684 }
 685
 686 static void felem_square_reduce(felem out, const felem in)
 687 {
 688     largefelem tmp;
 689     felem_square(tmp, in);
 690     felem_reduce(out, tmp);
 691 }
 692
 693 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 694 {
 695     largefelem tmp;
 696     felem_mul(tmp, in1, in2);
 697     felem_reduce(out, tmp);
 698 }
 699
 700 /*-
 701  * felem_inv calculates |out| = |in|^{-1}
 702  *
 703  * Based on Fermat's Little Theorem:
 704  *   a^p = a (mod p)
 705  *   a^{p-1} = 1 (mod p)
 706  *   a^{p-2} = a^{-1} (mod p)
 707  */
 708 static void felem_inv(felem out, const felem in)
 709 {
 710     felem ftmp, ftmp2, ftmp3, ftmp4;
 711     largefelem tmp;
 712     unsigned i;
 713
 714     felem_square(tmp, in);
 715     felem_reduce(ftmp, tmp);    /* 2^1 */
 716     felem_mul(tmp, in, ftmp);
 717     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
 718     felem_assign(ftmp2, ftmp);
 719     felem_square(tmp, ftmp);
 720     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
 721     felem_mul(tmp, in, ftmp);
 722     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
 723     felem_square(tmp, ftmp);
 724     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
 725
 726     felem_square(tmp, ftmp2);
 727     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
 728     felem_square(tmp, ftmp3);
 729     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
 730     felem_mul(tmp, ftmp3, ftmp2);
 731     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
 732
 733     felem_assign(ftmp2, ftmp3);
 734     felem_square(tmp, ftmp3);
 735     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
 736     felem_square(tmp, ftmp3);
 737     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
 738     felem_square(tmp, ftmp3);
 739     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
 740     felem_square(tmp, ftmp3);
 741     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
 742     felem_assign(ftmp4, ftmp3);
 743     felem_mul(tmp, ftmp3, ftmp);
 744     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
 745     felem_square(tmp, ftmp4);
 746     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
 747     felem_mul(tmp, ftmp3, ftmp2);
 748     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
 749     felem_assign(ftmp2, ftmp3);
 750
 751     for (i = 0; i < 8; i++) {
 752         felem_square(tmp, ftmp3);
 753         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
 754     }
 755     felem_mul(tmp, ftmp3, ftmp2);
 756     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
 757     felem_assign(ftmp2, ftmp3);
 758
 759     for (i = 0; i < 16; i++) {
 760         felem_square(tmp, ftmp3);
 761         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
 762     }
 763     felem_mul(tmp, ftmp3, ftmp2);
 764     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
 765     felem_assign(ftmp2, ftmp3);
 766
 767     for (i = 0; i < 32; i++) {
 768         felem_square(tmp, ftmp3);
 769         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
 770     }
 771     felem_mul(tmp, ftmp3, ftmp2);
 772     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
 773     felem_assign(ftmp2, ftmp3);
 774
 775     for (i = 0; i < 64; i++) {
 776         felem_square(tmp, ftmp3);
 777         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
 778     }
 779     felem_mul(tmp, ftmp3, ftmp2);
 780     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
 781     felem_assign(ftmp2, ftmp3);
 782
 783     for (i = 0; i < 128; i++) {
 784         felem_square(tmp, ftmp3);
 785         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
 786     }
 787     felem_mul(tmp, ftmp3, ftmp2);
 788     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
 789     felem_assign(ftmp2, ftmp3);
 790
 791     for (i = 0; i < 256; i++) {
 792         felem_square(tmp, ftmp3);
 793         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
 794     }
 795     felem_mul(tmp, ftmp3, ftmp2);
 796     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
 797
 798     for (i = 0; i < 9; i++) {
 799         felem_square(tmp, ftmp3);
 800         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
 801     }
 802     felem_mul(tmp, ftmp3, ftmp4);
 803     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
 804     felem_mul(tmp, ftmp3, in);
 805     felem_reduce(out, tmp);     /* 2^512 - 3 */
 806 }
 807
 808 /* This is 2^521-1, expressed as an felem */
 809 static const felem kPrime = {
 810     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 811     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 812     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 813 };
 814
 815 /*-
 816  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 817  * otherwise.
 818  * On entry:
 819  *   in[i] < 2^59 + 2^14
 820  */
 821 static limb felem_is_zero(const felem in)
 822 {
 823     felem ftmp;
 824     limb is_zero, is_p;
 825     felem_assign(ftmp, in);
 826
 827     ftmp[0] += ftmp[8] >> 57;
 828     ftmp[8] &= bottom57bits;
 829     /* ftmp[8] < 2^57 */
 830     ftmp[1] += ftmp[0] >> 58;
 831     ftmp[0] &= bottom58bits;
 832     ftmp[2] += ftmp[1] >> 58;
 833     ftmp[1] &= bottom58bits;
 834     ftmp[3] += ftmp[2] >> 58;
 835     ftmp[2] &= bottom58bits;
 836     ftmp[4] += ftmp[3] >> 58;
 837     ftmp[3] &= bottom58bits;
 838     ftmp[5] += ftmp[4] >> 58;
 839     ftmp[4] &= bottom58bits;
 840     ftmp[6] += ftmp[5] >> 58;
 841     ftmp[5] &= bottom58bits;
 842     ftmp[7] += ftmp[6] >> 58;
 843     ftmp[6] &= bottom58bits;
 844     ftmp[8] += ftmp[7] >> 58;
 845     ftmp[7] &= bottom58bits;
 846     /* ftmp[8] < 2^57 + 4 */
 847
 848     /*
 849      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
 850      * than our bound for ftmp[8]. Therefore we only have to check if the
 851      * zero is zero or 2^521-1.
 852      */
 853
 854     is_zero = 0;
 855     is_zero |= ftmp[0];
 856     is_zero |= ftmp[1];
 857     is_zero |= ftmp[2];
 858     is_zero |= ftmp[3];
 859     is_zero |= ftmp[4];
 860     is_zero |= ftmp[5];
 861     is_zero |= ftmp[6];
 862     is_zero |= ftmp[7];
 863     is_zero |= ftmp[8];
 864
 865     is_zero--;
 866     /*
 867      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
 868      * can be set is if is_zero was 0 before the decrement.
 869      */
 870     is_zero = ((s64) is_zero) >> 63;
 871
 872     is_p = ftmp[0] ^ kPrime[0];
 873     is_p |= ftmp[1] ^ kPrime[1];
 874     is_p |= ftmp[2] ^ kPrime[2];
 875     is_p |= ftmp[3] ^ kPrime[3];
 876     is_p |= ftmp[4] ^ kPrime[4];
 877     is_p |= ftmp[5] ^ kPrime[5];
 878     is_p |= ftmp[6] ^ kPrime[6];
 879     is_p |= ftmp[7] ^ kPrime[7];
 880     is_p |= ftmp[8] ^ kPrime[8];
 881
 882     is_p--;
 883     is_p = ((s64) is_p) >> 63;
 884
 885     is_zero |= is_p;
 886     return is_zero;
 887 }
 888
 889 static int felem_is_zero_int(const felem in)
 890 {
 891     return (int)(felem_is_zero(in) & ((limb) 1));
 892 }
 893
 894 /*-
 895  * felem_contract converts |in| to its unique, minimal representation.
 896  * On entry:
 897  *   in[i] < 2^59 + 2^14
 898  */
 899 static void felem_contract(felem out, const felem in)
 900 {
 901     limb is_p, is_greater, sign;
 902     static const limb two58 = ((limb) 1) << 58;
 903
 904     felem_assign(out, in);
 905
 906     out[0] += out[8] >> 57;
 907     out[8] &= bottom57bits;
 908     /* out[8] < 2^57 */
 909     out[1] += out[0] >> 58;
 910     out[0] &= bottom58bits;
 911     out[2] += out[1] >> 58;
 912     out[1] &= bottom58bits;
 913     out[3] += out[2] >> 58;
 914     out[2] &= bottom58bits;
 915     out[4] += out[3] >> 58;
 916     out[3] &= bottom58bits;
 917     out[5] += out[4] >> 58;
 918     out[4] &= bottom58bits;
 919     out[6] += out[5] >> 58;
 920     out[5] &= bottom58bits;
 921     out[7] += out[6] >> 58;
 922     out[6] &= bottom58bits;
 923     out[8] += out[7] >> 58;
 924     out[7] &= bottom58bits;
 925     /* out[8] < 2^57 + 4 */
 926
 927     /*
 928      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
 929      * out. See the comments in felem_is_zero regarding why we don't test for
 930      * other multiples of the prime.
 931      */
 932
 933     /*
 934      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
 935      */
 936
 937     is_p = out[0] ^ kPrime[0];
 938     is_p |= out[1] ^ kPrime[1];
 939     is_p |= out[2] ^ kPrime[2];
 940     is_p |= out[3] ^ kPrime[3];
 941     is_p |= out[4] ^ kPrime[4];
 942     is_p |= out[5] ^ kPrime[5];
 943     is_p |= out[6] ^ kPrime[6];
 944     is_p |= out[7] ^ kPrime[7];
 945     is_p |= out[8] ^ kPrime[8];
 946
 947     is_p--;
 948     is_p &= is_p << 32;
 949     is_p &= is_p << 16;
 950     is_p &= is_p << 8;
 951     is_p &= is_p << 4;
 952     is_p &= is_p << 2;
 953     is_p &= is_p << 1;
 954     is_p = ((s64) is_p) >> 63;
 955     is_p = ~is_p;
 956
 957     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 958
 959     out[0] &= is_p;
 960     out[1] &= is_p;
 961     out[2] &= is_p;
 962     out[3] &= is_p;
 963     out[4] &= is_p;
 964     out[5] &= is_p;
 965     out[6] &= is_p;
 966     out[7] &= is_p;
 967     out[8] &= is_p;
 968
 969     /*
 970      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
 971      * 57 is greater than zero as (2^521-1) + x >= 2^522
 972      */
 973     is_greater = out[8] >> 57;
 974     is_greater |= is_greater << 32;
 975     is_greater |= is_greater << 16;
 976     is_greater |= is_greater << 8;
 977     is_greater |= is_greater << 4;
 978     is_greater |= is_greater << 2;
 979     is_greater |= is_greater << 1;
 980     is_greater = ((s64) is_greater) >> 63;
 981
 982     out[0] -= kPrime[0] & is_greater;
 983     out[1] -= kPrime[1] & is_greater;
 984     out[2] -= kPrime[2] & is_greater;
 985     out[3] -= kPrime[3] & is_greater;
 986     out[4] -= kPrime[4] & is_greater;
 987     out[5] -= kPrime[5] & is_greater;
 988     out[6] -= kPrime[6] & is_greater;
 989     out[7] -= kPrime[7] & is_greater;
 990     out[8] -= kPrime[8] & is_greater;
 991
 992     /* Eliminate negative coefficients */
 993     sign = -(out[0] >> 63);
 994     out[0] += (two58 & sign);
 995     out[1] -= (1 & sign);
 996     sign = -(out[1] >> 63);
 997     out[1] += (two58 & sign);
 998     out[2] -= (1 & sign);
 999     sign = -(out[2] >> 63);
1000     out[2] += (two58 & sign);
1001     out[3] -= (1 & sign);
1002     sign = -(out[3] >> 63);
1003     out[3] += (two58 & sign);
1004     out[4] -= (1 & sign);
1005     sign = -(out[4] >> 63);
1006     out[4] += (two58 & sign);
1007     out[5] -= (1 & sign);
1008     sign = -(out[0] >> 63);
1009     out[5] += (two58 & sign);
1010     out[6] -= (1 & sign);
1011     sign = -(out[6] >> 63);
1012     out[6] += (two58 & sign);
1013     out[7] -= (1 & sign);
1014     sign = -(out[7] >> 63);
1015     out[7] += (two58 & sign);
1016     out[8] -= (1 & sign);
1017     sign = -(out[5] >> 63);
1018     out[5] += (two58 & sign);
1019     out[6] -= (1 & sign);
1020     sign = -(out[6] >> 63);
1021     out[6] += (two58 & sign);
1022     out[7] -= (1 & sign);
1023     sign = -(out[7] >> 63);
1024     out[7] += (two58 & sign);
1025     out[8] -= (1 & sign);
1026 }
1027
1028 /*-
1029  * Group operations
1030  * ----------------
1031  *
1032  * Building on top of the field operations we have the operations on the
1033  * elliptic curve group itself. Points on the curve are represented in Jacobian
1034  * coordinates */
1035
1036 /*-
1037  * point_double calculates 2*(x_in, y_in, z_in)
1038  *
1039  * The method is taken from:
1040  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1041  *
1042  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1043  * while x_out == y_in is not (maybe this works, but it's not tested). */
1044 static void
1045 point_double(felem x_out, felem y_out, felem z_out,
1046              const felem x_in, const felem y_in, const felem z_in)
1047 {
1048     largefelem tmp, tmp2;
1049     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1050
1051     felem_assign(ftmp, x_in);
1052     felem_assign(ftmp2, x_in);
1053
1054     /* delta = z^2 */
1055     felem_square(tmp, z_in);
1056     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1057
1058     /* gamma = y^2 */
1059     felem_square(tmp, y_in);
1060     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1061
1062     /* beta = x*gamma */
1063     felem_mul(tmp, x_in, gamma);
1064     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1065
1066     /* alpha = 3*(x-delta)*(x+delta) */
1067     felem_diff64(ftmp, delta);
1068     /* ftmp[i] < 2^61 */
1069     felem_sum64(ftmp2, delta);
1070     /* ftmp2[i] < 2^60 + 2^15 */
1071     felem_scalar64(ftmp2, 3);
1072     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1073     felem_mul(tmp, ftmp, ftmp2);
1074     /*-
1075      * tmp[i] < 17(3*2^121 + 3*2^76)
1076      *        = 61*2^121 + 61*2^76
1077      *        < 64*2^121 + 64*2^76
1078      *        = 2^127 + 2^82
1079      *        < 2^128
1080      */
1081     felem_reduce(alpha, tmp);
1082
1083     /* x' = alpha^2 - 8*beta */
1084     felem_square(tmp, alpha);
1085     /*
1086      * tmp[i] < 17*2^120 < 2^125
1087      */
1088     felem_assign(ftmp, beta);
1089     felem_scalar64(ftmp, 8);
1090     /* ftmp[i] < 2^62 + 2^17 */
1091     felem_diff_128_64(tmp, ftmp);
1092     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1093     felem_reduce(x_out, tmp);
1094
1095     /* z' = (y + z)^2 - gamma - delta */
1096     felem_sum64(delta, gamma);
1097     /* delta[i] < 2^60 + 2^15 */
1098     felem_assign(ftmp, y_in);
1099     felem_sum64(ftmp, z_in);
1100     /* ftmp[i] < 2^60 + 2^15 */
1101     felem_square(tmp, ftmp);
1102     /*
1103      * tmp[i] < 17(2^122) < 2^127
1104      */
1105     felem_diff_128_64(tmp, delta);
1106     /* tmp[i] < 2^127 + 2^63 */
1107     felem_reduce(z_out, tmp);
1108
1109     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1110     felem_scalar64(beta, 4);
1111     /* beta[i] < 2^61 + 2^16 */
1112     felem_diff64(beta, x_out);
1113     /* beta[i] < 2^61 + 2^60 + 2^16 */
1114     felem_mul(tmp, alpha, beta);
1115     /*-
1116      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1117      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1118      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1119      *        < 2^128
1120      */
1121     felem_square(tmp2, gamma);
1122     /*-
1123      * tmp2[i] < 17*(2^59 + 2^14)^2
1124      *         = 17*(2^118 + 2^74 + 2^28)
1125      */
1126     felem_scalar128(tmp2, 8);
1127     /*-
1128      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1129      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1130      *         < 2^126
1131      */
1132     felem_diff128(tmp, tmp2);
1133     /*-
1134      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1135      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1136      *          2^74 + 2^69 + 2^34 + 2^30
1137      *        < 2^128
1138      */
1139     felem_reduce(y_out, tmp);
1140 }
1141
1142 /* copy_conditional copies in to out iff mask is all ones. */
1143 static void copy_conditional(felem out, const felem in, limb mask)
1144 {
1145     unsigned i;
1146     for (i = 0; i < NLIMBS; ++i) {
1147         const limb tmp = mask & (in[i] ^ out[i]);
1148         out[i] ^= tmp;
1149     }
1150 }
1151
1152 /*-
1153  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1154  *
1155  * The method is taken from
1156  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1157  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1158  *
1159  * This function includes a branch for checking whether the two input points
1160  * are equal (while not equal to the point at infinity). This case never
1161  * happens during single point multiplication, so there is no timing leak for
1162  * ECDH or ECDSA signing. */
1163 static void point_add(felem x3, felem y3, felem z3,
1164                       const felem x1, const felem y1, const felem z1,
1165                       const int mixed, const felem x2, const felem y2,
1166                       const felem z2)
1167 {
1168     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1169     largefelem tmp, tmp2;
1170     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1171
1172     z1_is_zero = felem_is_zero(z1);
1173     z2_is_zero = felem_is_zero(z2);
1174
1175     /* ftmp = z1z1 = z1**2 */
1176     felem_square(tmp, z1);
1177     felem_reduce(ftmp, tmp);
1178
1179     if (!mixed) {
1180         /* ftmp2 = z2z2 = z2**2 */
1181         felem_square(tmp, z2);
1182         felem_reduce(ftmp2, tmp);
1183
1184         /* u1 = ftmp3 = x1*z2z2 */
1185         felem_mul(tmp, x1, ftmp2);
1186         felem_reduce(ftmp3, tmp);
1187
1188         /* ftmp5 = z1 + z2 */
1189         felem_assign(ftmp5, z1);
1190         felem_sum64(ftmp5, z2);
1191         /* ftmp5[i] < 2^61 */
1192
1193         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1194         felem_square(tmp, ftmp5);
1195         /* tmp[i] < 17*2^122 */
1196         felem_diff_128_64(tmp, ftmp);
1197         /* tmp[i] < 17*2^122 + 2^63 */
1198         felem_diff_128_64(tmp, ftmp2);
1199         /* tmp[i] < 17*2^122 + 2^64 */
1200         felem_reduce(ftmp5, tmp);
1201
1202         /* ftmp2 = z2 * z2z2 */
1203         felem_mul(tmp, ftmp2, z2);
1204         felem_reduce(ftmp2, tmp);
1205
1206         /* s1 = ftmp6 = y1 * z2**3 */
1207         felem_mul(tmp, y1, ftmp2);
1208         felem_reduce(ftmp6, tmp);
1209     } else {
1210         /*
1211          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1212          */
1213
1214         /* u1 = ftmp3 = x1*z2z2 */
1215         felem_assign(ftmp3, x1);
1216
1217         /* ftmp5 = 2*z1z2 */
1218         felem_scalar(ftmp5, z1, 2);
1219
1220         /* s1 = ftmp6 = y1 * z2**3 */
1221         felem_assign(ftmp6, y1);
1222     }
1223
1224     /* u2 = x2*z1z1 */
1225     felem_mul(tmp, x2, ftmp);
1226     /* tmp[i] < 17*2^120 */
1227
1228     /* h = ftmp4 = u2 - u1 */
1229     felem_diff_128_64(tmp, ftmp3);
1230     /* tmp[i] < 17*2^120 + 2^63 */
1231     felem_reduce(ftmp4, tmp);
1232
1233     x_equal = felem_is_zero(ftmp4);
1234
1235     /* z_out = ftmp5 * h */
1236     felem_mul(tmp, ftmp5, ftmp4);
1237     felem_reduce(z_out, tmp);
1238
1239     /* ftmp = z1 * z1z1 */
1240     felem_mul(tmp, ftmp, z1);
1241     felem_reduce(ftmp, tmp);
1242
1243     /* s2 = tmp = y2 * z1**3 */
1244     felem_mul(tmp, y2, ftmp);
1245     /* tmp[i] < 17*2^120 */
1246
1247     /* r = ftmp5 = (s2 - s1)*2 */
1248     felem_diff_128_64(tmp, ftmp6);
1249     /* tmp[i] < 17*2^120 + 2^63 */
1250     felem_reduce(ftmp5, tmp);
1251     y_equal = felem_is_zero(ftmp5);
1252     felem_scalar64(ftmp5, 2);
1253     /* ftmp5[i] < 2^61 */
1254
1255     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1256         point_double(x3, y3, z3, x1, y1, z1);
1257         return;
1258     }
1259
1260     /* I = ftmp = (2h)**2 */
1261     felem_assign(ftmp, ftmp4);
1262     felem_scalar64(ftmp, 2);
1263     /* ftmp[i] < 2^61 */
1264     felem_square(tmp, ftmp);
1265     /* tmp[i] < 17*2^122 */
1266     felem_reduce(ftmp, tmp);
1267
1268     /* J = ftmp2 = h * I */
1269     felem_mul(tmp, ftmp4, ftmp);
1270     felem_reduce(ftmp2, tmp);
1271
1272     /* V = ftmp4 = U1 * I */
1273     felem_mul(tmp, ftmp3, ftmp);
1274     felem_reduce(ftmp4, tmp);
1275
1276     /* x_out = r**2 - J - 2V */
1277     felem_square(tmp, ftmp5);
1278     /* tmp[i] < 17*2^122 */
1279     felem_diff_128_64(tmp, ftmp2);
1280     /* tmp[i] < 17*2^122 + 2^63 */
1281     felem_assign(ftmp3, ftmp4);
1282     felem_scalar64(ftmp4, 2);
1283     /* ftmp4[i] < 2^61 */
1284     felem_diff_128_64(tmp, ftmp4);
1285     /* tmp[i] < 17*2^122 + 2^64 */
1286     felem_reduce(x_out, tmp);
1287
1288     /* y_out = r(V-x_out) - 2 * s1 * J */
1289     felem_diff64(ftmp3, x_out);
1290     /*
1291      * ftmp3[i] < 2^60 + 2^60 = 2^61
1292      */
1293     felem_mul(tmp, ftmp5, ftmp3);
1294     /* tmp[i] < 17*2^122 */
1295     felem_mul(tmp2, ftmp6, ftmp2);
1296     /* tmp2[i] < 17*2^120 */
1297     felem_scalar128(tmp2, 2);
1298     /* tmp2[i] < 17*2^121 */
1299     felem_diff128(tmp, tmp2);
1300         /*-
1301          * tmp[i] < 2^127 - 2^69 + 17*2^122
1302          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1303          *        < 2^127
1304          */
1305     felem_reduce(y_out, tmp);
1306
1307     copy_conditional(x_out, x2, z1_is_zero);
1308     copy_conditional(x_out, x1, z2_is_zero);
1309     copy_conditional(y_out, y2, z1_is_zero);
1310     copy_conditional(y_out, y1, z2_is_zero);
1311     copy_conditional(z_out, z2, z1_is_zero);
1312     copy_conditional(z_out, z1, z2_is_zero);
1313     felem_assign(x3, x_out);
1314     felem_assign(y3, y_out);
1315     felem_assign(z3, z_out);
1316 }
1317
1318 /*-
1319  * Base point pre computation
1320  * --------------------------
1321  *
1322  * Two different sorts of precomputed tables are used in the following code.
1323  * Each contain various points on the curve, where each point is three field
1324  * elements (x, y, z).
1325  *
1326  * For the base point table, z is usually 1 (0 for the point at infinity).
1327  * This table has 16 elements:
1328  * index | bits    | point
1329  * ------+---------+------------------------------
1330  *     0 | 0 0 0 0 | 0G
1331  *     1 | 0 0 0 1 | 1G
1332  *     2 | 0 0 1 0 | 2^130G
1333  *     3 | 0 0 1 1 | (2^130 + 1)G
1334  *     4 | 0 1 0 0 | 2^260G
1335  *     5 | 0 1 0 1 | (2^260 + 1)G
1336  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1337  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1338  *     8 | 1 0 0 0 | 2^390G
1339  *     9 | 1 0 0 1 | (2^390 + 1)G
1340  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1341  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1342  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1343  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1344  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1345  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1346  *
1347  * The reason for this is so that we can clock bits into four different
1348  * locations when doing simple scalar multiplies against the base point.
1349  *
1350  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1351
1352 /* gmul is the table of precomputed base points */
1353 static const felem gmul[16][3] = {
1354 {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1355  {0, 0, 0, 0, 0, 0, 0, 0, 0},
1356  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1357 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1358   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1359   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1360  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1361   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1362   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1363  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1364 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1365   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1366   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1367  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1368   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1369   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1370  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1371 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1372   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1373   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1374  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1375   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1376   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1377  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1378 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1379   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1380   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1381  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1382   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1383   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1384  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1385 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1386   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1387   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1388  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1389   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1390   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1391  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1392 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1393   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1394   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1395  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1396   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1397   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1398  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1399 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1400   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1401   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1402  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1403   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1404   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1405  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1406 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1407   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1408   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1409  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1410   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1411   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1412  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1413 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1414   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1415   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1416  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1417   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1418   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1419  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1420 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1421   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1422   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1423  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1424   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1425   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1426  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1427 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1428   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1429   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1430  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1431   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1432   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1433  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1434 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1435   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1436   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1437  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1438   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1439   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1440  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1441 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1442   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1443   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1444  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1445   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1446   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1447  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1448 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1449   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1450   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1451  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1452   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1453   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1454  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1455 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1456   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1457   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1458  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1459   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1460   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1461  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1462 };
1463
1464 /*
1465  * select_point selects the |idx|th point from a precomputation table and
1466  * copies it to out.
1467  */
1468  /* pre_comp below is of the size provided in |size| */
1469 static void select_point(const limb idx, unsigned int size,
1470                          const felem pre_comp[][3], felem out[3])
1471 {
1472     unsigned i, j;
1473     limb *outlimbs = &out[0][0];
1474
1475     memset(out, 0, sizeof(*out) * 3);
1476
1477     for (i = 0; i < size; i++) {
1478         const limb *inlimbs = &pre_comp[i][0][0];
1479         limb mask = i ^ idx;
1480         mask |= mask >> 4;
1481         mask |= mask >> 2;
1482         mask |= mask >> 1;
1483         mask &= 1;
1484         mask--;
1485         for (j = 0; j < NLIMBS * 3; j++)
1486             outlimbs[j] |= inlimbs[j] & mask;
1487     }
1488 }
1489
1490 /* get_bit returns the |i|th bit in |in| */
1491 static char get_bit(const felem_bytearray in, int i)
1492 {
1493     if (i < 0)
1494         return 0;
1495     return (in[i >> 3] >> (i & 7)) & 1;
1496 }
1497
1498 /*
1499  * Interleaved point multiplication using precomputed point multiples: The
1500  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1501  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1502  * generator, using certain (large) precomputed multiples in g_pre_comp.
1503  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1504  */
1505 static void batch_mul(felem x_out, felem y_out, felem z_out,
1506                       const felem_bytearray scalars[],
1507                       const unsigned num_points, const u8 *g_scalar,
1508                       const int mixed, const felem pre_comp[][17][3],
1509                       const felem g_pre_comp[16][3])
1510 {
1511     int i, skip;
1512     unsigned num, gen_mul = (g_scalar != NULL);
1513     felem nq[3], tmp[4];
1514     limb bits;
1515     u8 sign, digit;
1516
1517     /* set nq to the point at infinity */
1518     memset(nq, 0, sizeof(nq));
1519
1520     /*
1521      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1522      * of the generator (last quarter of rounds) and additions of other
1523      * points multiples (every 5th round).
1524      */
1525     skip = 1;                   /* save two point operations in the first
1526                                  * round */
1527     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1528         /* double */
1529         if (!skip)
1530             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1531
1532         /* add multiples of the generator */
1533         if (gen_mul && (i <= 130)) {
1534             bits = get_bit(g_scalar, i + 390) << 3;
1535             if (i < 130) {
1536                 bits |= get_bit(g_scalar, i + 260) << 2;
1537                 bits |= get_bit(g_scalar, i + 130) << 1;
1538                 bits |= get_bit(g_scalar, i);
1539             }
1540             /* select the point to add, in constant time */
1541             select_point(bits, 16, g_pre_comp, tmp);
1542             if (!skip) {
1543                 /* The 1 argument below is for "mixed" */
1544                 point_add(nq[0], nq[1], nq[2],
1545                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1546             } else {
1547                 memcpy(nq, tmp, 3 * sizeof(felem));
1548                 skip = 0;
1549             }
1550         }
1551
1552         /* do other additions every 5 doublings */
1553         if (num_points && (i % 5 == 0)) {
1554             /* loop over all scalars */
1555             for (num = 0; num < num_points; ++num) {
1556                 bits = get_bit(scalars[num], i + 4) << 5;
1557                 bits |= get_bit(scalars[num], i + 3) << 4;
1558                 bits |= get_bit(scalars[num], i + 2) << 3;
1559                 bits |= get_bit(scalars[num], i + 1) << 2;
1560                 bits |= get_bit(scalars[num], i) << 1;
1561                 bits |= get_bit(scalars[num], i - 1);
1562                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1563
1564                 /*
1565                  * select the point to add or subtract, in constant time
1566                  */
1567                 select_point(digit, 17, pre_comp[num], tmp);
1568                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1569                                             * point */
1570                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1571
1572                 if (!skip) {
1573                     point_add(nq[0], nq[1], nq[2],
1574                               nq[0], nq[1], nq[2],
1575                               mixed, tmp[0], tmp[1], tmp[2]);
1576                 } else {
1577                     memcpy(nq, tmp, 3 * sizeof(felem));
1578                     skip = 0;
1579                 }
1580             }
1581         }
1582     }
1583     felem_assign(x_out, nq[0]);
1584     felem_assign(y_out, nq[1]);
1585     felem_assign(z_out, nq[2]);
1586 }
1587
1588 /* Precomputation for the group generator. */
1589 struct nistp521_pre_comp_st {
1590     felem g_pre_comp[16][3];
1591     CRYPTO_REF_COUNT references;
1592     CRYPTO_RWLOCK *lock;
1593 };
1594
1595 const EC_METHOD *EC_GFp_nistp521_method(void)
1596 {
1597     static const EC_METHOD ret = {
1598         EC_FLAGS_DEFAULT_OCT,
1599         NID_X9_62_prime_field,
1600         ec_GFp_nistp521_group_init,
1601         ec_GFp_simple_group_finish,
1602         ec_GFp_simple_group_clear_finish,
1603         ec_GFp_nist_group_copy,
1604         ec_GFp_nistp521_group_set_curve,
1605         ec_GFp_simple_group_get_curve,
1606         ec_GFp_simple_group_get_degree,
1607         ec_group_simple_order_bits,
1608         ec_GFp_simple_group_check_discriminant,
1609         ec_GFp_simple_point_init,
1610         ec_GFp_simple_point_finish,
1611         ec_GFp_simple_point_clear_finish,
1612         ec_GFp_simple_point_copy,
1613         ec_GFp_simple_point_set_to_infinity,
1614         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1615         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1616         ec_GFp_simple_point_set_affine_coordinates,
1617         ec_GFp_nistp521_point_get_affine_coordinates,
1618         0 /* point_set_compressed_coordinates */ ,
1619         0 /* point2oct */ ,
1620         0 /* oct2point */ ,
1621         ec_GFp_simple_add,
1622         ec_GFp_simple_dbl,
1623         ec_GFp_simple_invert,
1624         ec_GFp_simple_is_at_infinity,
1625         ec_GFp_simple_is_on_curve,
1626         ec_GFp_simple_cmp,
1627         ec_GFp_simple_make_affine,
1628         ec_GFp_simple_points_make_affine,
1629         ec_GFp_nistp521_points_mul,
1630         ec_GFp_nistp521_precompute_mult,
1631         ec_GFp_nistp521_have_precompute_mult,
1632         ec_GFp_nist_field_mul,
1633         ec_GFp_nist_field_sqr,
1634         0 /* field_div */ ,
1635         0 /* field_encode */ ,
1636         0 /* field_decode */ ,
1637         0,                      /* field_set_to_one */
1638         ec_key_simple_priv2oct,
1639         ec_key_simple_oct2priv,
1640         0, /* set private */
1641         ec_key_simple_generate_key,
1642         ec_key_simple_check_key,
1643         ec_key_simple_generate_public_key,
1644         0, /* keycopy */
1645         0, /* keyfinish */
1646         ecdh_simple_compute_key
1647     };
1648
1649     return &ret;
1650 }
1651
1652 /******************************************************************************/
1653 /*
1654  * FUNCTIONS TO MANAGE PRECOMPUTATION
1655  */
1656
1657 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1658 {
1659     NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1660
1661     if (ret == NULL) {
1662         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1663         return ret;
1664     }
1665
1666     ret->references = 1;
1667
1668     ret->lock = CRYPTO_THREAD_lock_new();
1669     if (ret->lock == NULL) {
1670         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1671         OPENSSL_free(ret);
1672         return NULL;
1673     }
1674     return ret;
1675 }
1676
1677 NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
1678 {
1679     int i;
1680     if (p != NULL)
1681         CRYPTO_UP_REF(&p->references, &i, p->lock);
1682     return p;
1683 }
1684
1685 void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1686 {
1687     int i;
1688
1689     if (p == NULL)
1690         return;
1691
1692     CRYPTO_DOWN_REF(&p->references, &i, p->lock);
1693     REF_PRINT_COUNT("EC_nistp521", x);
1694     if (i > 0)
1695         return;
1696     REF_ASSERT_ISNT(i < 0);
1697
1698     CRYPTO_THREAD_lock_free(p->lock);
1699     OPENSSL_free(p);
1700 }
1701
1702 /******************************************************************************/
1703 /*
1704  * OPENSSL EC_METHOD FUNCTIONS
1705  */
1706
1707 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1708 {
1709     int ret;
1710     ret = ec_GFp_simple_group_init(group);
1711     group->a_is_minus3 = 1;
1712     return ret;
1713 }
1714
1715 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1716                                     const BIGNUM *a, const BIGNUM *b,
1717                                     BN_CTX *ctx)
1718 {
1719     int ret = 0;
1720     BN_CTX *new_ctx = NULL;
1721     BIGNUM *curve_p, *curve_a, *curve_b;
1722
1723     if (ctx == NULL)
1724         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1725             return 0;
1726     BN_CTX_start(ctx);
1727     curve_p = BN_CTX_get(ctx);
1728     curve_a = BN_CTX_get(ctx);
1729     curve_b = BN_CTX_get(ctx);
1730     if (curve_b == NULL)
1731         goto err;
1732     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1733     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1734     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1735     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1736         ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1737               EC_R_WRONG_CURVE_PARAMETERS);
1738         goto err;
1739     }
1740     group->field_mod_func = BN_nist_mod_521;
1741     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1742  err:
1743     BN_CTX_end(ctx);
1744     BN_CTX_free(new_ctx);
1745     return ret;
1746 }
1747
1748 /*
1749  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1750  * (X/Z^2, Y/Z^3)
1751  */
1752 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1753                                                  const EC_POINT *point,
1754                                                  BIGNUM *x, BIGNUM *y,
1755                                                  BN_CTX *ctx)
1756 {
1757     felem z1, z2, x_in, y_in, x_out, y_out;
1758     largefelem tmp;
1759
1760     if (EC_POINT_is_at_infinity(group, point)) {
1761         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1762               EC_R_POINT_AT_INFINITY);
1763         return 0;
1764     }
1765     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1766         (!BN_to_felem(z1, point->Z)))
1767         return 0;
1768     felem_inv(z2, z1);
1769     felem_square(tmp, z2);
1770     felem_reduce(z1, tmp);
1771     felem_mul(tmp, x_in, z1);
1772     felem_reduce(x_in, tmp);
1773     felem_contract(x_out, x_in);
1774     if (x != NULL) {
1775         if (!felem_to_BN(x, x_out)) {
1776             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1777                   ERR_R_BN_LIB);
1778             return 0;
1779         }
1780     }
1781     felem_mul(tmp, z1, z2);
1782     felem_reduce(z1, tmp);
1783     felem_mul(tmp, y_in, z1);
1784     felem_reduce(y_in, tmp);
1785     felem_contract(y_out, y_in);
1786     if (y != NULL) {
1787         if (!felem_to_BN(y, y_out)) {
1788             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1789                   ERR_R_BN_LIB);
1790             return 0;
1791         }
1792     }
1793     return 1;
1794 }
1795
1796 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1797 static void make_points_affine(size_t num, felem points[][3],
1798                                felem tmp_felems[])
1799 {
1800     /*
1801      * Runs in constant time, unless an input is the point at infinity (which
1802      * normally shouldn't happen).
1803      */
1804     ec_GFp_nistp_points_make_affine_internal(num,
1805                                              points,
1806                                              sizeof(felem),
1807                                              tmp_felems,
1808                                              (void (*)(void *))felem_one,
1809                                              (int (*)(const void *))
1810                                              felem_is_zero_int,
1811                                              (void (*)(void *, const void *))
1812                                              felem_assign,
1813                                              (void (*)(void *, const void *))
1814                                              felem_square_reduce, (void (*)
1815                                                                    (void *,
1816                                                                     const void
1817                                                                     *,
1818                                                                     const void
1819                                                                     *))
1820                                              felem_mul_reduce,
1821                                              (void (*)(void *, const void *))
1822                                              felem_inv,
1823                                              (void (*)(void *, const void *))
1824                                              felem_contract);
1825 }
1826
1827 /*
1828  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1829  * values Result is stored in r (r can equal one of the inputs).
1830  */
1831 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1832                                const BIGNUM *scalar, size_t num,
1833                                const EC_POINT *points[],
1834                                const BIGNUM *scalars[], BN_CTX *ctx)
1835 {
1836     int ret = 0;
1837     int j;
1838     int mixed = 0;
1839     BN_CTX *new_ctx = NULL;
1840     BIGNUM *x, *y, *z, *tmp_scalar;
1841     felem_bytearray g_secret;
1842     felem_bytearray *secrets = NULL;
1843     felem (*pre_comp)[17][3] = NULL;
1844     felem *tmp_felems = NULL;
1845     felem_bytearray tmp;
1846     unsigned i, num_bytes;
1847     int have_pre_comp = 0;
1848     size_t num_points = num;
1849     felem x_in, y_in, z_in, x_out, y_out, z_out;
1850     NISTP521_PRE_COMP *pre = NULL;
1851     felem(*g_pre_comp)[3] = NULL;
1852     EC_POINT *generator = NULL;
1853     const EC_POINT *p = NULL;
1854     const BIGNUM *p_scalar = NULL;
1855
1856     if (ctx == NULL)
1857         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1858             return 0;
1859     BN_CTX_start(ctx);
1860     x = BN_CTX_get(ctx);
1861     y = BN_CTX_get(ctx);
1862     z = BN_CTX_get(ctx);
1863     tmp_scalar = BN_CTX_get(ctx);
1864     if (tmp_scalar == NULL)
1865         goto err;
1866
1867     if (scalar != NULL) {
1868         pre = group->pre_comp.nistp521;
1869         if (pre)
1870             /* we have precomputation, try to use it */
1871             g_pre_comp = &pre->g_pre_comp[0];
1872         else
1873             /* try to use the standard precomputation */
1874             g_pre_comp = (felem(*)[3]) gmul;
1875         generator = EC_POINT_new(group);
1876         if (generator == NULL)
1877             goto err;
1878         /* get the generator from precomputation */
1879         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1880             !felem_to_BN(y, g_pre_comp[1][1]) ||
1881             !felem_to_BN(z, g_pre_comp[1][2])) {
1882             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1883             goto err;
1884         }
1885         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1886                                                       generator, x, y, z,
1887                                                       ctx))
1888             goto err;
1889         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1890             /* precomputation matches generator */
1891             have_pre_comp = 1;
1892         else
1893             /*
1894              * we don't have valid precomputation: treat the generator as a
1895              * random point
1896              */
1897             num_points++;
1898     }
1899
1900     if (num_points > 0) {
1901         if (num_points >= 2) {
1902             /*
1903              * unless we precompute multiples for just one point, converting
1904              * those into affine form is time well spent
1905              */
1906             mixed = 1;
1907         }
1908         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1909         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1910         if (mixed)
1911             tmp_felems =
1912                 OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1913         if ((secrets == NULL) || (pre_comp == NULL)
1914             || (mixed && (tmp_felems == NULL))) {
1915             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1916             goto err;
1917         }
1918
1919         /*
1920          * we treat NULL scalars as 0, and NULL points as points at infinity,
1921          * i.e., they contribute nothing to the linear combination
1922          */
1923         for (i = 0; i < num_points; ++i) {
1924             if (i == num)
1925                 /*
1926                  * we didn't have a valid precomputation, so we pick the
1927                  * generator
1928                  */
1929             {
1930                 p = EC_GROUP_get0_generator(group);
1931                 p_scalar = scalar;
1932             } else
1933                 /* the i^th point */
1934             {
1935                 p = points[i];
1936                 p_scalar = scalars[i];
1937             }
1938             if ((p_scalar != NULL) && (p != NULL)) {
1939                 /* reduce scalar to 0 <= scalar < 2^521 */
1940                 if ((BN_num_bits(p_scalar) > 521)
1941                     || (BN_is_negative(p_scalar))) {
1942                     /*
1943                      * this is an unusual input, and we don't guarantee
1944                      * constant-timeness
1945                      */
1946                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1947                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1948                         goto err;
1949                     }
1950                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
1951                 } else
1952                     num_bytes = BN_bn2bin(p_scalar, tmp);
1953                 flip_endian(secrets[i], tmp, num_bytes);
1954                 /* precompute multiples */
1955                 if ((!BN_to_felem(x_out, p->X)) ||
1956                     (!BN_to_felem(y_out, p->Y)) ||
1957                     (!BN_to_felem(z_out, p->Z)))
1958                     goto err;
1959                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1960                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1961                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1962                 for (j = 2; j <= 16; ++j) {
1963                     if (j & 1) {
1964                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1965                                   pre_comp[i][j][2], pre_comp[i][1][0],
1966                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1967                                   pre_comp[i][j - 1][0],
1968                                   pre_comp[i][j - 1][1],
1969                                   pre_comp[i][j - 1][2]);
1970                     } else {
1971                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1972                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1973                                      pre_comp[i][j / 2][1],
1974                                      pre_comp[i][j / 2][2]);
1975                     }
1976                 }
1977             }
1978         }
1979         if (mixed)
1980             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1981     }
1982
1983     /* the scalar for the generator */
1984     if ((scalar != NULL) && (have_pre_comp)) {
1985         memset(g_secret, 0, sizeof(g_secret));
1986         /* reduce scalar to 0 <= scalar < 2^521 */
1987         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1988             /*
1989              * this is an unusual input, and we don't guarantee
1990              * constant-timeness
1991              */
1992             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1993                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1994                 goto err;
1995             }
1996             num_bytes = BN_bn2bin(tmp_scalar, tmp);
1997         } else
1998             num_bytes = BN_bn2bin(scalar, tmp);
1999         flip_endian(g_secret, tmp, num_bytes);
2000         /* do the multiplication with generator precomputation */
2001         batch_mul(x_out, y_out, z_out,
2002                   (const felem_bytearray(*))secrets, num_points,
2003                   g_secret,
2004                   mixed, (const felem(*)[17][3])pre_comp,
2005                   (const felem(*)[3])g_pre_comp);
2006     } else
2007         /* do the multiplication without generator precomputation */
2008         batch_mul(x_out, y_out, z_out,
2009                   (const felem_bytearray(*))secrets, num_points,
2010                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
2011     /* reduce the output to its unique minimal representation */
2012     felem_contract(x_in, x_out);
2013     felem_contract(y_in, y_out);
2014     felem_contract(z_in, z_out);
2015     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2016         (!felem_to_BN(z, z_in))) {
2017         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2018         goto err;
2019     }
2020     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2021
2022  err:
2023     BN_CTX_end(ctx);
2024     EC_POINT_free(generator);
2025     BN_CTX_free(new_ctx);
2026     OPENSSL_free(secrets);
2027     OPENSSL_free(pre_comp);
2028     OPENSSL_free(tmp_felems);
2029     return ret;
2030 }
2031
2032 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2033 {
2034     int ret = 0;
2035     NISTP521_PRE_COMP *pre = NULL;
2036     int i, j;
2037     BN_CTX *new_ctx = NULL;
2038     BIGNUM *x, *y;
2039     EC_POINT *generator = NULL;
2040     felem tmp_felems[16];
2041
2042     /* throw away old precomputation */
2043     EC_pre_comp_free(group);
2044     if (ctx == NULL)
2045         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2046             return 0;
2047     BN_CTX_start(ctx);
2048     x = BN_CTX_get(ctx);
2049     y = BN_CTX_get(ctx);
2050     if (y == NULL)
2051         goto err;
2052     /* get the generator */
2053     if (group->generator == NULL)
2054         goto err;
2055     generator = EC_POINT_new(group);
2056     if (generator == NULL)
2057         goto err;
2058     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2059     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2060     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2061         goto err;
2062     if ((pre = nistp521_pre_comp_new()) == NULL)
2063         goto err;
2064     /*
2065      * if the generator is the standard one, use built-in precomputation
2066      */
2067     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2068         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2069         goto done;
2070     }
2071     if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
2072         (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
2073         (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2074         goto err;
2075     /* compute 2^130*G, 2^260*G, 2^390*G */
2076     for (i = 1; i <= 4; i <<= 1) {
2077         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2078                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2079                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2080         for (j = 0; j < 129; ++j) {
2081             point_double(pre->g_pre_comp[2 * i][0],
2082                          pre->g_pre_comp[2 * i][1],
2083                          pre->g_pre_comp[2 * i][2],
2084                          pre->g_pre_comp[2 * i][0],
2085                          pre->g_pre_comp[2 * i][1],
2086                          pre->g_pre_comp[2 * i][2]);
2087         }
2088     }
2089     /* g_pre_comp[0] is the point at infinity */
2090     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2091     /* the remaining multiples */
2092     /* 2^130*G + 2^260*G */
2093     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2094               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2095               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2096               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2097               pre->g_pre_comp[2][2]);
2098     /* 2^130*G + 2^390*G */
2099     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2100               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2101               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2102               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2103               pre->g_pre_comp[2][2]);
2104     /* 2^260*G + 2^390*G */
2105     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2106               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2107               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2108               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2109               pre->g_pre_comp[4][2]);
2110     /* 2^130*G + 2^260*G + 2^390*G */
2111     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2112               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2113               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2114               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2115               pre->g_pre_comp[2][2]);
2116     for (i = 1; i < 8; ++i) {
2117         /* odd multiples: add G */
2118         point_add(pre->g_pre_comp[2 * i + 1][0],
2119                   pre->g_pre_comp[2 * i + 1][1],
2120                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2121                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2122                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2123                   pre->g_pre_comp[1][2]);
2124     }
2125     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2126
2127  done:
2128     SETPRECOMP(group, nistp521, pre);
2129     ret = 1;
2130     pre = NULL;
2131  err:
2132     BN_CTX_end(ctx);
2133     EC_POINT_free(generator);
2134     BN_CTX_free(new_ctx);
2135     EC_nistp521_pre_comp_free(pre);
2136     return ret;
2137 }
2138
2139 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2140 {
2141     return HAVEPRECOMP(group, nistp521);
2142 }
2143
2144 #endif