crypto/ec/ecp_nistp521.c

   1 /*
   2  * Copyright 2011-2019 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the OpenSSL license (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  28  *
  29  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  30  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  31  * work which got its smarts from Daniel J. Bernstein's work on the same.
  32  */
  33
  34 #include <openssl/e_os2.h>
  35 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  36 NON_EMPTY_TRANSLATION_UNIT
  37 #else
  38
  39 # include <string.h>
  40 # include <openssl/err.h>
  41 # include "ec_lcl.h"
  42
  43 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  44   /* even with gcc, the typedef won't work for 32-bit platforms */
  45 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  46                                  * platforms */
  47 # else
  48 #  error "Need GCC 3.1 or later to define type uint128_t"
  49 # endif
  50
  51 typedef uint8_t u8;
  52 typedef uint64_t u64;
  53
  54 /*
  55  * The underlying field. P521 operates over GF(2^521-1). We can serialise an
  56  * element of this field into 66 bytes where the most significant byte
  57  * contains only a single bit. We call this an felem_bytearray.
  58  */
  59
  60 typedef u8 felem_bytearray[66];
  61
  62 /*
  63  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  64  * These values are big-endian.
  65  */
  66 static const felem_bytearray nistp521_curve_params[5] = {
  67     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
  68      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  69      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  70      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  74      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  75      0xff, 0xff},
  76     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
  77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  78      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  79      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  82      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  83      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  84      0xff, 0xfc},
  85     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
  86      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  87      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  88      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  89      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  90      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  91      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  92      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  93      0x3f, 0x00},
  94     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
  95      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  96      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
  97      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
  98      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
  99      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
 100      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
 101      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 102      0xbd, 0x66},
 103     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
 104      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 105      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 106      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 107      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 108      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 109      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 110      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 111      0x66, 0x50}
 112 };
 113
 114 /*-
 115  * The representation of field elements.
 116  * ------------------------------------
 117  *
 118  * We represent field elements with nine values. These values are either 64 or
 119  * 128 bits and the field element represented is:
 120  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 121  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 122  * 58 bits apart, but are greater than 58 bits in length, the most significant
 123  * bits of each limb overlap with the least significant bits of the next.
 124  *
 125  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 126  * 'largefelem' */
 127
 128 # define NLIMBS 9
 129
 130 typedef uint64_t limb;
 131 typedef limb felem[NLIMBS];
 132 typedef uint128_t largefelem[NLIMBS];
 133
 134 static const limb bottom57bits = 0x1ffffffffffffff;
 135 static const limb bottom58bits = 0x3ffffffffffffff;
 136
 137 /*
 138  * bin66_to_felem takes a little-endian byte array and converts it into felem
 139  * form. This assumes that the CPU is little-endian.
 140  */
 141 static void bin66_to_felem(felem out, const u8 in[66])
 142 {
 143     out[0] = (*((limb *) & in[0])) & bottom58bits;
 144     out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
 145     out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
 146     out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
 147     out[4] = (*((limb *) & in[29])) & bottom58bits;
 148     out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
 149     out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
 150     out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
 151     out[8] = (*((limb *) & in[58])) & bottom57bits;
 152 }
 153
 154 /*
 155  * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 156  * array. This assumes that the CPU is little-endian.
 157  */
 158 static void felem_to_bin66(u8 out[66], const felem in)
 159 {
 160     memset(out, 0, 66);
 161     (*((limb *) & out[0])) = in[0];
 162     (*((limb *) & out[7])) |= in[1] << 2;
 163     (*((limb *) & out[14])) |= in[2] << 4;
 164     (*((limb *) & out[21])) |= in[3] << 6;
 165     (*((limb *) & out[29])) = in[4];
 166     (*((limb *) & out[36])) |= in[5] << 2;
 167     (*((limb *) & out[43])) |= in[6] << 4;
 168     (*((limb *) & out[50])) |= in[7] << 6;
 169     (*((limb *) & out[58])) = in[8];
 170 }
 171
 172 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 173 static int BN_to_felem(felem out, const BIGNUM *bn)
 174 {
 175     felem_bytearray b_out;
 176     int num_bytes;
 177
 178     if (BN_is_negative(bn)) {
 179         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 180         return 0;
 181     }
 182     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
 183     if (num_bytes < 0) {
 184         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 185         return 0;
 186     }
 187     bin66_to_felem(out, b_out);
 188     return 1;
 189 }
 190
 191 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 192 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 193 {
 194     felem_bytearray b_out;
 195     felem_to_bin66(b_out, in);
 196     return BN_lebin2bn(b_out, sizeof(b_out), out);
 197 }
 198
 199 /*-
 200  * Field operations
 201  * ----------------
 202  */
 203
 204 static void felem_one(felem out)
 205 {
 206     out[0] = 1;
 207     out[1] = 0;
 208     out[2] = 0;
 209     out[3] = 0;
 210     out[4] = 0;
 211     out[5] = 0;
 212     out[6] = 0;
 213     out[7] = 0;
 214     out[8] = 0;
 215 }
 216
 217 static void felem_assign(felem out, const felem in)
 218 {
 219     out[0] = in[0];
 220     out[1] = in[1];
 221     out[2] = in[2];
 222     out[3] = in[3];
 223     out[4] = in[4];
 224     out[5] = in[5];
 225     out[6] = in[6];
 226     out[7] = in[7];
 227     out[8] = in[8];
 228 }
 229
 230 /* felem_sum64 sets out = out + in. */
 231 static void felem_sum64(felem out, const felem in)
 232 {
 233     out[0] += in[0];
 234     out[1] += in[1];
 235     out[2] += in[2];
 236     out[3] += in[3];
 237     out[4] += in[4];
 238     out[5] += in[5];
 239     out[6] += in[6];
 240     out[7] += in[7];
 241     out[8] += in[8];
 242 }
 243
 244 /* felem_scalar sets out = in * scalar */
 245 static void felem_scalar(felem out, const felem in, limb scalar)
 246 {
 247     out[0] = in[0] * scalar;
 248     out[1] = in[1] * scalar;
 249     out[2] = in[2] * scalar;
 250     out[3] = in[3] * scalar;
 251     out[4] = in[4] * scalar;
 252     out[5] = in[5] * scalar;
 253     out[6] = in[6] * scalar;
 254     out[7] = in[7] * scalar;
 255     out[8] = in[8] * scalar;
 256 }
 257
 258 /* felem_scalar64 sets out = out * scalar */
 259 static void felem_scalar64(felem out, limb scalar)
 260 {
 261     out[0] *= scalar;
 262     out[1] *= scalar;
 263     out[2] *= scalar;
 264     out[3] *= scalar;
 265     out[4] *= scalar;
 266     out[5] *= scalar;
 267     out[6] *= scalar;
 268     out[7] *= scalar;
 269     out[8] *= scalar;
 270 }
 271
 272 /* felem_scalar128 sets out = out * scalar */
 273 static void felem_scalar128(largefelem out, limb scalar)
 274 {
 275     out[0] *= scalar;
 276     out[1] *= scalar;
 277     out[2] *= scalar;
 278     out[3] *= scalar;
 279     out[4] *= scalar;
 280     out[5] *= scalar;
 281     out[6] *= scalar;
 282     out[7] *= scalar;
 283     out[8] *= scalar;
 284 }
 285
 286 /*-
 287  * felem_neg sets |out| to |-in|
 288  * On entry:
 289  *   in[i] < 2^59 + 2^14
 290  * On exit:
 291  *   out[i] < 2^62
 292  */
 293 static void felem_neg(felem out, const felem in)
 294 {
 295     /* In order to prevent underflow, we subtract from 0 mod p. */
 296     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 297     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 298
 299     out[0] = two62m3 - in[0];
 300     out[1] = two62m2 - in[1];
 301     out[2] = two62m2 - in[2];
 302     out[3] = two62m2 - in[3];
 303     out[4] = two62m2 - in[4];
 304     out[5] = two62m2 - in[5];
 305     out[6] = two62m2 - in[6];
 306     out[7] = two62m2 - in[7];
 307     out[8] = two62m2 - in[8];
 308 }
 309
 310 /*-
 311  * felem_diff64 subtracts |in| from |out|
 312  * On entry:
 313  *   in[i] < 2^59 + 2^14
 314  * On exit:
 315  *   out[i] < out[i] + 2^62
 316  */
 317 static void felem_diff64(felem out, const felem in)
 318 {
 319     /*
 320      * In order to prevent underflow, we add 0 mod p before subtracting.
 321      */
 322     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 323     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 324
 325     out[0] += two62m3 - in[0];
 326     out[1] += two62m2 - in[1];
 327     out[2] += two62m2 - in[2];
 328     out[3] += two62m2 - in[3];
 329     out[4] += two62m2 - in[4];
 330     out[5] += two62m2 - in[5];
 331     out[6] += two62m2 - in[6];
 332     out[7] += two62m2 - in[7];
 333     out[8] += two62m2 - in[8];
 334 }
 335
 336 /*-
 337  * felem_diff_128_64 subtracts |in| from |out|
 338  * On entry:
 339  *   in[i] < 2^62 + 2^17
 340  * On exit:
 341  *   out[i] < out[i] + 2^63
 342  */
 343 static void felem_diff_128_64(largefelem out, const felem in)
 344 {
 345     /*
 346      * In order to prevent underflow, we add 64p mod p (which is equivalent
 347      * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
 348      * digit number with all bits set to 1. See "The representation of field
 349      * elements" comment above for a description of how limbs are used to
 350      * represent a number. 64p is represented with 8 limbs containing a number
 351      * with 58 bits set and one limb with a number with 57 bits set.
 352      */
 353     static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
 354     static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
 355
 356     out[0] += two63m6 - in[0];
 357     out[1] += two63m5 - in[1];
 358     out[2] += two63m5 - in[2];
 359     out[3] += two63m5 - in[3];
 360     out[4] += two63m5 - in[4];
 361     out[5] += two63m5 - in[5];
 362     out[6] += two63m5 - in[6];
 363     out[7] += two63m5 - in[7];
 364     out[8] += two63m5 - in[8];
 365 }
 366
 367 /*-
 368  * felem_diff_128_64 subtracts |in| from |out|
 369  * On entry:
 370  *   in[i] < 2^126
 371  * On exit:
 372  *   out[i] < out[i] + 2^127 - 2^69
 373  */
 374 static void felem_diff128(largefelem out, const largefelem in)
 375 {
 376     /*
 377      * In order to prevent underflow, we add 0 mod p before subtracting.
 378      */
 379     static const uint128_t two127m70 =
 380         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
 381     static const uint128_t two127m69 =
 382         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
 383
 384     out[0] += (two127m70 - in[0]);
 385     out[1] += (two127m69 - in[1]);
 386     out[2] += (two127m69 - in[2]);
 387     out[3] += (two127m69 - in[3]);
 388     out[4] += (two127m69 - in[4]);
 389     out[5] += (two127m69 - in[5]);
 390     out[6] += (two127m69 - in[6]);
 391     out[7] += (two127m69 - in[7]);
 392     out[8] += (two127m69 - in[8]);
 393 }
 394
 395 /*-
 396  * felem_square sets |out| = |in|^2
 397  * On entry:
 398  *   in[i] < 2^62
 399  * On exit:
 400  *   out[i] < 17 * max(in[i]) * max(in[i])
 401  */
 402 static void felem_square(largefelem out, const felem in)
 403 {
 404     felem inx2, inx4;
 405     felem_scalar(inx2, in, 2);
 406     felem_scalar(inx4, in, 4);
 407
 408     /*-
 409      * We have many cases were we want to do
 410      *   in[x] * in[y] +
 411      *   in[y] * in[x]
 412      * This is obviously just
 413      *   2 * in[x] * in[y]
 414      * However, rather than do the doubling on the 128 bit result, we
 415      * double one of the inputs to the multiplication by reading from
 416      * |inx2|
 417      */
 418
 419     out[0] = ((uint128_t) in[0]) * in[0];
 420     out[1] = ((uint128_t) in[0]) * inx2[1];
 421     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
 422     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
 423     out[4] = ((uint128_t) in[0]) * inx2[4] +
 424              ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
 425     out[5] = ((uint128_t) in[0]) * inx2[5] +
 426              ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
 427     out[6] = ((uint128_t) in[0]) * inx2[6] +
 428              ((uint128_t) in[1]) * inx2[5] +
 429              ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
 430     out[7] = ((uint128_t) in[0]) * inx2[7] +
 431              ((uint128_t) in[1]) * inx2[6] +
 432              ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
 433     out[8] = ((uint128_t) in[0]) * inx2[8] +
 434              ((uint128_t) in[1]) * inx2[7] +
 435              ((uint128_t) in[2]) * inx2[6] +
 436              ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
 437
 438     /*
 439      * The remaining limbs fall above 2^521, with the first falling at 2^522.
 440      * They correspond to locations one bit up from the limbs produced above
 441      * so we would have to multiply by two to align them. Again, rather than
 442      * operate on the 128-bit result, we double one of the inputs to the
 443      * multiplication. If we want to double for both this reason, and the
 444      * reason above, then we end up multiplying by four.
 445      */
 446
 447     /* 9 */
 448     out[0] += ((uint128_t) in[1]) * inx4[8] +
 449               ((uint128_t) in[2]) * inx4[7] +
 450               ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
 451
 452     /* 10 */
 453     out[1] += ((uint128_t) in[2]) * inx4[8] +
 454               ((uint128_t) in[3]) * inx4[7] +
 455               ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
 456
 457     /* 11 */
 458     out[2] += ((uint128_t) in[3]) * inx4[8] +
 459               ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
 460
 461     /* 12 */
 462     out[3] += ((uint128_t) in[4]) * inx4[8] +
 463               ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
 464
 465     /* 13 */
 466     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
 467
 468     /* 14 */
 469     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
 470
 471     /* 15 */
 472     out[6] += ((uint128_t) in[7]) * inx4[8];
 473
 474     /* 16 */
 475     out[7] += ((uint128_t) in[8]) * inx2[8];
 476 }
 477
 478 /*-
 479  * felem_mul sets |out| = |in1| * |in2|
 480  * On entry:
 481  *   in1[i] < 2^64
 482  *   in2[i] < 2^63
 483  * On exit:
 484  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 485  */
 486 static void felem_mul(largefelem out, const felem in1, const felem in2)
 487 {
 488     felem in2x2;
 489     felem_scalar(in2x2, in2, 2);
 490
 491     out[0] = ((uint128_t) in1[0]) * in2[0];
 492
 493     out[1] = ((uint128_t) in1[0]) * in2[1] +
 494              ((uint128_t) in1[1]) * in2[0];
 495
 496     out[2] = ((uint128_t) in1[0]) * in2[2] +
 497              ((uint128_t) in1[1]) * in2[1] +
 498              ((uint128_t) in1[2]) * in2[0];
 499
 500     out[3] = ((uint128_t) in1[0]) * in2[3] +
 501              ((uint128_t) in1[1]) * in2[2] +
 502              ((uint128_t) in1[2]) * in2[1] +
 503              ((uint128_t) in1[3]) * in2[0];
 504
 505     out[4] = ((uint128_t) in1[0]) * in2[4] +
 506              ((uint128_t) in1[1]) * in2[3] +
 507              ((uint128_t) in1[2]) * in2[2] +
 508              ((uint128_t) in1[3]) * in2[1] +
 509              ((uint128_t) in1[4]) * in2[0];
 510
 511     out[5] = ((uint128_t) in1[0]) * in2[5] +
 512              ((uint128_t) in1[1]) * in2[4] +
 513              ((uint128_t) in1[2]) * in2[3] +
 514              ((uint128_t) in1[3]) * in2[2] +
 515              ((uint128_t) in1[4]) * in2[1] +
 516              ((uint128_t) in1[5]) * in2[0];
 517
 518     out[6] = ((uint128_t) in1[0]) * in2[6] +
 519              ((uint128_t) in1[1]) * in2[5] +
 520              ((uint128_t) in1[2]) * in2[4] +
 521              ((uint128_t) in1[3]) * in2[3] +
 522              ((uint128_t) in1[4]) * in2[2] +
 523              ((uint128_t) in1[5]) * in2[1] +
 524              ((uint128_t) in1[6]) * in2[0];
 525
 526     out[7] = ((uint128_t) in1[0]) * in2[7] +
 527              ((uint128_t) in1[1]) * in2[6] +
 528              ((uint128_t) in1[2]) * in2[5] +
 529              ((uint128_t) in1[3]) * in2[4] +
 530              ((uint128_t) in1[4]) * in2[3] +
 531              ((uint128_t) in1[5]) * in2[2] +
 532              ((uint128_t) in1[6]) * in2[1] +
 533              ((uint128_t) in1[7]) * in2[0];
 534
 535     out[8] = ((uint128_t) in1[0]) * in2[8] +
 536              ((uint128_t) in1[1]) * in2[7] +
 537              ((uint128_t) in1[2]) * in2[6] +
 538              ((uint128_t) in1[3]) * in2[5] +
 539              ((uint128_t) in1[4]) * in2[4] +
 540              ((uint128_t) in1[5]) * in2[3] +
 541              ((uint128_t) in1[6]) * in2[2] +
 542              ((uint128_t) in1[7]) * in2[1] +
 543              ((uint128_t) in1[8]) * in2[0];
 544
 545     /* See comment in felem_square about the use of in2x2 here */
 546
 547     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 548               ((uint128_t) in1[2]) * in2x2[7] +
 549               ((uint128_t) in1[3]) * in2x2[6] +
 550               ((uint128_t) in1[4]) * in2x2[5] +
 551               ((uint128_t) in1[5]) * in2x2[4] +
 552               ((uint128_t) in1[6]) * in2x2[3] +
 553               ((uint128_t) in1[7]) * in2x2[2] +
 554               ((uint128_t) in1[8]) * in2x2[1];
 555
 556     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 557               ((uint128_t) in1[3]) * in2x2[7] +
 558               ((uint128_t) in1[4]) * in2x2[6] +
 559               ((uint128_t) in1[5]) * in2x2[5] +
 560               ((uint128_t) in1[6]) * in2x2[4] +
 561               ((uint128_t) in1[7]) * in2x2[3] +
 562               ((uint128_t) in1[8]) * in2x2[2];
 563
 564     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 565               ((uint128_t) in1[4]) * in2x2[7] +
 566               ((uint128_t) in1[5]) * in2x2[6] +
 567               ((uint128_t) in1[6]) * in2x2[5] +
 568               ((uint128_t) in1[7]) * in2x2[4] +
 569               ((uint128_t) in1[8]) * in2x2[3];
 570
 571     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 572               ((uint128_t) in1[5]) * in2x2[7] +
 573               ((uint128_t) in1[6]) * in2x2[6] +
 574               ((uint128_t) in1[7]) * in2x2[5] +
 575               ((uint128_t) in1[8]) * in2x2[4];
 576
 577     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 578               ((uint128_t) in1[6]) * in2x2[7] +
 579               ((uint128_t) in1[7]) * in2x2[6] +
 580               ((uint128_t) in1[8]) * in2x2[5];
 581
 582     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 583               ((uint128_t) in1[7]) * in2x2[7] +
 584               ((uint128_t) in1[8]) * in2x2[6];
 585
 586     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 587               ((uint128_t) in1[8]) * in2x2[7];
 588
 589     out[7] += ((uint128_t) in1[8]) * in2x2[8];
 590 }
 591
 592 static const limb bottom52bits = 0xfffffffffffff;
 593
 594 /*-
 595  * felem_reduce converts a largefelem to an felem.
 596  * On entry:
 597  *   in[i] < 2^128
 598  * On exit:
 599  *   out[i] < 2^59 + 2^14
 600  */
 601 static void felem_reduce(felem out, const largefelem in)
 602 {
 603     u64 overflow1, overflow2;
 604
 605     out[0] = ((limb) in[0]) & bottom58bits;
 606     out[1] = ((limb) in[1]) & bottom58bits;
 607     out[2] = ((limb) in[2]) & bottom58bits;
 608     out[3] = ((limb) in[3]) & bottom58bits;
 609     out[4] = ((limb) in[4]) & bottom58bits;
 610     out[5] = ((limb) in[5]) & bottom58bits;
 611     out[6] = ((limb) in[6]) & bottom58bits;
 612     out[7] = ((limb) in[7]) & bottom58bits;
 613     out[8] = ((limb) in[8]) & bottom58bits;
 614
 615     /* out[i] < 2^58 */
 616
 617     out[1] += ((limb) in[0]) >> 58;
 618     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 619     /*-
 620      * out[1] < 2^58 + 2^6 + 2^58
 621      *        = 2^59 + 2^6
 622      */
 623     out[2] += ((limb) (in[0] >> 64)) >> 52;
 624
 625     out[2] += ((limb) in[1]) >> 58;
 626     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 627     out[3] += ((limb) (in[1] >> 64)) >> 52;
 628
 629     out[3] += ((limb) in[2]) >> 58;
 630     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 631     out[4] += ((limb) (in[2] >> 64)) >> 52;
 632
 633     out[4] += ((limb) in[3]) >> 58;
 634     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 635     out[5] += ((limb) (in[3] >> 64)) >> 52;
 636
 637     out[5] += ((limb) in[4]) >> 58;
 638     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 639     out[6] += ((limb) (in[4] >> 64)) >> 52;
 640
 641     out[6] += ((limb) in[5]) >> 58;
 642     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 643     out[7] += ((limb) (in[5] >> 64)) >> 52;
 644
 645     out[7] += ((limb) in[6]) >> 58;
 646     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 647     out[8] += ((limb) (in[6] >> 64)) >> 52;
 648
 649     out[8] += ((limb) in[7]) >> 58;
 650     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 651     /*-
 652      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 653      *            < 2^59 + 2^13
 654      */
 655     overflow1 = ((limb) (in[7] >> 64)) >> 52;
 656
 657     overflow1 += ((limb) in[8]) >> 58;
 658     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 659     overflow2 = ((limb) (in[8] >> 64)) >> 52;
 660
 661     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
 662     overflow2 <<= 1;            /* overflow2 < 2^13 */
 663
 664     out[0] += overflow1;        /* out[0] < 2^60 */
 665     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
 666
 667     out[1] += out[0] >> 58;
 668     out[0] &= bottom58bits;
 669     /*-
 670      * out[0] < 2^58
 671      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 672      *        < 2^59 + 2^14
 673      */
 674 }
 675
 676 static void felem_square_reduce(felem out, const felem in)
 677 {
 678     largefelem tmp;
 679     felem_square(tmp, in);
 680     felem_reduce(out, tmp);
 681 }
 682
 683 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 684 {
 685     largefelem tmp;
 686     felem_mul(tmp, in1, in2);
 687     felem_reduce(out, tmp);
 688 }
 689
 690 /*-
 691  * felem_inv calculates |out| = |in|^{-1}
 692  *
 693  * Based on Fermat's Little Theorem:
 694  *   a^p = a (mod p)
 695  *   a^{p-1} = 1 (mod p)
 696  *   a^{p-2} = a^{-1} (mod p)
 697  */
 698 static void felem_inv(felem out, const felem in)
 699 {
 700     felem ftmp, ftmp2, ftmp3, ftmp4;
 701     largefelem tmp;
 702     unsigned i;
 703
 704     felem_square(tmp, in);
 705     felem_reduce(ftmp, tmp);    /* 2^1 */
 706     felem_mul(tmp, in, ftmp);
 707     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
 708     felem_assign(ftmp2, ftmp);
 709     felem_square(tmp, ftmp);
 710     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
 711     felem_mul(tmp, in, ftmp);
 712     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
 713     felem_square(tmp, ftmp);
 714     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
 715
 716     felem_square(tmp, ftmp2);
 717     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
 718     felem_square(tmp, ftmp3);
 719     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
 720     felem_mul(tmp, ftmp3, ftmp2);
 721     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
 722
 723     felem_assign(ftmp2, ftmp3);
 724     felem_square(tmp, ftmp3);
 725     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
 726     felem_square(tmp, ftmp3);
 727     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
 728     felem_square(tmp, ftmp3);
 729     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
 730     felem_square(tmp, ftmp3);
 731     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
 732     felem_assign(ftmp4, ftmp3);
 733     felem_mul(tmp, ftmp3, ftmp);
 734     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
 735     felem_square(tmp, ftmp4);
 736     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
 737     felem_mul(tmp, ftmp3, ftmp2);
 738     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
 739     felem_assign(ftmp2, ftmp3);
 740
 741     for (i = 0; i < 8; i++) {
 742         felem_square(tmp, ftmp3);
 743         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
 744     }
 745     felem_mul(tmp, ftmp3, ftmp2);
 746     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
 747     felem_assign(ftmp2, ftmp3);
 748
 749     for (i = 0; i < 16; i++) {
 750         felem_square(tmp, ftmp3);
 751         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
 752     }
 753     felem_mul(tmp, ftmp3, ftmp2);
 754     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
 755     felem_assign(ftmp2, ftmp3);
 756
 757     for (i = 0; i < 32; i++) {
 758         felem_square(tmp, ftmp3);
 759         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
 760     }
 761     felem_mul(tmp, ftmp3, ftmp2);
 762     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
 763     felem_assign(ftmp2, ftmp3);
 764
 765     for (i = 0; i < 64; i++) {
 766         felem_square(tmp, ftmp3);
 767         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
 768     }
 769     felem_mul(tmp, ftmp3, ftmp2);
 770     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
 771     felem_assign(ftmp2, ftmp3);
 772
 773     for (i = 0; i < 128; i++) {
 774         felem_square(tmp, ftmp3);
 775         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
 776     }
 777     felem_mul(tmp, ftmp3, ftmp2);
 778     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
 779     felem_assign(ftmp2, ftmp3);
 780
 781     for (i = 0; i < 256; i++) {
 782         felem_square(tmp, ftmp3);
 783         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
 784     }
 785     felem_mul(tmp, ftmp3, ftmp2);
 786     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
 787
 788     for (i = 0; i < 9; i++) {
 789         felem_square(tmp, ftmp3);
 790         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
 791     }
 792     felem_mul(tmp, ftmp3, ftmp4);
 793     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
 794     felem_mul(tmp, ftmp3, in);
 795     felem_reduce(out, tmp);     /* 2^512 - 3 */
 796 }
 797
 798 /* This is 2^521-1, expressed as an felem */
 799 static const felem kPrime = {
 800     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 801     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 802     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 803 };
 804
 805 /*-
 806  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 807  * otherwise.
 808  * On entry:
 809  *   in[i] < 2^59 + 2^14
 810  */
 811 static limb felem_is_zero(const felem in)
 812 {
 813     felem ftmp;
 814     limb is_zero, is_p;
 815     felem_assign(ftmp, in);
 816
 817     ftmp[0] += ftmp[8] >> 57;
 818     ftmp[8] &= bottom57bits;
 819     /* ftmp[8] < 2^57 */
 820     ftmp[1] += ftmp[0] >> 58;
 821     ftmp[0] &= bottom58bits;
 822     ftmp[2] += ftmp[1] >> 58;
 823     ftmp[1] &= bottom58bits;
 824     ftmp[3] += ftmp[2] >> 58;
 825     ftmp[2] &= bottom58bits;
 826     ftmp[4] += ftmp[3] >> 58;
 827     ftmp[3] &= bottom58bits;
 828     ftmp[5] += ftmp[4] >> 58;
 829     ftmp[4] &= bottom58bits;
 830     ftmp[6] += ftmp[5] >> 58;
 831     ftmp[5] &= bottom58bits;
 832     ftmp[7] += ftmp[6] >> 58;
 833     ftmp[6] &= bottom58bits;
 834     ftmp[8] += ftmp[7] >> 58;
 835     ftmp[7] &= bottom58bits;
 836     /* ftmp[8] < 2^57 + 4 */
 837
 838     /*
 839      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
 840      * than our bound for ftmp[8]. Therefore we only have to check if the
 841      * zero is zero or 2^521-1.
 842      */
 843
 844     is_zero = 0;
 845     is_zero |= ftmp[0];
 846     is_zero |= ftmp[1];
 847     is_zero |= ftmp[2];
 848     is_zero |= ftmp[3];
 849     is_zero |= ftmp[4];
 850     is_zero |= ftmp[5];
 851     is_zero |= ftmp[6];
 852     is_zero |= ftmp[7];
 853     is_zero |= ftmp[8];
 854
 855     is_zero--;
 856     /*
 857      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
 858      * can be set is if is_zero was 0 before the decrement.
 859      */
 860     is_zero = 0 - (is_zero >> 63);
 861
 862     is_p = ftmp[0] ^ kPrime[0];
 863     is_p |= ftmp[1] ^ kPrime[1];
 864     is_p |= ftmp[2] ^ kPrime[2];
 865     is_p |= ftmp[3] ^ kPrime[3];
 866     is_p |= ftmp[4] ^ kPrime[4];
 867     is_p |= ftmp[5] ^ kPrime[5];
 868     is_p |= ftmp[6] ^ kPrime[6];
 869     is_p |= ftmp[7] ^ kPrime[7];
 870     is_p |= ftmp[8] ^ kPrime[8];
 871
 872     is_p--;
 873     is_p = 0 - (is_p >> 63);
 874
 875     is_zero |= is_p;
 876     return is_zero;
 877 }
 878
 879 static int felem_is_zero_int(const void *in)
 880 {
 881     return (int)(felem_is_zero(in) & ((limb) 1));
 882 }
 883
 884 /*-
 885  * felem_contract converts |in| to its unique, minimal representation.
 886  * On entry:
 887  *   in[i] < 2^59 + 2^14
 888  */
 889 static void felem_contract(felem out, const felem in)
 890 {
 891     limb is_p, is_greater, sign;
 892     static const limb two58 = ((limb) 1) << 58;
 893
 894     felem_assign(out, in);
 895
 896     out[0] += out[8] >> 57;
 897     out[8] &= bottom57bits;
 898     /* out[8] < 2^57 */
 899     out[1] += out[0] >> 58;
 900     out[0] &= bottom58bits;
 901     out[2] += out[1] >> 58;
 902     out[1] &= bottom58bits;
 903     out[3] += out[2] >> 58;
 904     out[2] &= bottom58bits;
 905     out[4] += out[3] >> 58;
 906     out[3] &= bottom58bits;
 907     out[5] += out[4] >> 58;
 908     out[4] &= bottom58bits;
 909     out[6] += out[5] >> 58;
 910     out[5] &= bottom58bits;
 911     out[7] += out[6] >> 58;
 912     out[6] &= bottom58bits;
 913     out[8] += out[7] >> 58;
 914     out[7] &= bottom58bits;
 915     /* out[8] < 2^57 + 4 */
 916
 917     /*
 918      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
 919      * out. See the comments in felem_is_zero regarding why we don't test for
 920      * other multiples of the prime.
 921      */
 922
 923     /*
 924      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
 925      */
 926
 927     is_p = out[0] ^ kPrime[0];
 928     is_p |= out[1] ^ kPrime[1];
 929     is_p |= out[2] ^ kPrime[2];
 930     is_p |= out[3] ^ kPrime[3];
 931     is_p |= out[4] ^ kPrime[4];
 932     is_p |= out[5] ^ kPrime[5];
 933     is_p |= out[6] ^ kPrime[6];
 934     is_p |= out[7] ^ kPrime[7];
 935     is_p |= out[8] ^ kPrime[8];
 936
 937     is_p--;
 938     is_p &= is_p << 32;
 939     is_p &= is_p << 16;
 940     is_p &= is_p << 8;
 941     is_p &= is_p << 4;
 942     is_p &= is_p << 2;
 943     is_p &= is_p << 1;
 944     is_p = 0 - (is_p >> 63);
 945     is_p = ~is_p;
 946
 947     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 948
 949     out[0] &= is_p;
 950     out[1] &= is_p;
 951     out[2] &= is_p;
 952     out[3] &= is_p;
 953     out[4] &= is_p;
 954     out[5] &= is_p;
 955     out[6] &= is_p;
 956     out[7] &= is_p;
 957     out[8] &= is_p;
 958
 959     /*
 960      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
 961      * 57 is greater than zero as (2^521-1) + x >= 2^522
 962      */
 963     is_greater = out[8] >> 57;
 964     is_greater |= is_greater << 32;
 965     is_greater |= is_greater << 16;
 966     is_greater |= is_greater << 8;
 967     is_greater |= is_greater << 4;
 968     is_greater |= is_greater << 2;
 969     is_greater |= is_greater << 1;
 970     is_greater = 0 - (is_greater >> 63);
 971
 972     out[0] -= kPrime[0] & is_greater;
 973     out[1] -= kPrime[1] & is_greater;
 974     out[2] -= kPrime[2] & is_greater;
 975     out[3] -= kPrime[3] & is_greater;
 976     out[4] -= kPrime[4] & is_greater;
 977     out[5] -= kPrime[5] & is_greater;
 978     out[6] -= kPrime[6] & is_greater;
 979     out[7] -= kPrime[7] & is_greater;
 980     out[8] -= kPrime[8] & is_greater;
 981
 982     /* Eliminate negative coefficients */
 983     sign = -(out[0] >> 63);
 984     out[0] += (two58 & sign);
 985     out[1] -= (1 & sign);
 986     sign = -(out[1] >> 63);
 987     out[1] += (two58 & sign);
 988     out[2] -= (1 & sign);
 989     sign = -(out[2] >> 63);
 990     out[2] += (two58 & sign);
 991     out[3] -= (1 & sign);
 992     sign = -(out[3] >> 63);
 993     out[3] += (two58 & sign);
 994     out[4] -= (1 & sign);
 995     sign = -(out[4] >> 63);
 996     out[4] += (two58 & sign);
 997     out[5] -= (1 & sign);
 998     sign = -(out[0] >> 63);
 999     out[5] += (two58 & sign);
1000     out[6] -= (1 & sign);
1001     sign = -(out[6] >> 63);
1002     out[6] += (two58 & sign);
1003     out[7] -= (1 & sign);
1004     sign = -(out[7] >> 63);
1005     out[7] += (two58 & sign);
1006     out[8] -= (1 & sign);
1007     sign = -(out[5] >> 63);
1008     out[5] += (two58 & sign);
1009     out[6] -= (1 & sign);
1010     sign = -(out[6] >> 63);
1011     out[6] += (two58 & sign);
1012     out[7] -= (1 & sign);
1013     sign = -(out[7] >> 63);
1014     out[7] += (two58 & sign);
1015     out[8] -= (1 & sign);
1016 }
1017
1018 /*-
1019  * Group operations
1020  * ----------------
1021  *
1022  * Building on top of the field operations we have the operations on the
1023  * elliptic curve group itself. Points on the curve are represented in Jacobian
1024  * coordinates */
1025
1026 /*-
1027  * point_double calculates 2*(x_in, y_in, z_in)
1028  *
1029  * The method is taken from:
1030  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1031  *
1032  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1033  * while x_out == y_in is not (maybe this works, but it's not tested). */
1034 static void
1035 point_double(felem x_out, felem y_out, felem z_out,
1036              const felem x_in, const felem y_in, const felem z_in)
1037 {
1038     largefelem tmp, tmp2;
1039     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1040
1041     felem_assign(ftmp, x_in);
1042     felem_assign(ftmp2, x_in);
1043
1044     /* delta = z^2 */
1045     felem_square(tmp, z_in);
1046     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1047
1048     /* gamma = y^2 */
1049     felem_square(tmp, y_in);
1050     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1051
1052     /* beta = x*gamma */
1053     felem_mul(tmp, x_in, gamma);
1054     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1055
1056     /* alpha = 3*(x-delta)*(x+delta) */
1057     felem_diff64(ftmp, delta);
1058     /* ftmp[i] < 2^61 */
1059     felem_sum64(ftmp2, delta);
1060     /* ftmp2[i] < 2^60 + 2^15 */
1061     felem_scalar64(ftmp2, 3);
1062     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1063     felem_mul(tmp, ftmp, ftmp2);
1064     /*-
1065      * tmp[i] < 17(3*2^121 + 3*2^76)
1066      *        = 61*2^121 + 61*2^76
1067      *        < 64*2^121 + 64*2^76
1068      *        = 2^127 + 2^82
1069      *        < 2^128
1070      */
1071     felem_reduce(alpha, tmp);
1072
1073     /* x' = alpha^2 - 8*beta */
1074     felem_square(tmp, alpha);
1075     /*
1076      * tmp[i] < 17*2^120 < 2^125
1077      */
1078     felem_assign(ftmp, beta);
1079     felem_scalar64(ftmp, 8);
1080     /* ftmp[i] < 2^62 + 2^17 */
1081     felem_diff_128_64(tmp, ftmp);
1082     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1083     felem_reduce(x_out, tmp);
1084
1085     /* z' = (y + z)^2 - gamma - delta */
1086     felem_sum64(delta, gamma);
1087     /* delta[i] < 2^60 + 2^15 */
1088     felem_assign(ftmp, y_in);
1089     felem_sum64(ftmp, z_in);
1090     /* ftmp[i] < 2^60 + 2^15 */
1091     felem_square(tmp, ftmp);
1092     /*
1093      * tmp[i] < 17(2^122) < 2^127
1094      */
1095     felem_diff_128_64(tmp, delta);
1096     /* tmp[i] < 2^127 + 2^63 */
1097     felem_reduce(z_out, tmp);
1098
1099     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1100     felem_scalar64(beta, 4);
1101     /* beta[i] < 2^61 + 2^16 */
1102     felem_diff64(beta, x_out);
1103     /* beta[i] < 2^61 + 2^60 + 2^16 */
1104     felem_mul(tmp, alpha, beta);
1105     /*-
1106      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1107      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1108      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1109      *        < 2^128
1110      */
1111     felem_square(tmp2, gamma);
1112     /*-
1113      * tmp2[i] < 17*(2^59 + 2^14)^2
1114      *         = 17*(2^118 + 2^74 + 2^28)
1115      */
1116     felem_scalar128(tmp2, 8);
1117     /*-
1118      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1119      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1120      *         < 2^126
1121      */
1122     felem_diff128(tmp, tmp2);
1123     /*-
1124      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1125      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1126      *          2^74 + 2^69 + 2^34 + 2^30
1127      *        < 2^128
1128      */
1129     felem_reduce(y_out, tmp);
1130 }
1131
1132 /* copy_conditional copies in to out iff mask is all ones. */
1133 static void copy_conditional(felem out, const felem in, limb mask)
1134 {
1135     unsigned i;
1136     for (i = 0; i < NLIMBS; ++i) {
1137         const limb tmp = mask & (in[i] ^ out[i]);
1138         out[i] ^= tmp;
1139     }
1140 }
1141
1142 /*-
1143  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1144  *
1145  * The method is taken from
1146  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1147  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1148  *
1149  * This function includes a branch for checking whether the two input points
1150  * are equal (while not equal to the point at infinity). This case never
1151  * happens during single point multiplication, so there is no timing leak for
1152  * ECDH or ECDSA signing. */
1153 static void point_add(felem x3, felem y3, felem z3,
1154                       const felem x1, const felem y1, const felem z1,
1155                       const int mixed, const felem x2, const felem y2,
1156                       const felem z2)
1157 {
1158     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1159     largefelem tmp, tmp2;
1160     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1161
1162     z1_is_zero = felem_is_zero(z1);
1163     z2_is_zero = felem_is_zero(z2);
1164
1165     /* ftmp = z1z1 = z1**2 */
1166     felem_square(tmp, z1);
1167     felem_reduce(ftmp, tmp);
1168
1169     if (!mixed) {
1170         /* ftmp2 = z2z2 = z2**2 */
1171         felem_square(tmp, z2);
1172         felem_reduce(ftmp2, tmp);
1173
1174         /* u1 = ftmp3 = x1*z2z2 */
1175         felem_mul(tmp, x1, ftmp2);
1176         felem_reduce(ftmp3, tmp);
1177
1178         /* ftmp5 = z1 + z2 */
1179         felem_assign(ftmp5, z1);
1180         felem_sum64(ftmp5, z2);
1181         /* ftmp5[i] < 2^61 */
1182
1183         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1184         felem_square(tmp, ftmp5);
1185         /* tmp[i] < 17*2^122 */
1186         felem_diff_128_64(tmp, ftmp);
1187         /* tmp[i] < 17*2^122 + 2^63 */
1188         felem_diff_128_64(tmp, ftmp2);
1189         /* tmp[i] < 17*2^122 + 2^64 */
1190         felem_reduce(ftmp5, tmp);
1191
1192         /* ftmp2 = z2 * z2z2 */
1193         felem_mul(tmp, ftmp2, z2);
1194         felem_reduce(ftmp2, tmp);
1195
1196         /* s1 = ftmp6 = y1 * z2**3 */
1197         felem_mul(tmp, y1, ftmp2);
1198         felem_reduce(ftmp6, tmp);
1199     } else {
1200         /*
1201          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1202          */
1203
1204         /* u1 = ftmp3 = x1*z2z2 */
1205         felem_assign(ftmp3, x1);
1206
1207         /* ftmp5 = 2*z1z2 */
1208         felem_scalar(ftmp5, z1, 2);
1209
1210         /* s1 = ftmp6 = y1 * z2**3 */
1211         felem_assign(ftmp6, y1);
1212     }
1213
1214     /* u2 = x2*z1z1 */
1215     felem_mul(tmp, x2, ftmp);
1216     /* tmp[i] < 17*2^120 */
1217
1218     /* h = ftmp4 = u2 - u1 */
1219     felem_diff_128_64(tmp, ftmp3);
1220     /* tmp[i] < 17*2^120 + 2^63 */
1221     felem_reduce(ftmp4, tmp);
1222
1223     x_equal = felem_is_zero(ftmp4);
1224
1225     /* z_out = ftmp5 * h */
1226     felem_mul(tmp, ftmp5, ftmp4);
1227     felem_reduce(z_out, tmp);
1228
1229     /* ftmp = z1 * z1z1 */
1230     felem_mul(tmp, ftmp, z1);
1231     felem_reduce(ftmp, tmp);
1232
1233     /* s2 = tmp = y2 * z1**3 */
1234     felem_mul(tmp, y2, ftmp);
1235     /* tmp[i] < 17*2^120 */
1236
1237     /* r = ftmp5 = (s2 - s1)*2 */
1238     felem_diff_128_64(tmp, ftmp6);
1239     /* tmp[i] < 17*2^120 + 2^63 */
1240     felem_reduce(ftmp5, tmp);
1241     y_equal = felem_is_zero(ftmp5);
1242     felem_scalar64(ftmp5, 2);
1243     /* ftmp5[i] < 2^61 */
1244
1245     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1246         point_double(x3, y3, z3, x1, y1, z1);
1247         return;
1248     }
1249
1250     /* I = ftmp = (2h)**2 */
1251     felem_assign(ftmp, ftmp4);
1252     felem_scalar64(ftmp, 2);
1253     /* ftmp[i] < 2^61 */
1254     felem_square(tmp, ftmp);
1255     /* tmp[i] < 17*2^122 */
1256     felem_reduce(ftmp, tmp);
1257
1258     /* J = ftmp2 = h * I */
1259     felem_mul(tmp, ftmp4, ftmp);
1260     felem_reduce(ftmp2, tmp);
1261
1262     /* V = ftmp4 = U1 * I */
1263     felem_mul(tmp, ftmp3, ftmp);
1264     felem_reduce(ftmp4, tmp);
1265
1266     /* x_out = r**2 - J - 2V */
1267     felem_square(tmp, ftmp5);
1268     /* tmp[i] < 17*2^122 */
1269     felem_diff_128_64(tmp, ftmp2);
1270     /* tmp[i] < 17*2^122 + 2^63 */
1271     felem_assign(ftmp3, ftmp4);
1272     felem_scalar64(ftmp4, 2);
1273     /* ftmp4[i] < 2^61 */
1274     felem_diff_128_64(tmp, ftmp4);
1275     /* tmp[i] < 17*2^122 + 2^64 */
1276     felem_reduce(x_out, tmp);
1277
1278     /* y_out = r(V-x_out) - 2 * s1 * J */
1279     felem_diff64(ftmp3, x_out);
1280     /*
1281      * ftmp3[i] < 2^60 + 2^60 = 2^61
1282      */
1283     felem_mul(tmp, ftmp5, ftmp3);
1284     /* tmp[i] < 17*2^122 */
1285     felem_mul(tmp2, ftmp6, ftmp2);
1286     /* tmp2[i] < 17*2^120 */
1287     felem_scalar128(tmp2, 2);
1288     /* tmp2[i] < 17*2^121 */
1289     felem_diff128(tmp, tmp2);
1290         /*-
1291          * tmp[i] < 2^127 - 2^69 + 17*2^122
1292          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1293          *        < 2^127
1294          */
1295     felem_reduce(y_out, tmp);
1296
1297     copy_conditional(x_out, x2, z1_is_zero);
1298     copy_conditional(x_out, x1, z2_is_zero);
1299     copy_conditional(y_out, y2, z1_is_zero);
1300     copy_conditional(y_out, y1, z2_is_zero);
1301     copy_conditional(z_out, z2, z1_is_zero);
1302     copy_conditional(z_out, z1, z2_is_zero);
1303     felem_assign(x3, x_out);
1304     felem_assign(y3, y_out);
1305     felem_assign(z3, z_out);
1306 }
1307
1308 /*-
1309  * Base point pre computation
1310  * --------------------------
1311  *
1312  * Two different sorts of precomputed tables are used in the following code.
1313  * Each contain various points on the curve, where each point is three field
1314  * elements (x, y, z).
1315  *
1316  * For the base point table, z is usually 1 (0 for the point at infinity).
1317  * This table has 16 elements:
1318  * index | bits    | point
1319  * ------+---------+------------------------------
1320  *     0 | 0 0 0 0 | 0G
1321  *     1 | 0 0 0 1 | 1G
1322  *     2 | 0 0 1 0 | 2^130G
1323  *     3 | 0 0 1 1 | (2^130 + 1)G
1324  *     4 | 0 1 0 0 | 2^260G
1325  *     5 | 0 1 0 1 | (2^260 + 1)G
1326  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1327  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1328  *     8 | 1 0 0 0 | 2^390G
1329  *     9 | 1 0 0 1 | (2^390 + 1)G
1330  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1331  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1332  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1333  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1334  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1335  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1336  *
1337  * The reason for this is so that we can clock bits into four different
1338  * locations when doing simple scalar multiplies against the base point.
1339  *
1340  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1341
1342 /* gmul is the table of precomputed base points */
1343 static const felem gmul[16][3] = {
1344 {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1345  {0, 0, 0, 0, 0, 0, 0, 0, 0},
1346  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1347 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1348   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1349   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1350  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1351   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1352   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1353  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1354 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1355   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1356   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1357  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1358   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1359   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1360  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1361 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1362   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1363   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1364  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1365   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1366   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1367  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1368 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1369   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1370   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1371  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1372   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1373   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1374  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1375 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1376   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1377   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1378  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1379   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1380   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1381  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1382 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1383   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1384   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1385  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1386   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1387   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1388  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1389 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1390   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1391   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1392  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1393   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1394   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1395  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1396 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1397   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1398   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1399  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1400   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1401   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1402  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1403 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1404   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1405   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1406  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1407   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1408   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1409  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1410 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1411   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1412   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1413  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1414   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1415   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1416  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1417 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1418   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1419   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1420  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1421   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1422   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1423  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1424 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1425   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1426   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1427  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1428   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1429   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1430  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1431 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1432   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1433   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1434  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1435   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1436   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1437  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1438 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1439   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1440   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1441  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1442   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1443   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1444  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1445 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1446   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1447   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1448  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1449   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1450   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1451  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1452 };
1453
1454 /*
1455  * select_point selects the |idx|th point from a precomputation table and
1456  * copies it to out.
1457  */
1458  /* pre_comp below is of the size provided in |size| */
1459 static void select_point(const limb idx, unsigned int size,
1460                          const felem pre_comp[][3], felem out[3])
1461 {
1462     unsigned i, j;
1463     limb *outlimbs = &out[0][0];
1464
1465     memset(out, 0, sizeof(*out) * 3);
1466
1467     for (i = 0; i < size; i++) {
1468         const limb *inlimbs = &pre_comp[i][0][0];
1469         limb mask = i ^ idx;
1470         mask |= mask >> 4;
1471         mask |= mask >> 2;
1472         mask |= mask >> 1;
1473         mask &= 1;
1474         mask--;
1475         for (j = 0; j < NLIMBS * 3; j++)
1476             outlimbs[j] |= inlimbs[j] & mask;
1477     }
1478 }
1479
1480 /* get_bit returns the |i|th bit in |in| */
1481 static char get_bit(const felem_bytearray in, int i)
1482 {
1483     if (i < 0)
1484         return 0;
1485     return (in[i >> 3] >> (i & 7)) & 1;
1486 }
1487
1488 /*
1489  * Interleaved point multiplication using precomputed point multiples: The
1490  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1491  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1492  * generator, using certain (large) precomputed multiples in g_pre_comp.
1493  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1494  */
1495 static void batch_mul(felem x_out, felem y_out, felem z_out,
1496                       const felem_bytearray scalars[],
1497                       const unsigned num_points, const u8 *g_scalar,
1498                       const int mixed, const felem pre_comp[][17][3],
1499                       const felem g_pre_comp[16][3])
1500 {
1501     int i, skip;
1502     unsigned num, gen_mul = (g_scalar != NULL);
1503     felem nq[3], tmp[4];
1504     limb bits;
1505     u8 sign, digit;
1506
1507     /* set nq to the point at infinity */
1508     memset(nq, 0, sizeof(nq));
1509
1510     /*
1511      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1512      * of the generator (last quarter of rounds) and additions of other
1513      * points multiples (every 5th round).
1514      */
1515     skip = 1;                   /* save two point operations in the first
1516                                  * round */
1517     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1518         /* double */
1519         if (!skip)
1520             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1521
1522         /* add multiples of the generator */
1523         if (gen_mul && (i <= 130)) {
1524             bits = get_bit(g_scalar, i + 390) << 3;
1525             if (i < 130) {
1526                 bits |= get_bit(g_scalar, i + 260) << 2;
1527                 bits |= get_bit(g_scalar, i + 130) << 1;
1528                 bits |= get_bit(g_scalar, i);
1529             }
1530             /* select the point to add, in constant time */
1531             select_point(bits, 16, g_pre_comp, tmp);
1532             if (!skip) {
1533                 /* The 1 argument below is for "mixed" */
1534                 point_add(nq[0], nq[1], nq[2],
1535                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1536             } else {
1537                 memcpy(nq, tmp, 3 * sizeof(felem));
1538                 skip = 0;
1539             }
1540         }
1541
1542         /* do other additions every 5 doublings */
1543         if (num_points && (i % 5 == 0)) {
1544             /* loop over all scalars */
1545             for (num = 0; num < num_points; ++num) {
1546                 bits = get_bit(scalars[num], i + 4) << 5;
1547                 bits |= get_bit(scalars[num], i + 3) << 4;
1548                 bits |= get_bit(scalars[num], i + 2) << 3;
1549                 bits |= get_bit(scalars[num], i + 1) << 2;
1550                 bits |= get_bit(scalars[num], i) << 1;
1551                 bits |= get_bit(scalars[num], i - 1);
1552                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1553
1554                 /*
1555                  * select the point to add or subtract, in constant time
1556                  */
1557                 select_point(digit, 17, pre_comp[num], tmp);
1558                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1559                                             * point */
1560                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1561
1562                 if (!skip) {
1563                     point_add(nq[0], nq[1], nq[2],
1564                               nq[0], nq[1], nq[2],
1565                               mixed, tmp[0], tmp[1], tmp[2]);
1566                 } else {
1567                     memcpy(nq, tmp, 3 * sizeof(felem));
1568                     skip = 0;
1569                 }
1570             }
1571         }
1572     }
1573     felem_assign(x_out, nq[0]);
1574     felem_assign(y_out, nq[1]);
1575     felem_assign(z_out, nq[2]);
1576 }
1577
1578 /* Precomputation for the group generator. */
1579 struct nistp521_pre_comp_st {
1580     felem g_pre_comp[16][3];
1581     int references;
1582     CRYPTO_RWLOCK *lock;
1583 };
1584
1585 const EC_METHOD *EC_GFp_nistp521_method(void)
1586 {
1587     static const EC_METHOD ret = {
1588         EC_FLAGS_DEFAULT_OCT,
1589         NID_X9_62_prime_field,
1590         ec_GFp_nistp521_group_init,
1591         ec_GFp_simple_group_finish,
1592         ec_GFp_simple_group_clear_finish,
1593         ec_GFp_nist_group_copy,
1594         ec_GFp_nistp521_group_set_curve,
1595         ec_GFp_simple_group_get_curve,
1596         ec_GFp_simple_group_get_degree,
1597         ec_group_simple_order_bits,
1598         ec_GFp_simple_group_check_discriminant,
1599         ec_GFp_simple_point_init,
1600         ec_GFp_simple_point_finish,
1601         ec_GFp_simple_point_clear_finish,
1602         ec_GFp_simple_point_copy,
1603         ec_GFp_simple_point_set_to_infinity,
1604         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1605         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1606         ec_GFp_simple_point_set_affine_coordinates,
1607         ec_GFp_nistp521_point_get_affine_coordinates,
1608         0 /* point_set_compressed_coordinates */ ,
1609         0 /* point2oct */ ,
1610         0 /* oct2point */ ,
1611         ec_GFp_simple_add,
1612         ec_GFp_simple_dbl,
1613         ec_GFp_simple_invert,
1614         ec_GFp_simple_is_at_infinity,
1615         ec_GFp_simple_is_on_curve,
1616         ec_GFp_simple_cmp,
1617         ec_GFp_simple_make_affine,
1618         ec_GFp_simple_points_make_affine,
1619         ec_GFp_nistp521_points_mul,
1620         ec_GFp_nistp521_precompute_mult,
1621         ec_GFp_nistp521_have_precompute_mult,
1622         ec_GFp_nist_field_mul,
1623         ec_GFp_nist_field_sqr,
1624         0 /* field_div */ ,
1625         ec_GFp_simple_field_inv,
1626         0 /* field_encode */ ,
1627         0 /* field_decode */ ,
1628         0,                      /* field_set_to_one */
1629         ec_key_simple_priv2oct,
1630         ec_key_simple_oct2priv,
1631         0, /* set private */
1632         ec_key_simple_generate_key,
1633         ec_key_simple_check_key,
1634         ec_key_simple_generate_public_key,
1635         0, /* keycopy */
1636         0, /* keyfinish */
1637         ecdh_simple_compute_key,
1638         0  /* blind_coordinates */
1639     };
1640
1641     return &ret;
1642 }
1643
1644 /******************************************************************************/
1645 /*
1646  * FUNCTIONS TO MANAGE PRECOMPUTATION
1647  */
1648
1649 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1650 {
1651     NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1652
1653     if (ret == NULL) {
1654         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1655         return ret;
1656     }
1657
1658     ret->references = 1;
1659
1660     ret->lock = CRYPTO_THREAD_lock_new();
1661     if (ret->lock == NULL) {
1662         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1663         OPENSSL_free(ret);
1664         return NULL;
1665     }
1666     return ret;
1667 }
1668
1669 NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
1670 {
1671     int i;
1672     if (p != NULL)
1673         CRYPTO_atomic_add(&p->references, 1, &i, p->lock);
1674     return p;
1675 }
1676
1677 void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1678 {
1679     int i;
1680
1681     if (p == NULL)
1682         return;
1683
1684     CRYPTO_atomic_add(&p->references, -1, &i, p->lock);
1685     REF_PRINT_COUNT("EC_nistp521", x);
1686     if (i > 0)
1687         return;
1688     REF_ASSERT_ISNT(i < 0);
1689
1690     CRYPTO_THREAD_lock_free(p->lock);
1691     OPENSSL_free(p);
1692 }
1693
1694 /******************************************************************************/
1695 /*
1696  * OPENSSL EC_METHOD FUNCTIONS
1697  */
1698
1699 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1700 {
1701     int ret;
1702     ret = ec_GFp_simple_group_init(group);
1703     group->a_is_minus3 = 1;
1704     return ret;
1705 }
1706
1707 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1708                                     const BIGNUM *a, const BIGNUM *b,
1709                                     BN_CTX *ctx)
1710 {
1711     int ret = 0;
1712     BN_CTX *new_ctx = NULL;
1713     BIGNUM *curve_p, *curve_a, *curve_b;
1714
1715     if (ctx == NULL)
1716         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1717             return 0;
1718     BN_CTX_start(ctx);
1719     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1720         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1721         ((curve_b = BN_CTX_get(ctx)) == NULL))
1722         goto err;
1723     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1724     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1725     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1726     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1727         ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1728               EC_R_WRONG_CURVE_PARAMETERS);
1729         goto err;
1730     }
1731     group->field_mod_func = BN_nist_mod_521;
1732     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1733  err:
1734     BN_CTX_end(ctx);
1735     BN_CTX_free(new_ctx);
1736     return ret;
1737 }
1738
1739 /*
1740  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1741  * (X/Z^2, Y/Z^3)
1742  */
1743 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1744                                                  const EC_POINT *point,
1745                                                  BIGNUM *x, BIGNUM *y,
1746                                                  BN_CTX *ctx)
1747 {
1748     felem z1, z2, x_in, y_in, x_out, y_out;
1749     largefelem tmp;
1750
1751     if (EC_POINT_is_at_infinity(group, point)) {
1752         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1753               EC_R_POINT_AT_INFINITY);
1754         return 0;
1755     }
1756     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1757         (!BN_to_felem(z1, point->Z)))
1758         return 0;
1759     felem_inv(z2, z1);
1760     felem_square(tmp, z2);
1761     felem_reduce(z1, tmp);
1762     felem_mul(tmp, x_in, z1);
1763     felem_reduce(x_in, tmp);
1764     felem_contract(x_out, x_in);
1765     if (x != NULL) {
1766         if (!felem_to_BN(x, x_out)) {
1767             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1768                   ERR_R_BN_LIB);
1769             return 0;
1770         }
1771     }
1772     felem_mul(tmp, z1, z2);
1773     felem_reduce(z1, tmp);
1774     felem_mul(tmp, y_in, z1);
1775     felem_reduce(y_in, tmp);
1776     felem_contract(y_out, y_in);
1777     if (y != NULL) {
1778         if (!felem_to_BN(y, y_out)) {
1779             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1780                   ERR_R_BN_LIB);
1781             return 0;
1782         }
1783     }
1784     return 1;
1785 }
1786
1787 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1788 static void make_points_affine(size_t num, felem points[][3],
1789                                felem tmp_felems[])
1790 {
1791     /*
1792      * Runs in constant time, unless an input is the point at infinity (which
1793      * normally shouldn't happen).
1794      */
1795     ec_GFp_nistp_points_make_affine_internal(num,
1796                                              points,
1797                                              sizeof(felem),
1798                                              tmp_felems,
1799                                              (void (*)(void *))felem_one,
1800                                              felem_is_zero_int,
1801                                              (void (*)(void *, const void *))
1802                                              felem_assign,
1803                                              (void (*)(void *, const void *))
1804                                              felem_square_reduce, (void (*)
1805                                                                    (void *,
1806                                                                     const void
1807                                                                     *,
1808                                                                     const void
1809                                                                     *))
1810                                              felem_mul_reduce,
1811                                              (void (*)(void *, const void *))
1812                                              felem_inv,
1813                                              (void (*)(void *, const void *))
1814                                              felem_contract);
1815 }
1816
1817 /*
1818  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1819  * values Result is stored in r (r can equal one of the inputs).
1820  */
1821 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1822                                const BIGNUM *scalar, size_t num,
1823                                const EC_POINT *points[],
1824                                const BIGNUM *scalars[], BN_CTX *ctx)
1825 {
1826     int ret = 0;
1827     int j;
1828     int mixed = 0;
1829     BN_CTX *new_ctx = NULL;
1830     BIGNUM *x, *y, *z, *tmp_scalar;
1831     felem_bytearray g_secret;
1832     felem_bytearray *secrets = NULL;
1833     felem (*pre_comp)[17][3] = NULL;
1834     felem *tmp_felems = NULL;
1835     unsigned i;
1836     int num_bytes;
1837     int have_pre_comp = 0;
1838     size_t num_points = num;
1839     felem x_in, y_in, z_in, x_out, y_out, z_out;
1840     NISTP521_PRE_COMP *pre = NULL;
1841     felem(*g_pre_comp)[3] = NULL;
1842     EC_POINT *generator = NULL;
1843     const EC_POINT *p = NULL;
1844     const BIGNUM *p_scalar = NULL;
1845
1846     if (ctx == NULL)
1847         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1848             return 0;
1849     BN_CTX_start(ctx);
1850     if (((x = BN_CTX_get(ctx)) == NULL) ||
1851         ((y = BN_CTX_get(ctx)) == NULL) ||
1852         ((z = BN_CTX_get(ctx)) == NULL) ||
1853         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1854         goto err;
1855
1856     if (scalar != NULL) {
1857         pre = group->pre_comp.nistp521;
1858         if (pre)
1859             /* we have precomputation, try to use it */
1860             g_pre_comp = &pre->g_pre_comp[0];
1861         else
1862             /* try to use the standard precomputation */
1863             g_pre_comp = (felem(*)[3]) gmul;
1864         generator = EC_POINT_new(group);
1865         if (generator == NULL)
1866             goto err;
1867         /* get the generator from precomputation */
1868         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1869             !felem_to_BN(y, g_pre_comp[1][1]) ||
1870             !felem_to_BN(z, g_pre_comp[1][2])) {
1871             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1872             goto err;
1873         }
1874         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1875                                                       generator, x, y, z,
1876                                                       ctx))
1877             goto err;
1878         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1879             /* precomputation matches generator */
1880             have_pre_comp = 1;
1881         else
1882             /*
1883              * we don't have valid precomputation: treat the generator as a
1884              * random point
1885              */
1886             num_points++;
1887     }
1888
1889     if (num_points > 0) {
1890         if (num_points >= 2) {
1891             /*
1892              * unless we precompute multiples for just one point, converting
1893              * those into affine form is time well spent
1894              */
1895             mixed = 1;
1896         }
1897         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1898         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1899         if (mixed)
1900             tmp_felems =
1901                 OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1902         if ((secrets == NULL) || (pre_comp == NULL)
1903             || (mixed && (tmp_felems == NULL))) {
1904             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1905             goto err;
1906         }
1907
1908         /*
1909          * we treat NULL scalars as 0, and NULL points as points at infinity,
1910          * i.e., they contribute nothing to the linear combination
1911          */
1912         for (i = 0; i < num_points; ++i) {
1913             if (i == num)
1914                 /*
1915                  * we didn't have a valid precomputation, so we pick the
1916                  * generator
1917                  */
1918             {
1919                 p = EC_GROUP_get0_generator(group);
1920                 p_scalar = scalar;
1921             } else
1922                 /* the i^th point */
1923             {
1924                 p = points[i];
1925                 p_scalar = scalars[i];
1926             }
1927             if ((p_scalar != NULL) && (p != NULL)) {
1928                 /* reduce scalar to 0 <= scalar < 2^521 */
1929                 if ((BN_num_bits(p_scalar) > 521)
1930                     || (BN_is_negative(p_scalar))) {
1931                     /*
1932                      * this is an unusual input, and we don't guarantee
1933                      * constant-timeness
1934                      */
1935                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1936                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1937                         goto err;
1938                     }
1939                     num_bytes = BN_bn2lebinpad(tmp_scalar,
1940                                                secrets[i], sizeof(secrets[i]));
1941                 } else {
1942                     num_bytes = BN_bn2lebinpad(p_scalar,
1943                                                secrets[i], sizeof(secrets[i]));
1944                 }
1945                 if (num_bytes < 0) {
1946                     ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1947                     goto err;
1948                 }
1949                 /* precompute multiples */
1950                 if ((!BN_to_felem(x_out, p->X)) ||
1951                     (!BN_to_felem(y_out, p->Y)) ||
1952                     (!BN_to_felem(z_out, p->Z)))
1953                     goto err;
1954                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1955                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1956                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1957                 for (j = 2; j <= 16; ++j) {
1958                     if (j & 1) {
1959                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1960                                   pre_comp[i][j][2], pre_comp[i][1][0],
1961                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1962                                   pre_comp[i][j - 1][0],
1963                                   pre_comp[i][j - 1][1],
1964                                   pre_comp[i][j - 1][2]);
1965                     } else {
1966                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1967                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1968                                      pre_comp[i][j / 2][1],
1969                                      pre_comp[i][j / 2][2]);
1970                     }
1971                 }
1972             }
1973         }
1974         if (mixed)
1975             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1976     }
1977
1978     /* the scalar for the generator */
1979     if ((scalar != NULL) && (have_pre_comp)) {
1980         memset(g_secret, 0, sizeof(g_secret));
1981         /* reduce scalar to 0 <= scalar < 2^521 */
1982         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1983             /*
1984              * this is an unusual input, and we don't guarantee
1985              * constant-timeness
1986              */
1987             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1988                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1989                 goto err;
1990             }
1991             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
1992         } else
1993             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
1994         /* do the multiplication with generator precomputation */
1995         batch_mul(x_out, y_out, z_out,
1996                   (const felem_bytearray(*))secrets, num_points,
1997                   g_secret,
1998                   mixed, (const felem(*)[17][3])pre_comp,
1999                   (const felem(*)[3])g_pre_comp);
2000     } else
2001         /* do the multiplication without generator precomputation */
2002         batch_mul(x_out, y_out, z_out,
2003                   (const felem_bytearray(*))secrets, num_points,
2004                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
2005     /* reduce the output to its unique minimal representation */
2006     felem_contract(x_in, x_out);
2007     felem_contract(y_in, y_out);
2008     felem_contract(z_in, z_out);
2009     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2010         (!felem_to_BN(z, z_in))) {
2011         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2012         goto err;
2013     }
2014     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2015
2016  err:
2017     BN_CTX_end(ctx);
2018     EC_POINT_free(generator);
2019     BN_CTX_free(new_ctx);
2020     OPENSSL_free(secrets);
2021     OPENSSL_free(pre_comp);
2022     OPENSSL_free(tmp_felems);
2023     return ret;
2024 }
2025
2026 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2027 {
2028     int ret = 0;
2029     NISTP521_PRE_COMP *pre = NULL;
2030     int i, j;
2031     BN_CTX *new_ctx = NULL;
2032     BIGNUM *x, *y;
2033     EC_POINT *generator = NULL;
2034     felem tmp_felems[16];
2035
2036     /* throw away old precomputation */
2037     EC_pre_comp_free(group);
2038     if (ctx == NULL)
2039         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2040             return 0;
2041     BN_CTX_start(ctx);
2042     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2043         goto err;
2044     /* get the generator */
2045     if (group->generator == NULL)
2046         goto err;
2047     generator = EC_POINT_new(group);
2048     if (generator == NULL)
2049         goto err;
2050     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2051     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2052     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2053         goto err;
2054     if ((pre = nistp521_pre_comp_new()) == NULL)
2055         goto err;
2056     /*
2057      * if the generator is the standard one, use built-in precomputation
2058      */
2059     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2060         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2061         goto done;
2062     }
2063     if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
2064         (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
2065         (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2066         goto err;
2067     /* compute 2^130*G, 2^260*G, 2^390*G */
2068     for (i = 1; i <= 4; i <<= 1) {
2069         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2070                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2071                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2072         for (j = 0; j < 129; ++j) {
2073             point_double(pre->g_pre_comp[2 * i][0],
2074                          pre->g_pre_comp[2 * i][1],
2075                          pre->g_pre_comp[2 * i][2],
2076                          pre->g_pre_comp[2 * i][0],
2077                          pre->g_pre_comp[2 * i][1],
2078                          pre->g_pre_comp[2 * i][2]);
2079         }
2080     }
2081     /* g_pre_comp[0] is the point at infinity */
2082     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2083     /* the remaining multiples */
2084     /* 2^130*G + 2^260*G */
2085     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2086               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2087               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2088               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2089               pre->g_pre_comp[2][2]);
2090     /* 2^130*G + 2^390*G */
2091     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2092               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2093               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2094               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2095               pre->g_pre_comp[2][2]);
2096     /* 2^260*G + 2^390*G */
2097     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2098               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2099               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2100               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2101               pre->g_pre_comp[4][2]);
2102     /* 2^130*G + 2^260*G + 2^390*G */
2103     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2104               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2105               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2106               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2107               pre->g_pre_comp[2][2]);
2108     for (i = 1; i < 8; ++i) {
2109         /* odd multiples: add G */
2110         point_add(pre->g_pre_comp[2 * i + 1][0],
2111                   pre->g_pre_comp[2 * i + 1][1],
2112                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2113                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2114                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2115                   pre->g_pre_comp[1][2]);
2116     }
2117     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2118
2119  done:
2120     SETPRECOMP(group, nistp521, pre);
2121     ret = 1;
2122     pre = NULL;
2123  err:
2124     BN_CTX_end(ctx);
2125     EC_POINT_free(generator);
2126     BN_CTX_free(new_ctx);
2127     EC_nistp521_pre_comp_free(pre);
2128     return ret;
2129 }
2130
2131 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2132 {
2133     return HAVEPRECOMP(group, nistp521);
2134 }
2135
2136 #endif