crypto/ec/ecp_nistp521.c

   1 /*
   2  * Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the Apache License 2.0 (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * ECDSA low level APIs are deprecated for public use, but still ok for
  28  * internal use.
  29  */
  30 #include "internal/deprecated.h"
  31
  32 /*
  33  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  34  *
  35  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  36  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  37  * work which got its smarts from Daniel J. Bernstein's work on the same.
  38  */
  39
  40 #include <openssl/e_os2.h>
  41
  42 #include <string.h>
  43 #include <openssl/err.h>
  44 #include "ec_local.h"
  45
  46 #if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
  47   /* even with gcc, the typedef won't work for 32-bit platforms */
  48 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  49                                  * platforms */
  50 #else
  51 # error "Your compiler doesn't appear to support 128-bit integer types"
  52 #endif
  53
  54 typedef uint8_t u8;
  55 typedef uint64_t u64;
  56
  57 /*
  58  * The underlying field. P521 operates over GF(2^521-1). We can serialise an
  59  * element of this field into 66 bytes where the most significant byte
  60  * contains only a single bit. We call this an felem_bytearray.
  61  */
  62
  63 typedef u8 felem_bytearray[66];
  64
  65 /*
  66  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  67  * These values are big-endian.
  68  */
  69 static const felem_bytearray nistp521_curve_params[5] = {
  70     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
  71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  74      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  75      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  76      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  78      0xff, 0xff},
  79     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
  80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  82      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  83      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  84      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  85      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  86      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  87      0xff, 0xfc},
  88     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
  89      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  90      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  91      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  92      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  93      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  94      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  95      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  96      0x3f, 0x00},
  97     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
  98      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  99      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
 100      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
 101      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
 102      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
 103      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
 104      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 105      0xbd, 0x66},
 106     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
 107      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 108      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 109      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 110      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 111      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 112      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 113      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 114      0x66, 0x50}
 115 };
 116
 117 /*-
 118  * The representation of field elements.
 119  * ------------------------------------
 120  *
 121  * We represent field elements with nine values. These values are either 64 or
 122  * 128 bits and the field element represented is:
 123  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 124  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 125  * 58 bits apart, but are greater than 58 bits in length, the most significant
 126  * bits of each limb overlap with the least significant bits of the next.
 127  *
 128  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 129  * 'largefelem' */
 130
 131 #define NLIMBS 9
 132
 133 typedef uint64_t limb;
 134 typedef limb felem[NLIMBS];
 135 typedef uint128_t largefelem[NLIMBS];
 136
 137 static const limb bottom57bits = 0x1ffffffffffffff;
 138 static const limb bottom58bits = 0x3ffffffffffffff;
 139
 140 /*
 141  * bin66_to_felem takes a little-endian byte array and converts it into felem
 142  * form. This assumes that the CPU is little-endian.
 143  */
 144 static void bin66_to_felem(felem out, const u8 in[66])
 145 {
 146     out[0] = (*((limb *) & in[0])) & bottom58bits;
 147     out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
 148     out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
 149     out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
 150     out[4] = (*((limb *) & in[29])) & bottom58bits;
 151     out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
 152     out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
 153     out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
 154     out[8] = (*((limb *) & in[58])) & bottom57bits;
 155 }
 156
 157 /*
 158  * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 159  * array. This assumes that the CPU is little-endian.
 160  */
 161 static void felem_to_bin66(u8 out[66], const felem in)
 162 {
 163     memset(out, 0, 66);
 164     (*((limb *) & out[0])) = in[0];
 165     (*((limb *) & out[7])) |= in[1] << 2;
 166     (*((limb *) & out[14])) |= in[2] << 4;
 167     (*((limb *) & out[21])) |= in[3] << 6;
 168     (*((limb *) & out[29])) = in[4];
 169     (*((limb *) & out[36])) |= in[5] << 2;
 170     (*((limb *) & out[43])) |= in[6] << 4;
 171     (*((limb *) & out[50])) |= in[7] << 6;
 172     (*((limb *) & out[58])) = in[8];
 173 }
 174
 175 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 176 static int BN_to_felem(felem out, const BIGNUM *bn)
 177 {
 178     felem_bytearray b_out;
 179     int num_bytes;
 180
 181     if (BN_is_negative(bn)) {
 182         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 183         return 0;
 184     }
 185     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
 186     if (num_bytes < 0) {
 187         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 188         return 0;
 189     }
 190     bin66_to_felem(out, b_out);
 191     return 1;
 192 }
 193
 194 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 195 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 196 {
 197     felem_bytearray b_out;
 198     felem_to_bin66(b_out, in);
 199     return BN_lebin2bn(b_out, sizeof(b_out), out);
 200 }
 201
 202 /*-
 203  * Field operations
 204  * ----------------
 205  */
 206
 207 static void felem_one(felem out)
 208 {
 209     out[0] = 1;
 210     out[1] = 0;
 211     out[2] = 0;
 212     out[3] = 0;
 213     out[4] = 0;
 214     out[5] = 0;
 215     out[6] = 0;
 216     out[7] = 0;
 217     out[8] = 0;
 218 }
 219
 220 static void felem_assign(felem out, const felem in)
 221 {
 222     out[0] = in[0];
 223     out[1] = in[1];
 224     out[2] = in[2];
 225     out[3] = in[3];
 226     out[4] = in[4];
 227     out[5] = in[5];
 228     out[6] = in[6];
 229     out[7] = in[7];
 230     out[8] = in[8];
 231 }
 232
 233 /* felem_sum64 sets out = out + in. */
 234 static void felem_sum64(felem out, const felem in)
 235 {
 236     out[0] += in[0];
 237     out[1] += in[1];
 238     out[2] += in[2];
 239     out[3] += in[3];
 240     out[4] += in[4];
 241     out[5] += in[5];
 242     out[6] += in[6];
 243     out[7] += in[7];
 244     out[8] += in[8];
 245 }
 246
 247 /* felem_scalar sets out = in * scalar */
 248 static void felem_scalar(felem out, const felem in, limb scalar)
 249 {
 250     out[0] = in[0] * scalar;
 251     out[1] = in[1] * scalar;
 252     out[2] = in[2] * scalar;
 253     out[3] = in[3] * scalar;
 254     out[4] = in[4] * scalar;
 255     out[5] = in[5] * scalar;
 256     out[6] = in[6] * scalar;
 257     out[7] = in[7] * scalar;
 258     out[8] = in[8] * scalar;
 259 }
 260
 261 /* felem_scalar64 sets out = out * scalar */
 262 static void felem_scalar64(felem out, limb scalar)
 263 {
 264     out[0] *= scalar;
 265     out[1] *= scalar;
 266     out[2] *= scalar;
 267     out[3] *= scalar;
 268     out[4] *= scalar;
 269     out[5] *= scalar;
 270     out[6] *= scalar;
 271     out[7] *= scalar;
 272     out[8] *= scalar;
 273 }
 274
 275 /* felem_scalar128 sets out = out * scalar */
 276 static void felem_scalar128(largefelem out, limb scalar)
 277 {
 278     out[0] *= scalar;
 279     out[1] *= scalar;
 280     out[2] *= scalar;
 281     out[3] *= scalar;
 282     out[4] *= scalar;
 283     out[5] *= scalar;
 284     out[6] *= scalar;
 285     out[7] *= scalar;
 286     out[8] *= scalar;
 287 }
 288
 289 /*-
 290  * felem_neg sets |out| to |-in|
 291  * On entry:
 292  *   in[i] < 2^59 + 2^14
 293  * On exit:
 294  *   out[i] < 2^62
 295  */
 296 static void felem_neg(felem out, const felem in)
 297 {
 298     /* In order to prevent underflow, we subtract from 0 mod p. */
 299     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 300     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 301
 302     out[0] = two62m3 - in[0];
 303     out[1] = two62m2 - in[1];
 304     out[2] = two62m2 - in[2];
 305     out[3] = two62m2 - in[3];
 306     out[4] = two62m2 - in[4];
 307     out[5] = two62m2 - in[5];
 308     out[6] = two62m2 - in[6];
 309     out[7] = two62m2 - in[7];
 310     out[8] = two62m2 - in[8];
 311 }
 312
 313 /*-
 314  * felem_diff64 subtracts |in| from |out|
 315  * On entry:
 316  *   in[i] < 2^59 + 2^14
 317  * On exit:
 318  *   out[i] < out[i] + 2^62
 319  */
 320 static void felem_diff64(felem out, const felem in)
 321 {
 322     /*
 323      * In order to prevent underflow, we add 0 mod p before subtracting.
 324      */
 325     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 326     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 327
 328     out[0] += two62m3 - in[0];
 329     out[1] += two62m2 - in[1];
 330     out[2] += two62m2 - in[2];
 331     out[3] += two62m2 - in[3];
 332     out[4] += two62m2 - in[4];
 333     out[5] += two62m2 - in[5];
 334     out[6] += two62m2 - in[6];
 335     out[7] += two62m2 - in[7];
 336     out[8] += two62m2 - in[8];
 337 }
 338
 339 /*-
 340  * felem_diff_128_64 subtracts |in| from |out|
 341  * On entry:
 342  *   in[i] < 2^62 + 2^17
 343  * On exit:
 344  *   out[i] < out[i] + 2^63
 345  */
 346 static void felem_diff_128_64(largefelem out, const felem in)
 347 {
 348     /*
 349      * In order to prevent underflow, we add 64p mod p (which is equivalent
 350      * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
 351      * digit number with all bits set to 1. See "The representation of field
 352      * elements" comment above for a description of how limbs are used to
 353      * represent a number. 64p is represented with 8 limbs containing a number
 354      * with 58 bits set and one limb with a number with 57 bits set.
 355      */
 356     static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
 357     static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
 358
 359     out[0] += two63m6 - in[0];
 360     out[1] += two63m5 - in[1];
 361     out[2] += two63m5 - in[2];
 362     out[3] += two63m5 - in[3];
 363     out[4] += two63m5 - in[4];
 364     out[5] += two63m5 - in[5];
 365     out[6] += two63m5 - in[6];
 366     out[7] += two63m5 - in[7];
 367     out[8] += two63m5 - in[8];
 368 }
 369
 370 /*-
 371  * felem_diff_128_64 subtracts |in| from |out|
 372  * On entry:
 373  *   in[i] < 2^126
 374  * On exit:
 375  *   out[i] < out[i] + 2^127 - 2^69
 376  */
 377 static void felem_diff128(largefelem out, const largefelem in)
 378 {
 379     /*
 380      * In order to prevent underflow, we add 0 mod p before subtracting.
 381      */
 382     static const uint128_t two127m70 =
 383         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
 384     static const uint128_t two127m69 =
 385         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
 386
 387     out[0] += (two127m70 - in[0]);
 388     out[1] += (two127m69 - in[1]);
 389     out[2] += (two127m69 - in[2]);
 390     out[3] += (two127m69 - in[3]);
 391     out[4] += (two127m69 - in[4]);
 392     out[5] += (two127m69 - in[5]);
 393     out[6] += (two127m69 - in[6]);
 394     out[7] += (two127m69 - in[7]);
 395     out[8] += (two127m69 - in[8]);
 396 }
 397
 398 /*-
 399  * felem_square sets |out| = |in|^2
 400  * On entry:
 401  *   in[i] < 2^62
 402  * On exit:
 403  *   out[i] < 17 * max(in[i]) * max(in[i])
 404  */
 405 static void felem_square(largefelem out, const felem in)
 406 {
 407     felem inx2, inx4;
 408     felem_scalar(inx2, in, 2);
 409     felem_scalar(inx4, in, 4);
 410
 411     /*-
 412      * We have many cases were we want to do
 413      *   in[x] * in[y] +
 414      *   in[y] * in[x]
 415      * This is obviously just
 416      *   2 * in[x] * in[y]
 417      * However, rather than do the doubling on the 128 bit result, we
 418      * double one of the inputs to the multiplication by reading from
 419      * |inx2|
 420      */
 421
 422     out[0] = ((uint128_t) in[0]) * in[0];
 423     out[1] = ((uint128_t) in[0]) * inx2[1];
 424     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
 425     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
 426     out[4] = ((uint128_t) in[0]) * inx2[4] +
 427              ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
 428     out[5] = ((uint128_t) in[0]) * inx2[5] +
 429              ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
 430     out[6] = ((uint128_t) in[0]) * inx2[6] +
 431              ((uint128_t) in[1]) * inx2[5] +
 432              ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
 433     out[7] = ((uint128_t) in[0]) * inx2[7] +
 434              ((uint128_t) in[1]) * inx2[6] +
 435              ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
 436     out[8] = ((uint128_t) in[0]) * inx2[8] +
 437              ((uint128_t) in[1]) * inx2[7] +
 438              ((uint128_t) in[2]) * inx2[6] +
 439              ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
 440
 441     /*
 442      * The remaining limbs fall above 2^521, with the first falling at 2^522.
 443      * They correspond to locations one bit up from the limbs produced above
 444      * so we would have to multiply by two to align them. Again, rather than
 445      * operate on the 128-bit result, we double one of the inputs to the
 446      * multiplication. If we want to double for both this reason, and the
 447      * reason above, then we end up multiplying by four.
 448      */
 449
 450     /* 9 */
 451     out[0] += ((uint128_t) in[1]) * inx4[8] +
 452               ((uint128_t) in[2]) * inx4[7] +
 453               ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
 454
 455     /* 10 */
 456     out[1] += ((uint128_t) in[2]) * inx4[8] +
 457               ((uint128_t) in[3]) * inx4[7] +
 458               ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
 459
 460     /* 11 */
 461     out[2] += ((uint128_t) in[3]) * inx4[8] +
 462               ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
 463
 464     /* 12 */
 465     out[3] += ((uint128_t) in[4]) * inx4[8] +
 466               ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
 467
 468     /* 13 */
 469     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
 470
 471     /* 14 */
 472     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
 473
 474     /* 15 */
 475     out[6] += ((uint128_t) in[7]) * inx4[8];
 476
 477     /* 16 */
 478     out[7] += ((uint128_t) in[8]) * inx2[8];
 479 }
 480
 481 /*-
 482  * felem_mul sets |out| = |in1| * |in2|
 483  * On entry:
 484  *   in1[i] < 2^64
 485  *   in2[i] < 2^63
 486  * On exit:
 487  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 488  */
 489 static void felem_mul(largefelem out, const felem in1, const felem in2)
 490 {
 491     felem in2x2;
 492     felem_scalar(in2x2, in2, 2);
 493
 494     out[0] = ((uint128_t) in1[0]) * in2[0];
 495
 496     out[1] = ((uint128_t) in1[0]) * in2[1] +
 497              ((uint128_t) in1[1]) * in2[0];
 498
 499     out[2] = ((uint128_t) in1[0]) * in2[2] +
 500              ((uint128_t) in1[1]) * in2[1] +
 501              ((uint128_t) in1[2]) * in2[0];
 502
 503     out[3] = ((uint128_t) in1[0]) * in2[3] +
 504              ((uint128_t) in1[1]) * in2[2] +
 505              ((uint128_t) in1[2]) * in2[1] +
 506              ((uint128_t) in1[3]) * in2[0];
 507
 508     out[4] = ((uint128_t) in1[0]) * in2[4] +
 509              ((uint128_t) in1[1]) * in2[3] +
 510              ((uint128_t) in1[2]) * in2[2] +
 511              ((uint128_t) in1[3]) * in2[1] +
 512              ((uint128_t) in1[4]) * in2[0];
 513
 514     out[5] = ((uint128_t) in1[0]) * in2[5] +
 515              ((uint128_t) in1[1]) * in2[4] +
 516              ((uint128_t) in1[2]) * in2[3] +
 517              ((uint128_t) in1[3]) * in2[2] +
 518              ((uint128_t) in1[4]) * in2[1] +
 519              ((uint128_t) in1[5]) * in2[0];
 520
 521     out[6] = ((uint128_t) in1[0]) * in2[6] +
 522              ((uint128_t) in1[1]) * in2[5] +
 523              ((uint128_t) in1[2]) * in2[4] +
 524              ((uint128_t) in1[3]) * in2[3] +
 525              ((uint128_t) in1[4]) * in2[2] +
 526              ((uint128_t) in1[5]) * in2[1] +
 527              ((uint128_t) in1[6]) * in2[0];
 528
 529     out[7] = ((uint128_t) in1[0]) * in2[7] +
 530              ((uint128_t) in1[1]) * in2[6] +
 531              ((uint128_t) in1[2]) * in2[5] +
 532              ((uint128_t) in1[3]) * in2[4] +
 533              ((uint128_t) in1[4]) * in2[3] +
 534              ((uint128_t) in1[5]) * in2[2] +
 535              ((uint128_t) in1[6]) * in2[1] +
 536              ((uint128_t) in1[7]) * in2[0];
 537
 538     out[8] = ((uint128_t) in1[0]) * in2[8] +
 539              ((uint128_t) in1[1]) * in2[7] +
 540              ((uint128_t) in1[2]) * in2[6] +
 541              ((uint128_t) in1[3]) * in2[5] +
 542              ((uint128_t) in1[4]) * in2[4] +
 543              ((uint128_t) in1[5]) * in2[3] +
 544              ((uint128_t) in1[6]) * in2[2] +
 545              ((uint128_t) in1[7]) * in2[1] +
 546              ((uint128_t) in1[8]) * in2[0];
 547
 548     /* See comment in felem_square about the use of in2x2 here */
 549
 550     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 551               ((uint128_t) in1[2]) * in2x2[7] +
 552               ((uint128_t) in1[3]) * in2x2[6] +
 553               ((uint128_t) in1[4]) * in2x2[5] +
 554               ((uint128_t) in1[5]) * in2x2[4] +
 555               ((uint128_t) in1[6]) * in2x2[3] +
 556               ((uint128_t) in1[7]) * in2x2[2] +
 557               ((uint128_t) in1[8]) * in2x2[1];
 558
 559     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 560               ((uint128_t) in1[3]) * in2x2[7] +
 561               ((uint128_t) in1[4]) * in2x2[6] +
 562               ((uint128_t) in1[5]) * in2x2[5] +
 563               ((uint128_t) in1[6]) * in2x2[4] +
 564               ((uint128_t) in1[7]) * in2x2[3] +
 565               ((uint128_t) in1[8]) * in2x2[2];
 566
 567     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 568               ((uint128_t) in1[4]) * in2x2[7] +
 569               ((uint128_t) in1[5]) * in2x2[6] +
 570               ((uint128_t) in1[6]) * in2x2[5] +
 571               ((uint128_t) in1[7]) * in2x2[4] +
 572               ((uint128_t) in1[8]) * in2x2[3];
 573
 574     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 575               ((uint128_t) in1[5]) * in2x2[7] +
 576               ((uint128_t) in1[6]) * in2x2[6] +
 577               ((uint128_t) in1[7]) * in2x2[5] +
 578               ((uint128_t) in1[8]) * in2x2[4];
 579
 580     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 581               ((uint128_t) in1[6]) * in2x2[7] +
 582               ((uint128_t) in1[7]) * in2x2[6] +
 583               ((uint128_t) in1[8]) * in2x2[5];
 584
 585     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 586               ((uint128_t) in1[7]) * in2x2[7] +
 587               ((uint128_t) in1[8]) * in2x2[6];
 588
 589     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 590               ((uint128_t) in1[8]) * in2x2[7];
 591
 592     out[7] += ((uint128_t) in1[8]) * in2x2[8];
 593 }
 594
 595 static const limb bottom52bits = 0xfffffffffffff;
 596
 597 /*-
 598  * felem_reduce converts a largefelem to an felem.
 599  * On entry:
 600  *   in[i] < 2^128
 601  * On exit:
 602  *   out[i] < 2^59 + 2^14
 603  */
 604 static void felem_reduce(felem out, const largefelem in)
 605 {
 606     u64 overflow1, overflow2;
 607
 608     out[0] = ((limb) in[0]) & bottom58bits;
 609     out[1] = ((limb) in[1]) & bottom58bits;
 610     out[2] = ((limb) in[2]) & bottom58bits;
 611     out[3] = ((limb) in[3]) & bottom58bits;
 612     out[4] = ((limb) in[4]) & bottom58bits;
 613     out[5] = ((limb) in[5]) & bottom58bits;
 614     out[6] = ((limb) in[6]) & bottom58bits;
 615     out[7] = ((limb) in[7]) & bottom58bits;
 616     out[8] = ((limb) in[8]) & bottom58bits;
 617
 618     /* out[i] < 2^58 */
 619
 620     out[1] += ((limb) in[0]) >> 58;
 621     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 622     /*-
 623      * out[1] < 2^58 + 2^6 + 2^58
 624      *        = 2^59 + 2^6
 625      */
 626     out[2] += ((limb) (in[0] >> 64)) >> 52;
 627
 628     out[2] += ((limb) in[1]) >> 58;
 629     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 630     out[3] += ((limb) (in[1] >> 64)) >> 52;
 631
 632     out[3] += ((limb) in[2]) >> 58;
 633     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 634     out[4] += ((limb) (in[2] >> 64)) >> 52;
 635
 636     out[4] += ((limb) in[3]) >> 58;
 637     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 638     out[5] += ((limb) (in[3] >> 64)) >> 52;
 639
 640     out[5] += ((limb) in[4]) >> 58;
 641     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 642     out[6] += ((limb) (in[4] >> 64)) >> 52;
 643
 644     out[6] += ((limb) in[5]) >> 58;
 645     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 646     out[7] += ((limb) (in[5] >> 64)) >> 52;
 647
 648     out[7] += ((limb) in[6]) >> 58;
 649     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 650     out[8] += ((limb) (in[6] >> 64)) >> 52;
 651
 652     out[8] += ((limb) in[7]) >> 58;
 653     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 654     /*-
 655      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 656      *            < 2^59 + 2^13
 657      */
 658     overflow1 = ((limb) (in[7] >> 64)) >> 52;
 659
 660     overflow1 += ((limb) in[8]) >> 58;
 661     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 662     overflow2 = ((limb) (in[8] >> 64)) >> 52;
 663
 664     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
 665     overflow2 <<= 1;            /* overflow2 < 2^13 */
 666
 667     out[0] += overflow1;        /* out[0] < 2^60 */
 668     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
 669
 670     out[1] += out[0] >> 58;
 671     out[0] &= bottom58bits;
 672     /*-
 673      * out[0] < 2^58
 674      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 675      *        < 2^59 + 2^14
 676      */
 677 }
 678
 679 static void felem_square_reduce(felem out, const felem in)
 680 {
 681     largefelem tmp;
 682     felem_square(tmp, in);
 683     felem_reduce(out, tmp);
 684 }
 685
 686 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 687 {
 688     largefelem tmp;
 689     felem_mul(tmp, in1, in2);
 690     felem_reduce(out, tmp);
 691 }
 692
 693 /*-
 694  * felem_inv calculates |out| = |in|^{-1}
 695  *
 696  * Based on Fermat's Little Theorem:
 697  *   a^p = a (mod p)
 698  *   a^{p-1} = 1 (mod p)
 699  *   a^{p-2} = a^{-1} (mod p)
 700  */
 701 static void felem_inv(felem out, const felem in)
 702 {
 703     felem ftmp, ftmp2, ftmp3, ftmp4;
 704     largefelem tmp;
 705     unsigned i;
 706
 707     felem_square(tmp, in);
 708     felem_reduce(ftmp, tmp);    /* 2^1 */
 709     felem_mul(tmp, in, ftmp);
 710     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
 711     felem_assign(ftmp2, ftmp);
 712     felem_square(tmp, ftmp);
 713     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
 714     felem_mul(tmp, in, ftmp);
 715     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
 716     felem_square(tmp, ftmp);
 717     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
 718
 719     felem_square(tmp, ftmp2);
 720     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
 721     felem_square(tmp, ftmp3);
 722     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
 723     felem_mul(tmp, ftmp3, ftmp2);
 724     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
 725
 726     felem_assign(ftmp2, ftmp3);
 727     felem_square(tmp, ftmp3);
 728     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
 729     felem_square(tmp, ftmp3);
 730     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
 731     felem_square(tmp, ftmp3);
 732     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
 733     felem_square(tmp, ftmp3);
 734     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
 735     felem_assign(ftmp4, ftmp3);
 736     felem_mul(tmp, ftmp3, ftmp);
 737     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
 738     felem_square(tmp, ftmp4);
 739     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
 740     felem_mul(tmp, ftmp3, ftmp2);
 741     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
 742     felem_assign(ftmp2, ftmp3);
 743
 744     for (i = 0; i < 8; i++) {
 745         felem_square(tmp, ftmp3);
 746         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
 747     }
 748     felem_mul(tmp, ftmp3, ftmp2);
 749     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
 750     felem_assign(ftmp2, ftmp3);
 751
 752     for (i = 0; i < 16; i++) {
 753         felem_square(tmp, ftmp3);
 754         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
 755     }
 756     felem_mul(tmp, ftmp3, ftmp2);
 757     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
 758     felem_assign(ftmp2, ftmp3);
 759
 760     for (i = 0; i < 32; i++) {
 761         felem_square(tmp, ftmp3);
 762         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
 763     }
 764     felem_mul(tmp, ftmp3, ftmp2);
 765     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
 766     felem_assign(ftmp2, ftmp3);
 767
 768     for (i = 0; i < 64; i++) {
 769         felem_square(tmp, ftmp3);
 770         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
 771     }
 772     felem_mul(tmp, ftmp3, ftmp2);
 773     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
 774     felem_assign(ftmp2, ftmp3);
 775
 776     for (i = 0; i < 128; i++) {
 777         felem_square(tmp, ftmp3);
 778         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
 779     }
 780     felem_mul(tmp, ftmp3, ftmp2);
 781     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
 782     felem_assign(ftmp2, ftmp3);
 783
 784     for (i = 0; i < 256; i++) {
 785         felem_square(tmp, ftmp3);
 786         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
 787     }
 788     felem_mul(tmp, ftmp3, ftmp2);
 789     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
 790
 791     for (i = 0; i < 9; i++) {
 792         felem_square(tmp, ftmp3);
 793         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
 794     }
 795     felem_mul(tmp, ftmp3, ftmp4);
 796     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
 797     felem_mul(tmp, ftmp3, in);
 798     felem_reduce(out, tmp);     /* 2^512 - 3 */
 799 }
 800
 801 /* This is 2^521-1, expressed as an felem */
 802 static const felem kPrime = {
 803     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 804     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 805     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 806 };
 807
 808 /*-
 809  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 810  * otherwise.
 811  * On entry:
 812  *   in[i] < 2^59 + 2^14
 813  */
 814 static limb felem_is_zero(const felem in)
 815 {
 816     felem ftmp;
 817     limb is_zero, is_p;
 818     felem_assign(ftmp, in);
 819
 820     ftmp[0] += ftmp[8] >> 57;
 821     ftmp[8] &= bottom57bits;
 822     /* ftmp[8] < 2^57 */
 823     ftmp[1] += ftmp[0] >> 58;
 824     ftmp[0] &= bottom58bits;
 825     ftmp[2] += ftmp[1] >> 58;
 826     ftmp[1] &= bottom58bits;
 827     ftmp[3] += ftmp[2] >> 58;
 828     ftmp[2] &= bottom58bits;
 829     ftmp[4] += ftmp[3] >> 58;
 830     ftmp[3] &= bottom58bits;
 831     ftmp[5] += ftmp[4] >> 58;
 832     ftmp[4] &= bottom58bits;
 833     ftmp[6] += ftmp[5] >> 58;
 834     ftmp[5] &= bottom58bits;
 835     ftmp[7] += ftmp[6] >> 58;
 836     ftmp[6] &= bottom58bits;
 837     ftmp[8] += ftmp[7] >> 58;
 838     ftmp[7] &= bottom58bits;
 839     /* ftmp[8] < 2^57 + 4 */
 840
 841     /*
 842      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
 843      * than our bound for ftmp[8]. Therefore we only have to check if the
 844      * zero is zero or 2^521-1.
 845      */
 846
 847     is_zero = 0;
 848     is_zero |= ftmp[0];
 849     is_zero |= ftmp[1];
 850     is_zero |= ftmp[2];
 851     is_zero |= ftmp[3];
 852     is_zero |= ftmp[4];
 853     is_zero |= ftmp[5];
 854     is_zero |= ftmp[6];
 855     is_zero |= ftmp[7];
 856     is_zero |= ftmp[8];
 857
 858     is_zero--;
 859     /*
 860      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
 861      * can be set is if is_zero was 0 before the decrement.
 862      */
 863     is_zero = 0 - (is_zero >> 63);
 864
 865     is_p = ftmp[0] ^ kPrime[0];
 866     is_p |= ftmp[1] ^ kPrime[1];
 867     is_p |= ftmp[2] ^ kPrime[2];
 868     is_p |= ftmp[3] ^ kPrime[3];
 869     is_p |= ftmp[4] ^ kPrime[4];
 870     is_p |= ftmp[5] ^ kPrime[5];
 871     is_p |= ftmp[6] ^ kPrime[6];
 872     is_p |= ftmp[7] ^ kPrime[7];
 873     is_p |= ftmp[8] ^ kPrime[8];
 874
 875     is_p--;
 876     is_p = 0 - (is_p >> 63);
 877
 878     is_zero |= is_p;
 879     return is_zero;
 880 }
 881
 882 static int felem_is_zero_int(const void *in)
 883 {
 884     return (int)(felem_is_zero(in) & ((limb) 1));
 885 }
 886
 887 /*-
 888  * felem_contract converts |in| to its unique, minimal representation.
 889  * On entry:
 890  *   in[i] < 2^59 + 2^14
 891  */
 892 static void felem_contract(felem out, const felem in)
 893 {
 894     limb is_p, is_greater, sign;
 895     static const limb two58 = ((limb) 1) << 58;
 896
 897     felem_assign(out, in);
 898
 899     out[0] += out[8] >> 57;
 900     out[8] &= bottom57bits;
 901     /* out[8] < 2^57 */
 902     out[1] += out[0] >> 58;
 903     out[0] &= bottom58bits;
 904     out[2] += out[1] >> 58;
 905     out[1] &= bottom58bits;
 906     out[3] += out[2] >> 58;
 907     out[2] &= bottom58bits;
 908     out[4] += out[3] >> 58;
 909     out[3] &= bottom58bits;
 910     out[5] += out[4] >> 58;
 911     out[4] &= bottom58bits;
 912     out[6] += out[5] >> 58;
 913     out[5] &= bottom58bits;
 914     out[7] += out[6] >> 58;
 915     out[6] &= bottom58bits;
 916     out[8] += out[7] >> 58;
 917     out[7] &= bottom58bits;
 918     /* out[8] < 2^57 + 4 */
 919
 920     /*
 921      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
 922      * out. See the comments in felem_is_zero regarding why we don't test for
 923      * other multiples of the prime.
 924      */
 925
 926     /*
 927      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
 928      */
 929
 930     is_p = out[0] ^ kPrime[0];
 931     is_p |= out[1] ^ kPrime[1];
 932     is_p |= out[2] ^ kPrime[2];
 933     is_p |= out[3] ^ kPrime[3];
 934     is_p |= out[4] ^ kPrime[4];
 935     is_p |= out[5] ^ kPrime[5];
 936     is_p |= out[6] ^ kPrime[6];
 937     is_p |= out[7] ^ kPrime[7];
 938     is_p |= out[8] ^ kPrime[8];
 939
 940     is_p--;
 941     is_p &= is_p << 32;
 942     is_p &= is_p << 16;
 943     is_p &= is_p << 8;
 944     is_p &= is_p << 4;
 945     is_p &= is_p << 2;
 946     is_p &= is_p << 1;
 947     is_p = 0 - (is_p >> 63);
 948     is_p = ~is_p;
 949
 950     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 951
 952     out[0] &= is_p;
 953     out[1] &= is_p;
 954     out[2] &= is_p;
 955     out[3] &= is_p;
 956     out[4] &= is_p;
 957     out[5] &= is_p;
 958     out[6] &= is_p;
 959     out[7] &= is_p;
 960     out[8] &= is_p;
 961
 962     /*
 963      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
 964      * 57 is greater than zero as (2^521-1) + x >= 2^522
 965      */
 966     is_greater = out[8] >> 57;
 967     is_greater |= is_greater << 32;
 968     is_greater |= is_greater << 16;
 969     is_greater |= is_greater << 8;
 970     is_greater |= is_greater << 4;
 971     is_greater |= is_greater << 2;
 972     is_greater |= is_greater << 1;
 973     is_greater = 0 - (is_greater >> 63);
 974
 975     out[0] -= kPrime[0] & is_greater;
 976     out[1] -= kPrime[1] & is_greater;
 977     out[2] -= kPrime[2] & is_greater;
 978     out[3] -= kPrime[3] & is_greater;
 979     out[4] -= kPrime[4] & is_greater;
 980     out[5] -= kPrime[5] & is_greater;
 981     out[6] -= kPrime[6] & is_greater;
 982     out[7] -= kPrime[7] & is_greater;
 983     out[8] -= kPrime[8] & is_greater;
 984
 985     /* Eliminate negative coefficients */
 986     sign = -(out[0] >> 63);
 987     out[0] += (two58 & sign);
 988     out[1] -= (1 & sign);
 989     sign = -(out[1] >> 63);
 990     out[1] += (two58 & sign);
 991     out[2] -= (1 & sign);
 992     sign = -(out[2] >> 63);
 993     out[2] += (two58 & sign);
 994     out[3] -= (1 & sign);
 995     sign = -(out[3] >> 63);
 996     out[3] += (two58 & sign);
 997     out[4] -= (1 & sign);
 998     sign = -(out[4] >> 63);
 999     out[4] += (two58 & sign);
1000     out[5] -= (1 & sign);
1001     sign = -(out[0] >> 63);
1002     out[5] += (two58 & sign);
1003     out[6] -= (1 & sign);
1004     sign = -(out[6] >> 63);
1005     out[6] += (two58 & sign);
1006     out[7] -= (1 & sign);
1007     sign = -(out[7] >> 63);
1008     out[7] += (two58 & sign);
1009     out[8] -= (1 & sign);
1010     sign = -(out[5] >> 63);
1011     out[5] += (two58 & sign);
1012     out[6] -= (1 & sign);
1013     sign = -(out[6] >> 63);
1014     out[6] += (two58 & sign);
1015     out[7] -= (1 & sign);
1016     sign = -(out[7] >> 63);
1017     out[7] += (two58 & sign);
1018     out[8] -= (1 & sign);
1019 }
1020
1021 /*-
1022  * Group operations
1023  * ----------------
1024  *
1025  * Building on top of the field operations we have the operations on the
1026  * elliptic curve group itself. Points on the curve are represented in Jacobian
1027  * coordinates */
1028
1029 /*-
1030  * point_double calculates 2*(x_in, y_in, z_in)
1031  *
1032  * The method is taken from:
1033  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1034  *
1035  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1036  * while x_out == y_in is not (maybe this works, but it's not tested). */
1037 static void
1038 point_double(felem x_out, felem y_out, felem z_out,
1039              const felem x_in, const felem y_in, const felem z_in)
1040 {
1041     largefelem tmp, tmp2;
1042     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1043
1044     felem_assign(ftmp, x_in);
1045     felem_assign(ftmp2, x_in);
1046
1047     /* delta = z^2 */
1048     felem_square(tmp, z_in);
1049     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1050
1051     /* gamma = y^2 */
1052     felem_square(tmp, y_in);
1053     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1054
1055     /* beta = x*gamma */
1056     felem_mul(tmp, x_in, gamma);
1057     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1058
1059     /* alpha = 3*(x-delta)*(x+delta) */
1060     felem_diff64(ftmp, delta);
1061     /* ftmp[i] < 2^61 */
1062     felem_sum64(ftmp2, delta);
1063     /* ftmp2[i] < 2^60 + 2^15 */
1064     felem_scalar64(ftmp2, 3);
1065     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1066     felem_mul(tmp, ftmp, ftmp2);
1067     /*-
1068      * tmp[i] < 17(3*2^121 + 3*2^76)
1069      *        = 61*2^121 + 61*2^76
1070      *        < 64*2^121 + 64*2^76
1071      *        = 2^127 + 2^82
1072      *        < 2^128
1073      */
1074     felem_reduce(alpha, tmp);
1075
1076     /* x' = alpha^2 - 8*beta */
1077     felem_square(tmp, alpha);
1078     /*
1079      * tmp[i] < 17*2^120 < 2^125
1080      */
1081     felem_assign(ftmp, beta);
1082     felem_scalar64(ftmp, 8);
1083     /* ftmp[i] < 2^62 + 2^17 */
1084     felem_diff_128_64(tmp, ftmp);
1085     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1086     felem_reduce(x_out, tmp);
1087
1088     /* z' = (y + z)^2 - gamma - delta */
1089     felem_sum64(delta, gamma);
1090     /* delta[i] < 2^60 + 2^15 */
1091     felem_assign(ftmp, y_in);
1092     felem_sum64(ftmp, z_in);
1093     /* ftmp[i] < 2^60 + 2^15 */
1094     felem_square(tmp, ftmp);
1095     /*
1096      * tmp[i] < 17(2^122) < 2^127
1097      */
1098     felem_diff_128_64(tmp, delta);
1099     /* tmp[i] < 2^127 + 2^63 */
1100     felem_reduce(z_out, tmp);
1101
1102     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1103     felem_scalar64(beta, 4);
1104     /* beta[i] < 2^61 + 2^16 */
1105     felem_diff64(beta, x_out);
1106     /* beta[i] < 2^61 + 2^60 + 2^16 */
1107     felem_mul(tmp, alpha, beta);
1108     /*-
1109      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1110      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1111      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1112      *        < 2^128
1113      */
1114     felem_square(tmp2, gamma);
1115     /*-
1116      * tmp2[i] < 17*(2^59 + 2^14)^2
1117      *         = 17*(2^118 + 2^74 + 2^28)
1118      */
1119     felem_scalar128(tmp2, 8);
1120     /*-
1121      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1122      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1123      *         < 2^126
1124      */
1125     felem_diff128(tmp, tmp2);
1126     /*-
1127      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1128      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1129      *          2^74 + 2^69 + 2^34 + 2^30
1130      *        < 2^128
1131      */
1132     felem_reduce(y_out, tmp);
1133 }
1134
1135 /* copy_conditional copies in to out iff mask is all ones. */
1136 static void copy_conditional(felem out, const felem in, limb mask)
1137 {
1138     unsigned i;
1139     for (i = 0; i < NLIMBS; ++i) {
1140         const limb tmp = mask & (in[i] ^ out[i]);
1141         out[i] ^= tmp;
1142     }
1143 }
1144
1145 /*-
1146  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1147  *
1148  * The method is taken from
1149  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1150  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1151  *
1152  * This function includes a branch for checking whether the two input points
1153  * are equal (while not equal to the point at infinity). See comment below
1154  * on constant-time.
1155  */
1156 static void point_add(felem x3, felem y3, felem z3,
1157                       const felem x1, const felem y1, const felem z1,
1158                       const int mixed, const felem x2, const felem y2,
1159                       const felem z2)
1160 {
1161     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1162     largefelem tmp, tmp2;
1163     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1164     limb points_equal;
1165
1166     z1_is_zero = felem_is_zero(z1);
1167     z2_is_zero = felem_is_zero(z2);
1168
1169     /* ftmp = z1z1 = z1**2 */
1170     felem_square(tmp, z1);
1171     felem_reduce(ftmp, tmp);
1172
1173     if (!mixed) {
1174         /* ftmp2 = z2z2 = z2**2 */
1175         felem_square(tmp, z2);
1176         felem_reduce(ftmp2, tmp);
1177
1178         /* u1 = ftmp3 = x1*z2z2 */
1179         felem_mul(tmp, x1, ftmp2);
1180         felem_reduce(ftmp3, tmp);
1181
1182         /* ftmp5 = z1 + z2 */
1183         felem_assign(ftmp5, z1);
1184         felem_sum64(ftmp5, z2);
1185         /* ftmp5[i] < 2^61 */
1186
1187         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1188         felem_square(tmp, ftmp5);
1189         /* tmp[i] < 17*2^122 */
1190         felem_diff_128_64(tmp, ftmp);
1191         /* tmp[i] < 17*2^122 + 2^63 */
1192         felem_diff_128_64(tmp, ftmp2);
1193         /* tmp[i] < 17*2^122 + 2^64 */
1194         felem_reduce(ftmp5, tmp);
1195
1196         /* ftmp2 = z2 * z2z2 */
1197         felem_mul(tmp, ftmp2, z2);
1198         felem_reduce(ftmp2, tmp);
1199
1200         /* s1 = ftmp6 = y1 * z2**3 */
1201         felem_mul(tmp, y1, ftmp2);
1202         felem_reduce(ftmp6, tmp);
1203     } else {
1204         /*
1205          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1206          */
1207
1208         /* u1 = ftmp3 = x1*z2z2 */
1209         felem_assign(ftmp3, x1);
1210
1211         /* ftmp5 = 2*z1z2 */
1212         felem_scalar(ftmp5, z1, 2);
1213
1214         /* s1 = ftmp6 = y1 * z2**3 */
1215         felem_assign(ftmp6, y1);
1216     }
1217
1218     /* u2 = x2*z1z1 */
1219     felem_mul(tmp, x2, ftmp);
1220     /* tmp[i] < 17*2^120 */
1221
1222     /* h = ftmp4 = u2 - u1 */
1223     felem_diff_128_64(tmp, ftmp3);
1224     /* tmp[i] < 17*2^120 + 2^63 */
1225     felem_reduce(ftmp4, tmp);
1226
1227     x_equal = felem_is_zero(ftmp4);
1228
1229     /* z_out = ftmp5 * h */
1230     felem_mul(tmp, ftmp5, ftmp4);
1231     felem_reduce(z_out, tmp);
1232
1233     /* ftmp = z1 * z1z1 */
1234     felem_mul(tmp, ftmp, z1);
1235     felem_reduce(ftmp, tmp);
1236
1237     /* s2 = tmp = y2 * z1**3 */
1238     felem_mul(tmp, y2, ftmp);
1239     /* tmp[i] < 17*2^120 */
1240
1241     /* r = ftmp5 = (s2 - s1)*2 */
1242     felem_diff_128_64(tmp, ftmp6);
1243     /* tmp[i] < 17*2^120 + 2^63 */
1244     felem_reduce(ftmp5, tmp);
1245     y_equal = felem_is_zero(ftmp5);
1246     felem_scalar64(ftmp5, 2);
1247     /* ftmp5[i] < 2^61 */
1248
1249     /*
1250      * The formulae are incorrect if the points are equal, in affine coordinates
1251      * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1252      * happens.
1253      *
1254      * We use bitwise operations to avoid potential side-channels introduced by
1255      * the short-circuiting behaviour of boolean operators.
1256      *
1257      * The special case of either point being the point at infinity (z1 and/or
1258      * z2 are zero), is handled separately later on in this function, so we
1259      * avoid jumping to point_double here in those special cases.
1260      *
1261      * Notice the comment below on the implications of this branching for timing
1262      * leaks and why it is considered practically irrelevant.
1263      */
1264     points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1265
1266     if (points_equal) {
1267         /*
1268          * This is obviously not constant-time but it will almost-never happen
1269          * for ECDH / ECDSA. The case where it can happen is during scalar-mult
1270          * where the intermediate value gets very close to the group order.
1271          * Since |ec_GFp_nistp_recode_scalar_bits| produces signed digits for
1272          * the scalar, it's possible for the intermediate value to be a small
1273          * negative multiple of the base point, and for the final signed digit
1274          * to be the same value. We believe that this only occurs for the scalar
1275          * 1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
1276          * ffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb
1277          * 71e913863f7, in that case the penultimate intermediate is -9G and
1278          * the final digit is also -9G. Since this only happens for a single
1279          * scalar, the timing leak is irrelevant. (Any attacker who wanted to
1280          * check whether a secret scalar was that exact value, can already do
1281          * so.)
1282          */
1283         point_double(x3, y3, z3, x1, y1, z1);
1284         return;
1285     }
1286
1287     /* I = ftmp = (2h)**2 */
1288     felem_assign(ftmp, ftmp4);
1289     felem_scalar64(ftmp, 2);
1290     /* ftmp[i] < 2^61 */
1291     felem_square(tmp, ftmp);
1292     /* tmp[i] < 17*2^122 */
1293     felem_reduce(ftmp, tmp);
1294
1295     /* J = ftmp2 = h * I */
1296     felem_mul(tmp, ftmp4, ftmp);
1297     felem_reduce(ftmp2, tmp);
1298
1299     /* V = ftmp4 = U1 * I */
1300     felem_mul(tmp, ftmp3, ftmp);
1301     felem_reduce(ftmp4, tmp);
1302
1303     /* x_out = r**2 - J - 2V */
1304     felem_square(tmp, ftmp5);
1305     /* tmp[i] < 17*2^122 */
1306     felem_diff_128_64(tmp, ftmp2);
1307     /* tmp[i] < 17*2^122 + 2^63 */
1308     felem_assign(ftmp3, ftmp4);
1309     felem_scalar64(ftmp4, 2);
1310     /* ftmp4[i] < 2^61 */
1311     felem_diff_128_64(tmp, ftmp4);
1312     /* tmp[i] < 17*2^122 + 2^64 */
1313     felem_reduce(x_out, tmp);
1314
1315     /* y_out = r(V-x_out) - 2 * s1 * J */
1316     felem_diff64(ftmp3, x_out);
1317     /*
1318      * ftmp3[i] < 2^60 + 2^60 = 2^61
1319      */
1320     felem_mul(tmp, ftmp5, ftmp3);
1321     /* tmp[i] < 17*2^122 */
1322     felem_mul(tmp2, ftmp6, ftmp2);
1323     /* tmp2[i] < 17*2^120 */
1324     felem_scalar128(tmp2, 2);
1325     /* tmp2[i] < 17*2^121 */
1326     felem_diff128(tmp, tmp2);
1327         /*-
1328          * tmp[i] < 2^127 - 2^69 + 17*2^122
1329          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1330          *        < 2^127
1331          */
1332     felem_reduce(y_out, tmp);
1333
1334     copy_conditional(x_out, x2, z1_is_zero);
1335     copy_conditional(x_out, x1, z2_is_zero);
1336     copy_conditional(y_out, y2, z1_is_zero);
1337     copy_conditional(y_out, y1, z2_is_zero);
1338     copy_conditional(z_out, z2, z1_is_zero);
1339     copy_conditional(z_out, z1, z2_is_zero);
1340     felem_assign(x3, x_out);
1341     felem_assign(y3, y_out);
1342     felem_assign(z3, z_out);
1343 }
1344
1345 /*-
1346  * Base point pre computation
1347  * --------------------------
1348  *
1349  * Two different sorts of precomputed tables are used in the following code.
1350  * Each contain various points on the curve, where each point is three field
1351  * elements (x, y, z).
1352  *
1353  * For the base point table, z is usually 1 (0 for the point at infinity).
1354  * This table has 16 elements:
1355  * index | bits    | point
1356  * ------+---------+------------------------------
1357  *     0 | 0 0 0 0 | 0G
1358  *     1 | 0 0 0 1 | 1G
1359  *     2 | 0 0 1 0 | 2^130G
1360  *     3 | 0 0 1 1 | (2^130 + 1)G
1361  *     4 | 0 1 0 0 | 2^260G
1362  *     5 | 0 1 0 1 | (2^260 + 1)G
1363  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1364  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1365  *     8 | 1 0 0 0 | 2^390G
1366  *     9 | 1 0 0 1 | (2^390 + 1)G
1367  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1368  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1369  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1370  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1371  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1372  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1373  *
1374  * The reason for this is so that we can clock bits into four different
1375  * locations when doing simple scalar multiplies against the base point.
1376  *
1377  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1378
1379 /* gmul is the table of precomputed base points */
1380 static const felem gmul[16][3] = {
1381 {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1382  {0, 0, 0, 0, 0, 0, 0, 0, 0},
1383  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1384 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1385   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1386   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1387  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1388   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1389   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1390  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1391 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1392   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1393   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1394  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1395   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1396   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1397  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1398 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1399   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1400   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1401  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1402   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1403   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1404  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1405 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1406   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1407   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1408  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1409   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1410   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1411  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1412 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1413   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1414   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1415  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1416   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1417   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1418  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1419 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1420   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1421   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1422  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1423   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1424   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1425  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1426 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1427   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1428   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1429  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1430   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1431   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1432  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1433 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1434   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1435   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1436  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1437   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1438   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1439  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1440 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1441   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1442   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1443  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1444   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1445   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1446  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1447 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1448   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1449   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1450  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1451   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1452   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1453  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1454 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1455   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1456   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1457  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1458   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1459   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1460  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1461 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1462   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1463   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1464  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1465   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1466   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1467  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1468 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1469   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1470   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1471  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1472   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1473   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1474  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1475 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1476   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1477   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1478  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1479   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1480   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1481  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1482 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1483   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1484   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1485  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1486   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1487   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1488  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1489 };
1490
1491 /*
1492  * select_point selects the |idx|th point from a precomputation table and
1493  * copies it to out.
1494  */
1495  /* pre_comp below is of the size provided in |size| */
1496 static void select_point(const limb idx, unsigned int size,
1497                          const felem pre_comp[][3], felem out[3])
1498 {
1499     unsigned i, j;
1500     limb *outlimbs = &out[0][0];
1501
1502     memset(out, 0, sizeof(*out) * 3);
1503
1504     for (i = 0; i < size; i++) {
1505         const limb *inlimbs = &pre_comp[i][0][0];
1506         limb mask = i ^ idx;
1507         mask |= mask >> 4;
1508         mask |= mask >> 2;
1509         mask |= mask >> 1;
1510         mask &= 1;
1511         mask--;
1512         for (j = 0; j < NLIMBS * 3; j++)
1513             outlimbs[j] |= inlimbs[j] & mask;
1514     }
1515 }
1516
1517 /* get_bit returns the |i|th bit in |in| */
1518 static char get_bit(const felem_bytearray in, int i)
1519 {
1520     if (i < 0)
1521         return 0;
1522     return (in[i >> 3] >> (i & 7)) & 1;
1523 }
1524
1525 /*
1526  * Interleaved point multiplication using precomputed point multiples: The
1527  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1528  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1529  * generator, using certain (large) precomputed multiples in g_pre_comp.
1530  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1531  */
1532 static void batch_mul(felem x_out, felem y_out, felem z_out,
1533                       const felem_bytearray scalars[],
1534                       const unsigned num_points, const u8 *g_scalar,
1535                       const int mixed, const felem pre_comp[][17][3],
1536                       const felem g_pre_comp[16][3])
1537 {
1538     int i, skip;
1539     unsigned num, gen_mul = (g_scalar != NULL);
1540     felem nq[3], tmp[4];
1541     limb bits;
1542     u8 sign, digit;
1543
1544     /* set nq to the point at infinity */
1545     memset(nq, 0, sizeof(nq));
1546
1547     /*
1548      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1549      * of the generator (last quarter of rounds) and additions of other
1550      * points multiples (every 5th round).
1551      */
1552     skip = 1;                   /* save two point operations in the first
1553                                  * round */
1554     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1555         /* double */
1556         if (!skip)
1557             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1558
1559         /* add multiples of the generator */
1560         if (gen_mul && (i <= 130)) {
1561             bits = get_bit(g_scalar, i + 390) << 3;
1562             if (i < 130) {
1563                 bits |= get_bit(g_scalar, i + 260) << 2;
1564                 bits |= get_bit(g_scalar, i + 130) << 1;
1565                 bits |= get_bit(g_scalar, i);
1566             }
1567             /* select the point to add, in constant time */
1568             select_point(bits, 16, g_pre_comp, tmp);
1569             if (!skip) {
1570                 /* The 1 argument below is for "mixed" */
1571                 point_add(nq[0], nq[1], nq[2],
1572                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1573             } else {
1574                 memcpy(nq, tmp, 3 * sizeof(felem));
1575                 skip = 0;
1576             }
1577         }
1578
1579         /* do other additions every 5 doublings */
1580         if (num_points && (i % 5 == 0)) {
1581             /* loop over all scalars */
1582             for (num = 0; num < num_points; ++num) {
1583                 bits = get_bit(scalars[num], i + 4) << 5;
1584                 bits |= get_bit(scalars[num], i + 3) << 4;
1585                 bits |= get_bit(scalars[num], i + 2) << 3;
1586                 bits |= get_bit(scalars[num], i + 1) << 2;
1587                 bits |= get_bit(scalars[num], i) << 1;
1588                 bits |= get_bit(scalars[num], i - 1);
1589                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1590
1591                 /*
1592                  * select the point to add or subtract, in constant time
1593                  */
1594                 select_point(digit, 17, pre_comp[num], tmp);
1595                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1596                                             * point */
1597                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1598
1599                 if (!skip) {
1600                     point_add(nq[0], nq[1], nq[2],
1601                               nq[0], nq[1], nq[2],
1602                               mixed, tmp[0], tmp[1], tmp[2]);
1603                 } else {
1604                     memcpy(nq, tmp, 3 * sizeof(felem));
1605                     skip = 0;
1606                 }
1607             }
1608         }
1609     }
1610     felem_assign(x_out, nq[0]);
1611     felem_assign(y_out, nq[1]);
1612     felem_assign(z_out, nq[2]);
1613 }
1614
1615 /* Precomputation for the group generator. */
1616 struct nistp521_pre_comp_st {
1617     felem g_pre_comp[16][3];
1618     CRYPTO_REF_COUNT references;
1619     CRYPTO_RWLOCK *lock;
1620 };
1621
1622 const EC_METHOD *EC_GFp_nistp521_method(void)
1623 {
1624     static const EC_METHOD ret = {
1625         EC_FLAGS_DEFAULT_OCT,
1626         NID_X9_62_prime_field,
1627         ec_GFp_nistp521_group_init,
1628         ec_GFp_simple_group_finish,
1629         ec_GFp_simple_group_clear_finish,
1630         ec_GFp_nist_group_copy,
1631         ec_GFp_nistp521_group_set_curve,
1632         ec_GFp_simple_group_get_curve,
1633         ec_GFp_simple_group_get_degree,
1634         ec_group_simple_order_bits,
1635         ec_GFp_simple_group_check_discriminant,
1636         ec_GFp_simple_point_init,
1637         ec_GFp_simple_point_finish,
1638         ec_GFp_simple_point_clear_finish,
1639         ec_GFp_simple_point_copy,
1640         ec_GFp_simple_point_set_to_infinity,
1641         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1642         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1643         ec_GFp_simple_point_set_affine_coordinates,
1644         ec_GFp_nistp521_point_get_affine_coordinates,
1645         0 /* point_set_compressed_coordinates */ ,
1646         0 /* point2oct */ ,
1647         0 /* oct2point */ ,
1648         ec_GFp_simple_add,
1649         ec_GFp_simple_dbl,
1650         ec_GFp_simple_invert,
1651         ec_GFp_simple_is_at_infinity,
1652         ec_GFp_simple_is_on_curve,
1653         ec_GFp_simple_cmp,
1654         ec_GFp_simple_make_affine,
1655         ec_GFp_simple_points_make_affine,
1656         ec_GFp_nistp521_points_mul,
1657         ec_GFp_nistp521_precompute_mult,
1658         ec_GFp_nistp521_have_precompute_mult,
1659         ec_GFp_nist_field_mul,
1660         ec_GFp_nist_field_sqr,
1661         0 /* field_div */ ,
1662         ec_GFp_simple_field_inv,
1663         0 /* field_encode */ ,
1664         0 /* field_decode */ ,
1665         0,                      /* field_set_to_one */
1666         ec_key_simple_priv2oct,
1667         ec_key_simple_oct2priv,
1668         0, /* set private */
1669         ec_key_simple_generate_key,
1670         ec_key_simple_check_key,
1671         ec_key_simple_generate_public_key,
1672         0, /* keycopy */
1673         0, /* keyfinish */
1674         ecdh_simple_compute_key,
1675         ecdsa_simple_sign_setup,
1676         ecdsa_simple_sign_sig,
1677         ecdsa_simple_verify_sig,
1678         0, /* field_inverse_mod_ord */
1679         0, /* blind_coordinates */
1680         0, /* ladder_pre */
1681         0, /* ladder_step */
1682         0  /* ladder_post */
1683     };
1684
1685     return &ret;
1686 }
1687
1688 /******************************************************************************/
1689 /*
1690  * FUNCTIONS TO MANAGE PRECOMPUTATION
1691  */
1692
1693 static NISTP521_PRE_COMP *nistp521_pre_comp_new(void)
1694 {
1695     NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1696
1697     if (ret == NULL) {
1698         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1699         return ret;
1700     }
1701
1702     ret->references = 1;
1703
1704     ret->lock = CRYPTO_THREAD_lock_new();
1705     if (ret->lock == NULL) {
1706         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1707         OPENSSL_free(ret);
1708         return NULL;
1709     }
1710     return ret;
1711 }
1712
1713 NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
1714 {
1715     int i;
1716     if (p != NULL)
1717         CRYPTO_UP_REF(&p->references, &i, p->lock);
1718     return p;
1719 }
1720
1721 void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1722 {
1723     int i;
1724
1725     if (p == NULL)
1726         return;
1727
1728     CRYPTO_DOWN_REF(&p->references, &i, p->lock);
1729     REF_PRINT_COUNT("EC_nistp521", x);
1730     if (i > 0)
1731         return;
1732     REF_ASSERT_ISNT(i < 0);
1733
1734     CRYPTO_THREAD_lock_free(p->lock);
1735     OPENSSL_free(p);
1736 }
1737
1738 /******************************************************************************/
1739 /*
1740  * OPENSSL EC_METHOD FUNCTIONS
1741  */
1742
1743 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1744 {
1745     int ret;
1746     ret = ec_GFp_simple_group_init(group);
1747     group->a_is_minus3 = 1;
1748     return ret;
1749 }
1750
1751 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1752                                     const BIGNUM *a, const BIGNUM *b,
1753                                     BN_CTX *ctx)
1754 {
1755     int ret = 0;
1756     BIGNUM *curve_p, *curve_a, *curve_b;
1757 #ifndef FIPS_MODE
1758     BN_CTX *new_ctx = NULL;
1759
1760     if (ctx == NULL)
1761         ctx = new_ctx = BN_CTX_new();
1762 #endif
1763     if (ctx == NULL)
1764         return 0;
1765
1766     BN_CTX_start(ctx);
1767     curve_p = BN_CTX_get(ctx);
1768     curve_a = BN_CTX_get(ctx);
1769     curve_b = BN_CTX_get(ctx);
1770     if (curve_b == NULL)
1771         goto err;
1772     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1773     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1774     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1775     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1776         ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1777               EC_R_WRONG_CURVE_PARAMETERS);
1778         goto err;
1779     }
1780     group->field_mod_func = BN_nist_mod_521;
1781     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1782  err:
1783     BN_CTX_end(ctx);
1784 #ifndef FIPS_MODE
1785     BN_CTX_free(new_ctx);
1786 #endif
1787     return ret;
1788 }
1789
1790 /*
1791  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1792  * (X/Z^2, Y/Z^3)
1793  */
1794 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1795                                                  const EC_POINT *point,
1796                                                  BIGNUM *x, BIGNUM *y,
1797                                                  BN_CTX *ctx)
1798 {
1799     felem z1, z2, x_in, y_in, x_out, y_out;
1800     largefelem tmp;
1801
1802     if (EC_POINT_is_at_infinity(group, point)) {
1803         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1804               EC_R_POINT_AT_INFINITY);
1805         return 0;
1806     }
1807     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1808         (!BN_to_felem(z1, point->Z)))
1809         return 0;
1810     felem_inv(z2, z1);
1811     felem_square(tmp, z2);
1812     felem_reduce(z1, tmp);
1813     felem_mul(tmp, x_in, z1);
1814     felem_reduce(x_in, tmp);
1815     felem_contract(x_out, x_in);
1816     if (x != NULL) {
1817         if (!felem_to_BN(x, x_out)) {
1818             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1819                   ERR_R_BN_LIB);
1820             return 0;
1821         }
1822     }
1823     felem_mul(tmp, z1, z2);
1824     felem_reduce(z1, tmp);
1825     felem_mul(tmp, y_in, z1);
1826     felem_reduce(y_in, tmp);
1827     felem_contract(y_out, y_in);
1828     if (y != NULL) {
1829         if (!felem_to_BN(y, y_out)) {
1830             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1831                   ERR_R_BN_LIB);
1832             return 0;
1833         }
1834     }
1835     return 1;
1836 }
1837
1838 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1839 static void make_points_affine(size_t num, felem points[][3],
1840                                felem tmp_felems[])
1841 {
1842     /*
1843      * Runs in constant time, unless an input is the point at infinity (which
1844      * normally shouldn't happen).
1845      */
1846     ec_GFp_nistp_points_make_affine_internal(num,
1847                                              points,
1848                                              sizeof(felem),
1849                                              tmp_felems,
1850                                              (void (*)(void *))felem_one,
1851                                              felem_is_zero_int,
1852                                              (void (*)(void *, const void *))
1853                                              felem_assign,
1854                                              (void (*)(void *, const void *))
1855                                              felem_square_reduce, (void (*)
1856                                                                    (void *,
1857                                                                     const void
1858                                                                     *,
1859                                                                     const void
1860                                                                     *))
1861                                              felem_mul_reduce,
1862                                              (void (*)(void *, const void *))
1863                                              felem_inv,
1864                                              (void (*)(void *, const void *))
1865                                              felem_contract);
1866 }
1867
1868 /*
1869  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1870  * values Result is stored in r (r can equal one of the inputs).
1871  */
1872 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1873                                const BIGNUM *scalar, size_t num,
1874                                const EC_POINT *points[],
1875                                const BIGNUM *scalars[], BN_CTX *ctx)
1876 {
1877     int ret = 0;
1878     int j;
1879     int mixed = 0;
1880     BIGNUM *x, *y, *z, *tmp_scalar;
1881     felem_bytearray g_secret;
1882     felem_bytearray *secrets = NULL;
1883     felem (*pre_comp)[17][3] = NULL;
1884     felem *tmp_felems = NULL;
1885     unsigned i;
1886     int num_bytes;
1887     int have_pre_comp = 0;
1888     size_t num_points = num;
1889     felem x_in, y_in, z_in, x_out, y_out, z_out;
1890     NISTP521_PRE_COMP *pre = NULL;
1891     felem(*g_pre_comp)[3] = NULL;
1892     EC_POINT *generator = NULL;
1893     const EC_POINT *p = NULL;
1894     const BIGNUM *p_scalar = NULL;
1895
1896     BN_CTX_start(ctx);
1897     x = BN_CTX_get(ctx);
1898     y = BN_CTX_get(ctx);
1899     z = BN_CTX_get(ctx);
1900     tmp_scalar = BN_CTX_get(ctx);
1901     if (tmp_scalar == NULL)
1902         goto err;
1903
1904     if (scalar != NULL) {
1905         pre = group->pre_comp.nistp521;
1906         if (pre)
1907             /* we have precomputation, try to use it */
1908             g_pre_comp = &pre->g_pre_comp[0];
1909         else
1910             /* try to use the standard precomputation */
1911             g_pre_comp = (felem(*)[3]) gmul;
1912         generator = EC_POINT_new(group);
1913         if (generator == NULL)
1914             goto err;
1915         /* get the generator from precomputation */
1916         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1917             !felem_to_BN(y, g_pre_comp[1][1]) ||
1918             !felem_to_BN(z, g_pre_comp[1][2])) {
1919             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1920             goto err;
1921         }
1922         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1923                                                       generator, x, y, z,
1924                                                       ctx))
1925             goto err;
1926         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1927             /* precomputation matches generator */
1928             have_pre_comp = 1;
1929         else
1930             /*
1931              * we don't have valid precomputation: treat the generator as a
1932              * random point
1933              */
1934             num_points++;
1935     }
1936
1937     if (num_points > 0) {
1938         if (num_points >= 2) {
1939             /*
1940              * unless we precompute multiples for just one point, converting
1941              * those into affine form is time well spent
1942              */
1943             mixed = 1;
1944         }
1945         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1946         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1947         if (mixed)
1948             tmp_felems =
1949                 OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1950         if ((secrets == NULL) || (pre_comp == NULL)
1951             || (mixed && (tmp_felems == NULL))) {
1952             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1953             goto err;
1954         }
1955
1956         /*
1957          * we treat NULL scalars as 0, and NULL points as points at infinity,
1958          * i.e., they contribute nothing to the linear combination
1959          */
1960         for (i = 0; i < num_points; ++i) {
1961             if (i == num) {
1962                 /*
1963                  * we didn't have a valid precomputation, so we pick the
1964                  * generator
1965                  */
1966                 p = EC_GROUP_get0_generator(group);
1967                 p_scalar = scalar;
1968             } else {
1969                 /* the i^th point */
1970                 p = points[i];
1971                 p_scalar = scalars[i];
1972             }
1973             if ((p_scalar != NULL) && (p != NULL)) {
1974                 /* reduce scalar to 0 <= scalar < 2^521 */
1975                 if ((BN_num_bits(p_scalar) > 521)
1976                     || (BN_is_negative(p_scalar))) {
1977                     /*
1978                      * this is an unusual input, and we don't guarantee
1979                      * constant-timeness
1980                      */
1981                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1982                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1983                         goto err;
1984                     }
1985                     num_bytes = BN_bn2lebinpad(tmp_scalar,
1986                                                secrets[i], sizeof(secrets[i]));
1987                 } else {
1988                     num_bytes = BN_bn2lebinpad(p_scalar,
1989                                                secrets[i], sizeof(secrets[i]));
1990                 }
1991                 if (num_bytes < 0) {
1992                     ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1993                     goto err;
1994                 }
1995                 /* precompute multiples */
1996                 if ((!BN_to_felem(x_out, p->X)) ||
1997                     (!BN_to_felem(y_out, p->Y)) ||
1998                     (!BN_to_felem(z_out, p->Z)))
1999                     goto err;
2000                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
2001                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
2002                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
2003                 for (j = 2; j <= 16; ++j) {
2004                     if (j & 1) {
2005                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
2006                                   pre_comp[i][j][2], pre_comp[i][1][0],
2007                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
2008                                   pre_comp[i][j - 1][0],
2009                                   pre_comp[i][j - 1][1],
2010                                   pre_comp[i][j - 1][2]);
2011                     } else {
2012                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
2013                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
2014                                      pre_comp[i][j / 2][1],
2015                                      pre_comp[i][j / 2][2]);
2016                     }
2017                 }
2018             }
2019         }
2020         if (mixed)
2021             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
2022     }
2023
2024     /* the scalar for the generator */
2025     if ((scalar != NULL) && (have_pre_comp)) {
2026         memset(g_secret, 0, sizeof(g_secret));
2027         /* reduce scalar to 0 <= scalar < 2^521 */
2028         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
2029             /*
2030              * this is an unusual input, and we don't guarantee
2031              * constant-timeness
2032              */
2033             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2034                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2035                 goto err;
2036             }
2037             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2038         } else {
2039             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2040         }
2041         /* do the multiplication with generator precomputation */
2042         batch_mul(x_out, y_out, z_out,
2043                   (const felem_bytearray(*))secrets, num_points,
2044                   g_secret,
2045                   mixed, (const felem(*)[17][3])pre_comp,
2046                   (const felem(*)[3])g_pre_comp);
2047     } else {
2048         /* do the multiplication without generator precomputation */
2049         batch_mul(x_out, y_out, z_out,
2050                   (const felem_bytearray(*))secrets, num_points,
2051                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
2052     }
2053     /* reduce the output to its unique minimal representation */
2054     felem_contract(x_in, x_out);
2055     felem_contract(y_in, y_out);
2056     felem_contract(z_in, z_out);
2057     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2058         (!felem_to_BN(z, z_in))) {
2059         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2060         goto err;
2061     }
2062     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2063
2064  err:
2065     BN_CTX_end(ctx);
2066     EC_POINT_free(generator);
2067     OPENSSL_free(secrets);
2068     OPENSSL_free(pre_comp);
2069     OPENSSL_free(tmp_felems);
2070     return ret;
2071 }
2072
2073 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2074 {
2075     int ret = 0;
2076     NISTP521_PRE_COMP *pre = NULL;
2077     int i, j;
2078     BIGNUM *x, *y;
2079     EC_POINT *generator = NULL;
2080     felem tmp_felems[16];
2081 #ifndef FIPS_MODE
2082     BN_CTX *new_ctx = NULL;
2083 #endif
2084
2085     /* throw away old precomputation */
2086     EC_pre_comp_free(group);
2087
2088 #ifndef FIPS_MODE
2089     if (ctx == NULL)
2090         ctx = new_ctx = BN_CTX_new();
2091 #endif
2092     if (ctx == NULL)
2093         return 0;
2094
2095     BN_CTX_start(ctx);
2096     x = BN_CTX_get(ctx);
2097     y = BN_CTX_get(ctx);
2098     if (y == NULL)
2099         goto err;
2100     /* get the generator */
2101     if (group->generator == NULL)
2102         goto err;
2103     generator = EC_POINT_new(group);
2104     if (generator == NULL)
2105         goto err;
2106     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2107     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2108     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2109         goto err;
2110     if ((pre = nistp521_pre_comp_new()) == NULL)
2111         goto err;
2112     /*
2113      * if the generator is the standard one, use built-in precomputation
2114      */
2115     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2116         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2117         goto done;
2118     }
2119     if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
2120         (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
2121         (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2122         goto err;
2123     /* compute 2^130*G, 2^260*G, 2^390*G */
2124     for (i = 1; i <= 4; i <<= 1) {
2125         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2126                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2127                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2128         for (j = 0; j < 129; ++j) {
2129             point_double(pre->g_pre_comp[2 * i][0],
2130                          pre->g_pre_comp[2 * i][1],
2131                          pre->g_pre_comp[2 * i][2],
2132                          pre->g_pre_comp[2 * i][0],
2133                          pre->g_pre_comp[2 * i][1],
2134                          pre->g_pre_comp[2 * i][2]);
2135         }
2136     }
2137     /* g_pre_comp[0] is the point at infinity */
2138     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2139     /* the remaining multiples */
2140     /* 2^130*G + 2^260*G */
2141     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2142               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2143               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2144               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2145               pre->g_pre_comp[2][2]);
2146     /* 2^130*G + 2^390*G */
2147     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2148               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2149               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2150               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2151               pre->g_pre_comp[2][2]);
2152     /* 2^260*G + 2^390*G */
2153     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2154               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2155               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2156               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2157               pre->g_pre_comp[4][2]);
2158     /* 2^130*G + 2^260*G + 2^390*G */
2159     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2160               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2161               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2162               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2163               pre->g_pre_comp[2][2]);
2164     for (i = 1; i < 8; ++i) {
2165         /* odd multiples: add G */
2166         point_add(pre->g_pre_comp[2 * i + 1][0],
2167                   pre->g_pre_comp[2 * i + 1][1],
2168                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2169                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2170                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2171                   pre->g_pre_comp[1][2]);
2172     }
2173     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2174
2175  done:
2176     SETPRECOMP(group, nistp521, pre);
2177     ret = 1;
2178     pre = NULL;
2179  err:
2180     BN_CTX_end(ctx);
2181     EC_POINT_free(generator);
2182 #ifndef FIPS_MODE
2183     BN_CTX_free(new_ctx);
2184 #endif
2185     EC_nistp521_pre_comp_free(pre);
2186     return ret;
2187 }
2188
2189 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2190 {
2191     return HAVEPRECOMP(group, nistp521);
2192 }