crypto/ec/ecp_nistp521.c

   1 /*
   2  * Copyright 2011-2021 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the Apache License 2.0 (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * ECDSA low level APIs are deprecated for public use, but still ok for
  28  * internal use.
  29  */
  30 #include "internal/deprecated.h"
  31
  32 /*
  33  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  34  *
  35  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  36  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  37  * work which got its smarts from Daniel J. Bernstein's work on the same.
  38  */
  39
  40 #include <openssl/e_os2.h>
  41
  42 #include <string.h>
  43 #include <openssl/err.h>
  44 #include "ec_local.h"
  45
  46 #include "internal/numbers.h"
  47
  48 #ifndef INT128_MAX
  49 # error "Your compiler doesn't appear to support 128-bit integer types"
  50 #endif
  51
  52 typedef uint8_t u8;
  53 typedef uint64_t u64;
  54
  55 /*
  56  * The underlying field. P521 operates over GF(2^521-1). We can serialize an
  57  * element of this field into 66 bytes where the most significant byte
  58  * contains only a single bit. We call this an felem_bytearray.
  59  */
  60
  61 typedef u8 felem_bytearray[66];
  62
  63 /*
  64  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  65  * These values are big-endian.
  66  */
  67 static const felem_bytearray nistp521_curve_params[5] = {
  68     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
  69      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  70      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  74      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  75      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  76      0xff, 0xff},
  77     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
  78      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  79      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  82      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  83      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  84      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  85      0xff, 0xfc},
  86     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
  87      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  88      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  89      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  90      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  91      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  92      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  93      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  94      0x3f, 0x00},
  95     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
  96      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  97      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
  98      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
  99      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
 100      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
 101      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
 102      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 103      0xbd, 0x66},
 104     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
 105      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 106      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 107      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 108      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 109      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 110      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 111      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 112      0x66, 0x50}
 113 };
 114
 115 /*-
 116  * The representation of field elements.
 117  * ------------------------------------
 118  *
 119  * We represent field elements with nine values. These values are either 64 or
 120  * 128 bits and the field element represented is:
 121  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 122  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 123  * 58 bits apart, but are greater than 58 bits in length, the most significant
 124  * bits of each limb overlap with the least significant bits of the next.
 125  *
 126  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 127  * 'largefelem' */
 128
 129 #define NLIMBS 9
 130
 131 typedef uint64_t limb;
 132 typedef limb limb_aX __attribute((__aligned__(1)));
 133 typedef limb felem[NLIMBS];
 134 typedef uint128_t largefelem[NLIMBS];
 135
 136 static const limb bottom57bits = 0x1ffffffffffffff;
 137 static const limb bottom58bits = 0x3ffffffffffffff;
 138
 139 /*
 140  * bin66_to_felem takes a little-endian byte array and converts it into felem
 141  * form. This assumes that the CPU is little-endian.
 142  */
 143 static void bin66_to_felem(felem out, const u8 in[66])
 144 {
 145     out[0] = (*((limb *) & in[0])) & bottom58bits;
 146     out[1] = (*((limb_aX *) & in[7]) >> 2) & bottom58bits;
 147     out[2] = (*((limb_aX *) & in[14]) >> 4) & bottom58bits;
 148     out[3] = (*((limb_aX *) & in[21]) >> 6) & bottom58bits;
 149     out[4] = (*((limb_aX *) & in[29])) & bottom58bits;
 150     out[5] = (*((limb_aX *) & in[36]) >> 2) & bottom58bits;
 151     out[6] = (*((limb_aX *) & in[43]) >> 4) & bottom58bits;
 152     out[7] = (*((limb_aX *) & in[50]) >> 6) & bottom58bits;
 153     out[8] = (*((limb_aX *) & in[58])) & bottom57bits;
 154 }
 155
 156 /*
 157  * felem_to_bin66 takes an felem and serializes into a little endian, 66 byte
 158  * array. This assumes that the CPU is little-endian.
 159  */
 160 static void felem_to_bin66(u8 out[66], const felem in)
 161 {
 162     memset(out, 0, 66);
 163     (*((limb *) & out[0])) = in[0];
 164     (*((limb_aX *) & out[7])) |= in[1] << 2;
 165     (*((limb_aX *) & out[14])) |= in[2] << 4;
 166     (*((limb_aX *) & out[21])) |= in[3] << 6;
 167     (*((limb_aX *) & out[29])) = in[4];
 168     (*((limb_aX *) & out[36])) |= in[5] << 2;
 169     (*((limb_aX *) & out[43])) |= in[6] << 4;
 170     (*((limb_aX *) & out[50])) |= in[7] << 6;
 171     (*((limb_aX *) & out[58])) = in[8];
 172 }
 173
 174 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 175 static int BN_to_felem(felem out, const BIGNUM *bn)
 176 {
 177     felem_bytearray b_out;
 178     int num_bytes;
 179
 180     if (BN_is_negative(bn)) {
 181         ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
 182         return 0;
 183     }
 184     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
 185     if (num_bytes < 0) {
 186         ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
 187         return 0;
 188     }
 189     bin66_to_felem(out, b_out);
 190     return 1;
 191 }
 192
 193 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 194 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 195 {
 196     felem_bytearray b_out;
 197     felem_to_bin66(b_out, in);
 198     return BN_lebin2bn(b_out, sizeof(b_out), out);
 199 }
 200
 201 /*-
 202  * Field operations
 203  * ----------------
 204  */
 205
 206 static void felem_one(felem out)
 207 {
 208     out[0] = 1;
 209     out[1] = 0;
 210     out[2] = 0;
 211     out[3] = 0;
 212     out[4] = 0;
 213     out[5] = 0;
 214     out[6] = 0;
 215     out[7] = 0;
 216     out[8] = 0;
 217 }
 218
 219 static void felem_assign(felem out, const felem in)
 220 {
 221     out[0] = in[0];
 222     out[1] = in[1];
 223     out[2] = in[2];
 224     out[3] = in[3];
 225     out[4] = in[4];
 226     out[5] = in[5];
 227     out[6] = in[6];
 228     out[7] = in[7];
 229     out[8] = in[8];
 230 }
 231
 232 /* felem_sum64 sets out = out + in. */
 233 static void felem_sum64(felem out, const felem in)
 234 {
 235     out[0] += in[0];
 236     out[1] += in[1];
 237     out[2] += in[2];
 238     out[3] += in[3];
 239     out[4] += in[4];
 240     out[5] += in[5];
 241     out[6] += in[6];
 242     out[7] += in[7];
 243     out[8] += in[8];
 244 }
 245
 246 /* felem_scalar sets out = in * scalar */
 247 static void felem_scalar(felem out, const felem in, limb scalar)
 248 {
 249     out[0] = in[0] * scalar;
 250     out[1] = in[1] * scalar;
 251     out[2] = in[2] * scalar;
 252     out[3] = in[3] * scalar;
 253     out[4] = in[4] * scalar;
 254     out[5] = in[5] * scalar;
 255     out[6] = in[6] * scalar;
 256     out[7] = in[7] * scalar;
 257     out[8] = in[8] * scalar;
 258 }
 259
 260 /* felem_scalar64 sets out = out * scalar */
 261 static void felem_scalar64(felem out, limb scalar)
 262 {
 263     out[0] *= scalar;
 264     out[1] *= scalar;
 265     out[2] *= scalar;
 266     out[3] *= scalar;
 267     out[4] *= scalar;
 268     out[5] *= scalar;
 269     out[6] *= scalar;
 270     out[7] *= scalar;
 271     out[8] *= scalar;
 272 }
 273
 274 /* felem_scalar128 sets out = out * scalar */
 275 static void felem_scalar128(largefelem out, limb scalar)
 276 {
 277     out[0] *= scalar;
 278     out[1] *= scalar;
 279     out[2] *= scalar;
 280     out[3] *= scalar;
 281     out[4] *= scalar;
 282     out[5] *= scalar;
 283     out[6] *= scalar;
 284     out[7] *= scalar;
 285     out[8] *= scalar;
 286 }
 287
 288 /*-
 289  * felem_neg sets |out| to |-in|
 290  * On entry:
 291  *   in[i] < 2^59 + 2^14
 292  * On exit:
 293  *   out[i] < 2^62
 294  */
 295 static void felem_neg(felem out, const felem in)
 296 {
 297     /* In order to prevent underflow, we subtract from 0 mod p. */
 298     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 299     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 300
 301     out[0] = two62m3 - in[0];
 302     out[1] = two62m2 - in[1];
 303     out[2] = two62m2 - in[2];
 304     out[3] = two62m2 - in[3];
 305     out[4] = two62m2 - in[4];
 306     out[5] = two62m2 - in[5];
 307     out[6] = two62m2 - in[6];
 308     out[7] = two62m2 - in[7];
 309     out[8] = two62m2 - in[8];
 310 }
 311
 312 /*-
 313  * felem_diff64 subtracts |in| from |out|
 314  * On entry:
 315  *   in[i] < 2^59 + 2^14
 316  * On exit:
 317  *   out[i] < out[i] + 2^62
 318  */
 319 static void felem_diff64(felem out, const felem in)
 320 {
 321     /*
 322      * In order to prevent underflow, we add 0 mod p before subtracting.
 323      */
 324     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 325     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 326
 327     out[0] += two62m3 - in[0];
 328     out[1] += two62m2 - in[1];
 329     out[2] += two62m2 - in[2];
 330     out[3] += two62m2 - in[3];
 331     out[4] += two62m2 - in[4];
 332     out[5] += two62m2 - in[5];
 333     out[6] += two62m2 - in[6];
 334     out[7] += two62m2 - in[7];
 335     out[8] += two62m2 - in[8];
 336 }
 337
 338 /*-
 339  * felem_diff_128_64 subtracts |in| from |out|
 340  * On entry:
 341  *   in[i] < 2^62 + 2^17
 342  * On exit:
 343  *   out[i] < out[i] + 2^63
 344  */
 345 static void felem_diff_128_64(largefelem out, const felem in)
 346 {
 347     /*
 348      * In order to prevent underflow, we add 64p mod p (which is equivalent
 349      * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
 350      * digit number with all bits set to 1. See "The representation of field
 351      * elements" comment above for a description of how limbs are used to
 352      * represent a number. 64p is represented with 8 limbs containing a number
 353      * with 58 bits set and one limb with a number with 57 bits set.
 354      */
 355     static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
 356     static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
 357
 358     out[0] += two63m6 - in[0];
 359     out[1] += two63m5 - in[1];
 360     out[2] += two63m5 - in[2];
 361     out[3] += two63m5 - in[3];
 362     out[4] += two63m5 - in[4];
 363     out[5] += two63m5 - in[5];
 364     out[6] += two63m5 - in[6];
 365     out[7] += two63m5 - in[7];
 366     out[8] += two63m5 - in[8];
 367 }
 368
 369 /*-
 370  * felem_diff_128_64 subtracts |in| from |out|
 371  * On entry:
 372  *   in[i] < 2^126
 373  * On exit:
 374  *   out[i] < out[i] + 2^127 - 2^69
 375  */
 376 static void felem_diff128(largefelem out, const largefelem in)
 377 {
 378     /*
 379      * In order to prevent underflow, we add 0 mod p before subtracting.
 380      */
 381     static const uint128_t two127m70 =
 382         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
 383     static const uint128_t two127m69 =
 384         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
 385
 386     out[0] += (two127m70 - in[0]);
 387     out[1] += (two127m69 - in[1]);
 388     out[2] += (two127m69 - in[2]);
 389     out[3] += (two127m69 - in[3]);
 390     out[4] += (two127m69 - in[4]);
 391     out[5] += (two127m69 - in[5]);
 392     out[6] += (two127m69 - in[6]);
 393     out[7] += (two127m69 - in[7]);
 394     out[8] += (two127m69 - in[8]);
 395 }
 396
 397 /*-
 398  * felem_square sets |out| = |in|^2
 399  * On entry:
 400  *   in[i] < 2^62
 401  * On exit:
 402  *   out[i] < 17 * max(in[i]) * max(in[i])
 403  */
 404 static void felem_square(largefelem out, const felem in)
 405 {
 406     felem inx2, inx4;
 407     felem_scalar(inx2, in, 2);
 408     felem_scalar(inx4, in, 4);
 409
 410     /*-
 411      * We have many cases were we want to do
 412      *   in[x] * in[y] +
 413      *   in[y] * in[x]
 414      * This is obviously just
 415      *   2 * in[x] * in[y]
 416      * However, rather than do the doubling on the 128 bit result, we
 417      * double one of the inputs to the multiplication by reading from
 418      * |inx2|
 419      */
 420
 421     out[0] = ((uint128_t) in[0]) * in[0];
 422     out[1] = ((uint128_t) in[0]) * inx2[1];
 423     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
 424     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
 425     out[4] = ((uint128_t) in[0]) * inx2[4] +
 426              ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
 427     out[5] = ((uint128_t) in[0]) * inx2[5] +
 428              ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
 429     out[6] = ((uint128_t) in[0]) * inx2[6] +
 430              ((uint128_t) in[1]) * inx2[5] +
 431              ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
 432     out[7] = ((uint128_t) in[0]) * inx2[7] +
 433              ((uint128_t) in[1]) * inx2[6] +
 434              ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
 435     out[8] = ((uint128_t) in[0]) * inx2[8] +
 436              ((uint128_t) in[1]) * inx2[7] +
 437              ((uint128_t) in[2]) * inx2[6] +
 438              ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
 439
 440     /*
 441      * The remaining limbs fall above 2^521, with the first falling at 2^522.
 442      * They correspond to locations one bit up from the limbs produced above
 443      * so we would have to multiply by two to align them. Again, rather than
 444      * operate on the 128-bit result, we double one of the inputs to the
 445      * multiplication. If we want to double for both this reason, and the
 446      * reason above, then we end up multiplying by four.
 447      */
 448
 449     /* 9 */
 450     out[0] += ((uint128_t) in[1]) * inx4[8] +
 451               ((uint128_t) in[2]) * inx4[7] +
 452               ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
 453
 454     /* 10 */
 455     out[1] += ((uint128_t) in[2]) * inx4[8] +
 456               ((uint128_t) in[3]) * inx4[7] +
 457               ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
 458
 459     /* 11 */
 460     out[2] += ((uint128_t) in[3]) * inx4[8] +
 461               ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
 462
 463     /* 12 */
 464     out[3] += ((uint128_t) in[4]) * inx4[8] +
 465               ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
 466
 467     /* 13 */
 468     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
 469
 470     /* 14 */
 471     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
 472
 473     /* 15 */
 474     out[6] += ((uint128_t) in[7]) * inx4[8];
 475
 476     /* 16 */
 477     out[7] += ((uint128_t) in[8]) * inx2[8];
 478 }
 479
 480 /*-
 481  * felem_mul sets |out| = |in1| * |in2|
 482  * On entry:
 483  *   in1[i] < 2^64
 484  *   in2[i] < 2^63
 485  * On exit:
 486  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 487  */
 488 static void felem_mul(largefelem out, const felem in1, const felem in2)
 489 {
 490     felem in2x2;
 491     felem_scalar(in2x2, in2, 2);
 492
 493     out[0] = ((uint128_t) in1[0]) * in2[0];
 494
 495     out[1] = ((uint128_t) in1[0]) * in2[1] +
 496              ((uint128_t) in1[1]) * in2[0];
 497
 498     out[2] = ((uint128_t) in1[0]) * in2[2] +
 499              ((uint128_t) in1[1]) * in2[1] +
 500              ((uint128_t) in1[2]) * in2[0];
 501
 502     out[3] = ((uint128_t) in1[0]) * in2[3] +
 503              ((uint128_t) in1[1]) * in2[2] +
 504              ((uint128_t) in1[2]) * in2[1] +
 505              ((uint128_t) in1[3]) * in2[0];
 506
 507     out[4] = ((uint128_t) in1[0]) * in2[4] +
 508              ((uint128_t) in1[1]) * in2[3] +
 509              ((uint128_t) in1[2]) * in2[2] +
 510              ((uint128_t) in1[3]) * in2[1] +
 511              ((uint128_t) in1[4]) * in2[0];
 512
 513     out[5] = ((uint128_t) in1[0]) * in2[5] +
 514              ((uint128_t) in1[1]) * in2[4] +
 515              ((uint128_t) in1[2]) * in2[3] +
 516              ((uint128_t) in1[3]) * in2[2] +
 517              ((uint128_t) in1[4]) * in2[1] +
 518              ((uint128_t) in1[5]) * in2[0];
 519
 520     out[6] = ((uint128_t) in1[0]) * in2[6] +
 521              ((uint128_t) in1[1]) * in2[5] +
 522              ((uint128_t) in1[2]) * in2[4] +
 523              ((uint128_t) in1[3]) * in2[3] +
 524              ((uint128_t) in1[4]) * in2[2] +
 525              ((uint128_t) in1[5]) * in2[1] +
 526              ((uint128_t) in1[6]) * in2[0];
 527
 528     out[7] = ((uint128_t) in1[0]) * in2[7] +
 529              ((uint128_t) in1[1]) * in2[6] +
 530              ((uint128_t) in1[2]) * in2[5] +
 531              ((uint128_t) in1[3]) * in2[4] +
 532              ((uint128_t) in1[4]) * in2[3] +
 533              ((uint128_t) in1[5]) * in2[2] +
 534              ((uint128_t) in1[6]) * in2[1] +
 535              ((uint128_t) in1[7]) * in2[0];
 536
 537     out[8] = ((uint128_t) in1[0]) * in2[8] +
 538              ((uint128_t) in1[1]) * in2[7] +
 539              ((uint128_t) in1[2]) * in2[6] +
 540              ((uint128_t) in1[3]) * in2[5] +
 541              ((uint128_t) in1[4]) * in2[4] +
 542              ((uint128_t) in1[5]) * in2[3] +
 543              ((uint128_t) in1[6]) * in2[2] +
 544              ((uint128_t) in1[7]) * in2[1] +
 545              ((uint128_t) in1[8]) * in2[0];
 546
 547     /* See comment in felem_square about the use of in2x2 here */
 548
 549     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 550               ((uint128_t) in1[2]) * in2x2[7] +
 551               ((uint128_t) in1[3]) * in2x2[6] +
 552               ((uint128_t) in1[4]) * in2x2[5] +
 553               ((uint128_t) in1[5]) * in2x2[4] +
 554               ((uint128_t) in1[6]) * in2x2[3] +
 555               ((uint128_t) in1[7]) * in2x2[2] +
 556               ((uint128_t) in1[8]) * in2x2[1];
 557
 558     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 559               ((uint128_t) in1[3]) * in2x2[7] +
 560               ((uint128_t) in1[4]) * in2x2[6] +
 561               ((uint128_t) in1[5]) * in2x2[5] +
 562               ((uint128_t) in1[6]) * in2x2[4] +
 563               ((uint128_t) in1[7]) * in2x2[3] +
 564               ((uint128_t) in1[8]) * in2x2[2];
 565
 566     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 567               ((uint128_t) in1[4]) * in2x2[7] +
 568               ((uint128_t) in1[5]) * in2x2[6] +
 569               ((uint128_t) in1[6]) * in2x2[5] +
 570               ((uint128_t) in1[7]) * in2x2[4] +
 571               ((uint128_t) in1[8]) * in2x2[3];
 572
 573     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 574               ((uint128_t) in1[5]) * in2x2[7] +
 575               ((uint128_t) in1[6]) * in2x2[6] +
 576               ((uint128_t) in1[7]) * in2x2[5] +
 577               ((uint128_t) in1[8]) * in2x2[4];
 578
 579     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 580               ((uint128_t) in1[6]) * in2x2[7] +
 581               ((uint128_t) in1[7]) * in2x2[6] +
 582               ((uint128_t) in1[8]) * in2x2[5];
 583
 584     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 585               ((uint128_t) in1[7]) * in2x2[7] +
 586               ((uint128_t) in1[8]) * in2x2[6];
 587
 588     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 589               ((uint128_t) in1[8]) * in2x2[7];
 590
 591     out[7] += ((uint128_t) in1[8]) * in2x2[8];
 592 }
 593
 594 static const limb bottom52bits = 0xfffffffffffff;
 595
 596 /*-
 597  * felem_reduce converts a largefelem to an felem.
 598  * On entry:
 599  *   in[i] < 2^128
 600  * On exit:
 601  *   out[i] < 2^59 + 2^14
 602  */
 603 static void felem_reduce(felem out, const largefelem in)
 604 {
 605     u64 overflow1, overflow2;
 606
 607     out[0] = ((limb) in[0]) & bottom58bits;
 608     out[1] = ((limb) in[1]) & bottom58bits;
 609     out[2] = ((limb) in[2]) & bottom58bits;
 610     out[3] = ((limb) in[3]) & bottom58bits;
 611     out[4] = ((limb) in[4]) & bottom58bits;
 612     out[5] = ((limb) in[5]) & bottom58bits;
 613     out[6] = ((limb) in[6]) & bottom58bits;
 614     out[7] = ((limb) in[7]) & bottom58bits;
 615     out[8] = ((limb) in[8]) & bottom58bits;
 616
 617     /* out[i] < 2^58 */
 618
 619     out[1] += ((limb) in[0]) >> 58;
 620     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 621     /*-
 622      * out[1] < 2^58 + 2^6 + 2^58
 623      *        = 2^59 + 2^6
 624      */
 625     out[2] += ((limb) (in[0] >> 64)) >> 52;
 626
 627     out[2] += ((limb) in[1]) >> 58;
 628     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 629     out[3] += ((limb) (in[1] >> 64)) >> 52;
 630
 631     out[3] += ((limb) in[2]) >> 58;
 632     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 633     out[4] += ((limb) (in[2] >> 64)) >> 52;
 634
 635     out[4] += ((limb) in[3]) >> 58;
 636     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 637     out[5] += ((limb) (in[3] >> 64)) >> 52;
 638
 639     out[5] += ((limb) in[4]) >> 58;
 640     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 641     out[6] += ((limb) (in[4] >> 64)) >> 52;
 642
 643     out[6] += ((limb) in[5]) >> 58;
 644     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 645     out[7] += ((limb) (in[5] >> 64)) >> 52;
 646
 647     out[7] += ((limb) in[6]) >> 58;
 648     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 649     out[8] += ((limb) (in[6] >> 64)) >> 52;
 650
 651     out[8] += ((limb) in[7]) >> 58;
 652     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 653     /*-
 654      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 655      *            < 2^59 + 2^13
 656      */
 657     overflow1 = ((limb) (in[7] >> 64)) >> 52;
 658
 659     overflow1 += ((limb) in[8]) >> 58;
 660     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 661     overflow2 = ((limb) (in[8] >> 64)) >> 52;
 662
 663     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
 664     overflow2 <<= 1;            /* overflow2 < 2^13 */
 665
 666     out[0] += overflow1;        /* out[0] < 2^60 */
 667     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
 668
 669     out[1] += out[0] >> 58;
 670     out[0] &= bottom58bits;
 671     /*-
 672      * out[0] < 2^58
 673      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 674      *        < 2^59 + 2^14
 675      */
 676 }
 677
 678 static void felem_square_reduce(felem out, const felem in)
 679 {
 680     largefelem tmp;
 681     felem_square(tmp, in);
 682     felem_reduce(out, tmp);
 683 }
 684
 685 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 686 {
 687     largefelem tmp;
 688     felem_mul(tmp, in1, in2);
 689     felem_reduce(out, tmp);
 690 }
 691
 692 /*-
 693  * felem_inv calculates |out| = |in|^{-1}
 694  *
 695  * Based on Fermat's Little Theorem:
 696  *   a^p = a (mod p)
 697  *   a^{p-1} = 1 (mod p)
 698  *   a^{p-2} = a^{-1} (mod p)
 699  */
 700 static void felem_inv(felem out, const felem in)
 701 {
 702     felem ftmp, ftmp2, ftmp3, ftmp4;
 703     largefelem tmp;
 704     unsigned i;
 705
 706     felem_square(tmp, in);
 707     felem_reduce(ftmp, tmp);    /* 2^1 */
 708     felem_mul(tmp, in, ftmp);
 709     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
 710     felem_assign(ftmp2, ftmp);
 711     felem_square(tmp, ftmp);
 712     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
 713     felem_mul(tmp, in, ftmp);
 714     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
 715     felem_square(tmp, ftmp);
 716     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
 717
 718     felem_square(tmp, ftmp2);
 719     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
 720     felem_square(tmp, ftmp3);
 721     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
 722     felem_mul(tmp, ftmp3, ftmp2);
 723     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
 724
 725     felem_assign(ftmp2, ftmp3);
 726     felem_square(tmp, ftmp3);
 727     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
 728     felem_square(tmp, ftmp3);
 729     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
 730     felem_square(tmp, ftmp3);
 731     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
 732     felem_square(tmp, ftmp3);
 733     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
 734     felem_assign(ftmp4, ftmp3);
 735     felem_mul(tmp, ftmp3, ftmp);
 736     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
 737     felem_square(tmp, ftmp4);
 738     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
 739     felem_mul(tmp, ftmp3, ftmp2);
 740     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
 741     felem_assign(ftmp2, ftmp3);
 742
 743     for (i = 0; i < 8; i++) {
 744         felem_square(tmp, ftmp3);
 745         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
 746     }
 747     felem_mul(tmp, ftmp3, ftmp2);
 748     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
 749     felem_assign(ftmp2, ftmp3);
 750
 751     for (i = 0; i < 16; i++) {
 752         felem_square(tmp, ftmp3);
 753         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
 754     }
 755     felem_mul(tmp, ftmp3, ftmp2);
 756     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
 757     felem_assign(ftmp2, ftmp3);
 758
 759     for (i = 0; i < 32; i++) {
 760         felem_square(tmp, ftmp3);
 761         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
 762     }
 763     felem_mul(tmp, ftmp3, ftmp2);
 764     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
 765     felem_assign(ftmp2, ftmp3);
 766
 767     for (i = 0; i < 64; i++) {
 768         felem_square(tmp, ftmp3);
 769         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
 770     }
 771     felem_mul(tmp, ftmp3, ftmp2);
 772     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
 773     felem_assign(ftmp2, ftmp3);
 774
 775     for (i = 0; i < 128; i++) {
 776         felem_square(tmp, ftmp3);
 777         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
 778     }
 779     felem_mul(tmp, ftmp3, ftmp2);
 780     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
 781     felem_assign(ftmp2, ftmp3);
 782
 783     for (i = 0; i < 256; i++) {
 784         felem_square(tmp, ftmp3);
 785         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
 786     }
 787     felem_mul(tmp, ftmp3, ftmp2);
 788     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
 789
 790     for (i = 0; i < 9; i++) {
 791         felem_square(tmp, ftmp3);
 792         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
 793     }
 794     felem_mul(tmp, ftmp3, ftmp4);
 795     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
 796     felem_mul(tmp, ftmp3, in);
 797     felem_reduce(out, tmp);     /* 2^512 - 3 */
 798 }
 799
 800 /* This is 2^521-1, expressed as an felem */
 801 static const felem kPrime = {
 802     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 803     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 804     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 805 };
 806
 807 /*-
 808  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 809  * otherwise.
 810  * On entry:
 811  *   in[i] < 2^59 + 2^14
 812  */
 813 static limb felem_is_zero(const felem in)
 814 {
 815     felem ftmp;
 816     limb is_zero, is_p;
 817     felem_assign(ftmp, in);
 818
 819     ftmp[0] += ftmp[8] >> 57;
 820     ftmp[8] &= bottom57bits;
 821     /* ftmp[8] < 2^57 */
 822     ftmp[1] += ftmp[0] >> 58;
 823     ftmp[0] &= bottom58bits;
 824     ftmp[2] += ftmp[1] >> 58;
 825     ftmp[1] &= bottom58bits;
 826     ftmp[3] += ftmp[2] >> 58;
 827     ftmp[2] &= bottom58bits;
 828     ftmp[4] += ftmp[3] >> 58;
 829     ftmp[3] &= bottom58bits;
 830     ftmp[5] += ftmp[4] >> 58;
 831     ftmp[4] &= bottom58bits;
 832     ftmp[6] += ftmp[5] >> 58;
 833     ftmp[5] &= bottom58bits;
 834     ftmp[7] += ftmp[6] >> 58;
 835     ftmp[6] &= bottom58bits;
 836     ftmp[8] += ftmp[7] >> 58;
 837     ftmp[7] &= bottom58bits;
 838     /* ftmp[8] < 2^57 + 4 */
 839
 840     /*
 841      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
 842      * than our bound for ftmp[8]. Therefore we only have to check if the
 843      * zero is zero or 2^521-1.
 844      */
 845
 846     is_zero = 0;
 847     is_zero |= ftmp[0];
 848     is_zero |= ftmp[1];
 849     is_zero |= ftmp[2];
 850     is_zero |= ftmp[3];
 851     is_zero |= ftmp[4];
 852     is_zero |= ftmp[5];
 853     is_zero |= ftmp[6];
 854     is_zero |= ftmp[7];
 855     is_zero |= ftmp[8];
 856
 857     is_zero--;
 858     /*
 859      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
 860      * can be set is if is_zero was 0 before the decrement.
 861      */
 862     is_zero = 0 - (is_zero >> 63);
 863
 864     is_p = ftmp[0] ^ kPrime[0];
 865     is_p |= ftmp[1] ^ kPrime[1];
 866     is_p |= ftmp[2] ^ kPrime[2];
 867     is_p |= ftmp[3] ^ kPrime[3];
 868     is_p |= ftmp[4] ^ kPrime[4];
 869     is_p |= ftmp[5] ^ kPrime[5];
 870     is_p |= ftmp[6] ^ kPrime[6];
 871     is_p |= ftmp[7] ^ kPrime[7];
 872     is_p |= ftmp[8] ^ kPrime[8];
 873
 874     is_p--;
 875     is_p = 0 - (is_p >> 63);
 876
 877     is_zero |= is_p;
 878     return is_zero;
 879 }
 880
 881 static int felem_is_zero_int(const void *in)
 882 {
 883     return (int)(felem_is_zero(in) & ((limb) 1));
 884 }
 885
 886 /*-
 887  * felem_contract converts |in| to its unique, minimal representation.
 888  * On entry:
 889  *   in[i] < 2^59 + 2^14
 890  */
 891 static void felem_contract(felem out, const felem in)
 892 {
 893     limb is_p, is_greater, sign;
 894     static const limb two58 = ((limb) 1) << 58;
 895
 896     felem_assign(out, in);
 897
 898     out[0] += out[8] >> 57;
 899     out[8] &= bottom57bits;
 900     /* out[8] < 2^57 */
 901     out[1] += out[0] >> 58;
 902     out[0] &= bottom58bits;
 903     out[2] += out[1] >> 58;
 904     out[1] &= bottom58bits;
 905     out[3] += out[2] >> 58;
 906     out[2] &= bottom58bits;
 907     out[4] += out[3] >> 58;
 908     out[3] &= bottom58bits;
 909     out[5] += out[4] >> 58;
 910     out[4] &= bottom58bits;
 911     out[6] += out[5] >> 58;
 912     out[5] &= bottom58bits;
 913     out[7] += out[6] >> 58;
 914     out[6] &= bottom58bits;
 915     out[8] += out[7] >> 58;
 916     out[7] &= bottom58bits;
 917     /* out[8] < 2^57 + 4 */
 918
 919     /*
 920      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
 921      * out. See the comments in felem_is_zero regarding why we don't test for
 922      * other multiples of the prime.
 923      */
 924
 925     /*
 926      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
 927      */
 928
 929     is_p = out[0] ^ kPrime[0];
 930     is_p |= out[1] ^ kPrime[1];
 931     is_p |= out[2] ^ kPrime[2];
 932     is_p |= out[3] ^ kPrime[3];
 933     is_p |= out[4] ^ kPrime[4];
 934     is_p |= out[5] ^ kPrime[5];
 935     is_p |= out[6] ^ kPrime[6];
 936     is_p |= out[7] ^ kPrime[7];
 937     is_p |= out[8] ^ kPrime[8];
 938
 939     is_p--;
 940     is_p &= is_p << 32;
 941     is_p &= is_p << 16;
 942     is_p &= is_p << 8;
 943     is_p &= is_p << 4;
 944     is_p &= is_p << 2;
 945     is_p &= is_p << 1;
 946     is_p = 0 - (is_p >> 63);
 947     is_p = ~is_p;
 948
 949     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 950
 951     out[0] &= is_p;
 952     out[1] &= is_p;
 953     out[2] &= is_p;
 954     out[3] &= is_p;
 955     out[4] &= is_p;
 956     out[5] &= is_p;
 957     out[6] &= is_p;
 958     out[7] &= is_p;
 959     out[8] &= is_p;
 960
 961     /*
 962      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
 963      * 57 is greater than zero as (2^521-1) + x >= 2^522
 964      */
 965     is_greater = out[8] >> 57;
 966     is_greater |= is_greater << 32;
 967     is_greater |= is_greater << 16;
 968     is_greater |= is_greater << 8;
 969     is_greater |= is_greater << 4;
 970     is_greater |= is_greater << 2;
 971     is_greater |= is_greater << 1;
 972     is_greater = 0 - (is_greater >> 63);
 973
 974     out[0] -= kPrime[0] & is_greater;
 975     out[1] -= kPrime[1] & is_greater;
 976     out[2] -= kPrime[2] & is_greater;
 977     out[3] -= kPrime[3] & is_greater;
 978     out[4] -= kPrime[4] & is_greater;
 979     out[5] -= kPrime[5] & is_greater;
 980     out[6] -= kPrime[6] & is_greater;
 981     out[7] -= kPrime[7] & is_greater;
 982     out[8] -= kPrime[8] & is_greater;
 983
 984     /* Eliminate negative coefficients */
 985     sign = -(out[0] >> 63);
 986     out[0] += (two58 & sign);
 987     out[1] -= (1 & sign);
 988     sign = -(out[1] >> 63);
 989     out[1] += (two58 & sign);
 990     out[2] -= (1 & sign);
 991     sign = -(out[2] >> 63);
 992     out[2] += (two58 & sign);
 993     out[3] -= (1 & sign);
 994     sign = -(out[3] >> 63);
 995     out[3] += (two58 & sign);
 996     out[4] -= (1 & sign);
 997     sign = -(out[4] >> 63);
 998     out[4] += (two58 & sign);
 999     out[5] -= (1 & sign);
1000     sign = -(out[0] >> 63);
1001     out[5] += (two58 & sign);
1002     out[6] -= (1 & sign);
1003     sign = -(out[6] >> 63);
1004     out[6] += (two58 & sign);
1005     out[7] -= (1 & sign);
1006     sign = -(out[7] >> 63);
1007     out[7] += (two58 & sign);
1008     out[8] -= (1 & sign);
1009     sign = -(out[5] >> 63);
1010     out[5] += (two58 & sign);
1011     out[6] -= (1 & sign);
1012     sign = -(out[6] >> 63);
1013     out[6] += (two58 & sign);
1014     out[7] -= (1 & sign);
1015     sign = -(out[7] >> 63);
1016     out[7] += (two58 & sign);
1017     out[8] -= (1 & sign);
1018 }
1019
1020 /*-
1021  * Group operations
1022  * ----------------
1023  *
1024  * Building on top of the field operations we have the operations on the
1025  * elliptic curve group itself. Points on the curve are represented in Jacobian
1026  * coordinates */
1027
1028 /*-
1029  * point_double calculates 2*(x_in, y_in, z_in)
1030  *
1031  * The method is taken from:
1032  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1033  *
1034  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1035  * while x_out == y_in is not (maybe this works, but it's not tested). */
1036 static void
1037 point_double(felem x_out, felem y_out, felem z_out,
1038              const felem x_in, const felem y_in, const felem z_in)
1039 {
1040     largefelem tmp, tmp2;
1041     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1042
1043     felem_assign(ftmp, x_in);
1044     felem_assign(ftmp2, x_in);
1045
1046     /* delta = z^2 */
1047     felem_square(tmp, z_in);
1048     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1049
1050     /* gamma = y^2 */
1051     felem_square(tmp, y_in);
1052     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1053
1054     /* beta = x*gamma */
1055     felem_mul(tmp, x_in, gamma);
1056     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1057
1058     /* alpha = 3*(x-delta)*(x+delta) */
1059     felem_diff64(ftmp, delta);
1060     /* ftmp[i] < 2^61 */
1061     felem_sum64(ftmp2, delta);
1062     /* ftmp2[i] < 2^60 + 2^15 */
1063     felem_scalar64(ftmp2, 3);
1064     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1065     felem_mul(tmp, ftmp, ftmp2);
1066     /*-
1067      * tmp[i] < 17(3*2^121 + 3*2^76)
1068      *        = 61*2^121 + 61*2^76
1069      *        < 64*2^121 + 64*2^76
1070      *        = 2^127 + 2^82
1071      *        < 2^128
1072      */
1073     felem_reduce(alpha, tmp);
1074
1075     /* x' = alpha^2 - 8*beta */
1076     felem_square(tmp, alpha);
1077     /*
1078      * tmp[i] < 17*2^120 < 2^125
1079      */
1080     felem_assign(ftmp, beta);
1081     felem_scalar64(ftmp, 8);
1082     /* ftmp[i] < 2^62 + 2^17 */
1083     felem_diff_128_64(tmp, ftmp);
1084     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1085     felem_reduce(x_out, tmp);
1086
1087     /* z' = (y + z)^2 - gamma - delta */
1088     felem_sum64(delta, gamma);
1089     /* delta[i] < 2^60 + 2^15 */
1090     felem_assign(ftmp, y_in);
1091     felem_sum64(ftmp, z_in);
1092     /* ftmp[i] < 2^60 + 2^15 */
1093     felem_square(tmp, ftmp);
1094     /*
1095      * tmp[i] < 17(2^122) < 2^127
1096      */
1097     felem_diff_128_64(tmp, delta);
1098     /* tmp[i] < 2^127 + 2^63 */
1099     felem_reduce(z_out, tmp);
1100
1101     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1102     felem_scalar64(beta, 4);
1103     /* beta[i] < 2^61 + 2^16 */
1104     felem_diff64(beta, x_out);
1105     /* beta[i] < 2^61 + 2^60 + 2^16 */
1106     felem_mul(tmp, alpha, beta);
1107     /*-
1108      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1109      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1110      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1111      *        < 2^128
1112      */
1113     felem_square(tmp2, gamma);
1114     /*-
1115      * tmp2[i] < 17*(2^59 + 2^14)^2
1116      *         = 17*(2^118 + 2^74 + 2^28)
1117      */
1118     felem_scalar128(tmp2, 8);
1119     /*-
1120      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1121      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1122      *         < 2^126
1123      */
1124     felem_diff128(tmp, tmp2);
1125     /*-
1126      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1127      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1128      *          2^74 + 2^69 + 2^34 + 2^30
1129      *        < 2^128
1130      */
1131     felem_reduce(y_out, tmp);
1132 }
1133
1134 /* copy_conditional copies in to out iff mask is all ones. */
1135 static void copy_conditional(felem out, const felem in, limb mask)
1136 {
1137     unsigned i;
1138     for (i = 0; i < NLIMBS; ++i) {
1139         const limb tmp = mask & (in[i] ^ out[i]);
1140         out[i] ^= tmp;
1141     }
1142 }
1143
1144 /*-
1145  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1146  *
1147  * The method is taken from
1148  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1149  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1150  *
1151  * This function includes a branch for checking whether the two input points
1152  * are equal (while not equal to the point at infinity). See comment below
1153  * on constant-time.
1154  */
1155 static void point_add(felem x3, felem y3, felem z3,
1156                       const felem x1, const felem y1, const felem z1,
1157                       const int mixed, const felem x2, const felem y2,
1158                       const felem z2)
1159 {
1160     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1161     largefelem tmp, tmp2;
1162     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1163     limb points_equal;
1164
1165     z1_is_zero = felem_is_zero(z1);
1166     z2_is_zero = felem_is_zero(z2);
1167
1168     /* ftmp = z1z1 = z1**2 */
1169     felem_square(tmp, z1);
1170     felem_reduce(ftmp, tmp);
1171
1172     if (!mixed) {
1173         /* ftmp2 = z2z2 = z2**2 */
1174         felem_square(tmp, z2);
1175         felem_reduce(ftmp2, tmp);
1176
1177         /* u1 = ftmp3 = x1*z2z2 */
1178         felem_mul(tmp, x1, ftmp2);
1179         felem_reduce(ftmp3, tmp);
1180
1181         /* ftmp5 = z1 + z2 */
1182         felem_assign(ftmp5, z1);
1183         felem_sum64(ftmp5, z2);
1184         /* ftmp5[i] < 2^61 */
1185
1186         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1187         felem_square(tmp, ftmp5);
1188         /* tmp[i] < 17*2^122 */
1189         felem_diff_128_64(tmp, ftmp);
1190         /* tmp[i] < 17*2^122 + 2^63 */
1191         felem_diff_128_64(tmp, ftmp2);
1192         /* tmp[i] < 17*2^122 + 2^64 */
1193         felem_reduce(ftmp5, tmp);
1194
1195         /* ftmp2 = z2 * z2z2 */
1196         felem_mul(tmp, ftmp2, z2);
1197         felem_reduce(ftmp2, tmp);
1198
1199         /* s1 = ftmp6 = y1 * z2**3 */
1200         felem_mul(tmp, y1, ftmp2);
1201         felem_reduce(ftmp6, tmp);
1202     } else {
1203         /*
1204          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1205          */
1206
1207         /* u1 = ftmp3 = x1*z2z2 */
1208         felem_assign(ftmp3, x1);
1209
1210         /* ftmp5 = 2*z1z2 */
1211         felem_scalar(ftmp5, z1, 2);
1212
1213         /* s1 = ftmp6 = y1 * z2**3 */
1214         felem_assign(ftmp6, y1);
1215     }
1216
1217     /* u2 = x2*z1z1 */
1218     felem_mul(tmp, x2, ftmp);
1219     /* tmp[i] < 17*2^120 */
1220
1221     /* h = ftmp4 = u2 - u1 */
1222     felem_diff_128_64(tmp, ftmp3);
1223     /* tmp[i] < 17*2^120 + 2^63 */
1224     felem_reduce(ftmp4, tmp);
1225
1226     x_equal = felem_is_zero(ftmp4);
1227
1228     /* z_out = ftmp5 * h */
1229     felem_mul(tmp, ftmp5, ftmp4);
1230     felem_reduce(z_out, tmp);
1231
1232     /* ftmp = z1 * z1z1 */
1233     felem_mul(tmp, ftmp, z1);
1234     felem_reduce(ftmp, tmp);
1235
1236     /* s2 = tmp = y2 * z1**3 */
1237     felem_mul(tmp, y2, ftmp);
1238     /* tmp[i] < 17*2^120 */
1239
1240     /* r = ftmp5 = (s2 - s1)*2 */
1241     felem_diff_128_64(tmp, ftmp6);
1242     /* tmp[i] < 17*2^120 + 2^63 */
1243     felem_reduce(ftmp5, tmp);
1244     y_equal = felem_is_zero(ftmp5);
1245     felem_scalar64(ftmp5, 2);
1246     /* ftmp5[i] < 2^61 */
1247
1248     /*
1249      * The formulae are incorrect if the points are equal, in affine coordinates
1250      * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1251      * happens.
1252      *
1253      * We use bitwise operations to avoid potential side-channels introduced by
1254      * the short-circuiting behaviour of boolean operators.
1255      *
1256      * The special case of either point being the point at infinity (z1 and/or
1257      * z2 are zero), is handled separately later on in this function, so we
1258      * avoid jumping to point_double here in those special cases.
1259      *
1260      * Notice the comment below on the implications of this branching for timing
1261      * leaks and why it is considered practically irrelevant.
1262      */
1263     points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1264
1265     if (points_equal) {
1266         /*
1267          * This is obviously not constant-time but it will almost-never happen
1268          * for ECDH / ECDSA. The case where it can happen is during scalar-mult
1269          * where the intermediate value gets very close to the group order.
1270          * Since |ossl_ec_GFp_nistp_recode_scalar_bits| produces signed digits
1271          * for the scalar, it's possible for the intermediate value to be a small
1272          * negative multiple of the base point, and for the final signed digit
1273          * to be the same value. We believe that this only occurs for the scalar
1274          * 1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
1275          * ffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb
1276          * 71e913863f7, in that case the penultimate intermediate is -9G and
1277          * the final digit is also -9G. Since this only happens for a single
1278          * scalar, the timing leak is irrelevant. (Any attacker who wanted to
1279          * check whether a secret scalar was that exact value, can already do
1280          * so.)
1281          */
1282         point_double(x3, y3, z3, x1, y1, z1);
1283         return;
1284     }
1285
1286     /* I = ftmp = (2h)**2 */
1287     felem_assign(ftmp, ftmp4);
1288     felem_scalar64(ftmp, 2);
1289     /* ftmp[i] < 2^61 */
1290     felem_square(tmp, ftmp);
1291     /* tmp[i] < 17*2^122 */
1292     felem_reduce(ftmp, tmp);
1293
1294     /* J = ftmp2 = h * I */
1295     felem_mul(tmp, ftmp4, ftmp);
1296     felem_reduce(ftmp2, tmp);
1297
1298     /* V = ftmp4 = U1 * I */
1299     felem_mul(tmp, ftmp3, ftmp);
1300     felem_reduce(ftmp4, tmp);
1301
1302     /* x_out = r**2 - J - 2V */
1303     felem_square(tmp, ftmp5);
1304     /* tmp[i] < 17*2^122 */
1305     felem_diff_128_64(tmp, ftmp2);
1306     /* tmp[i] < 17*2^122 + 2^63 */
1307     felem_assign(ftmp3, ftmp4);
1308     felem_scalar64(ftmp4, 2);
1309     /* ftmp4[i] < 2^61 */
1310     felem_diff_128_64(tmp, ftmp4);
1311     /* tmp[i] < 17*2^122 + 2^64 */
1312     felem_reduce(x_out, tmp);
1313
1314     /* y_out = r(V-x_out) - 2 * s1 * J */
1315     felem_diff64(ftmp3, x_out);
1316     /*
1317      * ftmp3[i] < 2^60 + 2^60 = 2^61
1318      */
1319     felem_mul(tmp, ftmp5, ftmp3);
1320     /* tmp[i] < 17*2^122 */
1321     felem_mul(tmp2, ftmp6, ftmp2);
1322     /* tmp2[i] < 17*2^120 */
1323     felem_scalar128(tmp2, 2);
1324     /* tmp2[i] < 17*2^121 */
1325     felem_diff128(tmp, tmp2);
1326         /*-
1327          * tmp[i] < 2^127 - 2^69 + 17*2^122
1328          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1329          *        < 2^127
1330          */
1331     felem_reduce(y_out, tmp);
1332
1333     copy_conditional(x_out, x2, z1_is_zero);
1334     copy_conditional(x_out, x1, z2_is_zero);
1335     copy_conditional(y_out, y2, z1_is_zero);
1336     copy_conditional(y_out, y1, z2_is_zero);
1337     copy_conditional(z_out, z2, z1_is_zero);
1338     copy_conditional(z_out, z1, z2_is_zero);
1339     felem_assign(x3, x_out);
1340     felem_assign(y3, y_out);
1341     felem_assign(z3, z_out);
1342 }
1343
1344 /*-
1345  * Base point pre computation
1346  * --------------------------
1347  *
1348  * Two different sorts of precomputed tables are used in the following code.
1349  * Each contain various points on the curve, where each point is three field
1350  * elements (x, y, z).
1351  *
1352  * For the base point table, z is usually 1 (0 for the point at infinity).
1353  * This table has 16 elements:
1354  * index | bits    | point
1355  * ------+---------+------------------------------
1356  *     0 | 0 0 0 0 | 0G
1357  *     1 | 0 0 0 1 | 1G
1358  *     2 | 0 0 1 0 | 2^130G
1359  *     3 | 0 0 1 1 | (2^130 + 1)G
1360  *     4 | 0 1 0 0 | 2^260G
1361  *     5 | 0 1 0 1 | (2^260 + 1)G
1362  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1363  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1364  *     8 | 1 0 0 0 | 2^390G
1365  *     9 | 1 0 0 1 | (2^390 + 1)G
1366  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1367  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1368  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1369  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1370  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1371  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1372  *
1373  * The reason for this is so that we can clock bits into four different
1374  * locations when doing simple scalar multiplies against the base point.
1375  *
1376  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1377
1378 /* gmul is the table of precomputed base points */
1379 static const felem gmul[16][3] = {
1380 {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1381  {0, 0, 0, 0, 0, 0, 0, 0, 0},
1382  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1383 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1384   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1385   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1386  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1387   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1388   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1389  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1390 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1391   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1392   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1393  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1394   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1395   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1396  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1397 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1398   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1399   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1400  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1401   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1402   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1403  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1404 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1405   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1406   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1407  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1408   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1409   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1410  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1411 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1412   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1413   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1414  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1415   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1416   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1417  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1418 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1419   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1420   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1421  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1422   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1423   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1424  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1425 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1426   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1427   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1428  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1429   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1430   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1431  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1432 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1433   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1434   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1435  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1436   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1437   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1438  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1439 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1440   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1441   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1442  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1443   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1444   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1445  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1446 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1447   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1448   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1449  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1450   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1451   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1452  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1453 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1454   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1455   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1456  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1457   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1458   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1459  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1460 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1461   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1462   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1463  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1464   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1465   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1466  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1467 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1468   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1469   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1470  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1471   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1472   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1473  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1474 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1475   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1476   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1477  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1478   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1479   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1480  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1481 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1482   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1483   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1484  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1485   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1486   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1487  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1488 };
1489
1490 /*
1491  * select_point selects the |idx|th point from a precomputation table and
1492  * copies it to out.
1493  */
1494  /* pre_comp below is of the size provided in |size| */
1495 static void select_point(const limb idx, unsigned int size,
1496                          const felem pre_comp[][3], felem out[3])
1497 {
1498     unsigned i, j;
1499     limb *outlimbs = &out[0][0];
1500
1501     memset(out, 0, sizeof(*out) * 3);
1502
1503     for (i = 0; i < size; i++) {
1504         const limb *inlimbs = &pre_comp[i][0][0];
1505         limb mask = i ^ idx;
1506         mask |= mask >> 4;
1507         mask |= mask >> 2;
1508         mask |= mask >> 1;
1509         mask &= 1;
1510         mask--;
1511         for (j = 0; j < NLIMBS * 3; j++)
1512             outlimbs[j] |= inlimbs[j] & mask;
1513     }
1514 }
1515
1516 /* get_bit returns the |i|th bit in |in| */
1517 static char get_bit(const felem_bytearray in, int i)
1518 {
1519     if (i < 0)
1520         return 0;
1521     return (in[i >> 3] >> (i & 7)) & 1;
1522 }
1523
1524 /*
1525  * Interleaved point multiplication using precomputed point multiples: The
1526  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1527  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1528  * generator, using certain (large) precomputed multiples in g_pre_comp.
1529  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1530  */
1531 static void batch_mul(felem x_out, felem y_out, felem z_out,
1532                       const felem_bytearray scalars[],
1533                       const unsigned num_points, const u8 *g_scalar,
1534                       const int mixed, const felem pre_comp[][17][3],
1535                       const felem g_pre_comp[16][3])
1536 {
1537     int i, skip;
1538     unsigned num, gen_mul = (g_scalar != NULL);
1539     felem nq[3], tmp[4];
1540     limb bits;
1541     u8 sign, digit;
1542
1543     /* set nq to the point at infinity */
1544     memset(nq, 0, sizeof(nq));
1545
1546     /*
1547      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1548      * of the generator (last quarter of rounds) and additions of other
1549      * points multiples (every 5th round).
1550      */
1551     skip = 1;                   /* save two point operations in the first
1552                                  * round */
1553     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1554         /* double */
1555         if (!skip)
1556             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1557
1558         /* add multiples of the generator */
1559         if (gen_mul && (i <= 130)) {
1560             bits = get_bit(g_scalar, i + 390) << 3;
1561             if (i < 130) {
1562                 bits |= get_bit(g_scalar, i + 260) << 2;
1563                 bits |= get_bit(g_scalar, i + 130) << 1;
1564                 bits |= get_bit(g_scalar, i);
1565             }
1566             /* select the point to add, in constant time */
1567             select_point(bits, 16, g_pre_comp, tmp);
1568             if (!skip) {
1569                 /* The 1 argument below is for "mixed" */
1570                 point_add(nq[0], nq[1], nq[2],
1571                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1572             } else {
1573                 memcpy(nq, tmp, 3 * sizeof(felem));
1574                 skip = 0;
1575             }
1576         }
1577
1578         /* do other additions every 5 doublings */
1579         if (num_points && (i % 5 == 0)) {
1580             /* loop over all scalars */
1581             for (num = 0; num < num_points; ++num) {
1582                 bits = get_bit(scalars[num], i + 4) << 5;
1583                 bits |= get_bit(scalars[num], i + 3) << 4;
1584                 bits |= get_bit(scalars[num], i + 2) << 3;
1585                 bits |= get_bit(scalars[num], i + 1) << 2;
1586                 bits |= get_bit(scalars[num], i) << 1;
1587                 bits |= get_bit(scalars[num], i - 1);
1588                 ossl_ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1589
1590                 /*
1591                  * select the point to add or subtract, in constant time
1592                  */
1593                 select_point(digit, 17, pre_comp[num], tmp);
1594                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1595                                             * point */
1596                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1597
1598                 if (!skip) {
1599                     point_add(nq[0], nq[1], nq[2],
1600                               nq[0], nq[1], nq[2],
1601                               mixed, tmp[0], tmp[1], tmp[2]);
1602                 } else {
1603                     memcpy(nq, tmp, 3 * sizeof(felem));
1604                     skip = 0;
1605                 }
1606             }
1607         }
1608     }
1609     felem_assign(x_out, nq[0]);
1610     felem_assign(y_out, nq[1]);
1611     felem_assign(z_out, nq[2]);
1612 }
1613
1614 /* Precomputation for the group generator. */
1615 struct nistp521_pre_comp_st {
1616     felem g_pre_comp[16][3];
1617     CRYPTO_REF_COUNT references;
1618     CRYPTO_RWLOCK *lock;
1619 };
1620
1621 const EC_METHOD *EC_GFp_nistp521_method(void)
1622 {
1623     static const EC_METHOD ret = {
1624         EC_FLAGS_DEFAULT_OCT,
1625         NID_X9_62_prime_field,
1626         ossl_ec_GFp_nistp521_group_init,
1627         ossl_ec_GFp_simple_group_finish,
1628         ossl_ec_GFp_simple_group_clear_finish,
1629         ossl_ec_GFp_nist_group_copy,
1630         ossl_ec_GFp_nistp521_group_set_curve,
1631         ossl_ec_GFp_simple_group_get_curve,
1632         ossl_ec_GFp_simple_group_get_degree,
1633         ossl_ec_group_simple_order_bits,
1634         ossl_ec_GFp_simple_group_check_discriminant,
1635         ossl_ec_GFp_simple_point_init,
1636         ossl_ec_GFp_simple_point_finish,
1637         ossl_ec_GFp_simple_point_clear_finish,
1638         ossl_ec_GFp_simple_point_copy,
1639         ossl_ec_GFp_simple_point_set_to_infinity,
1640         ossl_ec_GFp_simple_point_set_affine_coordinates,
1641         ossl_ec_GFp_nistp521_point_get_affine_coordinates,
1642         0 /* point_set_compressed_coordinates */ ,
1643         0 /* point2oct */ ,
1644         0 /* oct2point */ ,
1645         ossl_ec_GFp_simple_add,
1646         ossl_ec_GFp_simple_dbl,
1647         ossl_ec_GFp_simple_invert,
1648         ossl_ec_GFp_simple_is_at_infinity,
1649         ossl_ec_GFp_simple_is_on_curve,
1650         ossl_ec_GFp_simple_cmp,
1651         ossl_ec_GFp_simple_make_affine,
1652         ossl_ec_GFp_simple_points_make_affine,
1653         ossl_ec_GFp_nistp521_points_mul,
1654         ossl_ec_GFp_nistp521_precompute_mult,
1655         ossl_ec_GFp_nistp521_have_precompute_mult,
1656         ossl_ec_GFp_nist_field_mul,
1657         ossl_ec_GFp_nist_field_sqr,
1658         0 /* field_div */ ,
1659         ossl_ec_GFp_simple_field_inv,
1660         0 /* field_encode */ ,
1661         0 /* field_decode */ ,
1662         0,                      /* field_set_to_one */
1663         ossl_ec_key_simple_priv2oct,
1664         ossl_ec_key_simple_oct2priv,
1665         0, /* set private */
1666         ossl_ec_key_simple_generate_key,
1667         ossl_ec_key_simple_check_key,
1668         ossl_ec_key_simple_generate_public_key,
1669         0, /* keycopy */
1670         0, /* keyfinish */
1671         ossl_ecdh_simple_compute_key,
1672         ossl_ecdsa_simple_sign_setup,
1673         ossl_ecdsa_simple_sign_sig,
1674         ossl_ecdsa_simple_verify_sig,
1675         0, /* field_inverse_mod_ord */
1676         0, /* blind_coordinates */
1677         0, /* ladder_pre */
1678         0, /* ladder_step */
1679         0  /* ladder_post */
1680     };
1681
1682     return &ret;
1683 }
1684
1685 /******************************************************************************/
1686 /*
1687  * FUNCTIONS TO MANAGE PRECOMPUTATION
1688  */
1689
1690 static NISTP521_PRE_COMP *nistp521_pre_comp_new(void)
1691 {
1692     NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1693
1694     if (ret == NULL) {
1695         ERR_raise(ERR_LIB_EC, ERR_R_MALLOC_FAILURE);
1696         return ret;
1697     }
1698
1699     ret->references = 1;
1700
1701     ret->lock = CRYPTO_THREAD_lock_new();
1702     if (ret->lock == NULL) {
1703         ERR_raise(ERR_LIB_EC, ERR_R_MALLOC_FAILURE);
1704         OPENSSL_free(ret);
1705         return NULL;
1706     }
1707     return ret;
1708 }
1709
1710 NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
1711 {
1712     int i;
1713     if (p != NULL)
1714         CRYPTO_UP_REF(&p->references, &i, p->lock);
1715     return p;
1716 }
1717
1718 void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1719 {
1720     int i;
1721
1722     if (p == NULL)
1723         return;
1724
1725     CRYPTO_DOWN_REF(&p->references, &i, p->lock);
1726     REF_PRINT_COUNT("EC_nistp521", x);
1727     if (i > 0)
1728         return;
1729     REF_ASSERT_ISNT(i < 0);
1730
1731     CRYPTO_THREAD_lock_free(p->lock);
1732     OPENSSL_free(p);
1733 }
1734
1735 /******************************************************************************/
1736 /*
1737  * OPENSSL EC_METHOD FUNCTIONS
1738  */
1739
1740 int ossl_ec_GFp_nistp521_group_init(EC_GROUP *group)
1741 {
1742     int ret;
1743     ret = ossl_ec_GFp_simple_group_init(group);
1744     group->a_is_minus3 = 1;
1745     return ret;
1746 }
1747
1748 int ossl_ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1749                                          const BIGNUM *a, const BIGNUM *b,
1750                                          BN_CTX *ctx)
1751 {
1752     int ret = 0;
1753     BIGNUM *curve_p, *curve_a, *curve_b;
1754 #ifndef FIPS_MODULE
1755     BN_CTX *new_ctx = NULL;
1756
1757     if (ctx == NULL)
1758         ctx = new_ctx = BN_CTX_new();
1759 #endif
1760     if (ctx == NULL)
1761         return 0;
1762
1763     BN_CTX_start(ctx);
1764     curve_p = BN_CTX_get(ctx);
1765     curve_a = BN_CTX_get(ctx);
1766     curve_b = BN_CTX_get(ctx);
1767     if (curve_b == NULL)
1768         goto err;
1769     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1770     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1771     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1772     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1773         ERR_raise(ERR_LIB_EC, EC_R_WRONG_CURVE_PARAMETERS);
1774         goto err;
1775     }
1776     group->field_mod_func = BN_nist_mod_521;
1777     ret = ossl_ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1778  err:
1779     BN_CTX_end(ctx);
1780 #ifndef FIPS_MODULE
1781     BN_CTX_free(new_ctx);
1782 #endif
1783     return ret;
1784 }
1785
1786 /*
1787  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1788  * (X/Z^2, Y/Z^3)
1789  */
1790 int ossl_ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1791                                                       const EC_POINT *point,
1792                                                       BIGNUM *x, BIGNUM *y,
1793                                                       BN_CTX *ctx)
1794 {
1795     felem z1, z2, x_in, y_in, x_out, y_out;
1796     largefelem tmp;
1797
1798     if (EC_POINT_is_at_infinity(group, point)) {
1799         ERR_raise(ERR_LIB_EC, EC_R_POINT_AT_INFINITY);
1800         return 0;
1801     }
1802     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1803         (!BN_to_felem(z1, point->Z)))
1804         return 0;
1805     felem_inv(z2, z1);
1806     felem_square(tmp, z2);
1807     felem_reduce(z1, tmp);
1808     felem_mul(tmp, x_in, z1);
1809     felem_reduce(x_in, tmp);
1810     felem_contract(x_out, x_in);
1811     if (x != NULL) {
1812         if (!felem_to_BN(x, x_out)) {
1813             ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1814             return 0;
1815         }
1816     }
1817     felem_mul(tmp, z1, z2);
1818     felem_reduce(z1, tmp);
1819     felem_mul(tmp, y_in, z1);
1820     felem_reduce(y_in, tmp);
1821     felem_contract(y_out, y_in);
1822     if (y != NULL) {
1823         if (!felem_to_BN(y, y_out)) {
1824             ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1825             return 0;
1826         }
1827     }
1828     return 1;
1829 }
1830
1831 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1832 static void make_points_affine(size_t num, felem points[][3],
1833                                felem tmp_felems[])
1834 {
1835     /*
1836      * Runs in constant time, unless an input is the point at infinity (which
1837      * normally shouldn't happen).
1838      */
1839     ossl_ec_GFp_nistp_points_make_affine_internal(num,
1840                                                   points,
1841                                                   sizeof(felem),
1842                                                   tmp_felems,
1843                                                   (void (*)(void *))felem_one,
1844                                                   felem_is_zero_int,
1845                                                   (void (*)(void *, const void *))
1846                                                   felem_assign,
1847                                                   (void (*)(void *, const void *))
1848                                                   felem_square_reduce, (void (*)
1849                                                                         (void *,
1850                                                                          const void
1851                                                                          *,
1852                                                                          const void
1853                                                                          *))
1854                                                   felem_mul_reduce,
1855                                                   (void (*)(void *, const void *))
1856                                                   felem_inv,
1857                                                   (void (*)(void *, const void *))
1858                                                   felem_contract);
1859 }
1860
1861 /*
1862  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1863  * values Result is stored in r (r can equal one of the inputs).
1864  */
1865 int ossl_ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1866                                     const BIGNUM *scalar, size_t num,
1867                                     const EC_POINT *points[],
1868                                     const BIGNUM *scalars[], BN_CTX *ctx)
1869 {
1870     int ret = 0;
1871     int j;
1872     int mixed = 0;
1873     BIGNUM *x, *y, *z, *tmp_scalar;
1874     felem_bytearray g_secret;
1875     felem_bytearray *secrets = NULL;
1876     felem (*pre_comp)[17][3] = NULL;
1877     felem *tmp_felems = NULL;
1878     unsigned i;
1879     int num_bytes;
1880     int have_pre_comp = 0;
1881     size_t num_points = num;
1882     felem x_in, y_in, z_in, x_out, y_out, z_out;
1883     NISTP521_PRE_COMP *pre = NULL;
1884     felem(*g_pre_comp)[3] = NULL;
1885     EC_POINT *generator = NULL;
1886     const EC_POINT *p = NULL;
1887     const BIGNUM *p_scalar = NULL;
1888
1889     BN_CTX_start(ctx);
1890     x = BN_CTX_get(ctx);
1891     y = BN_CTX_get(ctx);
1892     z = BN_CTX_get(ctx);
1893     tmp_scalar = BN_CTX_get(ctx);
1894     if (tmp_scalar == NULL)
1895         goto err;
1896
1897     if (scalar != NULL) {
1898         pre = group->pre_comp.nistp521;
1899         if (pre)
1900             /* we have precomputation, try to use it */
1901             g_pre_comp = &pre->g_pre_comp[0];
1902         else
1903             /* try to use the standard precomputation */
1904             g_pre_comp = (felem(*)[3]) gmul;
1905         generator = EC_POINT_new(group);
1906         if (generator == NULL)
1907             goto err;
1908         /* get the generator from precomputation */
1909         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1910             !felem_to_BN(y, g_pre_comp[1][1]) ||
1911             !felem_to_BN(z, g_pre_comp[1][2])) {
1912             ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1913             goto err;
1914         }
1915         if (!ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group,
1916                                                                 generator,
1917                                                                 x, y, z, ctx))
1918             goto err;
1919         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1920             /* precomputation matches generator */
1921             have_pre_comp = 1;
1922         else
1923             /*
1924              * we don't have valid precomputation: treat the generator as a
1925              * random point
1926              */
1927             num_points++;
1928     }
1929
1930     if (num_points > 0) {
1931         if (num_points >= 2) {
1932             /*
1933              * unless we precompute multiples for just one point, converting
1934              * those into affine form is time well spent
1935              */
1936             mixed = 1;
1937         }
1938         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1939         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1940         if (mixed)
1941             tmp_felems =
1942                 OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1943         if ((secrets == NULL) || (pre_comp == NULL)
1944             || (mixed && (tmp_felems == NULL))) {
1945             ERR_raise(ERR_LIB_EC, ERR_R_MALLOC_FAILURE);
1946             goto err;
1947         }
1948
1949         /*
1950          * we treat NULL scalars as 0, and NULL points as points at infinity,
1951          * i.e., they contribute nothing to the linear combination
1952          */
1953         for (i = 0; i < num_points; ++i) {
1954             if (i == num) {
1955                 /*
1956                  * we didn't have a valid precomputation, so we pick the
1957                  * generator
1958                  */
1959                 p = EC_GROUP_get0_generator(group);
1960                 p_scalar = scalar;
1961             } else {
1962                 /* the i^th point */
1963                 p = points[i];
1964                 p_scalar = scalars[i];
1965             }
1966             if ((p_scalar != NULL) && (p != NULL)) {
1967                 /* reduce scalar to 0 <= scalar < 2^521 */
1968                 if ((BN_num_bits(p_scalar) > 521)
1969                     || (BN_is_negative(p_scalar))) {
1970                     /*
1971                      * this is an unusual input, and we don't guarantee
1972                      * constant-timeness
1973                      */
1974                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1975                         ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1976                         goto err;
1977                     }
1978                     num_bytes = BN_bn2lebinpad(tmp_scalar,
1979                                                secrets[i], sizeof(secrets[i]));
1980                 } else {
1981                     num_bytes = BN_bn2lebinpad(p_scalar,
1982                                                secrets[i], sizeof(secrets[i]));
1983                 }
1984                 if (num_bytes < 0) {
1985                     ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1986                     goto err;
1987                 }
1988                 /* precompute multiples */
1989                 if ((!BN_to_felem(x_out, p->X)) ||
1990                     (!BN_to_felem(y_out, p->Y)) ||
1991                     (!BN_to_felem(z_out, p->Z)))
1992                     goto err;
1993                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1994                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1995                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1996                 for (j = 2; j <= 16; ++j) {
1997                     if (j & 1) {
1998                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1999                                   pre_comp[i][j][2], pre_comp[i][1][0],
2000                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
2001                                   pre_comp[i][j - 1][0],
2002                                   pre_comp[i][j - 1][1],
2003                                   pre_comp[i][j - 1][2]);
2004                     } else {
2005                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
2006                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
2007                                      pre_comp[i][j / 2][1],
2008                                      pre_comp[i][j / 2][2]);
2009                     }
2010                 }
2011             }
2012         }
2013         if (mixed)
2014             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
2015     }
2016
2017     /* the scalar for the generator */
2018     if ((scalar != NULL) && (have_pre_comp)) {
2019         memset(g_secret, 0, sizeof(g_secret));
2020         /* reduce scalar to 0 <= scalar < 2^521 */
2021         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
2022             /*
2023              * this is an unusual input, and we don't guarantee
2024              * constant-timeness
2025              */
2026             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2027                 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
2028                 goto err;
2029             }
2030             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2031         } else {
2032             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2033         }
2034         /* do the multiplication with generator precomputation */
2035         batch_mul(x_out, y_out, z_out,
2036                   (const felem_bytearray(*))secrets, num_points,
2037                   g_secret,
2038                   mixed, (const felem(*)[17][3])pre_comp,
2039                   (const felem(*)[3])g_pre_comp);
2040     } else {
2041         /* do the multiplication without generator precomputation */
2042         batch_mul(x_out, y_out, z_out,
2043                   (const felem_bytearray(*))secrets, num_points,
2044                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
2045     }
2046     /* reduce the output to its unique minimal representation */
2047     felem_contract(x_in, x_out);
2048     felem_contract(y_in, y_out);
2049     felem_contract(z_in, z_out);
2050     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2051         (!felem_to_BN(z, z_in))) {
2052         ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
2053         goto err;
2054     }
2055     ret = ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z,
2056                                                              ctx);
2057
2058  err:
2059     BN_CTX_end(ctx);
2060     EC_POINT_free(generator);
2061     OPENSSL_free(secrets);
2062     OPENSSL_free(pre_comp);
2063     OPENSSL_free(tmp_felems);
2064     return ret;
2065 }
2066
2067 int ossl_ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2068 {
2069     int ret = 0;
2070     NISTP521_PRE_COMP *pre = NULL;
2071     int i, j;
2072     BIGNUM *x, *y;
2073     EC_POINT *generator = NULL;
2074     felem tmp_felems[16];
2075 #ifndef FIPS_MODULE
2076     BN_CTX *new_ctx = NULL;
2077 #endif
2078
2079     /* throw away old precomputation */
2080     EC_pre_comp_free(group);
2081
2082 #ifndef FIPS_MODULE
2083     if (ctx == NULL)
2084         ctx = new_ctx = BN_CTX_new();
2085 #endif
2086     if (ctx == NULL)
2087         return 0;
2088
2089     BN_CTX_start(ctx);
2090     x = BN_CTX_get(ctx);
2091     y = BN_CTX_get(ctx);
2092     if (y == NULL)
2093         goto err;
2094     /* get the generator */
2095     if (group->generator == NULL)
2096         goto err;
2097     generator = EC_POINT_new(group);
2098     if (generator == NULL)
2099         goto err;
2100     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2101     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2102     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2103         goto err;
2104     if ((pre = nistp521_pre_comp_new()) == NULL)
2105         goto err;
2106     /*
2107      * if the generator is the standard one, use built-in precomputation
2108      */
2109     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2110         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2111         goto done;
2112     }
2113     if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
2114         (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
2115         (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2116         goto err;
2117     /* compute 2^130*G, 2^260*G, 2^390*G */
2118     for (i = 1; i <= 4; i <<= 1) {
2119         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2120                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2121                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2122         for (j = 0; j < 129; ++j) {
2123             point_double(pre->g_pre_comp[2 * i][0],
2124                          pre->g_pre_comp[2 * i][1],
2125                          pre->g_pre_comp[2 * i][2],
2126                          pre->g_pre_comp[2 * i][0],
2127                          pre->g_pre_comp[2 * i][1],
2128                          pre->g_pre_comp[2 * i][2]);
2129         }
2130     }
2131     /* g_pre_comp[0] is the point at infinity */
2132     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2133     /* the remaining multiples */
2134     /* 2^130*G + 2^260*G */
2135     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2136               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2137               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2138               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2139               pre->g_pre_comp[2][2]);
2140     /* 2^130*G + 2^390*G */
2141     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2142               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2143               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2144               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2145               pre->g_pre_comp[2][2]);
2146     /* 2^260*G + 2^390*G */
2147     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2148               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2149               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2150               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2151               pre->g_pre_comp[4][2]);
2152     /* 2^130*G + 2^260*G + 2^390*G */
2153     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2154               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2155               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2156               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2157               pre->g_pre_comp[2][2]);
2158     for (i = 1; i < 8; ++i) {
2159         /* odd multiples: add G */
2160         point_add(pre->g_pre_comp[2 * i + 1][0],
2161                   pre->g_pre_comp[2 * i + 1][1],
2162                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2163                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2164                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2165                   pre->g_pre_comp[1][2]);
2166     }
2167     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2168
2169  done:
2170     SETPRECOMP(group, nistp521, pre);
2171     ret = 1;
2172     pre = NULL;
2173  err:
2174     BN_CTX_end(ctx);
2175     EC_POINT_free(generator);
2176 #ifndef FIPS_MODULE
2177     BN_CTX_free(new_ctx);
2178 #endif
2179     EC_nistp521_pre_comp_free(pre);
2180     return ret;
2181 }
2182
2183 int ossl_ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2184 {
2185     return HAVEPRECOMP(group, nistp521);
2186 }