crypto/ec/ecp_nistp521.c

   1 /*
   2  * Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the Apache License 2.0 (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * ECDSA low level APIs are deprecated for public use, but still ok for
  28  * internal use.
  29  */
  30 #include "internal/deprecated.h"
  31
  32 /*
  33  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  34  *
  35  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  36  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  37  * work which got its smarts from Daniel J. Bernstein's work on the same.
  38  */
  39
  40 #include <openssl/e_os2.h>
  41 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  42 NON_EMPTY_TRANSLATION_UNIT
  43 #else
  44
  45 # include <string.h>
  46 # include <openssl/err.h>
  47 # include "ec_local.h"
  48
  49 # if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
  50   /* even with gcc, the typedef won't work for 32-bit platforms */
  51 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  52                                  * platforms */
  53 # else
  54 #  error "Your compiler doesn't appear to support 128-bit integer types"
  55 # endif
  56
  57 typedef uint8_t u8;
  58 typedef uint64_t u64;
  59
  60 /*
  61  * The underlying field. P521 operates over GF(2^521-1). We can serialise an
  62  * element of this field into 66 bytes where the most significant byte
  63  * contains only a single bit. We call this an felem_bytearray.
  64  */
  65
  66 typedef u8 felem_bytearray[66];
  67
  68 /*
  69  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  70  * These values are big-endian.
  71  */
  72 static const felem_bytearray nistp521_curve_params[5] = {
  73     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
  74      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  75      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  76      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  78      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  79      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81      0xff, 0xff},
  82     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
  83      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  84      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  85      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  86      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  87      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  88      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  89      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  90      0xff, 0xfc},
  91     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
  92      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  93      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  94      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  95      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  96      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  97      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  98      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  99      0x3f, 0x00},
 100     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
 101      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
 102      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
 103      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
 104      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
 105      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
 106      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
 107      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 108      0xbd, 0x66},
 109     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
 110      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 111      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 112      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 113      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 114      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 115      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 116      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 117      0x66, 0x50}
 118 };
 119
 120 /*-
 121  * The representation of field elements.
 122  * ------------------------------------
 123  *
 124  * We represent field elements with nine values. These values are either 64 or
 125  * 128 bits and the field element represented is:
 126  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 127  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 128  * 58 bits apart, but are greater than 58 bits in length, the most significant
 129  * bits of each limb overlap with the least significant bits of the next.
 130  *
 131  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 132  * 'largefelem' */
 133
 134 # define NLIMBS 9
 135
 136 typedef uint64_t limb;
 137 typedef limb felem[NLIMBS];
 138 typedef uint128_t largefelem[NLIMBS];
 139
 140 static const limb bottom57bits = 0x1ffffffffffffff;
 141 static const limb bottom58bits = 0x3ffffffffffffff;
 142
 143 /*
 144  * bin66_to_felem takes a little-endian byte array and converts it into felem
 145  * form. This assumes that the CPU is little-endian.
 146  */
 147 static void bin66_to_felem(felem out, const u8 in[66])
 148 {
 149     out[0] = (*((limb *) & in[0])) & bottom58bits;
 150     out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
 151     out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
 152     out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
 153     out[4] = (*((limb *) & in[29])) & bottom58bits;
 154     out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
 155     out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
 156     out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
 157     out[8] = (*((limb *) & in[58])) & bottom57bits;
 158 }
 159
 160 /*
 161  * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 162  * array. This assumes that the CPU is little-endian.
 163  */
 164 static void felem_to_bin66(u8 out[66], const felem in)
 165 {
 166     memset(out, 0, 66);
 167     (*((limb *) & out[0])) = in[0];
 168     (*((limb *) & out[7])) |= in[1] << 2;
 169     (*((limb *) & out[14])) |= in[2] << 4;
 170     (*((limb *) & out[21])) |= in[3] << 6;
 171     (*((limb *) & out[29])) = in[4];
 172     (*((limb *) & out[36])) |= in[5] << 2;
 173     (*((limb *) & out[43])) |= in[6] << 4;
 174     (*((limb *) & out[50])) |= in[7] << 6;
 175     (*((limb *) & out[58])) = in[8];
 176 }
 177
 178 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 179 static int BN_to_felem(felem out, const BIGNUM *bn)
 180 {
 181     felem_bytearray b_out;
 182     int num_bytes;
 183
 184     if (BN_is_negative(bn)) {
 185         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 186         return 0;
 187     }
 188     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
 189     if (num_bytes < 0) {
 190         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 191         return 0;
 192     }
 193     bin66_to_felem(out, b_out);
 194     return 1;
 195 }
 196
 197 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 198 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 199 {
 200     felem_bytearray b_out;
 201     felem_to_bin66(b_out, in);
 202     return BN_lebin2bn(b_out, sizeof(b_out), out);
 203 }
 204
 205 /*-
 206  * Field operations
 207  * ----------------
 208  */
 209
 210 static void felem_one(felem out)
 211 {
 212     out[0] = 1;
 213     out[1] = 0;
 214     out[2] = 0;
 215     out[3] = 0;
 216     out[4] = 0;
 217     out[5] = 0;
 218     out[6] = 0;
 219     out[7] = 0;
 220     out[8] = 0;
 221 }
 222
 223 static void felem_assign(felem out, const felem in)
 224 {
 225     out[0] = in[0];
 226     out[1] = in[1];
 227     out[2] = in[2];
 228     out[3] = in[3];
 229     out[4] = in[4];
 230     out[5] = in[5];
 231     out[6] = in[6];
 232     out[7] = in[7];
 233     out[8] = in[8];
 234 }
 235
 236 /* felem_sum64 sets out = out + in. */
 237 static void felem_sum64(felem out, const felem in)
 238 {
 239     out[0] += in[0];
 240     out[1] += in[1];
 241     out[2] += in[2];
 242     out[3] += in[3];
 243     out[4] += in[4];
 244     out[5] += in[5];
 245     out[6] += in[6];
 246     out[7] += in[7];
 247     out[8] += in[8];
 248 }
 249
 250 /* felem_scalar sets out = in * scalar */
 251 static void felem_scalar(felem out, const felem in, limb scalar)
 252 {
 253     out[0] = in[0] * scalar;
 254     out[1] = in[1] * scalar;
 255     out[2] = in[2] * scalar;
 256     out[3] = in[3] * scalar;
 257     out[4] = in[4] * scalar;
 258     out[5] = in[5] * scalar;
 259     out[6] = in[6] * scalar;
 260     out[7] = in[7] * scalar;
 261     out[8] = in[8] * scalar;
 262 }
 263
 264 /* felem_scalar64 sets out = out * scalar */
 265 static void felem_scalar64(felem out, limb scalar)
 266 {
 267     out[0] *= scalar;
 268     out[1] *= scalar;
 269     out[2] *= scalar;
 270     out[3] *= scalar;
 271     out[4] *= scalar;
 272     out[5] *= scalar;
 273     out[6] *= scalar;
 274     out[7] *= scalar;
 275     out[8] *= scalar;
 276 }
 277
 278 /* felem_scalar128 sets out = out * scalar */
 279 static void felem_scalar128(largefelem out, limb scalar)
 280 {
 281     out[0] *= scalar;
 282     out[1] *= scalar;
 283     out[2] *= scalar;
 284     out[3] *= scalar;
 285     out[4] *= scalar;
 286     out[5] *= scalar;
 287     out[6] *= scalar;
 288     out[7] *= scalar;
 289     out[8] *= scalar;
 290 }
 291
 292 /*-
 293  * felem_neg sets |out| to |-in|
 294  * On entry:
 295  *   in[i] < 2^59 + 2^14
 296  * On exit:
 297  *   out[i] < 2^62
 298  */
 299 static void felem_neg(felem out, const felem in)
 300 {
 301     /* In order to prevent underflow, we subtract from 0 mod p. */
 302     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 303     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 304
 305     out[0] = two62m3 - in[0];
 306     out[1] = two62m2 - in[1];
 307     out[2] = two62m2 - in[2];
 308     out[3] = two62m2 - in[3];
 309     out[4] = two62m2 - in[4];
 310     out[5] = two62m2 - in[5];
 311     out[6] = two62m2 - in[6];
 312     out[7] = two62m2 - in[7];
 313     out[8] = two62m2 - in[8];
 314 }
 315
 316 /*-
 317  * felem_diff64 subtracts |in| from |out|
 318  * On entry:
 319  *   in[i] < 2^59 + 2^14
 320  * On exit:
 321  *   out[i] < out[i] + 2^62
 322  */
 323 static void felem_diff64(felem out, const felem in)
 324 {
 325     /*
 326      * In order to prevent underflow, we add 0 mod p before subtracting.
 327      */
 328     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 329     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 330
 331     out[0] += two62m3 - in[0];
 332     out[1] += two62m2 - in[1];
 333     out[2] += two62m2 - in[2];
 334     out[3] += two62m2 - in[3];
 335     out[4] += two62m2 - in[4];
 336     out[5] += two62m2 - in[5];
 337     out[6] += two62m2 - in[6];
 338     out[7] += two62m2 - in[7];
 339     out[8] += two62m2 - in[8];
 340 }
 341
 342 /*-
 343  * felem_diff_128_64 subtracts |in| from |out|
 344  * On entry:
 345  *   in[i] < 2^62 + 2^17
 346  * On exit:
 347  *   out[i] < out[i] + 2^63
 348  */
 349 static void felem_diff_128_64(largefelem out, const felem in)
 350 {
 351     /*
 352      * In order to prevent underflow, we add 64p mod p (which is equivalent
 353      * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
 354      * digit number with all bits set to 1. See "The representation of field
 355      * elements" comment above for a description of how limbs are used to
 356      * represent a number. 64p is represented with 8 limbs containing a number
 357      * with 58 bits set and one limb with a number with 57 bits set.
 358      */
 359     static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
 360     static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
 361
 362     out[0] += two63m6 - in[0];
 363     out[1] += two63m5 - in[1];
 364     out[2] += two63m5 - in[2];
 365     out[3] += two63m5 - in[3];
 366     out[4] += two63m5 - in[4];
 367     out[5] += two63m5 - in[5];
 368     out[6] += two63m5 - in[6];
 369     out[7] += two63m5 - in[7];
 370     out[8] += two63m5 - in[8];
 371 }
 372
 373 /*-
 374  * felem_diff_128_64 subtracts |in| from |out|
 375  * On entry:
 376  *   in[i] < 2^126
 377  * On exit:
 378  *   out[i] < out[i] + 2^127 - 2^69
 379  */
 380 static void felem_diff128(largefelem out, const largefelem in)
 381 {
 382     /*
 383      * In order to prevent underflow, we add 0 mod p before subtracting.
 384      */
 385     static const uint128_t two127m70 =
 386         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
 387     static const uint128_t two127m69 =
 388         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
 389
 390     out[0] += (two127m70 - in[0]);
 391     out[1] += (two127m69 - in[1]);
 392     out[2] += (two127m69 - in[2]);
 393     out[3] += (two127m69 - in[3]);
 394     out[4] += (two127m69 - in[4]);
 395     out[5] += (two127m69 - in[5]);
 396     out[6] += (two127m69 - in[6]);
 397     out[7] += (two127m69 - in[7]);
 398     out[8] += (two127m69 - in[8]);
 399 }
 400
 401 /*-
 402  * felem_square sets |out| = |in|^2
 403  * On entry:
 404  *   in[i] < 2^62
 405  * On exit:
 406  *   out[i] < 17 * max(in[i]) * max(in[i])
 407  */
 408 static void felem_square(largefelem out, const felem in)
 409 {
 410     felem inx2, inx4;
 411     felem_scalar(inx2, in, 2);
 412     felem_scalar(inx4, in, 4);
 413
 414     /*-
 415      * We have many cases were we want to do
 416      *   in[x] * in[y] +
 417      *   in[y] * in[x]
 418      * This is obviously just
 419      *   2 * in[x] * in[y]
 420      * However, rather than do the doubling on the 128 bit result, we
 421      * double one of the inputs to the multiplication by reading from
 422      * |inx2|
 423      */
 424
 425     out[0] = ((uint128_t) in[0]) * in[0];
 426     out[1] = ((uint128_t) in[0]) * inx2[1];
 427     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
 428     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
 429     out[4] = ((uint128_t) in[0]) * inx2[4] +
 430              ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
 431     out[5] = ((uint128_t) in[0]) * inx2[5] +
 432              ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
 433     out[6] = ((uint128_t) in[0]) * inx2[6] +
 434              ((uint128_t) in[1]) * inx2[5] +
 435              ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
 436     out[7] = ((uint128_t) in[0]) * inx2[7] +
 437              ((uint128_t) in[1]) * inx2[6] +
 438              ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
 439     out[8] = ((uint128_t) in[0]) * inx2[8] +
 440              ((uint128_t) in[1]) * inx2[7] +
 441              ((uint128_t) in[2]) * inx2[6] +
 442              ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
 443
 444     /*
 445      * The remaining limbs fall above 2^521, with the first falling at 2^522.
 446      * They correspond to locations one bit up from the limbs produced above
 447      * so we would have to multiply by two to align them. Again, rather than
 448      * operate on the 128-bit result, we double one of the inputs to the
 449      * multiplication. If we want to double for both this reason, and the
 450      * reason above, then we end up multiplying by four.
 451      */
 452
 453     /* 9 */
 454     out[0] += ((uint128_t) in[1]) * inx4[8] +
 455               ((uint128_t) in[2]) * inx4[7] +
 456               ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
 457
 458     /* 10 */
 459     out[1] += ((uint128_t) in[2]) * inx4[8] +
 460               ((uint128_t) in[3]) * inx4[7] +
 461               ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
 462
 463     /* 11 */
 464     out[2] += ((uint128_t) in[3]) * inx4[8] +
 465               ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
 466
 467     /* 12 */
 468     out[3] += ((uint128_t) in[4]) * inx4[8] +
 469               ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
 470
 471     /* 13 */
 472     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
 473
 474     /* 14 */
 475     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
 476
 477     /* 15 */
 478     out[6] += ((uint128_t) in[7]) * inx4[8];
 479
 480     /* 16 */
 481     out[7] += ((uint128_t) in[8]) * inx2[8];
 482 }
 483
 484 /*-
 485  * felem_mul sets |out| = |in1| * |in2|
 486  * On entry:
 487  *   in1[i] < 2^64
 488  *   in2[i] < 2^63
 489  * On exit:
 490  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 491  */
 492 static void felem_mul(largefelem out, const felem in1, const felem in2)
 493 {
 494     felem in2x2;
 495     felem_scalar(in2x2, in2, 2);
 496
 497     out[0] = ((uint128_t) in1[0]) * in2[0];
 498
 499     out[1] = ((uint128_t) in1[0]) * in2[1] +
 500              ((uint128_t) in1[1]) * in2[0];
 501
 502     out[2] = ((uint128_t) in1[0]) * in2[2] +
 503              ((uint128_t) in1[1]) * in2[1] +
 504              ((uint128_t) in1[2]) * in2[0];
 505
 506     out[3] = ((uint128_t) in1[0]) * in2[3] +
 507              ((uint128_t) in1[1]) * in2[2] +
 508              ((uint128_t) in1[2]) * in2[1] +
 509              ((uint128_t) in1[3]) * in2[0];
 510
 511     out[4] = ((uint128_t) in1[0]) * in2[4] +
 512              ((uint128_t) in1[1]) * in2[3] +
 513              ((uint128_t) in1[2]) * in2[2] +
 514              ((uint128_t) in1[3]) * in2[1] +
 515              ((uint128_t) in1[4]) * in2[0];
 516
 517     out[5] = ((uint128_t) in1[0]) * in2[5] +
 518              ((uint128_t) in1[1]) * in2[4] +
 519              ((uint128_t) in1[2]) * in2[3] +
 520              ((uint128_t) in1[3]) * in2[2] +
 521              ((uint128_t) in1[4]) * in2[1] +
 522              ((uint128_t) in1[5]) * in2[0];
 523
 524     out[6] = ((uint128_t) in1[0]) * in2[6] +
 525              ((uint128_t) in1[1]) * in2[5] +
 526              ((uint128_t) in1[2]) * in2[4] +
 527              ((uint128_t) in1[3]) * in2[3] +
 528              ((uint128_t) in1[4]) * in2[2] +
 529              ((uint128_t) in1[5]) * in2[1] +
 530              ((uint128_t) in1[6]) * in2[0];
 531
 532     out[7] = ((uint128_t) in1[0]) * in2[7] +
 533              ((uint128_t) in1[1]) * in2[6] +
 534              ((uint128_t) in1[2]) * in2[5] +
 535              ((uint128_t) in1[3]) * in2[4] +
 536              ((uint128_t) in1[4]) * in2[3] +
 537              ((uint128_t) in1[5]) * in2[2] +
 538              ((uint128_t) in1[6]) * in2[1] +
 539              ((uint128_t) in1[7]) * in2[0];
 540
 541     out[8] = ((uint128_t) in1[0]) * in2[8] +
 542              ((uint128_t) in1[1]) * in2[7] +
 543              ((uint128_t) in1[2]) * in2[6] +
 544              ((uint128_t) in1[3]) * in2[5] +
 545              ((uint128_t) in1[4]) * in2[4] +
 546              ((uint128_t) in1[5]) * in2[3] +
 547              ((uint128_t) in1[6]) * in2[2] +
 548              ((uint128_t) in1[7]) * in2[1] +
 549              ((uint128_t) in1[8]) * in2[0];
 550
 551     /* See comment in felem_square about the use of in2x2 here */
 552
 553     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 554               ((uint128_t) in1[2]) * in2x2[7] +
 555               ((uint128_t) in1[3]) * in2x2[6] +
 556               ((uint128_t) in1[4]) * in2x2[5] +
 557               ((uint128_t) in1[5]) * in2x2[4] +
 558               ((uint128_t) in1[6]) * in2x2[3] +
 559               ((uint128_t) in1[7]) * in2x2[2] +
 560               ((uint128_t) in1[8]) * in2x2[1];
 561
 562     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 563               ((uint128_t) in1[3]) * in2x2[7] +
 564               ((uint128_t) in1[4]) * in2x2[6] +
 565               ((uint128_t) in1[5]) * in2x2[5] +
 566               ((uint128_t) in1[6]) * in2x2[4] +
 567               ((uint128_t) in1[7]) * in2x2[3] +
 568               ((uint128_t) in1[8]) * in2x2[2];
 569
 570     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 571               ((uint128_t) in1[4]) * in2x2[7] +
 572               ((uint128_t) in1[5]) * in2x2[6] +
 573               ((uint128_t) in1[6]) * in2x2[5] +
 574               ((uint128_t) in1[7]) * in2x2[4] +
 575               ((uint128_t) in1[8]) * in2x2[3];
 576
 577     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 578               ((uint128_t) in1[5]) * in2x2[7] +
 579               ((uint128_t) in1[6]) * in2x2[6] +
 580               ((uint128_t) in1[7]) * in2x2[5] +
 581               ((uint128_t) in1[8]) * in2x2[4];
 582
 583     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 584               ((uint128_t) in1[6]) * in2x2[7] +
 585               ((uint128_t) in1[7]) * in2x2[6] +
 586               ((uint128_t) in1[8]) * in2x2[5];
 587
 588     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 589               ((uint128_t) in1[7]) * in2x2[7] +
 590               ((uint128_t) in1[8]) * in2x2[6];
 591
 592     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 593               ((uint128_t) in1[8]) * in2x2[7];
 594
 595     out[7] += ((uint128_t) in1[8]) * in2x2[8];
 596 }
 597
 598 static const limb bottom52bits = 0xfffffffffffff;
 599
 600 /*-
 601  * felem_reduce converts a largefelem to an felem.
 602  * On entry:
 603  *   in[i] < 2^128
 604  * On exit:
 605  *   out[i] < 2^59 + 2^14
 606  */
 607 static void felem_reduce(felem out, const largefelem in)
 608 {
 609     u64 overflow1, overflow2;
 610
 611     out[0] = ((limb) in[0]) & bottom58bits;
 612     out[1] = ((limb) in[1]) & bottom58bits;
 613     out[2] = ((limb) in[2]) & bottom58bits;
 614     out[3] = ((limb) in[3]) & bottom58bits;
 615     out[4] = ((limb) in[4]) & bottom58bits;
 616     out[5] = ((limb) in[5]) & bottom58bits;
 617     out[6] = ((limb) in[6]) & bottom58bits;
 618     out[7] = ((limb) in[7]) & bottom58bits;
 619     out[8] = ((limb) in[8]) & bottom58bits;
 620
 621     /* out[i] < 2^58 */
 622
 623     out[1] += ((limb) in[0]) >> 58;
 624     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 625     /*-
 626      * out[1] < 2^58 + 2^6 + 2^58
 627      *        = 2^59 + 2^6
 628      */
 629     out[2] += ((limb) (in[0] >> 64)) >> 52;
 630
 631     out[2] += ((limb) in[1]) >> 58;
 632     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 633     out[3] += ((limb) (in[1] >> 64)) >> 52;
 634
 635     out[3] += ((limb) in[2]) >> 58;
 636     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 637     out[4] += ((limb) (in[2] >> 64)) >> 52;
 638
 639     out[4] += ((limb) in[3]) >> 58;
 640     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 641     out[5] += ((limb) (in[3] >> 64)) >> 52;
 642
 643     out[5] += ((limb) in[4]) >> 58;
 644     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 645     out[6] += ((limb) (in[4] >> 64)) >> 52;
 646
 647     out[6] += ((limb) in[5]) >> 58;
 648     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 649     out[7] += ((limb) (in[5] >> 64)) >> 52;
 650
 651     out[7] += ((limb) in[6]) >> 58;
 652     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 653     out[8] += ((limb) (in[6] >> 64)) >> 52;
 654
 655     out[8] += ((limb) in[7]) >> 58;
 656     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 657     /*-
 658      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 659      *            < 2^59 + 2^13
 660      */
 661     overflow1 = ((limb) (in[7] >> 64)) >> 52;
 662
 663     overflow1 += ((limb) in[8]) >> 58;
 664     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 665     overflow2 = ((limb) (in[8] >> 64)) >> 52;
 666
 667     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
 668     overflow2 <<= 1;            /* overflow2 < 2^13 */
 669
 670     out[0] += overflow1;        /* out[0] < 2^60 */
 671     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
 672
 673     out[1] += out[0] >> 58;
 674     out[0] &= bottom58bits;
 675     /*-
 676      * out[0] < 2^58
 677      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 678      *        < 2^59 + 2^14
 679      */
 680 }
 681
 682 static void felem_square_reduce(felem out, const felem in)
 683 {
 684     largefelem tmp;
 685     felem_square(tmp, in);
 686     felem_reduce(out, tmp);
 687 }
 688
 689 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 690 {
 691     largefelem tmp;
 692     felem_mul(tmp, in1, in2);
 693     felem_reduce(out, tmp);
 694 }
 695
 696 /*-
 697  * felem_inv calculates |out| = |in|^{-1}
 698  *
 699  * Based on Fermat's Little Theorem:
 700  *   a^p = a (mod p)
 701  *   a^{p-1} = 1 (mod p)
 702  *   a^{p-2} = a^{-1} (mod p)
 703  */
 704 static void felem_inv(felem out, const felem in)
 705 {
 706     felem ftmp, ftmp2, ftmp3, ftmp4;
 707     largefelem tmp;
 708     unsigned i;
 709
 710     felem_square(tmp, in);
 711     felem_reduce(ftmp, tmp);    /* 2^1 */
 712     felem_mul(tmp, in, ftmp);
 713     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
 714     felem_assign(ftmp2, ftmp);
 715     felem_square(tmp, ftmp);
 716     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
 717     felem_mul(tmp, in, ftmp);
 718     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
 719     felem_square(tmp, ftmp);
 720     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
 721
 722     felem_square(tmp, ftmp2);
 723     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
 724     felem_square(tmp, ftmp3);
 725     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
 726     felem_mul(tmp, ftmp3, ftmp2);
 727     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
 728
 729     felem_assign(ftmp2, ftmp3);
 730     felem_square(tmp, ftmp3);
 731     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
 732     felem_square(tmp, ftmp3);
 733     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
 734     felem_square(tmp, ftmp3);
 735     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
 736     felem_square(tmp, ftmp3);
 737     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
 738     felem_assign(ftmp4, ftmp3);
 739     felem_mul(tmp, ftmp3, ftmp);
 740     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
 741     felem_square(tmp, ftmp4);
 742     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
 743     felem_mul(tmp, ftmp3, ftmp2);
 744     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
 745     felem_assign(ftmp2, ftmp3);
 746
 747     for (i = 0; i < 8; i++) {
 748         felem_square(tmp, ftmp3);
 749         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
 750     }
 751     felem_mul(tmp, ftmp3, ftmp2);
 752     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
 753     felem_assign(ftmp2, ftmp3);
 754
 755     for (i = 0; i < 16; i++) {
 756         felem_square(tmp, ftmp3);
 757         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
 758     }
 759     felem_mul(tmp, ftmp3, ftmp2);
 760     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
 761     felem_assign(ftmp2, ftmp3);
 762
 763     for (i = 0; i < 32; i++) {
 764         felem_square(tmp, ftmp3);
 765         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
 766     }
 767     felem_mul(tmp, ftmp3, ftmp2);
 768     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
 769     felem_assign(ftmp2, ftmp3);
 770
 771     for (i = 0; i < 64; i++) {
 772         felem_square(tmp, ftmp3);
 773         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
 774     }
 775     felem_mul(tmp, ftmp3, ftmp2);
 776     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
 777     felem_assign(ftmp2, ftmp3);
 778
 779     for (i = 0; i < 128; i++) {
 780         felem_square(tmp, ftmp3);
 781         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
 782     }
 783     felem_mul(tmp, ftmp3, ftmp2);
 784     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
 785     felem_assign(ftmp2, ftmp3);
 786
 787     for (i = 0; i < 256; i++) {
 788         felem_square(tmp, ftmp3);
 789         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
 790     }
 791     felem_mul(tmp, ftmp3, ftmp2);
 792     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
 793
 794     for (i = 0; i < 9; i++) {
 795         felem_square(tmp, ftmp3);
 796         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
 797     }
 798     felem_mul(tmp, ftmp3, ftmp4);
 799     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
 800     felem_mul(tmp, ftmp3, in);
 801     felem_reduce(out, tmp);     /* 2^512 - 3 */
 802 }
 803
 804 /* This is 2^521-1, expressed as an felem */
 805 static const felem kPrime = {
 806     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 807     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 808     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 809 };
 810
 811 /*-
 812  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 813  * otherwise.
 814  * On entry:
 815  *   in[i] < 2^59 + 2^14
 816  */
 817 static limb felem_is_zero(const felem in)
 818 {
 819     felem ftmp;
 820     limb is_zero, is_p;
 821     felem_assign(ftmp, in);
 822
 823     ftmp[0] += ftmp[8] >> 57;
 824     ftmp[8] &= bottom57bits;
 825     /* ftmp[8] < 2^57 */
 826     ftmp[1] += ftmp[0] >> 58;
 827     ftmp[0] &= bottom58bits;
 828     ftmp[2] += ftmp[1] >> 58;
 829     ftmp[1] &= bottom58bits;
 830     ftmp[3] += ftmp[2] >> 58;
 831     ftmp[2] &= bottom58bits;
 832     ftmp[4] += ftmp[3] >> 58;
 833     ftmp[3] &= bottom58bits;
 834     ftmp[5] += ftmp[4] >> 58;
 835     ftmp[4] &= bottom58bits;
 836     ftmp[6] += ftmp[5] >> 58;
 837     ftmp[5] &= bottom58bits;
 838     ftmp[7] += ftmp[6] >> 58;
 839     ftmp[6] &= bottom58bits;
 840     ftmp[8] += ftmp[7] >> 58;
 841     ftmp[7] &= bottom58bits;
 842     /* ftmp[8] < 2^57 + 4 */
 843
 844     /*
 845      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
 846      * than our bound for ftmp[8]. Therefore we only have to check if the
 847      * zero is zero or 2^521-1.
 848      */
 849
 850     is_zero = 0;
 851     is_zero |= ftmp[0];
 852     is_zero |= ftmp[1];
 853     is_zero |= ftmp[2];
 854     is_zero |= ftmp[3];
 855     is_zero |= ftmp[4];
 856     is_zero |= ftmp[5];
 857     is_zero |= ftmp[6];
 858     is_zero |= ftmp[7];
 859     is_zero |= ftmp[8];
 860
 861     is_zero--;
 862     /*
 863      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
 864      * can be set is if is_zero was 0 before the decrement.
 865      */
 866     is_zero = 0 - (is_zero >> 63);
 867
 868     is_p = ftmp[0] ^ kPrime[0];
 869     is_p |= ftmp[1] ^ kPrime[1];
 870     is_p |= ftmp[2] ^ kPrime[2];
 871     is_p |= ftmp[3] ^ kPrime[3];
 872     is_p |= ftmp[4] ^ kPrime[4];
 873     is_p |= ftmp[5] ^ kPrime[5];
 874     is_p |= ftmp[6] ^ kPrime[6];
 875     is_p |= ftmp[7] ^ kPrime[7];
 876     is_p |= ftmp[8] ^ kPrime[8];
 877
 878     is_p--;
 879     is_p = 0 - (is_p >> 63);
 880
 881     is_zero |= is_p;
 882     return is_zero;
 883 }
 884
 885 static int felem_is_zero_int(const void *in)
 886 {
 887     return (int)(felem_is_zero(in) & ((limb) 1));
 888 }
 889
 890 /*-
 891  * felem_contract converts |in| to its unique, minimal representation.
 892  * On entry:
 893  *   in[i] < 2^59 + 2^14
 894  */
 895 static void felem_contract(felem out, const felem in)
 896 {
 897     limb is_p, is_greater, sign;
 898     static const limb two58 = ((limb) 1) << 58;
 899
 900     felem_assign(out, in);
 901
 902     out[0] += out[8] >> 57;
 903     out[8] &= bottom57bits;
 904     /* out[8] < 2^57 */
 905     out[1] += out[0] >> 58;
 906     out[0] &= bottom58bits;
 907     out[2] += out[1] >> 58;
 908     out[1] &= bottom58bits;
 909     out[3] += out[2] >> 58;
 910     out[2] &= bottom58bits;
 911     out[4] += out[3] >> 58;
 912     out[3] &= bottom58bits;
 913     out[5] += out[4] >> 58;
 914     out[4] &= bottom58bits;
 915     out[6] += out[5] >> 58;
 916     out[5] &= bottom58bits;
 917     out[7] += out[6] >> 58;
 918     out[6] &= bottom58bits;
 919     out[8] += out[7] >> 58;
 920     out[7] &= bottom58bits;
 921     /* out[8] < 2^57 + 4 */
 922
 923     /*
 924      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
 925      * out. See the comments in felem_is_zero regarding why we don't test for
 926      * other multiples of the prime.
 927      */
 928
 929     /*
 930      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
 931      */
 932
 933     is_p = out[0] ^ kPrime[0];
 934     is_p |= out[1] ^ kPrime[1];
 935     is_p |= out[2] ^ kPrime[2];
 936     is_p |= out[3] ^ kPrime[3];
 937     is_p |= out[4] ^ kPrime[4];
 938     is_p |= out[5] ^ kPrime[5];
 939     is_p |= out[6] ^ kPrime[6];
 940     is_p |= out[7] ^ kPrime[7];
 941     is_p |= out[8] ^ kPrime[8];
 942
 943     is_p--;
 944     is_p &= is_p << 32;
 945     is_p &= is_p << 16;
 946     is_p &= is_p << 8;
 947     is_p &= is_p << 4;
 948     is_p &= is_p << 2;
 949     is_p &= is_p << 1;
 950     is_p = 0 - (is_p >> 63);
 951     is_p = ~is_p;
 952
 953     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 954
 955     out[0] &= is_p;
 956     out[1] &= is_p;
 957     out[2] &= is_p;
 958     out[3] &= is_p;
 959     out[4] &= is_p;
 960     out[5] &= is_p;
 961     out[6] &= is_p;
 962     out[7] &= is_p;
 963     out[8] &= is_p;
 964
 965     /*
 966      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
 967      * 57 is greater than zero as (2^521-1) + x >= 2^522
 968      */
 969     is_greater = out[8] >> 57;
 970     is_greater |= is_greater << 32;
 971     is_greater |= is_greater << 16;
 972     is_greater |= is_greater << 8;
 973     is_greater |= is_greater << 4;
 974     is_greater |= is_greater << 2;
 975     is_greater |= is_greater << 1;
 976     is_greater = 0 - (is_greater >> 63);
 977
 978     out[0] -= kPrime[0] & is_greater;
 979     out[1] -= kPrime[1] & is_greater;
 980     out[2] -= kPrime[2] & is_greater;
 981     out[3] -= kPrime[3] & is_greater;
 982     out[4] -= kPrime[4] & is_greater;
 983     out[5] -= kPrime[5] & is_greater;
 984     out[6] -= kPrime[6] & is_greater;
 985     out[7] -= kPrime[7] & is_greater;
 986     out[8] -= kPrime[8] & is_greater;
 987
 988     /* Eliminate negative coefficients */
 989     sign = -(out[0] >> 63);
 990     out[0] += (two58 & sign);
 991     out[1] -= (1 & sign);
 992     sign = -(out[1] >> 63);
 993     out[1] += (two58 & sign);
 994     out[2] -= (1 & sign);
 995     sign = -(out[2] >> 63);
 996     out[2] += (two58 & sign);
 997     out[3] -= (1 & sign);
 998     sign = -(out[3] >> 63);
 999     out[3] += (two58 & sign);
1000     out[4] -= (1 & sign);
1001     sign = -(out[4] >> 63);
1002     out[4] += (two58 & sign);
1003     out[5] -= (1 & sign);
1004     sign = -(out[0] >> 63);
1005     out[5] += (two58 & sign);
1006     out[6] -= (1 & sign);
1007     sign = -(out[6] >> 63);
1008     out[6] += (two58 & sign);
1009     out[7] -= (1 & sign);
1010     sign = -(out[7] >> 63);
1011     out[7] += (two58 & sign);
1012     out[8] -= (1 & sign);
1013     sign = -(out[5] >> 63);
1014     out[5] += (two58 & sign);
1015     out[6] -= (1 & sign);
1016     sign = -(out[6] >> 63);
1017     out[6] += (two58 & sign);
1018     out[7] -= (1 & sign);
1019     sign = -(out[7] >> 63);
1020     out[7] += (two58 & sign);
1021     out[8] -= (1 & sign);
1022 }
1023
1024 /*-
1025  * Group operations
1026  * ----------------
1027  *
1028  * Building on top of the field operations we have the operations on the
1029  * elliptic curve group itself. Points on the curve are represented in Jacobian
1030  * coordinates */
1031
1032 /*-
1033  * point_double calculates 2*(x_in, y_in, z_in)
1034  *
1035  * The method is taken from:
1036  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1037  *
1038  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1039  * while x_out == y_in is not (maybe this works, but it's not tested). */
1040 static void
1041 point_double(felem x_out, felem y_out, felem z_out,
1042              const felem x_in, const felem y_in, const felem z_in)
1043 {
1044     largefelem tmp, tmp2;
1045     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1046
1047     felem_assign(ftmp, x_in);
1048     felem_assign(ftmp2, x_in);
1049
1050     /* delta = z^2 */
1051     felem_square(tmp, z_in);
1052     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1053
1054     /* gamma = y^2 */
1055     felem_square(tmp, y_in);
1056     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1057
1058     /* beta = x*gamma */
1059     felem_mul(tmp, x_in, gamma);
1060     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1061
1062     /* alpha = 3*(x-delta)*(x+delta) */
1063     felem_diff64(ftmp, delta);
1064     /* ftmp[i] < 2^61 */
1065     felem_sum64(ftmp2, delta);
1066     /* ftmp2[i] < 2^60 + 2^15 */
1067     felem_scalar64(ftmp2, 3);
1068     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1069     felem_mul(tmp, ftmp, ftmp2);
1070     /*-
1071      * tmp[i] < 17(3*2^121 + 3*2^76)
1072      *        = 61*2^121 + 61*2^76
1073      *        < 64*2^121 + 64*2^76
1074      *        = 2^127 + 2^82
1075      *        < 2^128
1076      */
1077     felem_reduce(alpha, tmp);
1078
1079     /* x' = alpha^2 - 8*beta */
1080     felem_square(tmp, alpha);
1081     /*
1082      * tmp[i] < 17*2^120 < 2^125
1083      */
1084     felem_assign(ftmp, beta);
1085     felem_scalar64(ftmp, 8);
1086     /* ftmp[i] < 2^62 + 2^17 */
1087     felem_diff_128_64(tmp, ftmp);
1088     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1089     felem_reduce(x_out, tmp);
1090
1091     /* z' = (y + z)^2 - gamma - delta */
1092     felem_sum64(delta, gamma);
1093     /* delta[i] < 2^60 + 2^15 */
1094     felem_assign(ftmp, y_in);
1095     felem_sum64(ftmp, z_in);
1096     /* ftmp[i] < 2^60 + 2^15 */
1097     felem_square(tmp, ftmp);
1098     /*
1099      * tmp[i] < 17(2^122) < 2^127
1100      */
1101     felem_diff_128_64(tmp, delta);
1102     /* tmp[i] < 2^127 + 2^63 */
1103     felem_reduce(z_out, tmp);
1104
1105     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1106     felem_scalar64(beta, 4);
1107     /* beta[i] < 2^61 + 2^16 */
1108     felem_diff64(beta, x_out);
1109     /* beta[i] < 2^61 + 2^60 + 2^16 */
1110     felem_mul(tmp, alpha, beta);
1111     /*-
1112      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1113      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1114      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1115      *        < 2^128
1116      */
1117     felem_square(tmp2, gamma);
1118     /*-
1119      * tmp2[i] < 17*(2^59 + 2^14)^2
1120      *         = 17*(2^118 + 2^74 + 2^28)
1121      */
1122     felem_scalar128(tmp2, 8);
1123     /*-
1124      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1125      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1126      *         < 2^126
1127      */
1128     felem_diff128(tmp, tmp2);
1129     /*-
1130      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1131      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1132      *          2^74 + 2^69 + 2^34 + 2^30
1133      *        < 2^128
1134      */
1135     felem_reduce(y_out, tmp);
1136 }
1137
1138 /* copy_conditional copies in to out iff mask is all ones. */
1139 static void copy_conditional(felem out, const felem in, limb mask)
1140 {
1141     unsigned i;
1142     for (i = 0; i < NLIMBS; ++i) {
1143         const limb tmp = mask & (in[i] ^ out[i]);
1144         out[i] ^= tmp;
1145     }
1146 }
1147
1148 /*-
1149  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1150  *
1151  * The method is taken from
1152  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1153  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1154  *
1155  * This function includes a branch for checking whether the two input points
1156  * are equal (while not equal to the point at infinity). See comment below
1157  * on constant-time.
1158  */
1159 static void point_add(felem x3, felem y3, felem z3,
1160                       const felem x1, const felem y1, const felem z1,
1161                       const int mixed, const felem x2, const felem y2,
1162                       const felem z2)
1163 {
1164     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1165     largefelem tmp, tmp2;
1166     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1167     limb points_equal;
1168
1169     z1_is_zero = felem_is_zero(z1);
1170     z2_is_zero = felem_is_zero(z2);
1171
1172     /* ftmp = z1z1 = z1**2 */
1173     felem_square(tmp, z1);
1174     felem_reduce(ftmp, tmp);
1175
1176     if (!mixed) {
1177         /* ftmp2 = z2z2 = z2**2 */
1178         felem_square(tmp, z2);
1179         felem_reduce(ftmp2, tmp);
1180
1181         /* u1 = ftmp3 = x1*z2z2 */
1182         felem_mul(tmp, x1, ftmp2);
1183         felem_reduce(ftmp3, tmp);
1184
1185         /* ftmp5 = z1 + z2 */
1186         felem_assign(ftmp5, z1);
1187         felem_sum64(ftmp5, z2);
1188         /* ftmp5[i] < 2^61 */
1189
1190         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1191         felem_square(tmp, ftmp5);
1192         /* tmp[i] < 17*2^122 */
1193         felem_diff_128_64(tmp, ftmp);
1194         /* tmp[i] < 17*2^122 + 2^63 */
1195         felem_diff_128_64(tmp, ftmp2);
1196         /* tmp[i] < 17*2^122 + 2^64 */
1197         felem_reduce(ftmp5, tmp);
1198
1199         /* ftmp2 = z2 * z2z2 */
1200         felem_mul(tmp, ftmp2, z2);
1201         felem_reduce(ftmp2, tmp);
1202
1203         /* s1 = ftmp6 = y1 * z2**3 */
1204         felem_mul(tmp, y1, ftmp2);
1205         felem_reduce(ftmp6, tmp);
1206     } else {
1207         /*
1208          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1209          */
1210
1211         /* u1 = ftmp3 = x1*z2z2 */
1212         felem_assign(ftmp3, x1);
1213
1214         /* ftmp5 = 2*z1z2 */
1215         felem_scalar(ftmp5, z1, 2);
1216
1217         /* s1 = ftmp6 = y1 * z2**3 */
1218         felem_assign(ftmp6, y1);
1219     }
1220
1221     /* u2 = x2*z1z1 */
1222     felem_mul(tmp, x2, ftmp);
1223     /* tmp[i] < 17*2^120 */
1224
1225     /* h = ftmp4 = u2 - u1 */
1226     felem_diff_128_64(tmp, ftmp3);
1227     /* tmp[i] < 17*2^120 + 2^63 */
1228     felem_reduce(ftmp4, tmp);
1229
1230     x_equal = felem_is_zero(ftmp4);
1231
1232     /* z_out = ftmp5 * h */
1233     felem_mul(tmp, ftmp5, ftmp4);
1234     felem_reduce(z_out, tmp);
1235
1236     /* ftmp = z1 * z1z1 */
1237     felem_mul(tmp, ftmp, z1);
1238     felem_reduce(ftmp, tmp);
1239
1240     /* s2 = tmp = y2 * z1**3 */
1241     felem_mul(tmp, y2, ftmp);
1242     /* tmp[i] < 17*2^120 */
1243
1244     /* r = ftmp5 = (s2 - s1)*2 */
1245     felem_diff_128_64(tmp, ftmp6);
1246     /* tmp[i] < 17*2^120 + 2^63 */
1247     felem_reduce(ftmp5, tmp);
1248     y_equal = felem_is_zero(ftmp5);
1249     felem_scalar64(ftmp5, 2);
1250     /* ftmp5[i] < 2^61 */
1251
1252     /*
1253      * The formulae are incorrect if the points are equal, in affine coordinates
1254      * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1255      * happens.
1256      *
1257      * We use bitwise operations to avoid potential side-channels introduced by
1258      * the short-circuiting behaviour of boolean operators.
1259      *
1260      * The special case of either point being the point at infinity (z1 and/or
1261      * z2 are zero), is handled separately later on in this function, so we
1262      * avoid jumping to point_double here in those special cases.
1263      *
1264      * Notice the comment below on the implications of this branching for timing
1265      * leaks and why it is considered practically irrelevant.
1266      */
1267     points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1268
1269     if (points_equal) {
1270         /*
1271          * This is obviously not constant-time but it will almost-never happen
1272          * for ECDH / ECDSA. The case where it can happen is during scalar-mult
1273          * where the intermediate value gets very close to the group order.
1274          * Since |ec_GFp_nistp_recode_scalar_bits| produces signed digits for
1275          * the scalar, it's possible for the intermediate value to be a small
1276          * negative multiple of the base point, and for the final signed digit
1277          * to be the same value. We believe that this only occurs for the scalar
1278          * 1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
1279          * ffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb
1280          * 71e913863f7, in that case the penultimate intermediate is -9G and
1281          * the final digit is also -9G. Since this only happens for a single
1282          * scalar, the timing leak is irrelevant. (Any attacker who wanted to
1283          * check whether a secret scalar was that exact value, can already do
1284          * so.)
1285          */
1286         point_double(x3, y3, z3, x1, y1, z1);
1287         return;
1288     }
1289
1290     /* I = ftmp = (2h)**2 */
1291     felem_assign(ftmp, ftmp4);
1292     felem_scalar64(ftmp, 2);
1293     /* ftmp[i] < 2^61 */
1294     felem_square(tmp, ftmp);
1295     /* tmp[i] < 17*2^122 */
1296     felem_reduce(ftmp, tmp);
1297
1298     /* J = ftmp2 = h * I */
1299     felem_mul(tmp, ftmp4, ftmp);
1300     felem_reduce(ftmp2, tmp);
1301
1302     /* V = ftmp4 = U1 * I */
1303     felem_mul(tmp, ftmp3, ftmp);
1304     felem_reduce(ftmp4, tmp);
1305
1306     /* x_out = r**2 - J - 2V */
1307     felem_square(tmp, ftmp5);
1308     /* tmp[i] < 17*2^122 */
1309     felem_diff_128_64(tmp, ftmp2);
1310     /* tmp[i] < 17*2^122 + 2^63 */
1311     felem_assign(ftmp3, ftmp4);
1312     felem_scalar64(ftmp4, 2);
1313     /* ftmp4[i] < 2^61 */
1314     felem_diff_128_64(tmp, ftmp4);
1315     /* tmp[i] < 17*2^122 + 2^64 */
1316     felem_reduce(x_out, tmp);
1317
1318     /* y_out = r(V-x_out) - 2 * s1 * J */
1319     felem_diff64(ftmp3, x_out);
1320     /*
1321      * ftmp3[i] < 2^60 + 2^60 = 2^61
1322      */
1323     felem_mul(tmp, ftmp5, ftmp3);
1324     /* tmp[i] < 17*2^122 */
1325     felem_mul(tmp2, ftmp6, ftmp2);
1326     /* tmp2[i] < 17*2^120 */
1327     felem_scalar128(tmp2, 2);
1328     /* tmp2[i] < 17*2^121 */
1329     felem_diff128(tmp, tmp2);
1330         /*-
1331          * tmp[i] < 2^127 - 2^69 + 17*2^122
1332          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1333          *        < 2^127
1334          */
1335     felem_reduce(y_out, tmp);
1336
1337     copy_conditional(x_out, x2, z1_is_zero);
1338     copy_conditional(x_out, x1, z2_is_zero);
1339     copy_conditional(y_out, y2, z1_is_zero);
1340     copy_conditional(y_out, y1, z2_is_zero);
1341     copy_conditional(z_out, z2, z1_is_zero);
1342     copy_conditional(z_out, z1, z2_is_zero);
1343     felem_assign(x3, x_out);
1344     felem_assign(y3, y_out);
1345     felem_assign(z3, z_out);
1346 }
1347
1348 /*-
1349  * Base point pre computation
1350  * --------------------------
1351  *
1352  * Two different sorts of precomputed tables are used in the following code.
1353  * Each contain various points on the curve, where each point is three field
1354  * elements (x, y, z).
1355  *
1356  * For the base point table, z is usually 1 (0 for the point at infinity).
1357  * This table has 16 elements:
1358  * index | bits    | point
1359  * ------+---------+------------------------------
1360  *     0 | 0 0 0 0 | 0G
1361  *     1 | 0 0 0 1 | 1G
1362  *     2 | 0 0 1 0 | 2^130G
1363  *     3 | 0 0 1 1 | (2^130 + 1)G
1364  *     4 | 0 1 0 0 | 2^260G
1365  *     5 | 0 1 0 1 | (2^260 + 1)G
1366  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1367  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1368  *     8 | 1 0 0 0 | 2^390G
1369  *     9 | 1 0 0 1 | (2^390 + 1)G
1370  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1371  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1372  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1373  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1374  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1375  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1376  *
1377  * The reason for this is so that we can clock bits into four different
1378  * locations when doing simple scalar multiplies against the base point.
1379  *
1380  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1381
1382 /* gmul is the table of precomputed base points */
1383 static const felem gmul[16][3] = {
1384 {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1385  {0, 0, 0, 0, 0, 0, 0, 0, 0},
1386  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1387 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1388   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1389   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1390  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1391   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1392   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1393  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1394 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1395   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1396   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1397  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1398   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1399   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1400  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1401 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1402   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1403   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1404  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1405   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1406   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1407  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1408 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1409   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1410   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1411  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1412   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1413   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1414  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1415 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1416   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1417   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1418  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1419   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1420   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1421  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1422 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1423   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1424   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1425  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1426   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1427   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1428  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1429 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1430   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1431   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1432  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1433   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1434   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1435  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1436 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1437   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1438   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1439  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1440   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1441   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1442  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1443 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1444   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1445   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1446  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1447   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1448   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1449  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1450 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1451   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1452   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1453  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1454   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1455   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1456  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1457 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1458   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1459   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1460  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1461   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1462   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1463  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1464 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1465   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1466   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1467  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1468   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1469   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1470  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1471 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1472   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1473   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1474  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1475   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1476   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1477  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1478 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1479   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1480   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1481  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1482   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1483   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1484  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1485 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1486   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1487   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1488  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1489   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1490   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1491  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1492 };
1493
1494 /*
1495  * select_point selects the |idx|th point from a precomputation table and
1496  * copies it to out.
1497  */
1498  /* pre_comp below is of the size provided in |size| */
1499 static void select_point(const limb idx, unsigned int size,
1500                          const felem pre_comp[][3], felem out[3])
1501 {
1502     unsigned i, j;
1503     limb *outlimbs = &out[0][0];
1504
1505     memset(out, 0, sizeof(*out) * 3);
1506
1507     for (i = 0; i < size; i++) {
1508         const limb *inlimbs = &pre_comp[i][0][0];
1509         limb mask = i ^ idx;
1510         mask |= mask >> 4;
1511         mask |= mask >> 2;
1512         mask |= mask >> 1;
1513         mask &= 1;
1514         mask--;
1515         for (j = 0; j < NLIMBS * 3; j++)
1516             outlimbs[j] |= inlimbs[j] & mask;
1517     }
1518 }
1519
1520 /* get_bit returns the |i|th bit in |in| */
1521 static char get_bit(const felem_bytearray in, int i)
1522 {
1523     if (i < 0)
1524         return 0;
1525     return (in[i >> 3] >> (i & 7)) & 1;
1526 }
1527
1528 /*
1529  * Interleaved point multiplication using precomputed point multiples: The
1530  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1531  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1532  * generator, using certain (large) precomputed multiples in g_pre_comp.
1533  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1534  */
1535 static void batch_mul(felem x_out, felem y_out, felem z_out,
1536                       const felem_bytearray scalars[],
1537                       const unsigned num_points, const u8 *g_scalar,
1538                       const int mixed, const felem pre_comp[][17][3],
1539                       const felem g_pre_comp[16][3])
1540 {
1541     int i, skip;
1542     unsigned num, gen_mul = (g_scalar != NULL);
1543     felem nq[3], tmp[4];
1544     limb bits;
1545     u8 sign, digit;
1546
1547     /* set nq to the point at infinity */
1548     memset(nq, 0, sizeof(nq));
1549
1550     /*
1551      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1552      * of the generator (last quarter of rounds) and additions of other
1553      * points multiples (every 5th round).
1554      */
1555     skip = 1;                   /* save two point operations in the first
1556                                  * round */
1557     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1558         /* double */
1559         if (!skip)
1560             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1561
1562         /* add multiples of the generator */
1563         if (gen_mul && (i <= 130)) {
1564             bits = get_bit(g_scalar, i + 390) << 3;
1565             if (i < 130) {
1566                 bits |= get_bit(g_scalar, i + 260) << 2;
1567                 bits |= get_bit(g_scalar, i + 130) << 1;
1568                 bits |= get_bit(g_scalar, i);
1569             }
1570             /* select the point to add, in constant time */
1571             select_point(bits, 16, g_pre_comp, tmp);
1572             if (!skip) {
1573                 /* The 1 argument below is for "mixed" */
1574                 point_add(nq[0], nq[1], nq[2],
1575                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1576             } else {
1577                 memcpy(nq, tmp, 3 * sizeof(felem));
1578                 skip = 0;
1579             }
1580         }
1581
1582         /* do other additions every 5 doublings */
1583         if (num_points && (i % 5 == 0)) {
1584             /* loop over all scalars */
1585             for (num = 0; num < num_points; ++num) {
1586                 bits = get_bit(scalars[num], i + 4) << 5;
1587                 bits |= get_bit(scalars[num], i + 3) << 4;
1588                 bits |= get_bit(scalars[num], i + 2) << 3;
1589                 bits |= get_bit(scalars[num], i + 1) << 2;
1590                 bits |= get_bit(scalars[num], i) << 1;
1591                 bits |= get_bit(scalars[num], i - 1);
1592                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1593
1594                 /*
1595                  * select the point to add or subtract, in constant time
1596                  */
1597                 select_point(digit, 17, pre_comp[num], tmp);
1598                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1599                                             * point */
1600                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1601
1602                 if (!skip) {
1603                     point_add(nq[0], nq[1], nq[2],
1604                               nq[0], nq[1], nq[2],
1605                               mixed, tmp[0], tmp[1], tmp[2]);
1606                 } else {
1607                     memcpy(nq, tmp, 3 * sizeof(felem));
1608                     skip = 0;
1609                 }
1610             }
1611         }
1612     }
1613     felem_assign(x_out, nq[0]);
1614     felem_assign(y_out, nq[1]);
1615     felem_assign(z_out, nq[2]);
1616 }
1617
1618 /* Precomputation for the group generator. */
1619 struct nistp521_pre_comp_st {
1620     felem g_pre_comp[16][3];
1621     CRYPTO_REF_COUNT references;
1622     CRYPTO_RWLOCK *lock;
1623 };
1624
1625 const EC_METHOD *EC_GFp_nistp521_method(void)
1626 {
1627     static const EC_METHOD ret = {
1628         EC_FLAGS_DEFAULT_OCT,
1629         NID_X9_62_prime_field,
1630         ec_GFp_nistp521_group_init,
1631         ec_GFp_simple_group_finish,
1632         ec_GFp_simple_group_clear_finish,
1633         ec_GFp_nist_group_copy,
1634         ec_GFp_nistp521_group_set_curve,
1635         ec_GFp_simple_group_get_curve,
1636         ec_GFp_simple_group_get_degree,
1637         ec_group_simple_order_bits,
1638         ec_GFp_simple_group_check_discriminant,
1639         ec_GFp_simple_point_init,
1640         ec_GFp_simple_point_finish,
1641         ec_GFp_simple_point_clear_finish,
1642         ec_GFp_simple_point_copy,
1643         ec_GFp_simple_point_set_to_infinity,
1644         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1645         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1646         ec_GFp_simple_point_set_affine_coordinates,
1647         ec_GFp_nistp521_point_get_affine_coordinates,
1648         0 /* point_set_compressed_coordinates */ ,
1649         0 /* point2oct */ ,
1650         0 /* oct2point */ ,
1651         ec_GFp_simple_add,
1652         ec_GFp_simple_dbl,
1653         ec_GFp_simple_invert,
1654         ec_GFp_simple_is_at_infinity,
1655         ec_GFp_simple_is_on_curve,
1656         ec_GFp_simple_cmp,
1657         ec_GFp_simple_make_affine,
1658         ec_GFp_simple_points_make_affine,
1659         ec_GFp_nistp521_points_mul,
1660         ec_GFp_nistp521_precompute_mult,
1661         ec_GFp_nistp521_have_precompute_mult,
1662         ec_GFp_nist_field_mul,
1663         ec_GFp_nist_field_sqr,
1664         0 /* field_div */ ,
1665         ec_GFp_simple_field_inv,
1666         0 /* field_encode */ ,
1667         0 /* field_decode */ ,
1668         0,                      /* field_set_to_one */
1669         ec_key_simple_priv2oct,
1670         ec_key_simple_oct2priv,
1671         0, /* set private */
1672         ec_key_simple_generate_key,
1673         ec_key_simple_check_key,
1674         ec_key_simple_generate_public_key,
1675         0, /* keycopy */
1676         0, /* keyfinish */
1677         ecdh_simple_compute_key,
1678         ecdsa_simple_sign_setup,
1679         ecdsa_simple_sign_sig,
1680         ecdsa_simple_verify_sig,
1681         0, /* field_inverse_mod_ord */
1682         0, /* blind_coordinates */
1683         0, /* ladder_pre */
1684         0, /* ladder_step */
1685         0  /* ladder_post */
1686     };
1687
1688     return &ret;
1689 }
1690
1691 /******************************************************************************/
1692 /*
1693  * FUNCTIONS TO MANAGE PRECOMPUTATION
1694  */
1695
1696 static NISTP521_PRE_COMP *nistp521_pre_comp_new(void)
1697 {
1698     NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1699
1700     if (ret == NULL) {
1701         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1702         return ret;
1703     }
1704
1705     ret->references = 1;
1706
1707     ret->lock = CRYPTO_THREAD_lock_new();
1708     if (ret->lock == NULL) {
1709         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1710         OPENSSL_free(ret);
1711         return NULL;
1712     }
1713     return ret;
1714 }
1715
1716 NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
1717 {
1718     int i;
1719     if (p != NULL)
1720         CRYPTO_UP_REF(&p->references, &i, p->lock);
1721     return p;
1722 }
1723
1724 void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1725 {
1726     int i;
1727
1728     if (p == NULL)
1729         return;
1730
1731     CRYPTO_DOWN_REF(&p->references, &i, p->lock);
1732     REF_PRINT_COUNT("EC_nistp521", x);
1733     if (i > 0)
1734         return;
1735     REF_ASSERT_ISNT(i < 0);
1736
1737     CRYPTO_THREAD_lock_free(p->lock);
1738     OPENSSL_free(p);
1739 }
1740
1741 /******************************************************************************/
1742 /*
1743  * OPENSSL EC_METHOD FUNCTIONS
1744  */
1745
1746 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1747 {
1748     int ret;
1749     ret = ec_GFp_simple_group_init(group);
1750     group->a_is_minus3 = 1;
1751     return ret;
1752 }
1753
1754 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1755                                     const BIGNUM *a, const BIGNUM *b,
1756                                     BN_CTX *ctx)
1757 {
1758     int ret = 0;
1759     BIGNUM *curve_p, *curve_a, *curve_b;
1760 #ifndef FIPS_MODE
1761     BN_CTX *new_ctx = NULL;
1762
1763     if (ctx == NULL)
1764         ctx = new_ctx = BN_CTX_new();
1765 #endif
1766     if (ctx == NULL)
1767         return 0;
1768
1769     BN_CTX_start(ctx);
1770     curve_p = BN_CTX_get(ctx);
1771     curve_a = BN_CTX_get(ctx);
1772     curve_b = BN_CTX_get(ctx);
1773     if (curve_b == NULL)
1774         goto err;
1775     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1776     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1777     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1778     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1779         ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1780               EC_R_WRONG_CURVE_PARAMETERS);
1781         goto err;
1782     }
1783     group->field_mod_func = BN_nist_mod_521;
1784     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1785  err:
1786     BN_CTX_end(ctx);
1787 #ifndef FIPS_MODE
1788     BN_CTX_free(new_ctx);
1789 #endif
1790     return ret;
1791 }
1792
1793 /*
1794  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1795  * (X/Z^2, Y/Z^3)
1796  */
1797 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1798                                                  const EC_POINT *point,
1799                                                  BIGNUM *x, BIGNUM *y,
1800                                                  BN_CTX *ctx)
1801 {
1802     felem z1, z2, x_in, y_in, x_out, y_out;
1803     largefelem tmp;
1804
1805     if (EC_POINT_is_at_infinity(group, point)) {
1806         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1807               EC_R_POINT_AT_INFINITY);
1808         return 0;
1809     }
1810     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1811         (!BN_to_felem(z1, point->Z)))
1812         return 0;
1813     felem_inv(z2, z1);
1814     felem_square(tmp, z2);
1815     felem_reduce(z1, tmp);
1816     felem_mul(tmp, x_in, z1);
1817     felem_reduce(x_in, tmp);
1818     felem_contract(x_out, x_in);
1819     if (x != NULL) {
1820         if (!felem_to_BN(x, x_out)) {
1821             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1822                   ERR_R_BN_LIB);
1823             return 0;
1824         }
1825     }
1826     felem_mul(tmp, z1, z2);
1827     felem_reduce(z1, tmp);
1828     felem_mul(tmp, y_in, z1);
1829     felem_reduce(y_in, tmp);
1830     felem_contract(y_out, y_in);
1831     if (y != NULL) {
1832         if (!felem_to_BN(y, y_out)) {
1833             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1834                   ERR_R_BN_LIB);
1835             return 0;
1836         }
1837     }
1838     return 1;
1839 }
1840
1841 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1842 static void make_points_affine(size_t num, felem points[][3],
1843                                felem tmp_felems[])
1844 {
1845     /*
1846      * Runs in constant time, unless an input is the point at infinity (which
1847      * normally shouldn't happen).
1848      */
1849     ec_GFp_nistp_points_make_affine_internal(num,
1850                                              points,
1851                                              sizeof(felem),
1852                                              tmp_felems,
1853                                              (void (*)(void *))felem_one,
1854                                              felem_is_zero_int,
1855                                              (void (*)(void *, const void *))
1856                                              felem_assign,
1857                                              (void (*)(void *, const void *))
1858                                              felem_square_reduce, (void (*)
1859                                                                    (void *,
1860                                                                     const void
1861                                                                     *,
1862                                                                     const void
1863                                                                     *))
1864                                              felem_mul_reduce,
1865                                              (void (*)(void *, const void *))
1866                                              felem_inv,
1867                                              (void (*)(void *, const void *))
1868                                              felem_contract);
1869 }
1870
1871 /*
1872  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1873  * values Result is stored in r (r can equal one of the inputs).
1874  */
1875 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1876                                const BIGNUM *scalar, size_t num,
1877                                const EC_POINT *points[],
1878                                const BIGNUM *scalars[], BN_CTX *ctx)
1879 {
1880     int ret = 0;
1881     int j;
1882     int mixed = 0;
1883     BIGNUM *x, *y, *z, *tmp_scalar;
1884     felem_bytearray g_secret;
1885     felem_bytearray *secrets = NULL;
1886     felem (*pre_comp)[17][3] = NULL;
1887     felem *tmp_felems = NULL;
1888     unsigned i;
1889     int num_bytes;
1890     int have_pre_comp = 0;
1891     size_t num_points = num;
1892     felem x_in, y_in, z_in, x_out, y_out, z_out;
1893     NISTP521_PRE_COMP *pre = NULL;
1894     felem(*g_pre_comp)[3] = NULL;
1895     EC_POINT *generator = NULL;
1896     const EC_POINT *p = NULL;
1897     const BIGNUM *p_scalar = NULL;
1898
1899     BN_CTX_start(ctx);
1900     x = BN_CTX_get(ctx);
1901     y = BN_CTX_get(ctx);
1902     z = BN_CTX_get(ctx);
1903     tmp_scalar = BN_CTX_get(ctx);
1904     if (tmp_scalar == NULL)
1905         goto err;
1906
1907     if (scalar != NULL) {
1908         pre = group->pre_comp.nistp521;
1909         if (pre)
1910             /* we have precomputation, try to use it */
1911             g_pre_comp = &pre->g_pre_comp[0];
1912         else
1913             /* try to use the standard precomputation */
1914             g_pre_comp = (felem(*)[3]) gmul;
1915         generator = EC_POINT_new(group);
1916         if (generator == NULL)
1917             goto err;
1918         /* get the generator from precomputation */
1919         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1920             !felem_to_BN(y, g_pre_comp[1][1]) ||
1921             !felem_to_BN(z, g_pre_comp[1][2])) {
1922             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1923             goto err;
1924         }
1925         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1926                                                       generator, x, y, z,
1927                                                       ctx))
1928             goto err;
1929         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1930             /* precomputation matches generator */
1931             have_pre_comp = 1;
1932         else
1933             /*
1934              * we don't have valid precomputation: treat the generator as a
1935              * random point
1936              */
1937             num_points++;
1938     }
1939
1940     if (num_points > 0) {
1941         if (num_points >= 2) {
1942             /*
1943              * unless we precompute multiples for just one point, converting
1944              * those into affine form is time well spent
1945              */
1946             mixed = 1;
1947         }
1948         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1949         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1950         if (mixed)
1951             tmp_felems =
1952                 OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1953         if ((secrets == NULL) || (pre_comp == NULL)
1954             || (mixed && (tmp_felems == NULL))) {
1955             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1956             goto err;
1957         }
1958
1959         /*
1960          * we treat NULL scalars as 0, and NULL points as points at infinity,
1961          * i.e., they contribute nothing to the linear combination
1962          */
1963         for (i = 0; i < num_points; ++i) {
1964             if (i == num) {
1965                 /*
1966                  * we didn't have a valid precomputation, so we pick the
1967                  * generator
1968                  */
1969                 p = EC_GROUP_get0_generator(group);
1970                 p_scalar = scalar;
1971             } else {
1972                 /* the i^th point */
1973                 p = points[i];
1974                 p_scalar = scalars[i];
1975             }
1976             if ((p_scalar != NULL) && (p != NULL)) {
1977                 /* reduce scalar to 0 <= scalar < 2^521 */
1978                 if ((BN_num_bits(p_scalar) > 521)
1979                     || (BN_is_negative(p_scalar))) {
1980                     /*
1981                      * this is an unusual input, and we don't guarantee
1982                      * constant-timeness
1983                      */
1984                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1985                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1986                         goto err;
1987                     }
1988                     num_bytes = BN_bn2lebinpad(tmp_scalar,
1989                                                secrets[i], sizeof(secrets[i]));
1990                 } else {
1991                     num_bytes = BN_bn2lebinpad(p_scalar,
1992                                                secrets[i], sizeof(secrets[i]));
1993                 }
1994                 if (num_bytes < 0) {
1995                     ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1996                     goto err;
1997                 }
1998                 /* precompute multiples */
1999                 if ((!BN_to_felem(x_out, p->X)) ||
2000                     (!BN_to_felem(y_out, p->Y)) ||
2001                     (!BN_to_felem(z_out, p->Z)))
2002                     goto err;
2003                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
2004                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
2005                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
2006                 for (j = 2; j <= 16; ++j) {
2007                     if (j & 1) {
2008                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
2009                                   pre_comp[i][j][2], pre_comp[i][1][0],
2010                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
2011                                   pre_comp[i][j - 1][0],
2012                                   pre_comp[i][j - 1][1],
2013                                   pre_comp[i][j - 1][2]);
2014                     } else {
2015                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
2016                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
2017                                      pre_comp[i][j / 2][1],
2018                                      pre_comp[i][j / 2][2]);
2019                     }
2020                 }
2021             }
2022         }
2023         if (mixed)
2024             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
2025     }
2026
2027     /* the scalar for the generator */
2028     if ((scalar != NULL) && (have_pre_comp)) {
2029         memset(g_secret, 0, sizeof(g_secret));
2030         /* reduce scalar to 0 <= scalar < 2^521 */
2031         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
2032             /*
2033              * this is an unusual input, and we don't guarantee
2034              * constant-timeness
2035              */
2036             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2037                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2038                 goto err;
2039             }
2040             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2041         } else {
2042             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2043         }
2044         /* do the multiplication with generator precomputation */
2045         batch_mul(x_out, y_out, z_out,
2046                   (const felem_bytearray(*))secrets, num_points,
2047                   g_secret,
2048                   mixed, (const felem(*)[17][3])pre_comp,
2049                   (const felem(*)[3])g_pre_comp);
2050     } else {
2051         /* do the multiplication without generator precomputation */
2052         batch_mul(x_out, y_out, z_out,
2053                   (const felem_bytearray(*))secrets, num_points,
2054                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
2055     }
2056     /* reduce the output to its unique minimal representation */
2057     felem_contract(x_in, x_out);
2058     felem_contract(y_in, y_out);
2059     felem_contract(z_in, z_out);
2060     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2061         (!felem_to_BN(z, z_in))) {
2062         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2063         goto err;
2064     }
2065     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2066
2067  err:
2068     BN_CTX_end(ctx);
2069     EC_POINT_free(generator);
2070     OPENSSL_free(secrets);
2071     OPENSSL_free(pre_comp);
2072     OPENSSL_free(tmp_felems);
2073     return ret;
2074 }
2075
2076 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2077 {
2078     int ret = 0;
2079     NISTP521_PRE_COMP *pre = NULL;
2080     int i, j;
2081     BIGNUM *x, *y;
2082     EC_POINT *generator = NULL;
2083     felem tmp_felems[16];
2084 #ifndef FIPS_MODE
2085     BN_CTX *new_ctx = NULL;
2086 #endif
2087
2088     /* throw away old precomputation */
2089     EC_pre_comp_free(group);
2090
2091 #ifndef FIPS_MODE
2092     if (ctx == NULL)
2093         ctx = new_ctx = BN_CTX_new();
2094 #endif
2095     if (ctx == NULL)
2096         return 0;
2097
2098     BN_CTX_start(ctx);
2099     x = BN_CTX_get(ctx);
2100     y = BN_CTX_get(ctx);
2101     if (y == NULL)
2102         goto err;
2103     /* get the generator */
2104     if (group->generator == NULL)
2105         goto err;
2106     generator = EC_POINT_new(group);
2107     if (generator == NULL)
2108         goto err;
2109     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2110     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2111     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2112         goto err;
2113     if ((pre = nistp521_pre_comp_new()) == NULL)
2114         goto err;
2115     /*
2116      * if the generator is the standard one, use built-in precomputation
2117      */
2118     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2119         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2120         goto done;
2121     }
2122     if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
2123         (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
2124         (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2125         goto err;
2126     /* compute 2^130*G, 2^260*G, 2^390*G */
2127     for (i = 1; i <= 4; i <<= 1) {
2128         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2129                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2130                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2131         for (j = 0; j < 129; ++j) {
2132             point_double(pre->g_pre_comp[2 * i][0],
2133                          pre->g_pre_comp[2 * i][1],
2134                          pre->g_pre_comp[2 * i][2],
2135                          pre->g_pre_comp[2 * i][0],
2136                          pre->g_pre_comp[2 * i][1],
2137                          pre->g_pre_comp[2 * i][2]);
2138         }
2139     }
2140     /* g_pre_comp[0] is the point at infinity */
2141     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2142     /* the remaining multiples */
2143     /* 2^130*G + 2^260*G */
2144     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2145               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2146               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2147               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2148               pre->g_pre_comp[2][2]);
2149     /* 2^130*G + 2^390*G */
2150     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2151               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2152               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2153               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2154               pre->g_pre_comp[2][2]);
2155     /* 2^260*G + 2^390*G */
2156     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2157               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2158               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2159               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2160               pre->g_pre_comp[4][2]);
2161     /* 2^130*G + 2^260*G + 2^390*G */
2162     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2163               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2164               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2165               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2166               pre->g_pre_comp[2][2]);
2167     for (i = 1; i < 8; ++i) {
2168         /* odd multiples: add G */
2169         point_add(pre->g_pre_comp[2 * i + 1][0],
2170                   pre->g_pre_comp[2 * i + 1][1],
2171                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2172                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2173                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2174                   pre->g_pre_comp[1][2]);
2175     }
2176     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2177
2178  done:
2179     SETPRECOMP(group, nistp521, pre);
2180     ret = 1;
2181     pre = NULL;
2182  err:
2183     BN_CTX_end(ctx);
2184     EC_POINT_free(generator);
2185 #ifndef FIPS_MODE
2186     BN_CTX_free(new_ctx);
2187 #endif
2188     EC_nistp521_pre_comp_free(pre);
2189     return ret;
2190 }
2191
2192 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2193 {
2194     return HAVEPRECOMP(group, nistp521);
2195 }
2196
2197 #endif