crypto/ec/ecp_nistp521.c

   1 /*
   2  * Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the Apache License 2.0 (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * ECDSA low level APIs are deprecated for public use, but still ok for
  28  * internal use.
  29  */
  30 #include "internal/deprecated.h"
  31
  32 /*
  33  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  34  *
  35  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  36  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  37  * work which got its smarts from Daniel J. Bernstein's work on the same.
  38  */
  39
  40 #include <openssl/e_os2.h>
  41
  42 #include <string.h>
  43 #include <openssl/err.h>
  44 #include "ec_local.h"
  45
  46 #if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
  47   /* even with gcc, the typedef won't work for 32-bit platforms */
  48 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  49                                  * platforms */
  50 #else
  51 # error "Your compiler doesn't appear to support 128-bit integer types"
  52 #endif
  53
  54 typedef uint8_t u8;
  55 typedef uint64_t u64;
  56
  57 /*
  58  * The underlying field. P521 operates over GF(2^521-1). We can serialise an
  59  * element of this field into 66 bytes where the most significant byte
  60  * contains only a single bit. We call this an felem_bytearray.
  61  */
  62
  63 typedef u8 felem_bytearray[66];
  64
  65 /*
  66  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  67  * These values are big-endian.
  68  */
  69 static const felem_bytearray nistp521_curve_params[5] = {
  70     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
  71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  74      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  75      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  76      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  78      0xff, 0xff},
  79     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
  80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  82      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  83      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  84      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  85      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  86      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  87      0xff, 0xfc},
  88     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
  89      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  90      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  91      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  92      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  93      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  94      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  95      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  96      0x3f, 0x00},
  97     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
  98      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  99      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
 100      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
 101      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
 102      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
 103      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
 104      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 105      0xbd, 0x66},
 106     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
 107      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 108      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 109      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 110      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 111      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 112      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 113      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 114      0x66, 0x50}
 115 };
 116
 117 /*-
 118  * The representation of field elements.
 119  * ------------------------------------
 120  *
 121  * We represent field elements with nine values. These values are either 64 or
 122  * 128 bits and the field element represented is:
 123  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 124  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 125  * 58 bits apart, but are greater than 58 bits in length, the most significant
 126  * bits of each limb overlap with the least significant bits of the next.
 127  *
 128  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 129  * 'largefelem' */
 130
 131 #define NLIMBS 9
 132
 133 typedef uint64_t limb;
 134 typedef limb limb_aX __attribute((__aligned__(1)));
 135 typedef limb felem[NLIMBS];
 136 typedef uint128_t largefelem[NLIMBS];
 137
 138 static const limb bottom57bits = 0x1ffffffffffffff;
 139 static const limb bottom58bits = 0x3ffffffffffffff;
 140
 141 /*
 142  * bin66_to_felem takes a little-endian byte array and converts it into felem
 143  * form. This assumes that the CPU is little-endian.
 144  */
 145 static void bin66_to_felem(felem out, const u8 in[66])
 146 {
 147     out[0] = (*((limb *) & in[0])) & bottom58bits;
 148     out[1] = (*((limb_aX *) & in[7]) >> 2) & bottom58bits;
 149     out[2] = (*((limb_aX *) & in[14]) >> 4) & bottom58bits;
 150     out[3] = (*((limb_aX *) & in[21]) >> 6) & bottom58bits;
 151     out[4] = (*((limb_aX *) & in[29])) & bottom58bits;
 152     out[5] = (*((limb_aX *) & in[36]) >> 2) & bottom58bits;
 153     out[6] = (*((limb_aX *) & in[43]) >> 4) & bottom58bits;
 154     out[7] = (*((limb_aX *) & in[50]) >> 6) & bottom58bits;
 155     out[8] = (*((limb_aX *) & in[58])) & bottom57bits;
 156 }
 157
 158 /*
 159  * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 160  * array. This assumes that the CPU is little-endian.
 161  */
 162 static void felem_to_bin66(u8 out[66], const felem in)
 163 {
 164     memset(out, 0, 66);
 165     (*((limb *) & out[0])) = in[0];
 166     (*((limb_aX *) & out[7])) |= in[1] << 2;
 167     (*((limb_aX *) & out[14])) |= in[2] << 4;
 168     (*((limb_aX *) & out[21])) |= in[3] << 6;
 169     (*((limb_aX *) & out[29])) = in[4];
 170     (*((limb_aX *) & out[36])) |= in[5] << 2;
 171     (*((limb_aX *) & out[43])) |= in[6] << 4;
 172     (*((limb_aX *) & out[50])) |= in[7] << 6;
 173     (*((limb_aX *) & out[58])) = in[8];
 174 }
 175
 176 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 177 static int BN_to_felem(felem out, const BIGNUM *bn)
 178 {
 179     felem_bytearray b_out;
 180     int num_bytes;
 181
 182     if (BN_is_negative(bn)) {
 183         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 184         return 0;
 185     }
 186     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
 187     if (num_bytes < 0) {
 188         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 189         return 0;
 190     }
 191     bin66_to_felem(out, b_out);
 192     return 1;
 193 }
 194
 195 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 196 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 197 {
 198     felem_bytearray b_out;
 199     felem_to_bin66(b_out, in);
 200     return BN_lebin2bn(b_out, sizeof(b_out), out);
 201 }
 202
 203 /*-
 204  * Field operations
 205  * ----------------
 206  */
 207
 208 static void felem_one(felem out)
 209 {
 210     out[0] = 1;
 211     out[1] = 0;
 212     out[2] = 0;
 213     out[3] = 0;
 214     out[4] = 0;
 215     out[5] = 0;
 216     out[6] = 0;
 217     out[7] = 0;
 218     out[8] = 0;
 219 }
 220
 221 static void felem_assign(felem out, const felem in)
 222 {
 223     out[0] = in[0];
 224     out[1] = in[1];
 225     out[2] = in[2];
 226     out[3] = in[3];
 227     out[4] = in[4];
 228     out[5] = in[5];
 229     out[6] = in[6];
 230     out[7] = in[7];
 231     out[8] = in[8];
 232 }
 233
 234 /* felem_sum64 sets out = out + in. */
 235 static void felem_sum64(felem out, const felem in)
 236 {
 237     out[0] += in[0];
 238     out[1] += in[1];
 239     out[2] += in[2];
 240     out[3] += in[3];
 241     out[4] += in[4];
 242     out[5] += in[5];
 243     out[6] += in[6];
 244     out[7] += in[7];
 245     out[8] += in[8];
 246 }
 247
 248 /* felem_scalar sets out = in * scalar */
 249 static void felem_scalar(felem out, const felem in, limb scalar)
 250 {
 251     out[0] = in[0] * scalar;
 252     out[1] = in[1] * scalar;
 253     out[2] = in[2] * scalar;
 254     out[3] = in[3] * scalar;
 255     out[4] = in[4] * scalar;
 256     out[5] = in[5] * scalar;
 257     out[6] = in[6] * scalar;
 258     out[7] = in[7] * scalar;
 259     out[8] = in[8] * scalar;
 260 }
 261
 262 /* felem_scalar64 sets out = out * scalar */
 263 static void felem_scalar64(felem out, limb scalar)
 264 {
 265     out[0] *= scalar;
 266     out[1] *= scalar;
 267     out[2] *= scalar;
 268     out[3] *= scalar;
 269     out[4] *= scalar;
 270     out[5] *= scalar;
 271     out[6] *= scalar;
 272     out[7] *= scalar;
 273     out[8] *= scalar;
 274 }
 275
 276 /* felem_scalar128 sets out = out * scalar */
 277 static void felem_scalar128(largefelem out, limb scalar)
 278 {
 279     out[0] *= scalar;
 280     out[1] *= scalar;
 281     out[2] *= scalar;
 282     out[3] *= scalar;
 283     out[4] *= scalar;
 284     out[5] *= scalar;
 285     out[6] *= scalar;
 286     out[7] *= scalar;
 287     out[8] *= scalar;
 288 }
 289
 290 /*-
 291  * felem_neg sets |out| to |-in|
 292  * On entry:
 293  *   in[i] < 2^59 + 2^14
 294  * On exit:
 295  *   out[i] < 2^62
 296  */
 297 static void felem_neg(felem out, const felem in)
 298 {
 299     /* In order to prevent underflow, we subtract from 0 mod p. */
 300     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 301     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 302
 303     out[0] = two62m3 - in[0];
 304     out[1] = two62m2 - in[1];
 305     out[2] = two62m2 - in[2];
 306     out[3] = two62m2 - in[3];
 307     out[4] = two62m2 - in[4];
 308     out[5] = two62m2 - in[5];
 309     out[6] = two62m2 - in[6];
 310     out[7] = two62m2 - in[7];
 311     out[8] = two62m2 - in[8];
 312 }
 313
 314 /*-
 315  * felem_diff64 subtracts |in| from |out|
 316  * On entry:
 317  *   in[i] < 2^59 + 2^14
 318  * On exit:
 319  *   out[i] < out[i] + 2^62
 320  */
 321 static void felem_diff64(felem out, const felem in)
 322 {
 323     /*
 324      * In order to prevent underflow, we add 0 mod p before subtracting.
 325      */
 326     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 327     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 328
 329     out[0] += two62m3 - in[0];
 330     out[1] += two62m2 - in[1];
 331     out[2] += two62m2 - in[2];
 332     out[3] += two62m2 - in[3];
 333     out[4] += two62m2 - in[4];
 334     out[5] += two62m2 - in[5];
 335     out[6] += two62m2 - in[6];
 336     out[7] += two62m2 - in[7];
 337     out[8] += two62m2 - in[8];
 338 }
 339
 340 /*-
 341  * felem_diff_128_64 subtracts |in| from |out|
 342  * On entry:
 343  *   in[i] < 2^62 + 2^17
 344  * On exit:
 345  *   out[i] < out[i] + 2^63
 346  */
 347 static void felem_diff_128_64(largefelem out, const felem in)
 348 {
 349     /*
 350      * In order to prevent underflow, we add 64p mod p (which is equivalent
 351      * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
 352      * digit number with all bits set to 1. See "The representation of field
 353      * elements" comment above for a description of how limbs are used to
 354      * represent a number. 64p is represented with 8 limbs containing a number
 355      * with 58 bits set and one limb with a number with 57 bits set.
 356      */
 357     static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
 358     static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
 359
 360     out[0] += two63m6 - in[0];
 361     out[1] += two63m5 - in[1];
 362     out[2] += two63m5 - in[2];
 363     out[3] += two63m5 - in[3];
 364     out[4] += two63m5 - in[4];
 365     out[5] += two63m5 - in[5];
 366     out[6] += two63m5 - in[6];
 367     out[7] += two63m5 - in[7];
 368     out[8] += two63m5 - in[8];
 369 }
 370
 371 /*-
 372  * felem_diff_128_64 subtracts |in| from |out|
 373  * On entry:
 374  *   in[i] < 2^126
 375  * On exit:
 376  *   out[i] < out[i] + 2^127 - 2^69
 377  */
 378 static void felem_diff128(largefelem out, const largefelem in)
 379 {
 380     /*
 381      * In order to prevent underflow, we add 0 mod p before subtracting.
 382      */
 383     static const uint128_t two127m70 =
 384         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
 385     static const uint128_t two127m69 =
 386         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
 387
 388     out[0] += (two127m70 - in[0]);
 389     out[1] += (two127m69 - in[1]);
 390     out[2] += (two127m69 - in[2]);
 391     out[3] += (two127m69 - in[3]);
 392     out[4] += (two127m69 - in[4]);
 393     out[5] += (two127m69 - in[5]);
 394     out[6] += (two127m69 - in[6]);
 395     out[7] += (two127m69 - in[7]);
 396     out[8] += (two127m69 - in[8]);
 397 }
 398
 399 /*-
 400  * felem_square sets |out| = |in|^2
 401  * On entry:
 402  *   in[i] < 2^62
 403  * On exit:
 404  *   out[i] < 17 * max(in[i]) * max(in[i])
 405  */
 406 static void felem_square(largefelem out, const felem in)
 407 {
 408     felem inx2, inx4;
 409     felem_scalar(inx2, in, 2);
 410     felem_scalar(inx4, in, 4);
 411
 412     /*-
 413      * We have many cases were we want to do
 414      *   in[x] * in[y] +
 415      *   in[y] * in[x]
 416      * This is obviously just
 417      *   2 * in[x] * in[y]
 418      * However, rather than do the doubling on the 128 bit result, we
 419      * double one of the inputs to the multiplication by reading from
 420      * |inx2|
 421      */
 422
 423     out[0] = ((uint128_t) in[0]) * in[0];
 424     out[1] = ((uint128_t) in[0]) * inx2[1];
 425     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
 426     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
 427     out[4] = ((uint128_t) in[0]) * inx2[4] +
 428              ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
 429     out[5] = ((uint128_t) in[0]) * inx2[5] +
 430              ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
 431     out[6] = ((uint128_t) in[0]) * inx2[6] +
 432              ((uint128_t) in[1]) * inx2[5] +
 433              ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
 434     out[7] = ((uint128_t) in[0]) * inx2[7] +
 435              ((uint128_t) in[1]) * inx2[6] +
 436              ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
 437     out[8] = ((uint128_t) in[0]) * inx2[8] +
 438              ((uint128_t) in[1]) * inx2[7] +
 439              ((uint128_t) in[2]) * inx2[6] +
 440              ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
 441
 442     /*
 443      * The remaining limbs fall above 2^521, with the first falling at 2^522.
 444      * They correspond to locations one bit up from the limbs produced above
 445      * so we would have to multiply by two to align them. Again, rather than
 446      * operate on the 128-bit result, we double one of the inputs to the
 447      * multiplication. If we want to double for both this reason, and the
 448      * reason above, then we end up multiplying by four.
 449      */
 450
 451     /* 9 */
 452     out[0] += ((uint128_t) in[1]) * inx4[8] +
 453               ((uint128_t) in[2]) * inx4[7] +
 454               ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
 455
 456     /* 10 */
 457     out[1] += ((uint128_t) in[2]) * inx4[8] +
 458               ((uint128_t) in[3]) * inx4[7] +
 459               ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
 460
 461     /* 11 */
 462     out[2] += ((uint128_t) in[3]) * inx4[8] +
 463               ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
 464
 465     /* 12 */
 466     out[3] += ((uint128_t) in[4]) * inx4[8] +
 467               ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
 468
 469     /* 13 */
 470     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
 471
 472     /* 14 */
 473     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
 474
 475     /* 15 */
 476     out[6] += ((uint128_t) in[7]) * inx4[8];
 477
 478     /* 16 */
 479     out[7] += ((uint128_t) in[8]) * inx2[8];
 480 }
 481
 482 /*-
 483  * felem_mul sets |out| = |in1| * |in2|
 484  * On entry:
 485  *   in1[i] < 2^64
 486  *   in2[i] < 2^63
 487  * On exit:
 488  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 489  */
 490 static void felem_mul(largefelem out, const felem in1, const felem in2)
 491 {
 492     felem in2x2;
 493     felem_scalar(in2x2, in2, 2);
 494
 495     out[0] = ((uint128_t) in1[0]) * in2[0];
 496
 497     out[1] = ((uint128_t) in1[0]) * in2[1] +
 498              ((uint128_t) in1[1]) * in2[0];
 499
 500     out[2] = ((uint128_t) in1[0]) * in2[2] +
 501              ((uint128_t) in1[1]) * in2[1] +
 502              ((uint128_t) in1[2]) * in2[0];
 503
 504     out[3] = ((uint128_t) in1[0]) * in2[3] +
 505              ((uint128_t) in1[1]) * in2[2] +
 506              ((uint128_t) in1[2]) * in2[1] +
 507              ((uint128_t) in1[3]) * in2[0];
 508
 509     out[4] = ((uint128_t) in1[0]) * in2[4] +
 510              ((uint128_t) in1[1]) * in2[3] +
 511              ((uint128_t) in1[2]) * in2[2] +
 512              ((uint128_t) in1[3]) * in2[1] +
 513              ((uint128_t) in1[4]) * in2[0];
 514
 515     out[5] = ((uint128_t) in1[0]) * in2[5] +
 516              ((uint128_t) in1[1]) * in2[4] +
 517              ((uint128_t) in1[2]) * in2[3] +
 518              ((uint128_t) in1[3]) * in2[2] +
 519              ((uint128_t) in1[4]) * in2[1] +
 520              ((uint128_t) in1[5]) * in2[0];
 521
 522     out[6] = ((uint128_t) in1[0]) * in2[6] +
 523              ((uint128_t) in1[1]) * in2[5] +
 524              ((uint128_t) in1[2]) * in2[4] +
 525              ((uint128_t) in1[3]) * in2[3] +
 526              ((uint128_t) in1[4]) * in2[2] +
 527              ((uint128_t) in1[5]) * in2[1] +
 528              ((uint128_t) in1[6]) * in2[0];
 529
 530     out[7] = ((uint128_t) in1[0]) * in2[7] +
 531              ((uint128_t) in1[1]) * in2[6] +
 532              ((uint128_t) in1[2]) * in2[5] +
 533              ((uint128_t) in1[3]) * in2[4] +
 534              ((uint128_t) in1[4]) * in2[3] +
 535              ((uint128_t) in1[5]) * in2[2] +
 536              ((uint128_t) in1[6]) * in2[1] +
 537              ((uint128_t) in1[7]) * in2[0];
 538
 539     out[8] = ((uint128_t) in1[0]) * in2[8] +
 540              ((uint128_t) in1[1]) * in2[7] +
 541              ((uint128_t) in1[2]) * in2[6] +
 542              ((uint128_t) in1[3]) * in2[5] +
 543              ((uint128_t) in1[4]) * in2[4] +
 544              ((uint128_t) in1[5]) * in2[3] +
 545              ((uint128_t) in1[6]) * in2[2] +
 546              ((uint128_t) in1[7]) * in2[1] +
 547              ((uint128_t) in1[8]) * in2[0];
 548
 549     /* See comment in felem_square about the use of in2x2 here */
 550
 551     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 552               ((uint128_t) in1[2]) * in2x2[7] +
 553               ((uint128_t) in1[3]) * in2x2[6] +
 554               ((uint128_t) in1[4]) * in2x2[5] +
 555               ((uint128_t) in1[5]) * in2x2[4] +
 556               ((uint128_t) in1[6]) * in2x2[3] +
 557               ((uint128_t) in1[7]) * in2x2[2] +
 558               ((uint128_t) in1[8]) * in2x2[1];
 559
 560     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 561               ((uint128_t) in1[3]) * in2x2[7] +
 562               ((uint128_t) in1[4]) * in2x2[6] +
 563               ((uint128_t) in1[5]) * in2x2[5] +
 564               ((uint128_t) in1[6]) * in2x2[4] +
 565               ((uint128_t) in1[7]) * in2x2[3] +
 566               ((uint128_t) in1[8]) * in2x2[2];
 567
 568     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 569               ((uint128_t) in1[4]) * in2x2[7] +
 570               ((uint128_t) in1[5]) * in2x2[6] +
 571               ((uint128_t) in1[6]) * in2x2[5] +
 572               ((uint128_t) in1[7]) * in2x2[4] +
 573               ((uint128_t) in1[8]) * in2x2[3];
 574
 575     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 576               ((uint128_t) in1[5]) * in2x2[7] +
 577               ((uint128_t) in1[6]) * in2x2[6] +
 578               ((uint128_t) in1[7]) * in2x2[5] +
 579               ((uint128_t) in1[8]) * in2x2[4];
 580
 581     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 582               ((uint128_t) in1[6]) * in2x2[7] +
 583               ((uint128_t) in1[7]) * in2x2[6] +
 584               ((uint128_t) in1[8]) * in2x2[5];
 585
 586     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 587               ((uint128_t) in1[7]) * in2x2[7] +
 588               ((uint128_t) in1[8]) * in2x2[6];
 589
 590     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 591               ((uint128_t) in1[8]) * in2x2[7];
 592
 593     out[7] += ((uint128_t) in1[8]) * in2x2[8];
 594 }
 595
 596 static const limb bottom52bits = 0xfffffffffffff;
 597
 598 /*-
 599  * felem_reduce converts a largefelem to an felem.
 600  * On entry:
 601  *   in[i] < 2^128
 602  * On exit:
 603  *   out[i] < 2^59 + 2^14
 604  */
 605 static void felem_reduce(felem out, const largefelem in)
 606 {
 607     u64 overflow1, overflow2;
 608
 609     out[0] = ((limb) in[0]) & bottom58bits;
 610     out[1] = ((limb) in[1]) & bottom58bits;
 611     out[2] = ((limb) in[2]) & bottom58bits;
 612     out[3] = ((limb) in[3]) & bottom58bits;
 613     out[4] = ((limb) in[4]) & bottom58bits;
 614     out[5] = ((limb) in[5]) & bottom58bits;
 615     out[6] = ((limb) in[6]) & bottom58bits;
 616     out[7] = ((limb) in[7]) & bottom58bits;
 617     out[8] = ((limb) in[8]) & bottom58bits;
 618
 619     /* out[i] < 2^58 */
 620
 621     out[1] += ((limb) in[0]) >> 58;
 622     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 623     /*-
 624      * out[1] < 2^58 + 2^6 + 2^58
 625      *        = 2^59 + 2^6
 626      */
 627     out[2] += ((limb) (in[0] >> 64)) >> 52;
 628
 629     out[2] += ((limb) in[1]) >> 58;
 630     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 631     out[3] += ((limb) (in[1] >> 64)) >> 52;
 632
 633     out[3] += ((limb) in[2]) >> 58;
 634     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 635     out[4] += ((limb) (in[2] >> 64)) >> 52;
 636
 637     out[4] += ((limb) in[3]) >> 58;
 638     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 639     out[5] += ((limb) (in[3] >> 64)) >> 52;
 640
 641     out[5] += ((limb) in[4]) >> 58;
 642     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 643     out[6] += ((limb) (in[4] >> 64)) >> 52;
 644
 645     out[6] += ((limb) in[5]) >> 58;
 646     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 647     out[7] += ((limb) (in[5] >> 64)) >> 52;
 648
 649     out[7] += ((limb) in[6]) >> 58;
 650     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 651     out[8] += ((limb) (in[6] >> 64)) >> 52;
 652
 653     out[8] += ((limb) in[7]) >> 58;
 654     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 655     /*-
 656      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 657      *            < 2^59 + 2^13
 658      */
 659     overflow1 = ((limb) (in[7] >> 64)) >> 52;
 660
 661     overflow1 += ((limb) in[8]) >> 58;
 662     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 663     overflow2 = ((limb) (in[8] >> 64)) >> 52;
 664
 665     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
 666     overflow2 <<= 1;            /* overflow2 < 2^13 */
 667
 668     out[0] += overflow1;        /* out[0] < 2^60 */
 669     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
 670
 671     out[1] += out[0] >> 58;
 672     out[0] &= bottom58bits;
 673     /*-
 674      * out[0] < 2^58
 675      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 676      *        < 2^59 + 2^14
 677      */
 678 }
 679
 680 static void felem_square_reduce(felem out, const felem in)
 681 {
 682     largefelem tmp;
 683     felem_square(tmp, in);
 684     felem_reduce(out, tmp);
 685 }
 686
 687 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 688 {
 689     largefelem tmp;
 690     felem_mul(tmp, in1, in2);
 691     felem_reduce(out, tmp);
 692 }
 693
 694 /*-
 695  * felem_inv calculates |out| = |in|^{-1}
 696  *
 697  * Based on Fermat's Little Theorem:
 698  *   a^p = a (mod p)
 699  *   a^{p-1} = 1 (mod p)
 700  *   a^{p-2} = a^{-1} (mod p)
 701  */
 702 static void felem_inv(felem out, const felem in)
 703 {
 704     felem ftmp, ftmp2, ftmp3, ftmp4;
 705     largefelem tmp;
 706     unsigned i;
 707
 708     felem_square(tmp, in);
 709     felem_reduce(ftmp, tmp);    /* 2^1 */
 710     felem_mul(tmp, in, ftmp);
 711     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
 712     felem_assign(ftmp2, ftmp);
 713     felem_square(tmp, ftmp);
 714     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
 715     felem_mul(tmp, in, ftmp);
 716     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
 717     felem_square(tmp, ftmp);
 718     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
 719
 720     felem_square(tmp, ftmp2);
 721     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
 722     felem_square(tmp, ftmp3);
 723     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
 724     felem_mul(tmp, ftmp3, ftmp2);
 725     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
 726
 727     felem_assign(ftmp2, ftmp3);
 728     felem_square(tmp, ftmp3);
 729     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
 730     felem_square(tmp, ftmp3);
 731     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
 732     felem_square(tmp, ftmp3);
 733     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
 734     felem_square(tmp, ftmp3);
 735     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
 736     felem_assign(ftmp4, ftmp3);
 737     felem_mul(tmp, ftmp3, ftmp);
 738     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
 739     felem_square(tmp, ftmp4);
 740     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
 741     felem_mul(tmp, ftmp3, ftmp2);
 742     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
 743     felem_assign(ftmp2, ftmp3);
 744
 745     for (i = 0; i < 8; i++) {
 746         felem_square(tmp, ftmp3);
 747         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
 748     }
 749     felem_mul(tmp, ftmp3, ftmp2);
 750     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
 751     felem_assign(ftmp2, ftmp3);
 752
 753     for (i = 0; i < 16; i++) {
 754         felem_square(tmp, ftmp3);
 755         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
 756     }
 757     felem_mul(tmp, ftmp3, ftmp2);
 758     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
 759     felem_assign(ftmp2, ftmp3);
 760
 761     for (i = 0; i < 32; i++) {
 762         felem_square(tmp, ftmp3);
 763         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
 764     }
 765     felem_mul(tmp, ftmp3, ftmp2);
 766     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
 767     felem_assign(ftmp2, ftmp3);
 768
 769     for (i = 0; i < 64; i++) {
 770         felem_square(tmp, ftmp3);
 771         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
 772     }
 773     felem_mul(tmp, ftmp3, ftmp2);
 774     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
 775     felem_assign(ftmp2, ftmp3);
 776
 777     for (i = 0; i < 128; i++) {
 778         felem_square(tmp, ftmp3);
 779         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
 780     }
 781     felem_mul(tmp, ftmp3, ftmp2);
 782     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
 783     felem_assign(ftmp2, ftmp3);
 784
 785     for (i = 0; i < 256; i++) {
 786         felem_square(tmp, ftmp3);
 787         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
 788     }
 789     felem_mul(tmp, ftmp3, ftmp2);
 790     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
 791
 792     for (i = 0; i < 9; i++) {
 793         felem_square(tmp, ftmp3);
 794         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
 795     }
 796     felem_mul(tmp, ftmp3, ftmp4);
 797     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
 798     felem_mul(tmp, ftmp3, in);
 799     felem_reduce(out, tmp);     /* 2^512 - 3 */
 800 }
 801
 802 /* This is 2^521-1, expressed as an felem */
 803 static const felem kPrime = {
 804     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 805     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 806     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 807 };
 808
 809 /*-
 810  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 811  * otherwise.
 812  * On entry:
 813  *   in[i] < 2^59 + 2^14
 814  */
 815 static limb felem_is_zero(const felem in)
 816 {
 817     felem ftmp;
 818     limb is_zero, is_p;
 819     felem_assign(ftmp, in);
 820
 821     ftmp[0] += ftmp[8] >> 57;
 822     ftmp[8] &= bottom57bits;
 823     /* ftmp[8] < 2^57 */
 824     ftmp[1] += ftmp[0] >> 58;
 825     ftmp[0] &= bottom58bits;
 826     ftmp[2] += ftmp[1] >> 58;
 827     ftmp[1] &= bottom58bits;
 828     ftmp[3] += ftmp[2] >> 58;
 829     ftmp[2] &= bottom58bits;
 830     ftmp[4] += ftmp[3] >> 58;
 831     ftmp[3] &= bottom58bits;
 832     ftmp[5] += ftmp[4] >> 58;
 833     ftmp[4] &= bottom58bits;
 834     ftmp[6] += ftmp[5] >> 58;
 835     ftmp[5] &= bottom58bits;
 836     ftmp[7] += ftmp[6] >> 58;
 837     ftmp[6] &= bottom58bits;
 838     ftmp[8] += ftmp[7] >> 58;
 839     ftmp[7] &= bottom58bits;
 840     /* ftmp[8] < 2^57 + 4 */
 841
 842     /*
 843      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
 844      * than our bound for ftmp[8]. Therefore we only have to check if the
 845      * zero is zero or 2^521-1.
 846      */
 847
 848     is_zero = 0;
 849     is_zero |= ftmp[0];
 850     is_zero |= ftmp[1];
 851     is_zero |= ftmp[2];
 852     is_zero |= ftmp[3];
 853     is_zero |= ftmp[4];
 854     is_zero |= ftmp[5];
 855     is_zero |= ftmp[6];
 856     is_zero |= ftmp[7];
 857     is_zero |= ftmp[8];
 858
 859     is_zero--;
 860     /*
 861      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
 862      * can be set is if is_zero was 0 before the decrement.
 863      */
 864     is_zero = 0 - (is_zero >> 63);
 865
 866     is_p = ftmp[0] ^ kPrime[0];
 867     is_p |= ftmp[1] ^ kPrime[1];
 868     is_p |= ftmp[2] ^ kPrime[2];
 869     is_p |= ftmp[3] ^ kPrime[3];
 870     is_p |= ftmp[4] ^ kPrime[4];
 871     is_p |= ftmp[5] ^ kPrime[5];
 872     is_p |= ftmp[6] ^ kPrime[6];
 873     is_p |= ftmp[7] ^ kPrime[7];
 874     is_p |= ftmp[8] ^ kPrime[8];
 875
 876     is_p--;
 877     is_p = 0 - (is_p >> 63);
 878
 879     is_zero |= is_p;
 880     return is_zero;
 881 }
 882
 883 static int felem_is_zero_int(const void *in)
 884 {
 885     return (int)(felem_is_zero(in) & ((limb) 1));
 886 }
 887
 888 /*-
 889  * felem_contract converts |in| to its unique, minimal representation.
 890  * On entry:
 891  *   in[i] < 2^59 + 2^14
 892  */
 893 static void felem_contract(felem out, const felem in)
 894 {
 895     limb is_p, is_greater, sign;
 896     static const limb two58 = ((limb) 1) << 58;
 897
 898     felem_assign(out, in);
 899
 900     out[0] += out[8] >> 57;
 901     out[8] &= bottom57bits;
 902     /* out[8] < 2^57 */
 903     out[1] += out[0] >> 58;
 904     out[0] &= bottom58bits;
 905     out[2] += out[1] >> 58;
 906     out[1] &= bottom58bits;
 907     out[3] += out[2] >> 58;
 908     out[2] &= bottom58bits;
 909     out[4] += out[3] >> 58;
 910     out[3] &= bottom58bits;
 911     out[5] += out[4] >> 58;
 912     out[4] &= bottom58bits;
 913     out[6] += out[5] >> 58;
 914     out[5] &= bottom58bits;
 915     out[7] += out[6] >> 58;
 916     out[6] &= bottom58bits;
 917     out[8] += out[7] >> 58;
 918     out[7] &= bottom58bits;
 919     /* out[8] < 2^57 + 4 */
 920
 921     /*
 922      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
 923      * out. See the comments in felem_is_zero regarding why we don't test for
 924      * other multiples of the prime.
 925      */
 926
 927     /*
 928      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
 929      */
 930
 931     is_p = out[0] ^ kPrime[0];
 932     is_p |= out[1] ^ kPrime[1];
 933     is_p |= out[2] ^ kPrime[2];
 934     is_p |= out[3] ^ kPrime[3];
 935     is_p |= out[4] ^ kPrime[4];
 936     is_p |= out[5] ^ kPrime[5];
 937     is_p |= out[6] ^ kPrime[6];
 938     is_p |= out[7] ^ kPrime[7];
 939     is_p |= out[8] ^ kPrime[8];
 940
 941     is_p--;
 942     is_p &= is_p << 32;
 943     is_p &= is_p << 16;
 944     is_p &= is_p << 8;
 945     is_p &= is_p << 4;
 946     is_p &= is_p << 2;
 947     is_p &= is_p << 1;
 948     is_p = 0 - (is_p >> 63);
 949     is_p = ~is_p;
 950
 951     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 952
 953     out[0] &= is_p;
 954     out[1] &= is_p;
 955     out[2] &= is_p;
 956     out[3] &= is_p;
 957     out[4] &= is_p;
 958     out[5] &= is_p;
 959     out[6] &= is_p;
 960     out[7] &= is_p;
 961     out[8] &= is_p;
 962
 963     /*
 964      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
 965      * 57 is greater than zero as (2^521-1) + x >= 2^522
 966      */
 967     is_greater = out[8] >> 57;
 968     is_greater |= is_greater << 32;
 969     is_greater |= is_greater << 16;
 970     is_greater |= is_greater << 8;
 971     is_greater |= is_greater << 4;
 972     is_greater |= is_greater << 2;
 973     is_greater |= is_greater << 1;
 974     is_greater = 0 - (is_greater >> 63);
 975
 976     out[0] -= kPrime[0] & is_greater;
 977     out[1] -= kPrime[1] & is_greater;
 978     out[2] -= kPrime[2] & is_greater;
 979     out[3] -= kPrime[3] & is_greater;
 980     out[4] -= kPrime[4] & is_greater;
 981     out[5] -= kPrime[5] & is_greater;
 982     out[6] -= kPrime[6] & is_greater;
 983     out[7] -= kPrime[7] & is_greater;
 984     out[8] -= kPrime[8] & is_greater;
 985
 986     /* Eliminate negative coefficients */
 987     sign = -(out[0] >> 63);
 988     out[0] += (two58 & sign);
 989     out[1] -= (1 & sign);
 990     sign = -(out[1] >> 63);
 991     out[1] += (two58 & sign);
 992     out[2] -= (1 & sign);
 993     sign = -(out[2] >> 63);
 994     out[2] += (two58 & sign);
 995     out[3] -= (1 & sign);
 996     sign = -(out[3] >> 63);
 997     out[3] += (two58 & sign);
 998     out[4] -= (1 & sign);
 999     sign = -(out[4] >> 63);
1000     out[4] += (two58 & sign);
1001     out[5] -= (1 & sign);
1002     sign = -(out[0] >> 63);
1003     out[5] += (two58 & sign);
1004     out[6] -= (1 & sign);
1005     sign = -(out[6] >> 63);
1006     out[6] += (two58 & sign);
1007     out[7] -= (1 & sign);
1008     sign = -(out[7] >> 63);
1009     out[7] += (two58 & sign);
1010     out[8] -= (1 & sign);
1011     sign = -(out[5] >> 63);
1012     out[5] += (two58 & sign);
1013     out[6] -= (1 & sign);
1014     sign = -(out[6] >> 63);
1015     out[6] += (two58 & sign);
1016     out[7] -= (1 & sign);
1017     sign = -(out[7] >> 63);
1018     out[7] += (two58 & sign);
1019     out[8] -= (1 & sign);
1020 }
1021
1022 /*-
1023  * Group operations
1024  * ----------------
1025  *
1026  * Building on top of the field operations we have the operations on the
1027  * elliptic curve group itself. Points on the curve are represented in Jacobian
1028  * coordinates */
1029
1030 /*-
1031  * point_double calculates 2*(x_in, y_in, z_in)
1032  *
1033  * The method is taken from:
1034  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1035  *
1036  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1037  * while x_out == y_in is not (maybe this works, but it's not tested). */
1038 static void
1039 point_double(felem x_out, felem y_out, felem z_out,
1040              const felem x_in, const felem y_in, const felem z_in)
1041 {
1042     largefelem tmp, tmp2;
1043     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1044
1045     felem_assign(ftmp, x_in);
1046     felem_assign(ftmp2, x_in);
1047
1048     /* delta = z^2 */
1049     felem_square(tmp, z_in);
1050     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1051
1052     /* gamma = y^2 */
1053     felem_square(tmp, y_in);
1054     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1055
1056     /* beta = x*gamma */
1057     felem_mul(tmp, x_in, gamma);
1058     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1059
1060     /* alpha = 3*(x-delta)*(x+delta) */
1061     felem_diff64(ftmp, delta);
1062     /* ftmp[i] < 2^61 */
1063     felem_sum64(ftmp2, delta);
1064     /* ftmp2[i] < 2^60 + 2^15 */
1065     felem_scalar64(ftmp2, 3);
1066     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1067     felem_mul(tmp, ftmp, ftmp2);
1068     /*-
1069      * tmp[i] < 17(3*2^121 + 3*2^76)
1070      *        = 61*2^121 + 61*2^76
1071      *        < 64*2^121 + 64*2^76
1072      *        = 2^127 + 2^82
1073      *        < 2^128
1074      */
1075     felem_reduce(alpha, tmp);
1076
1077     /* x' = alpha^2 - 8*beta */
1078     felem_square(tmp, alpha);
1079     /*
1080      * tmp[i] < 17*2^120 < 2^125
1081      */
1082     felem_assign(ftmp, beta);
1083     felem_scalar64(ftmp, 8);
1084     /* ftmp[i] < 2^62 + 2^17 */
1085     felem_diff_128_64(tmp, ftmp);
1086     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1087     felem_reduce(x_out, tmp);
1088
1089     /* z' = (y + z)^2 - gamma - delta */
1090     felem_sum64(delta, gamma);
1091     /* delta[i] < 2^60 + 2^15 */
1092     felem_assign(ftmp, y_in);
1093     felem_sum64(ftmp, z_in);
1094     /* ftmp[i] < 2^60 + 2^15 */
1095     felem_square(tmp, ftmp);
1096     /*
1097      * tmp[i] < 17(2^122) < 2^127
1098      */
1099     felem_diff_128_64(tmp, delta);
1100     /* tmp[i] < 2^127 + 2^63 */
1101     felem_reduce(z_out, tmp);
1102
1103     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1104     felem_scalar64(beta, 4);
1105     /* beta[i] < 2^61 + 2^16 */
1106     felem_diff64(beta, x_out);
1107     /* beta[i] < 2^61 + 2^60 + 2^16 */
1108     felem_mul(tmp, alpha, beta);
1109     /*-
1110      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1111      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1112      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1113      *        < 2^128
1114      */
1115     felem_square(tmp2, gamma);
1116     /*-
1117      * tmp2[i] < 17*(2^59 + 2^14)^2
1118      *         = 17*(2^118 + 2^74 + 2^28)
1119      */
1120     felem_scalar128(tmp2, 8);
1121     /*-
1122      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1123      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1124      *         < 2^126
1125      */
1126     felem_diff128(tmp, tmp2);
1127     /*-
1128      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1129      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1130      *          2^74 + 2^69 + 2^34 + 2^30
1131      *        < 2^128
1132      */
1133     felem_reduce(y_out, tmp);
1134 }
1135
1136 /* copy_conditional copies in to out iff mask is all ones. */
1137 static void copy_conditional(felem out, const felem in, limb mask)
1138 {
1139     unsigned i;
1140     for (i = 0; i < NLIMBS; ++i) {
1141         const limb tmp = mask & (in[i] ^ out[i]);
1142         out[i] ^= tmp;
1143     }
1144 }
1145
1146 /*-
1147  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1148  *
1149  * The method is taken from
1150  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1151  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1152  *
1153  * This function includes a branch for checking whether the two input points
1154  * are equal (while not equal to the point at infinity). See comment below
1155  * on constant-time.
1156  */
1157 static void point_add(felem x3, felem y3, felem z3,
1158                       const felem x1, const felem y1, const felem z1,
1159                       const int mixed, const felem x2, const felem y2,
1160                       const felem z2)
1161 {
1162     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1163     largefelem tmp, tmp2;
1164     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1165     limb points_equal;
1166
1167     z1_is_zero = felem_is_zero(z1);
1168     z2_is_zero = felem_is_zero(z2);
1169
1170     /* ftmp = z1z1 = z1**2 */
1171     felem_square(tmp, z1);
1172     felem_reduce(ftmp, tmp);
1173
1174     if (!mixed) {
1175         /* ftmp2 = z2z2 = z2**2 */
1176         felem_square(tmp, z2);
1177         felem_reduce(ftmp2, tmp);
1178
1179         /* u1 = ftmp3 = x1*z2z2 */
1180         felem_mul(tmp, x1, ftmp2);
1181         felem_reduce(ftmp3, tmp);
1182
1183         /* ftmp5 = z1 + z2 */
1184         felem_assign(ftmp5, z1);
1185         felem_sum64(ftmp5, z2);
1186         /* ftmp5[i] < 2^61 */
1187
1188         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1189         felem_square(tmp, ftmp5);
1190         /* tmp[i] < 17*2^122 */
1191         felem_diff_128_64(tmp, ftmp);
1192         /* tmp[i] < 17*2^122 + 2^63 */
1193         felem_diff_128_64(tmp, ftmp2);
1194         /* tmp[i] < 17*2^122 + 2^64 */
1195         felem_reduce(ftmp5, tmp);
1196
1197         /* ftmp2 = z2 * z2z2 */
1198         felem_mul(tmp, ftmp2, z2);
1199         felem_reduce(ftmp2, tmp);
1200
1201         /* s1 = ftmp6 = y1 * z2**3 */
1202         felem_mul(tmp, y1, ftmp2);
1203         felem_reduce(ftmp6, tmp);
1204     } else {
1205         /*
1206          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1207          */
1208
1209         /* u1 = ftmp3 = x1*z2z2 */
1210         felem_assign(ftmp3, x1);
1211
1212         /* ftmp5 = 2*z1z2 */
1213         felem_scalar(ftmp5, z1, 2);
1214
1215         /* s1 = ftmp6 = y1 * z2**3 */
1216         felem_assign(ftmp6, y1);
1217     }
1218
1219     /* u2 = x2*z1z1 */
1220     felem_mul(tmp, x2, ftmp);
1221     /* tmp[i] < 17*2^120 */
1222
1223     /* h = ftmp4 = u2 - u1 */
1224     felem_diff_128_64(tmp, ftmp3);
1225     /* tmp[i] < 17*2^120 + 2^63 */
1226     felem_reduce(ftmp4, tmp);
1227
1228     x_equal = felem_is_zero(ftmp4);
1229
1230     /* z_out = ftmp5 * h */
1231     felem_mul(tmp, ftmp5, ftmp4);
1232     felem_reduce(z_out, tmp);
1233
1234     /* ftmp = z1 * z1z1 */
1235     felem_mul(tmp, ftmp, z1);
1236     felem_reduce(ftmp, tmp);
1237
1238     /* s2 = tmp = y2 * z1**3 */
1239     felem_mul(tmp, y2, ftmp);
1240     /* tmp[i] < 17*2^120 */
1241
1242     /* r = ftmp5 = (s2 - s1)*2 */
1243     felem_diff_128_64(tmp, ftmp6);
1244     /* tmp[i] < 17*2^120 + 2^63 */
1245     felem_reduce(ftmp5, tmp);
1246     y_equal = felem_is_zero(ftmp5);
1247     felem_scalar64(ftmp5, 2);
1248     /* ftmp5[i] < 2^61 */
1249
1250     /*
1251      * The formulae are incorrect if the points are equal, in affine coordinates
1252      * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1253      * happens.
1254      *
1255      * We use bitwise operations to avoid potential side-channels introduced by
1256      * the short-circuiting behaviour of boolean operators.
1257      *
1258      * The special case of either point being the point at infinity (z1 and/or
1259      * z2 are zero), is handled separately later on in this function, so we
1260      * avoid jumping to point_double here in those special cases.
1261      *
1262      * Notice the comment below on the implications of this branching for timing
1263      * leaks and why it is considered practically irrelevant.
1264      */
1265     points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1266
1267     if (points_equal) {
1268         /*
1269          * This is obviously not constant-time but it will almost-never happen
1270          * for ECDH / ECDSA. The case where it can happen is during scalar-mult
1271          * where the intermediate value gets very close to the group order.
1272          * Since |ec_GFp_nistp_recode_scalar_bits| produces signed digits for
1273          * the scalar, it's possible for the intermediate value to be a small
1274          * negative multiple of the base point, and for the final signed digit
1275          * to be the same value. We believe that this only occurs for the scalar
1276          * 1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
1277          * ffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb
1278          * 71e913863f7, in that case the penultimate intermediate is -9G and
1279          * the final digit is also -9G. Since this only happens for a single
1280          * scalar, the timing leak is irrelevant. (Any attacker who wanted to
1281          * check whether a secret scalar was that exact value, can already do
1282          * so.)
1283          */
1284         point_double(x3, y3, z3, x1, y1, z1);
1285         return;
1286     }
1287
1288     /* I = ftmp = (2h)**2 */
1289     felem_assign(ftmp, ftmp4);
1290     felem_scalar64(ftmp, 2);
1291     /* ftmp[i] < 2^61 */
1292     felem_square(tmp, ftmp);
1293     /* tmp[i] < 17*2^122 */
1294     felem_reduce(ftmp, tmp);
1295
1296     /* J = ftmp2 = h * I */
1297     felem_mul(tmp, ftmp4, ftmp);
1298     felem_reduce(ftmp2, tmp);
1299
1300     /* V = ftmp4 = U1 * I */
1301     felem_mul(tmp, ftmp3, ftmp);
1302     felem_reduce(ftmp4, tmp);
1303
1304     /* x_out = r**2 - J - 2V */
1305     felem_square(tmp, ftmp5);
1306     /* tmp[i] < 17*2^122 */
1307     felem_diff_128_64(tmp, ftmp2);
1308     /* tmp[i] < 17*2^122 + 2^63 */
1309     felem_assign(ftmp3, ftmp4);
1310     felem_scalar64(ftmp4, 2);
1311     /* ftmp4[i] < 2^61 */
1312     felem_diff_128_64(tmp, ftmp4);
1313     /* tmp[i] < 17*2^122 + 2^64 */
1314     felem_reduce(x_out, tmp);
1315
1316     /* y_out = r(V-x_out) - 2 * s1 * J */
1317     felem_diff64(ftmp3, x_out);
1318     /*
1319      * ftmp3[i] < 2^60 + 2^60 = 2^61
1320      */
1321     felem_mul(tmp, ftmp5, ftmp3);
1322     /* tmp[i] < 17*2^122 */
1323     felem_mul(tmp2, ftmp6, ftmp2);
1324     /* tmp2[i] < 17*2^120 */
1325     felem_scalar128(tmp2, 2);
1326     /* tmp2[i] < 17*2^121 */
1327     felem_diff128(tmp, tmp2);
1328         /*-
1329          * tmp[i] < 2^127 - 2^69 + 17*2^122
1330          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1331          *        < 2^127
1332          */
1333     felem_reduce(y_out, tmp);
1334
1335     copy_conditional(x_out, x2, z1_is_zero);
1336     copy_conditional(x_out, x1, z2_is_zero);
1337     copy_conditional(y_out, y2, z1_is_zero);
1338     copy_conditional(y_out, y1, z2_is_zero);
1339     copy_conditional(z_out, z2, z1_is_zero);
1340     copy_conditional(z_out, z1, z2_is_zero);
1341     felem_assign(x3, x_out);
1342     felem_assign(y3, y_out);
1343     felem_assign(z3, z_out);
1344 }
1345
1346 /*-
1347  * Base point pre computation
1348  * --------------------------
1349  *
1350  * Two different sorts of precomputed tables are used in the following code.
1351  * Each contain various points on the curve, where each point is three field
1352  * elements (x, y, z).
1353  *
1354  * For the base point table, z is usually 1 (0 for the point at infinity).
1355  * This table has 16 elements:
1356  * index | bits    | point
1357  * ------+---------+------------------------------
1358  *     0 | 0 0 0 0 | 0G
1359  *     1 | 0 0 0 1 | 1G
1360  *     2 | 0 0 1 0 | 2^130G
1361  *     3 | 0 0 1 1 | (2^130 + 1)G
1362  *     4 | 0 1 0 0 | 2^260G
1363  *     5 | 0 1 0 1 | (2^260 + 1)G
1364  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1365  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1366  *     8 | 1 0 0 0 | 2^390G
1367  *     9 | 1 0 0 1 | (2^390 + 1)G
1368  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1369  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1370  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1371  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1372  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1373  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1374  *
1375  * The reason for this is so that we can clock bits into four different
1376  * locations when doing simple scalar multiplies against the base point.
1377  *
1378  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1379
1380 /* gmul is the table of precomputed base points */
1381 static const felem gmul[16][3] = {
1382 {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1383  {0, 0, 0, 0, 0, 0, 0, 0, 0},
1384  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1385 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1386   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1387   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1388  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1389   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1390   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1391  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1392 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1393   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1394   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1395  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1396   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1397   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1398  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1399 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1400   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1401   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1402  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1403   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1404   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1405  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1406 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1407   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1408   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1409  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1410   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1411   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1412  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1413 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1414   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1415   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1416  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1417   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1418   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1419  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1420 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1421   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1422   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1423  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1424   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1425   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1426  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1427 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1428   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1429   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1430  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1431   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1432   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1433  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1434 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1435   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1436   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1437  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1438   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1439   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1440  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1441 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1442   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1443   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1444  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1445   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1446   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1447  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1448 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1449   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1450   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1451  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1452   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1453   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1454  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1455 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1456   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1457   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1458  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1459   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1460   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1461  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1462 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1463   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1464   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1465  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1466   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1467   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1468  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1469 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1470   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1471   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1472  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1473   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1474   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1475  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1476 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1477   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1478   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1479  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1480   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1481   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1482  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1483 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1484   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1485   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1486  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1487   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1488   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1489  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1490 };
1491
1492 /*
1493  * select_point selects the |idx|th point from a precomputation table and
1494  * copies it to out.
1495  */
1496  /* pre_comp below is of the size provided in |size| */
1497 static void select_point(const limb idx, unsigned int size,
1498                          const felem pre_comp[][3], felem out[3])
1499 {
1500     unsigned i, j;
1501     limb *outlimbs = &out[0][0];
1502
1503     memset(out, 0, sizeof(*out) * 3);
1504
1505     for (i = 0; i < size; i++) {
1506         const limb *inlimbs = &pre_comp[i][0][0];
1507         limb mask = i ^ idx;
1508         mask |= mask >> 4;
1509         mask |= mask >> 2;
1510         mask |= mask >> 1;
1511         mask &= 1;
1512         mask--;
1513         for (j = 0; j < NLIMBS * 3; j++)
1514             outlimbs[j] |= inlimbs[j] & mask;
1515     }
1516 }
1517
1518 /* get_bit returns the |i|th bit in |in| */
1519 static char get_bit(const felem_bytearray in, int i)
1520 {
1521     if (i < 0)
1522         return 0;
1523     return (in[i >> 3] >> (i & 7)) & 1;
1524 }
1525
1526 /*
1527  * Interleaved point multiplication using precomputed point multiples: The
1528  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1529  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1530  * generator, using certain (large) precomputed multiples in g_pre_comp.
1531  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1532  */
1533 static void batch_mul(felem x_out, felem y_out, felem z_out,
1534                       const felem_bytearray scalars[],
1535                       const unsigned num_points, const u8 *g_scalar,
1536                       const int mixed, const felem pre_comp[][17][3],
1537                       const felem g_pre_comp[16][3])
1538 {
1539     int i, skip;
1540     unsigned num, gen_mul = (g_scalar != NULL);
1541     felem nq[3], tmp[4];
1542     limb bits;
1543     u8 sign, digit;
1544
1545     /* set nq to the point at infinity */
1546     memset(nq, 0, sizeof(nq));
1547
1548     /*
1549      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1550      * of the generator (last quarter of rounds) and additions of other
1551      * points multiples (every 5th round).
1552      */
1553     skip = 1;                   /* save two point operations in the first
1554                                  * round */
1555     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1556         /* double */
1557         if (!skip)
1558             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1559
1560         /* add multiples of the generator */
1561         if (gen_mul && (i <= 130)) {
1562             bits = get_bit(g_scalar, i + 390) << 3;
1563             if (i < 130) {
1564                 bits |= get_bit(g_scalar, i + 260) << 2;
1565                 bits |= get_bit(g_scalar, i + 130) << 1;
1566                 bits |= get_bit(g_scalar, i);
1567             }
1568             /* select the point to add, in constant time */
1569             select_point(bits, 16, g_pre_comp, tmp);
1570             if (!skip) {
1571                 /* The 1 argument below is for "mixed" */
1572                 point_add(nq[0], nq[1], nq[2],
1573                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1574             } else {
1575                 memcpy(nq, tmp, 3 * sizeof(felem));
1576                 skip = 0;
1577             }
1578         }
1579
1580         /* do other additions every 5 doublings */
1581         if (num_points && (i % 5 == 0)) {
1582             /* loop over all scalars */
1583             for (num = 0; num < num_points; ++num) {
1584                 bits = get_bit(scalars[num], i + 4) << 5;
1585                 bits |= get_bit(scalars[num], i + 3) << 4;
1586                 bits |= get_bit(scalars[num], i + 2) << 3;
1587                 bits |= get_bit(scalars[num], i + 1) << 2;
1588                 bits |= get_bit(scalars[num], i) << 1;
1589                 bits |= get_bit(scalars[num], i - 1);
1590                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1591
1592                 /*
1593                  * select the point to add or subtract, in constant time
1594                  */
1595                 select_point(digit, 17, pre_comp[num], tmp);
1596                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1597                                             * point */
1598                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1599
1600                 if (!skip) {
1601                     point_add(nq[0], nq[1], nq[2],
1602                               nq[0], nq[1], nq[2],
1603                               mixed, tmp[0], tmp[1], tmp[2]);
1604                 } else {
1605                     memcpy(nq, tmp, 3 * sizeof(felem));
1606                     skip = 0;
1607                 }
1608             }
1609         }
1610     }
1611     felem_assign(x_out, nq[0]);
1612     felem_assign(y_out, nq[1]);
1613     felem_assign(z_out, nq[2]);
1614 }
1615
1616 /* Precomputation for the group generator. */
1617 struct nistp521_pre_comp_st {
1618     felem g_pre_comp[16][3];
1619     CRYPTO_REF_COUNT references;
1620     CRYPTO_RWLOCK *lock;
1621 };
1622
1623 const EC_METHOD *EC_GFp_nistp521_method(void)
1624 {
1625     static const EC_METHOD ret = {
1626         EC_FLAGS_DEFAULT_OCT,
1627         NID_X9_62_prime_field,
1628         ec_GFp_nistp521_group_init,
1629         ec_GFp_simple_group_finish,
1630         ec_GFp_simple_group_clear_finish,
1631         ec_GFp_nist_group_copy,
1632         ec_GFp_nistp521_group_set_curve,
1633         ec_GFp_simple_group_get_curve,
1634         ec_GFp_simple_group_get_degree,
1635         ec_group_simple_order_bits,
1636         ec_GFp_simple_group_check_discriminant,
1637         ec_GFp_simple_point_init,
1638         ec_GFp_simple_point_finish,
1639         ec_GFp_simple_point_clear_finish,
1640         ec_GFp_simple_point_copy,
1641         ec_GFp_simple_point_set_to_infinity,
1642         ec_GFp_simple_point_set_affine_coordinates,
1643         ec_GFp_nistp521_point_get_affine_coordinates,
1644         0 /* point_set_compressed_coordinates */ ,
1645         0 /* point2oct */ ,
1646         0 /* oct2point */ ,
1647         ec_GFp_simple_add,
1648         ec_GFp_simple_dbl,
1649         ec_GFp_simple_invert,
1650         ec_GFp_simple_is_at_infinity,
1651         ec_GFp_simple_is_on_curve,
1652         ec_GFp_simple_cmp,
1653         ec_GFp_simple_make_affine,
1654         ec_GFp_simple_points_make_affine,
1655         ec_GFp_nistp521_points_mul,
1656         ec_GFp_nistp521_precompute_mult,
1657         ec_GFp_nistp521_have_precompute_mult,
1658         ec_GFp_nist_field_mul,
1659         ec_GFp_nist_field_sqr,
1660         0 /* field_div */ ,
1661         ec_GFp_simple_field_inv,
1662         0 /* field_encode */ ,
1663         0 /* field_decode */ ,
1664         0,                      /* field_set_to_one */
1665         ec_key_simple_priv2oct,
1666         ec_key_simple_oct2priv,
1667         0, /* set private */
1668         ec_key_simple_generate_key,
1669         ec_key_simple_check_key,
1670         ec_key_simple_generate_public_key,
1671         0, /* keycopy */
1672         0, /* keyfinish */
1673         ecdh_simple_compute_key,
1674         ecdsa_simple_sign_setup,
1675         ecdsa_simple_sign_sig,
1676         ecdsa_simple_verify_sig,
1677         0, /* field_inverse_mod_ord */
1678         0, /* blind_coordinates */
1679         0, /* ladder_pre */
1680         0, /* ladder_step */
1681         0  /* ladder_post */
1682     };
1683
1684     return &ret;
1685 }
1686
1687 /******************************************************************************/
1688 /*
1689  * FUNCTIONS TO MANAGE PRECOMPUTATION
1690  */
1691
1692 static NISTP521_PRE_COMP *nistp521_pre_comp_new(void)
1693 {
1694     NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1695
1696     if (ret == NULL) {
1697         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1698         return ret;
1699     }
1700
1701     ret->references = 1;
1702
1703     ret->lock = CRYPTO_THREAD_lock_new();
1704     if (ret->lock == NULL) {
1705         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1706         OPENSSL_free(ret);
1707         return NULL;
1708     }
1709     return ret;
1710 }
1711
1712 NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
1713 {
1714     int i;
1715     if (p != NULL)
1716         CRYPTO_UP_REF(&p->references, &i, p->lock);
1717     return p;
1718 }
1719
1720 void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1721 {
1722     int i;
1723
1724     if (p == NULL)
1725         return;
1726
1727     CRYPTO_DOWN_REF(&p->references, &i, p->lock);
1728     REF_PRINT_COUNT("EC_nistp521", x);
1729     if (i > 0)
1730         return;
1731     REF_ASSERT_ISNT(i < 0);
1732
1733     CRYPTO_THREAD_lock_free(p->lock);
1734     OPENSSL_free(p);
1735 }
1736
1737 /******************************************************************************/
1738 /*
1739  * OPENSSL EC_METHOD FUNCTIONS
1740  */
1741
1742 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1743 {
1744     int ret;
1745     ret = ec_GFp_simple_group_init(group);
1746     group->a_is_minus3 = 1;
1747     return ret;
1748 }
1749
1750 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1751                                     const BIGNUM *a, const BIGNUM *b,
1752                                     BN_CTX *ctx)
1753 {
1754     int ret = 0;
1755     BIGNUM *curve_p, *curve_a, *curve_b;
1756 #ifndef FIPS_MODULE
1757     BN_CTX *new_ctx = NULL;
1758
1759     if (ctx == NULL)
1760         ctx = new_ctx = BN_CTX_new();
1761 #endif
1762     if (ctx == NULL)
1763         return 0;
1764
1765     BN_CTX_start(ctx);
1766     curve_p = BN_CTX_get(ctx);
1767     curve_a = BN_CTX_get(ctx);
1768     curve_b = BN_CTX_get(ctx);
1769     if (curve_b == NULL)
1770         goto err;
1771     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1772     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1773     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1774     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1775         ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1776               EC_R_WRONG_CURVE_PARAMETERS);
1777         goto err;
1778     }
1779     group->field_mod_func = BN_nist_mod_521;
1780     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1781  err:
1782     BN_CTX_end(ctx);
1783 #ifndef FIPS_MODULE
1784     BN_CTX_free(new_ctx);
1785 #endif
1786     return ret;
1787 }
1788
1789 /*
1790  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1791  * (X/Z^2, Y/Z^3)
1792  */
1793 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1794                                                  const EC_POINT *point,
1795                                                  BIGNUM *x, BIGNUM *y,
1796                                                  BN_CTX *ctx)
1797 {
1798     felem z1, z2, x_in, y_in, x_out, y_out;
1799     largefelem tmp;
1800
1801     if (EC_POINT_is_at_infinity(group, point)) {
1802         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1803               EC_R_POINT_AT_INFINITY);
1804         return 0;
1805     }
1806     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1807         (!BN_to_felem(z1, point->Z)))
1808         return 0;
1809     felem_inv(z2, z1);
1810     felem_square(tmp, z2);
1811     felem_reduce(z1, tmp);
1812     felem_mul(tmp, x_in, z1);
1813     felem_reduce(x_in, tmp);
1814     felem_contract(x_out, x_in);
1815     if (x != NULL) {
1816         if (!felem_to_BN(x, x_out)) {
1817             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1818                   ERR_R_BN_LIB);
1819             return 0;
1820         }
1821     }
1822     felem_mul(tmp, z1, z2);
1823     felem_reduce(z1, tmp);
1824     felem_mul(tmp, y_in, z1);
1825     felem_reduce(y_in, tmp);
1826     felem_contract(y_out, y_in);
1827     if (y != NULL) {
1828         if (!felem_to_BN(y, y_out)) {
1829             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1830                   ERR_R_BN_LIB);
1831             return 0;
1832         }
1833     }
1834     return 1;
1835 }
1836
1837 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1838 static void make_points_affine(size_t num, felem points[][3],
1839                                felem tmp_felems[])
1840 {
1841     /*
1842      * Runs in constant time, unless an input is the point at infinity (which
1843      * normally shouldn't happen).
1844      */
1845     ec_GFp_nistp_points_make_affine_internal(num,
1846                                              points,
1847                                              sizeof(felem),
1848                                              tmp_felems,
1849                                              (void (*)(void *))felem_one,
1850                                              felem_is_zero_int,
1851                                              (void (*)(void *, const void *))
1852                                              felem_assign,
1853                                              (void (*)(void *, const void *))
1854                                              felem_square_reduce, (void (*)
1855                                                                    (void *,
1856                                                                     const void
1857                                                                     *,
1858                                                                     const void
1859                                                                     *))
1860                                              felem_mul_reduce,
1861                                              (void (*)(void *, const void *))
1862                                              felem_inv,
1863                                              (void (*)(void *, const void *))
1864                                              felem_contract);
1865 }
1866
1867 /*
1868  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1869  * values Result is stored in r (r can equal one of the inputs).
1870  */
1871 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1872                                const BIGNUM *scalar, size_t num,
1873                                const EC_POINT *points[],
1874                                const BIGNUM *scalars[], BN_CTX *ctx)
1875 {
1876     int ret = 0;
1877     int j;
1878     int mixed = 0;
1879     BIGNUM *x, *y, *z, *tmp_scalar;
1880     felem_bytearray g_secret;
1881     felem_bytearray *secrets = NULL;
1882     felem (*pre_comp)[17][3] = NULL;
1883     felem *tmp_felems = NULL;
1884     unsigned i;
1885     int num_bytes;
1886     int have_pre_comp = 0;
1887     size_t num_points = num;
1888     felem x_in, y_in, z_in, x_out, y_out, z_out;
1889     NISTP521_PRE_COMP *pre = NULL;
1890     felem(*g_pre_comp)[3] = NULL;
1891     EC_POINT *generator = NULL;
1892     const EC_POINT *p = NULL;
1893     const BIGNUM *p_scalar = NULL;
1894
1895     BN_CTX_start(ctx);
1896     x = BN_CTX_get(ctx);
1897     y = BN_CTX_get(ctx);
1898     z = BN_CTX_get(ctx);
1899     tmp_scalar = BN_CTX_get(ctx);
1900     if (tmp_scalar == NULL)
1901         goto err;
1902
1903     if (scalar != NULL) {
1904         pre = group->pre_comp.nistp521;
1905         if (pre)
1906             /* we have precomputation, try to use it */
1907             g_pre_comp = &pre->g_pre_comp[0];
1908         else
1909             /* try to use the standard precomputation */
1910             g_pre_comp = (felem(*)[3]) gmul;
1911         generator = EC_POINT_new(group);
1912         if (generator == NULL)
1913             goto err;
1914         /* get the generator from precomputation */
1915         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1916             !felem_to_BN(y, g_pre_comp[1][1]) ||
1917             !felem_to_BN(z, g_pre_comp[1][2])) {
1918             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1919             goto err;
1920         }
1921         if (!ec_GFp_simple_set_Jprojective_coordinates_GFp(group, generator, x,
1922                                                            y, z, ctx))
1923             goto err;
1924         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1925             /* precomputation matches generator */
1926             have_pre_comp = 1;
1927         else
1928             /*
1929              * we don't have valid precomputation: treat the generator as a
1930              * random point
1931              */
1932             num_points++;
1933     }
1934
1935     if (num_points > 0) {
1936         if (num_points >= 2) {
1937             /*
1938              * unless we precompute multiples for just one point, converting
1939              * those into affine form is time well spent
1940              */
1941             mixed = 1;
1942         }
1943         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1944         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1945         if (mixed)
1946             tmp_felems =
1947                 OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1948         if ((secrets == NULL) || (pre_comp == NULL)
1949             || (mixed && (tmp_felems == NULL))) {
1950             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1951             goto err;
1952         }
1953
1954         /*
1955          * we treat NULL scalars as 0, and NULL points as points at infinity,
1956          * i.e., they contribute nothing to the linear combination
1957          */
1958         for (i = 0; i < num_points; ++i) {
1959             if (i == num) {
1960                 /*
1961                  * we didn't have a valid precomputation, so we pick the
1962                  * generator
1963                  */
1964                 p = EC_GROUP_get0_generator(group);
1965                 p_scalar = scalar;
1966             } else {
1967                 /* the i^th point */
1968                 p = points[i];
1969                 p_scalar = scalars[i];
1970             }
1971             if ((p_scalar != NULL) && (p != NULL)) {
1972                 /* reduce scalar to 0 <= scalar < 2^521 */
1973                 if ((BN_num_bits(p_scalar) > 521)
1974                     || (BN_is_negative(p_scalar))) {
1975                     /*
1976                      * this is an unusual input, and we don't guarantee
1977                      * constant-timeness
1978                      */
1979                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1980                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1981                         goto err;
1982                     }
1983                     num_bytes = BN_bn2lebinpad(tmp_scalar,
1984                                                secrets[i], sizeof(secrets[i]));
1985                 } else {
1986                     num_bytes = BN_bn2lebinpad(p_scalar,
1987                                                secrets[i], sizeof(secrets[i]));
1988                 }
1989                 if (num_bytes < 0) {
1990                     ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1991                     goto err;
1992                 }
1993                 /* precompute multiples */
1994                 if ((!BN_to_felem(x_out, p->X)) ||
1995                     (!BN_to_felem(y_out, p->Y)) ||
1996                     (!BN_to_felem(z_out, p->Z)))
1997                     goto err;
1998                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1999                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
2000                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
2001                 for (j = 2; j <= 16; ++j) {
2002                     if (j & 1) {
2003                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
2004                                   pre_comp[i][j][2], pre_comp[i][1][0],
2005                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
2006                                   pre_comp[i][j - 1][0],
2007                                   pre_comp[i][j - 1][1],
2008                                   pre_comp[i][j - 1][2]);
2009                     } else {
2010                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
2011                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
2012                                      pre_comp[i][j / 2][1],
2013                                      pre_comp[i][j / 2][2]);
2014                     }
2015                 }
2016             }
2017         }
2018         if (mixed)
2019             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
2020     }
2021
2022     /* the scalar for the generator */
2023     if ((scalar != NULL) && (have_pre_comp)) {
2024         memset(g_secret, 0, sizeof(g_secret));
2025         /* reduce scalar to 0 <= scalar < 2^521 */
2026         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
2027             /*
2028              * this is an unusual input, and we don't guarantee
2029              * constant-timeness
2030              */
2031             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2032                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2033                 goto err;
2034             }
2035             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2036         } else {
2037             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2038         }
2039         /* do the multiplication with generator precomputation */
2040         batch_mul(x_out, y_out, z_out,
2041                   (const felem_bytearray(*))secrets, num_points,
2042                   g_secret,
2043                   mixed, (const felem(*)[17][3])pre_comp,
2044                   (const felem(*)[3])g_pre_comp);
2045     } else {
2046         /* do the multiplication without generator precomputation */
2047         batch_mul(x_out, y_out, z_out,
2048                   (const felem_bytearray(*))secrets, num_points,
2049                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
2050     }
2051     /* reduce the output to its unique minimal representation */
2052     felem_contract(x_in, x_out);
2053     felem_contract(y_in, y_out);
2054     felem_contract(z_in, z_out);
2055     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2056         (!felem_to_BN(z, z_in))) {
2057         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2058         goto err;
2059     }
2060     ret = ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2061
2062  err:
2063     BN_CTX_end(ctx);
2064     EC_POINT_free(generator);
2065     OPENSSL_free(secrets);
2066     OPENSSL_free(pre_comp);
2067     OPENSSL_free(tmp_felems);
2068     return ret;
2069 }
2070
2071 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2072 {
2073     int ret = 0;
2074     NISTP521_PRE_COMP *pre = NULL;
2075     int i, j;
2076     BIGNUM *x, *y;
2077     EC_POINT *generator = NULL;
2078     felem tmp_felems[16];
2079 #ifndef FIPS_MODULE
2080     BN_CTX *new_ctx = NULL;
2081 #endif
2082
2083     /* throw away old precomputation */
2084     EC_pre_comp_free(group);
2085
2086 #ifndef FIPS_MODULE
2087     if (ctx == NULL)
2088         ctx = new_ctx = BN_CTX_new();
2089 #endif
2090     if (ctx == NULL)
2091         return 0;
2092
2093     BN_CTX_start(ctx);
2094     x = BN_CTX_get(ctx);
2095     y = BN_CTX_get(ctx);
2096     if (y == NULL)
2097         goto err;
2098     /* get the generator */
2099     if (group->generator == NULL)
2100         goto err;
2101     generator = EC_POINT_new(group);
2102     if (generator == NULL)
2103         goto err;
2104     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2105     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2106     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2107         goto err;
2108     if ((pre = nistp521_pre_comp_new()) == NULL)
2109         goto err;
2110     /*
2111      * if the generator is the standard one, use built-in precomputation
2112      */
2113     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2114         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2115         goto done;
2116     }
2117     if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
2118         (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
2119         (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2120         goto err;
2121     /* compute 2^130*G, 2^260*G, 2^390*G */
2122     for (i = 1; i <= 4; i <<= 1) {
2123         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2124                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2125                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2126         for (j = 0; j < 129; ++j) {
2127             point_double(pre->g_pre_comp[2 * i][0],
2128                          pre->g_pre_comp[2 * i][1],
2129                          pre->g_pre_comp[2 * i][2],
2130                          pre->g_pre_comp[2 * i][0],
2131                          pre->g_pre_comp[2 * i][1],
2132                          pre->g_pre_comp[2 * i][2]);
2133         }
2134     }
2135     /* g_pre_comp[0] is the point at infinity */
2136     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2137     /* the remaining multiples */
2138     /* 2^130*G + 2^260*G */
2139     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2140               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2141               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2142               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2143               pre->g_pre_comp[2][2]);
2144     /* 2^130*G + 2^390*G */
2145     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2146               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2147               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2148               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2149               pre->g_pre_comp[2][2]);
2150     /* 2^260*G + 2^390*G */
2151     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2152               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2153               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2154               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2155               pre->g_pre_comp[4][2]);
2156     /* 2^130*G + 2^260*G + 2^390*G */
2157     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2158               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2159               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2160               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2161               pre->g_pre_comp[2][2]);
2162     for (i = 1; i < 8; ++i) {
2163         /* odd multiples: add G */
2164         point_add(pre->g_pre_comp[2 * i + 1][0],
2165                   pre->g_pre_comp[2 * i + 1][1],
2166                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2167                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2168                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2169                   pre->g_pre_comp[1][2]);
2170     }
2171     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2172
2173  done:
2174     SETPRECOMP(group, nistp521, pre);
2175     ret = 1;
2176     pre = NULL;
2177  err:
2178     BN_CTX_end(ctx);
2179     EC_POINT_free(generator);
2180 #ifndef FIPS_MODULE
2181     BN_CTX_free(new_ctx);
2182 #endif
2183     EC_nistp521_pre_comp_free(pre);
2184     return ret;
2185 }
2186
2187 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2188 {
2189     return HAVEPRECOMP(group, nistp521);
2190 }