crypto/ec/ecp_nistp521.c

   1 /*
   2  * Written by Adam Langley (Google) for the OpenSSL project
   3  */
   4 /* Copyright 2011 Google Inc.
   5  *
   6  * Licensed under the Apache License, Version 2.0 (the "License");
   7  *
   8  * you may not use this file except in compliance with the License.
   9  * You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  *  Unless required by applicable law or agreed to in writing, software
  14  *  distributed under the License is distributed on an "AS IS" BASIS,
  15  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  *  See the License for the specific language governing permissions and
  17  *  limitations under the License.
  18  */
  19
  20 /*
  21  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  22  *
  23  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  24  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  25  * work which got its smarts from Daniel J. Bernstein's work on the same.
  26  */
  27
  28 #include <openssl/opensslconf.h>
  29 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
  30
  31 # ifndef OPENSSL_SYS_VMS
  32 #  include <stdint.h>
  33 # else
  34 #  include <inttypes.h>
  35 # endif
  36
  37 # include <string.h>
  38 # include <openssl/err.h>
  39 # include "ec_lcl.h"
  40
  41 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  42   /* even with gcc, the typedef won't work for 32-bit platforms */
  43 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  44                                  * platforms */
  45 # else
  46 #  error "Need GCC 3.1 or later to define type uint128_t"
  47 # endif
  48
  49 typedef uint8_t u8;
  50 typedef uint64_t u64;
  51 typedef int64_t s64;
  52
  53 /*
  54  * The underlying field. P521 operates over GF(2^521-1). We can serialise an
  55  * element of this field into 66 bytes where the most significant byte
  56  * contains only a single bit. We call this an felem_bytearray.
  57  */
  58
  59 typedef u8 felem_bytearray[66];
  60
  61 /*
  62  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  63  * These values are big-endian.
  64  */
  65 static const felem_bytearray nistp521_curve_params[5] = {
  66     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
  67      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  68      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  69      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  70      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  74      0xff, 0xff},
  75     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
  76      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  78      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  79      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  82      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  83      0xff, 0xfc},
  84     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
  85      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  86      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  87      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  88      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  89      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  90      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  91      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  92      0x3f, 0x00},
  93     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
  94      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  95      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
  96      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
  97      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
  98      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
  99      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
 100      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 101      0xbd, 0x66},
 102     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
 103      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 104      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 105      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 106      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 107      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 108      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 109      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 110      0x66, 0x50}
 111 };
 112
 113 /*-
 114  * The representation of field elements.
 115  * ------------------------------------
 116  *
 117  * We represent field elements with nine values. These values are either 64 or
 118  * 128 bits and the field element represented is:
 119  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 120  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 121  * 58 bits apart, but are greater than 58 bits in length, the most significant
 122  * bits of each limb overlap with the least significant bits of the next.
 123  *
 124  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 125  * 'largefelem' */
 126
 127 # define NLIMBS 9
 128
 129 typedef uint64_t limb;
 130 typedef limb felem[NLIMBS];
 131 typedef uint128_t largefelem[NLIMBS];
 132
 133 static const limb bottom57bits = 0x1ffffffffffffff;
 134 static const limb bottom58bits = 0x3ffffffffffffff;
 135
 136 /*
 137  * bin66_to_felem takes a little-endian byte array and converts it into felem
 138  * form. This assumes that the CPU is little-endian.
 139  */
 140 static void bin66_to_felem(felem out, const u8 in[66])
 141 {
 142     out[0] = (*((limb *) & in[0])) & bottom58bits;
 143     out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
 144     out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
 145     out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
 146     out[4] = (*((limb *) & in[29])) & bottom58bits;
 147     out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
 148     out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
 149     out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
 150     out[8] = (*((limb *) & in[58])) & bottom57bits;
 151 }
 152
 153 /*
 154  * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 155  * array. This assumes that the CPU is little-endian.
 156  */
 157 static void felem_to_bin66(u8 out[66], const felem in)
 158 {
 159     memset(out, 0, 66);
 160     (*((limb *) & out[0])) = in[0];
 161     (*((limb *) & out[7])) |= in[1] << 2;
 162     (*((limb *) & out[14])) |= in[2] << 4;
 163     (*((limb *) & out[21])) |= in[3] << 6;
 164     (*((limb *) & out[29])) = in[4];
 165     (*((limb *) & out[36])) |= in[5] << 2;
 166     (*((limb *) & out[43])) |= in[6] << 4;
 167     (*((limb *) & out[50])) |= in[7] << 6;
 168     (*((limb *) & out[58])) = in[8];
 169 }
 170
 171 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 172 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 173 {
 174     unsigned i;
 175     for (i = 0; i < len; ++i)
 176         out[i] = in[len - 1 - i];
 177 }
 178
 179 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 180 static int BN_to_felem(felem out, const BIGNUM *bn)
 181 {
 182     felem_bytearray b_in;
 183     felem_bytearray b_out;
 184     unsigned num_bytes;
 185
 186     /* BN_bn2bin eats leading zeroes */
 187     memset(b_out, 0, sizeof(b_out));
 188     num_bytes = BN_num_bytes(bn);
 189     if (num_bytes > sizeof b_out) {
 190         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 191         return 0;
 192     }
 193     if (BN_is_negative(bn)) {
 194         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 195         return 0;
 196     }
 197     num_bytes = BN_bn2bin(bn, b_in);
 198     flip_endian(b_out, b_in, num_bytes);
 199     bin66_to_felem(out, b_out);
 200     return 1;
 201 }
 202
 203 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 204 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 205 {
 206     felem_bytearray b_in, b_out;
 207     felem_to_bin66(b_in, in);
 208     flip_endian(b_out, b_in, sizeof b_out);
 209     return BN_bin2bn(b_out, sizeof b_out, out);
 210 }
 211
 212 /*-
 213  * Field operations
 214  * ----------------
 215  */
 216
 217 static void felem_one(felem out)
 218 {
 219     out[0] = 1;
 220     out[1] = 0;
 221     out[2] = 0;
 222     out[3] = 0;
 223     out[4] = 0;
 224     out[5] = 0;
 225     out[6] = 0;
 226     out[7] = 0;
 227     out[8] = 0;
 228 }
 229
 230 static void felem_assign(felem out, const felem in)
 231 {
 232     out[0] = in[0];
 233     out[1] = in[1];
 234     out[2] = in[2];
 235     out[3] = in[3];
 236     out[4] = in[4];
 237     out[5] = in[5];
 238     out[6] = in[6];
 239     out[7] = in[7];
 240     out[8] = in[8];
 241 }
 242
 243 /* felem_sum64 sets out = out + in. */
 244 static void felem_sum64(felem out, const felem in)
 245 {
 246     out[0] += in[0];
 247     out[1] += in[1];
 248     out[2] += in[2];
 249     out[3] += in[3];
 250     out[4] += in[4];
 251     out[5] += in[5];
 252     out[6] += in[6];
 253     out[7] += in[7];
 254     out[8] += in[8];
 255 }
 256
 257 /* felem_scalar sets out = in * scalar */
 258 static void felem_scalar(felem out, const felem in, limb scalar)
 259 {
 260     out[0] = in[0] * scalar;
 261     out[1] = in[1] * scalar;
 262     out[2] = in[2] * scalar;
 263     out[3] = in[3] * scalar;
 264     out[4] = in[4] * scalar;
 265     out[5] = in[5] * scalar;
 266     out[6] = in[6] * scalar;
 267     out[7] = in[7] * scalar;
 268     out[8] = in[8] * scalar;
 269 }
 270
 271 /* felem_scalar64 sets out = out * scalar */
 272 static void felem_scalar64(felem out, limb scalar)
 273 {
 274     out[0] *= scalar;
 275     out[1] *= scalar;
 276     out[2] *= scalar;
 277     out[3] *= scalar;
 278     out[4] *= scalar;
 279     out[5] *= scalar;
 280     out[6] *= scalar;
 281     out[7] *= scalar;
 282     out[8] *= scalar;
 283 }
 284
 285 /* felem_scalar128 sets out = out * scalar */
 286 static void felem_scalar128(largefelem out, limb scalar)
 287 {
 288     out[0] *= scalar;
 289     out[1] *= scalar;
 290     out[2] *= scalar;
 291     out[3] *= scalar;
 292     out[4] *= scalar;
 293     out[5] *= scalar;
 294     out[6] *= scalar;
 295     out[7] *= scalar;
 296     out[8] *= scalar;
 297 }
 298
 299 /*-
 300  * felem_neg sets |out| to |-in|
 301  * On entry:
 302  *   in[i] < 2^59 + 2^14
 303  * On exit:
 304  *   out[i] < 2^62
 305  */
 306 static void felem_neg(felem out, const felem in)
 307 {
 308     /* In order to prevent underflow, we subtract from 0 mod p. */
 309     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 310     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 311
 312     out[0] = two62m3 - in[0];
 313     out[1] = two62m2 - in[1];
 314     out[2] = two62m2 - in[2];
 315     out[3] = two62m2 - in[3];
 316     out[4] = two62m2 - in[4];
 317     out[5] = two62m2 - in[5];
 318     out[6] = two62m2 - in[6];
 319     out[7] = two62m2 - in[7];
 320     out[8] = two62m2 - in[8];
 321 }
 322
 323 /*-
 324  * felem_diff64 subtracts |in| from |out|
 325  * On entry:
 326  *   in[i] < 2^59 + 2^14
 327  * On exit:
 328  *   out[i] < out[i] + 2^62
 329  */
 330 static void felem_diff64(felem out, const felem in)
 331 {
 332     /*
 333      * In order to prevent underflow, we add 0 mod p before subtracting.
 334      */
 335     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 336     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 337
 338     out[0] += two62m3 - in[0];
 339     out[1] += two62m2 - in[1];
 340     out[2] += two62m2 - in[2];
 341     out[3] += two62m2 - in[3];
 342     out[4] += two62m2 - in[4];
 343     out[5] += two62m2 - in[5];
 344     out[6] += two62m2 - in[6];
 345     out[7] += two62m2 - in[7];
 346     out[8] += two62m2 - in[8];
 347 }
 348
 349 /*-
 350  * felem_diff_128_64 subtracts |in| from |out|
 351  * On entry:
 352  *   in[i] < 2^62 + 2^17
 353  * On exit:
 354  *   out[i] < out[i] + 2^63
 355  */
 356 static void felem_diff_128_64(largefelem out, const felem in)
 357 {
 358     /*
 359      * In order to prevent underflow, we add 0 mod p before subtracting.
 360      */
 361     static const limb two63m6 = (((limb) 1) << 62) - (((limb) 1) << 5);
 362     static const limb two63m5 = (((limb) 1) << 62) - (((limb) 1) << 4);
 363
 364     out[0] += two63m6 - in[0];
 365     out[1] += two63m5 - in[1];
 366     out[2] += two63m5 - in[2];
 367     out[3] += two63m5 - in[3];
 368     out[4] += two63m5 - in[4];
 369     out[5] += two63m5 - in[5];
 370     out[6] += two63m5 - in[6];
 371     out[7] += two63m5 - in[7];
 372     out[8] += two63m5 - in[8];
 373 }
 374
 375 /*-
 376  * felem_diff_128_64 subtracts |in| from |out|
 377  * On entry:
 378  *   in[i] < 2^126
 379  * On exit:
 380  *   out[i] < out[i] + 2^127 - 2^69
 381  */
 382 static void felem_diff128(largefelem out, const largefelem in)
 383 {
 384     /*
 385      * In order to prevent underflow, we add 0 mod p before subtracting.
 386      */
 387     static const uint128_t two127m70 =
 388         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
 389     static const uint128_t two127m69 =
 390         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
 391
 392     out[0] += (two127m70 - in[0]);
 393     out[1] += (two127m69 - in[1]);
 394     out[2] += (two127m69 - in[2]);
 395     out[3] += (two127m69 - in[3]);
 396     out[4] += (two127m69 - in[4]);
 397     out[5] += (two127m69 - in[5]);
 398     out[6] += (two127m69 - in[6]);
 399     out[7] += (two127m69 - in[7]);
 400     out[8] += (two127m69 - in[8]);
 401 }
 402
 403 /*-
 404  * felem_square sets |out| = |in|^2
 405  * On entry:
 406  *   in[i] < 2^62
 407  * On exit:
 408  *   out[i] < 17 * max(in[i]) * max(in[i])
 409  */
 410 static void felem_square(largefelem out, const felem in)
 411 {
 412     felem inx2, inx4;
 413     felem_scalar(inx2, in, 2);
 414     felem_scalar(inx4, in, 4);
 415
 416     /*-
 417      * We have many cases were we want to do
 418      *   in[x] * in[y] +
 419      *   in[y] * in[x]
 420      * This is obviously just
 421      *   2 * in[x] * in[y]
 422      * However, rather than do the doubling on the 128 bit result, we
 423      * double one of the inputs to the multiplication by reading from
 424      * |inx2|
 425      */
 426
 427     out[0] = ((uint128_t) in[0]) * in[0];
 428     out[1] = ((uint128_t) in[0]) * inx2[1];
 429     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
 430     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
 431     out[4] = ((uint128_t) in[0]) * inx2[4] +
 432              ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
 433     out[5] = ((uint128_t) in[0]) * inx2[5] +
 434              ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
 435     out[6] = ((uint128_t) in[0]) * inx2[6] +
 436              ((uint128_t) in[1]) * inx2[5] +
 437              ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
 438     out[7] = ((uint128_t) in[0]) * inx2[7] +
 439              ((uint128_t) in[1]) * inx2[6] +
 440              ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
 441     out[8] = ((uint128_t) in[0]) * inx2[8] +
 442              ((uint128_t) in[1]) * inx2[7] +
 443              ((uint128_t) in[2]) * inx2[6] +
 444              ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
 445
 446     /*
 447      * The remaining limbs fall above 2^521, with the first falling at 2^522.
 448      * They correspond to locations one bit up from the limbs produced above
 449      * so we would have to multiply by two to align them. Again, rather than
 450      * operate on the 128-bit result, we double one of the inputs to the
 451      * multiplication. If we want to double for both this reason, and the
 452      * reason above, then we end up multiplying by four.
 453      */
 454
 455     /* 9 */
 456     out[0] += ((uint128_t) in[1]) * inx4[8] +
 457               ((uint128_t) in[2]) * inx4[7] +
 458               ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
 459
 460     /* 10 */
 461     out[1] += ((uint128_t) in[2]) * inx4[8] +
 462               ((uint128_t) in[3]) * inx4[7] +
 463               ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
 464
 465     /* 11 */
 466     out[2] += ((uint128_t) in[3]) * inx4[8] +
 467               ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
 468
 469     /* 12 */
 470     out[3] += ((uint128_t) in[4]) * inx4[8] +
 471               ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
 472
 473     /* 13 */
 474     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
 475
 476     /* 14 */
 477     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
 478
 479     /* 15 */
 480     out[6] += ((uint128_t) in[7]) * inx4[8];
 481
 482     /* 16 */
 483     out[7] += ((uint128_t) in[8]) * inx2[8];
 484 }
 485
 486 /*-
 487  * felem_mul sets |out| = |in1| * |in2|
 488  * On entry:
 489  *   in1[i] < 2^64
 490  *   in2[i] < 2^63
 491  * On exit:
 492  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 493  */
 494 static void felem_mul(largefelem out, const felem in1, const felem in2)
 495 {
 496     felem in2x2;
 497     felem_scalar(in2x2, in2, 2);
 498
 499     out[0] = ((uint128_t) in1[0]) * in2[0];
 500
 501     out[1] = ((uint128_t) in1[0]) * in2[1] +
 502              ((uint128_t) in1[1]) * in2[0];
 503
 504     out[2] = ((uint128_t) in1[0]) * in2[2] +
 505              ((uint128_t) in1[1]) * in2[1] +
 506              ((uint128_t) in1[2]) * in2[0];
 507
 508     out[3] = ((uint128_t) in1[0]) * in2[3] +
 509              ((uint128_t) in1[1]) * in2[2] +
 510              ((uint128_t) in1[2]) * in2[1] +
 511              ((uint128_t) in1[3]) * in2[0];
 512
 513     out[4] = ((uint128_t) in1[0]) * in2[4] +
 514              ((uint128_t) in1[1]) * in2[3] +
 515              ((uint128_t) in1[2]) * in2[2] +
 516              ((uint128_t) in1[3]) * in2[1] +
 517              ((uint128_t) in1[4]) * in2[0];
 518
 519     out[5] = ((uint128_t) in1[0]) * in2[5] +
 520              ((uint128_t) in1[1]) * in2[4] +
 521              ((uint128_t) in1[2]) * in2[3] +
 522              ((uint128_t) in1[3]) * in2[2] +
 523              ((uint128_t) in1[4]) * in2[1] +
 524              ((uint128_t) in1[5]) * in2[0];
 525
 526     out[6] = ((uint128_t) in1[0]) * in2[6] +
 527              ((uint128_t) in1[1]) * in2[5] +
 528              ((uint128_t) in1[2]) * in2[4] +
 529              ((uint128_t) in1[3]) * in2[3] +
 530              ((uint128_t) in1[4]) * in2[2] +
 531              ((uint128_t) in1[5]) * in2[1] +
 532              ((uint128_t) in1[6]) * in2[0];
 533
 534     out[7] = ((uint128_t) in1[0]) * in2[7] +
 535              ((uint128_t) in1[1]) * in2[6] +
 536              ((uint128_t) in1[2]) * in2[5] +
 537              ((uint128_t) in1[3]) * in2[4] +
 538              ((uint128_t) in1[4]) * in2[3] +
 539              ((uint128_t) in1[5]) * in2[2] +
 540              ((uint128_t) in1[6]) * in2[1] +
 541              ((uint128_t) in1[7]) * in2[0];
 542
 543     out[8] = ((uint128_t) in1[0]) * in2[8] +
 544              ((uint128_t) in1[1]) * in2[7] +
 545              ((uint128_t) in1[2]) * in2[6] +
 546              ((uint128_t) in1[3]) * in2[5] +
 547              ((uint128_t) in1[4]) * in2[4] +
 548              ((uint128_t) in1[5]) * in2[3] +
 549              ((uint128_t) in1[6]) * in2[2] +
 550              ((uint128_t) in1[7]) * in2[1] +
 551              ((uint128_t) in1[8]) * in2[0];
 552
 553     /* See comment in felem_square about the use of in2x2 here */
 554
 555     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 556               ((uint128_t) in1[2]) * in2x2[7] +
 557               ((uint128_t) in1[3]) * in2x2[6] +
 558               ((uint128_t) in1[4]) * in2x2[5] +
 559               ((uint128_t) in1[5]) * in2x2[4] +
 560               ((uint128_t) in1[6]) * in2x2[3] +
 561               ((uint128_t) in1[7]) * in2x2[2] +
 562               ((uint128_t) in1[8]) * in2x2[1];
 563
 564     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 565               ((uint128_t) in1[3]) * in2x2[7] +
 566               ((uint128_t) in1[4]) * in2x2[6] +
 567               ((uint128_t) in1[5]) * in2x2[5] +
 568               ((uint128_t) in1[6]) * in2x2[4] +
 569               ((uint128_t) in1[7]) * in2x2[3] +
 570               ((uint128_t) in1[8]) * in2x2[2];
 571
 572     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 573               ((uint128_t) in1[4]) * in2x2[7] +
 574               ((uint128_t) in1[5]) * in2x2[6] +
 575               ((uint128_t) in1[6]) * in2x2[5] +
 576               ((uint128_t) in1[7]) * in2x2[4] +
 577               ((uint128_t) in1[8]) * in2x2[3];
 578
 579     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 580               ((uint128_t) in1[5]) * in2x2[7] +
 581               ((uint128_t) in1[6]) * in2x2[6] +
 582               ((uint128_t) in1[7]) * in2x2[5] +
 583               ((uint128_t) in1[8]) * in2x2[4];
 584
 585     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 586               ((uint128_t) in1[6]) * in2x2[7] +
 587               ((uint128_t) in1[7]) * in2x2[6] +
 588               ((uint128_t) in1[8]) * in2x2[5];
 589
 590     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 591               ((uint128_t) in1[7]) * in2x2[7] +
 592               ((uint128_t) in1[8]) * in2x2[6];
 593
 594     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 595               ((uint128_t) in1[8]) * in2x2[7];
 596
 597     out[7] += ((uint128_t) in1[8]) * in2x2[8];
 598 }
 599
 600 static const limb bottom52bits = 0xfffffffffffff;
 601
 602 /*-
 603  * felem_reduce converts a largefelem to an felem.
 604  * On entry:
 605  *   in[i] < 2^128
 606  * On exit:
 607  *   out[i] < 2^59 + 2^14
 608  */
 609 static void felem_reduce(felem out, const largefelem in)
 610 {
 611     u64 overflow1, overflow2;
 612
 613     out[0] = ((limb) in[0]) & bottom58bits;
 614     out[1] = ((limb) in[1]) & bottom58bits;
 615     out[2] = ((limb) in[2]) & bottom58bits;
 616     out[3] = ((limb) in[3]) & bottom58bits;
 617     out[4] = ((limb) in[4]) & bottom58bits;
 618     out[5] = ((limb) in[5]) & bottom58bits;
 619     out[6] = ((limb) in[6]) & bottom58bits;
 620     out[7] = ((limb) in[7]) & bottom58bits;
 621     out[8] = ((limb) in[8]) & bottom58bits;
 622
 623     /* out[i] < 2^58 */
 624
 625     out[1] += ((limb) in[0]) >> 58;
 626     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 627     /*-
 628      * out[1] < 2^58 + 2^6 + 2^58
 629      *        = 2^59 + 2^6
 630      */
 631     out[2] += ((limb) (in[0] >> 64)) >> 52;
 632
 633     out[2] += ((limb) in[1]) >> 58;
 634     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 635     out[3] += ((limb) (in[1] >> 64)) >> 52;
 636
 637     out[3] += ((limb) in[2]) >> 58;
 638     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 639     out[4] += ((limb) (in[2] >> 64)) >> 52;
 640
 641     out[4] += ((limb) in[3]) >> 58;
 642     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 643     out[5] += ((limb) (in[3] >> 64)) >> 52;
 644
 645     out[5] += ((limb) in[4]) >> 58;
 646     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 647     out[6] += ((limb) (in[4] >> 64)) >> 52;
 648
 649     out[6] += ((limb) in[5]) >> 58;
 650     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 651     out[7] += ((limb) (in[5] >> 64)) >> 52;
 652
 653     out[7] += ((limb) in[6]) >> 58;
 654     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 655     out[8] += ((limb) (in[6] >> 64)) >> 52;
 656
 657     out[8] += ((limb) in[7]) >> 58;
 658     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 659     /*-
 660      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 661      *            < 2^59 + 2^13
 662      */
 663     overflow1 = ((limb) (in[7] >> 64)) >> 52;
 664
 665     overflow1 += ((limb) in[8]) >> 58;
 666     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 667     overflow2 = ((limb) (in[8] >> 64)) >> 52;
 668
 669     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
 670     overflow2 <<= 1;            /* overflow2 < 2^13 */
 671
 672     out[0] += overflow1;        /* out[0] < 2^60 */
 673     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
 674
 675     out[1] += out[0] >> 58;
 676     out[0] &= bottom58bits;
 677     /*-
 678      * out[0] < 2^58
 679      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 680      *        < 2^59 + 2^14
 681      */
 682 }
 683
 684 static void felem_square_reduce(felem out, const felem in)
 685 {
 686     largefelem tmp;
 687     felem_square(tmp, in);
 688     felem_reduce(out, tmp);
 689 }
 690
 691 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 692 {
 693     largefelem tmp;
 694     felem_mul(tmp, in1, in2);
 695     felem_reduce(out, tmp);
 696 }
 697
 698 /*-
 699  * felem_inv calculates |out| = |in|^{-1}
 700  *
 701  * Based on Fermat's Little Theorem:
 702  *   a^p = a (mod p)
 703  *   a^{p-1} = 1 (mod p)
 704  *   a^{p-2} = a^{-1} (mod p)
 705  */
 706 static void felem_inv(felem out, const felem in)
 707 {
 708     felem ftmp, ftmp2, ftmp3, ftmp4;
 709     largefelem tmp;
 710     unsigned i;
 711
 712     felem_square(tmp, in);
 713     felem_reduce(ftmp, tmp);    /* 2^1 */
 714     felem_mul(tmp, in, ftmp);
 715     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
 716     felem_assign(ftmp2, ftmp);
 717     felem_square(tmp, ftmp);
 718     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
 719     felem_mul(tmp, in, ftmp);
 720     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
 721     felem_square(tmp, ftmp);
 722     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
 723
 724     felem_square(tmp, ftmp2);
 725     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
 726     felem_square(tmp, ftmp3);
 727     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
 728     felem_mul(tmp, ftmp3, ftmp2);
 729     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
 730
 731     felem_assign(ftmp2, ftmp3);
 732     felem_square(tmp, ftmp3);
 733     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
 734     felem_square(tmp, ftmp3);
 735     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
 736     felem_square(tmp, ftmp3);
 737     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
 738     felem_square(tmp, ftmp3);
 739     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
 740     felem_assign(ftmp4, ftmp3);
 741     felem_mul(tmp, ftmp3, ftmp);
 742     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
 743     felem_square(tmp, ftmp4);
 744     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
 745     felem_mul(tmp, ftmp3, ftmp2);
 746     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
 747     felem_assign(ftmp2, ftmp3);
 748
 749     for (i = 0; i < 8; i++) {
 750         felem_square(tmp, ftmp3);
 751         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
 752     }
 753     felem_mul(tmp, ftmp3, ftmp2);
 754     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
 755     felem_assign(ftmp2, ftmp3);
 756
 757     for (i = 0; i < 16; i++) {
 758         felem_square(tmp, ftmp3);
 759         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
 760     }
 761     felem_mul(tmp, ftmp3, ftmp2);
 762     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
 763     felem_assign(ftmp2, ftmp3);
 764
 765     for (i = 0; i < 32; i++) {
 766         felem_square(tmp, ftmp3);
 767         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
 768     }
 769     felem_mul(tmp, ftmp3, ftmp2);
 770     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
 771     felem_assign(ftmp2, ftmp3);
 772
 773     for (i = 0; i < 64; i++) {
 774         felem_square(tmp, ftmp3);
 775         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
 776     }
 777     felem_mul(tmp, ftmp3, ftmp2);
 778     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
 779     felem_assign(ftmp2, ftmp3);
 780
 781     for (i = 0; i < 128; i++) {
 782         felem_square(tmp, ftmp3);
 783         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
 784     }
 785     felem_mul(tmp, ftmp3, ftmp2);
 786     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
 787     felem_assign(ftmp2, ftmp3);
 788
 789     for (i = 0; i < 256; i++) {
 790         felem_square(tmp, ftmp3);
 791         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
 792     }
 793     felem_mul(tmp, ftmp3, ftmp2);
 794     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
 795
 796     for (i = 0; i < 9; i++) {
 797         felem_square(tmp, ftmp3);
 798         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
 799     }
 800     felem_mul(tmp, ftmp3, ftmp4);
 801     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
 802     felem_mul(tmp, ftmp3, in);
 803     felem_reduce(out, tmp);     /* 2^512 - 3 */
 804 }
 805
 806 /* This is 2^521-1, expressed as an felem */
 807 static const felem kPrime = {
 808     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 809     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 810     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 811 };
 812
 813 /*-
 814  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 815  * otherwise.
 816  * On entry:
 817  *   in[i] < 2^59 + 2^14
 818  */
 819 static limb felem_is_zero(const felem in)
 820 {
 821     felem ftmp;
 822     limb is_zero, is_p;
 823     felem_assign(ftmp, in);
 824
 825     ftmp[0] += ftmp[8] >> 57;
 826     ftmp[8] &= bottom57bits;
 827     /* ftmp[8] < 2^57 */
 828     ftmp[1] += ftmp[0] >> 58;
 829     ftmp[0] &= bottom58bits;
 830     ftmp[2] += ftmp[1] >> 58;
 831     ftmp[1] &= bottom58bits;
 832     ftmp[3] += ftmp[2] >> 58;
 833     ftmp[2] &= bottom58bits;
 834     ftmp[4] += ftmp[3] >> 58;
 835     ftmp[3] &= bottom58bits;
 836     ftmp[5] += ftmp[4] >> 58;
 837     ftmp[4] &= bottom58bits;
 838     ftmp[6] += ftmp[5] >> 58;
 839     ftmp[5] &= bottom58bits;
 840     ftmp[7] += ftmp[6] >> 58;
 841     ftmp[6] &= bottom58bits;
 842     ftmp[8] += ftmp[7] >> 58;
 843     ftmp[7] &= bottom58bits;
 844     /* ftmp[8] < 2^57 + 4 */
 845
 846     /*
 847      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
 848      * than our bound for ftmp[8]. Therefore we only have to check if the
 849      * zero is zero or 2^521-1.
 850      */
 851
 852     is_zero = 0;
 853     is_zero |= ftmp[0];
 854     is_zero |= ftmp[1];
 855     is_zero |= ftmp[2];
 856     is_zero |= ftmp[3];
 857     is_zero |= ftmp[4];
 858     is_zero |= ftmp[5];
 859     is_zero |= ftmp[6];
 860     is_zero |= ftmp[7];
 861     is_zero |= ftmp[8];
 862
 863     is_zero--;
 864     /*
 865      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
 866      * can be set is if is_zero was 0 before the decrement.
 867      */
 868     is_zero = ((s64) is_zero) >> 63;
 869
 870     is_p = ftmp[0] ^ kPrime[0];
 871     is_p |= ftmp[1] ^ kPrime[1];
 872     is_p |= ftmp[2] ^ kPrime[2];
 873     is_p |= ftmp[3] ^ kPrime[3];
 874     is_p |= ftmp[4] ^ kPrime[4];
 875     is_p |= ftmp[5] ^ kPrime[5];
 876     is_p |= ftmp[6] ^ kPrime[6];
 877     is_p |= ftmp[7] ^ kPrime[7];
 878     is_p |= ftmp[8] ^ kPrime[8];
 879
 880     is_p--;
 881     is_p = ((s64) is_p) >> 63;
 882
 883     is_zero |= is_p;
 884     return is_zero;
 885 }
 886
 887 static int felem_is_zero_int(const felem in)
 888 {
 889     return (int)(felem_is_zero(in) & ((limb) 1));
 890 }
 891
 892 /*-
 893  * felem_contract converts |in| to its unique, minimal representation.
 894  * On entry:
 895  *   in[i] < 2^59 + 2^14
 896  */
 897 static void felem_contract(felem out, const felem in)
 898 {
 899     limb is_p, is_greater, sign;
 900     static const limb two58 = ((limb) 1) << 58;
 901
 902     felem_assign(out, in);
 903
 904     out[0] += out[8] >> 57;
 905     out[8] &= bottom57bits;
 906     /* out[8] < 2^57 */
 907     out[1] += out[0] >> 58;
 908     out[0] &= bottom58bits;
 909     out[2] += out[1] >> 58;
 910     out[1] &= bottom58bits;
 911     out[3] += out[2] >> 58;
 912     out[2] &= bottom58bits;
 913     out[4] += out[3] >> 58;
 914     out[3] &= bottom58bits;
 915     out[5] += out[4] >> 58;
 916     out[4] &= bottom58bits;
 917     out[6] += out[5] >> 58;
 918     out[5] &= bottom58bits;
 919     out[7] += out[6] >> 58;
 920     out[6] &= bottom58bits;
 921     out[8] += out[7] >> 58;
 922     out[7] &= bottom58bits;
 923     /* out[8] < 2^57 + 4 */
 924
 925     /*
 926      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
 927      * out. See the comments in felem_is_zero regarding why we don't test for
 928      * other multiples of the prime.
 929      */
 930
 931     /*
 932      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
 933      */
 934
 935     is_p = out[0] ^ kPrime[0];
 936     is_p |= out[1] ^ kPrime[1];
 937     is_p |= out[2] ^ kPrime[2];
 938     is_p |= out[3] ^ kPrime[3];
 939     is_p |= out[4] ^ kPrime[4];
 940     is_p |= out[5] ^ kPrime[5];
 941     is_p |= out[6] ^ kPrime[6];
 942     is_p |= out[7] ^ kPrime[7];
 943     is_p |= out[8] ^ kPrime[8];
 944
 945     is_p--;
 946     is_p &= is_p << 32;
 947     is_p &= is_p << 16;
 948     is_p &= is_p << 8;
 949     is_p &= is_p << 4;
 950     is_p &= is_p << 2;
 951     is_p &= is_p << 1;
 952     is_p = ((s64) is_p) >> 63;
 953     is_p = ~is_p;
 954
 955     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 956
 957     out[0] &= is_p;
 958     out[1] &= is_p;
 959     out[2] &= is_p;
 960     out[3] &= is_p;
 961     out[4] &= is_p;
 962     out[5] &= is_p;
 963     out[6] &= is_p;
 964     out[7] &= is_p;
 965     out[8] &= is_p;
 966
 967     /*
 968      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
 969      * 57 is greater than zero as (2^521-1) + x >= 2^522
 970      */
 971     is_greater = out[8] >> 57;
 972     is_greater |= is_greater << 32;
 973     is_greater |= is_greater << 16;
 974     is_greater |= is_greater << 8;
 975     is_greater |= is_greater << 4;
 976     is_greater |= is_greater << 2;
 977     is_greater |= is_greater << 1;
 978     is_greater = ((s64) is_greater) >> 63;
 979
 980     out[0] -= kPrime[0] & is_greater;
 981     out[1] -= kPrime[1] & is_greater;
 982     out[2] -= kPrime[2] & is_greater;
 983     out[3] -= kPrime[3] & is_greater;
 984     out[4] -= kPrime[4] & is_greater;
 985     out[5] -= kPrime[5] & is_greater;
 986     out[6] -= kPrime[6] & is_greater;
 987     out[7] -= kPrime[7] & is_greater;
 988     out[8] -= kPrime[8] & is_greater;
 989
 990     /* Eliminate negative coefficients */
 991     sign = -(out[0] >> 63);
 992     out[0] += (two58 & sign);
 993     out[1] -= (1 & sign);
 994     sign = -(out[1] >> 63);
 995     out[1] += (two58 & sign);
 996     out[2] -= (1 & sign);
 997     sign = -(out[2] >> 63);
 998     out[2] += (two58 & sign);
 999     out[3] -= (1 & sign);
1000     sign = -(out[3] >> 63);
1001     out[3] += (two58 & sign);
1002     out[4] -= (1 & sign);
1003     sign = -(out[4] >> 63);
1004     out[4] += (two58 & sign);
1005     out[5] -= (1 & sign);
1006     sign = -(out[0] >> 63);
1007     out[5] += (two58 & sign);
1008     out[6] -= (1 & sign);
1009     sign = -(out[6] >> 63);
1010     out[6] += (two58 & sign);
1011     out[7] -= (1 & sign);
1012     sign = -(out[7] >> 63);
1013     out[7] += (two58 & sign);
1014     out[8] -= (1 & sign);
1015     sign = -(out[5] >> 63);
1016     out[5] += (two58 & sign);
1017     out[6] -= (1 & sign);
1018     sign = -(out[6] >> 63);
1019     out[6] += (two58 & sign);
1020     out[7] -= (1 & sign);
1021     sign = -(out[7] >> 63);
1022     out[7] += (two58 & sign);
1023     out[8] -= (1 & sign);
1024 }
1025
1026 /*-
1027  * Group operations
1028  * ----------------
1029  *
1030  * Building on top of the field operations we have the operations on the
1031  * elliptic curve group itself. Points on the curve are represented in Jacobian
1032  * coordinates */
1033
1034 /*-
1035  * point_double calcuates 2*(x_in, y_in, z_in)
1036  *
1037  * The method is taken from:
1038  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1039  *
1040  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1041  * while x_out == y_in is not (maybe this works, but it's not tested). */
1042 static void
1043 point_double(felem x_out, felem y_out, felem z_out,
1044              const felem x_in, const felem y_in, const felem z_in)
1045 {
1046     largefelem tmp, tmp2;
1047     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1048
1049     felem_assign(ftmp, x_in);
1050     felem_assign(ftmp2, x_in);
1051
1052     /* delta = z^2 */
1053     felem_square(tmp, z_in);
1054     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1055
1056     /* gamma = y^2 */
1057     felem_square(tmp, y_in);
1058     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1059
1060     /* beta = x*gamma */
1061     felem_mul(tmp, x_in, gamma);
1062     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1063
1064     /* alpha = 3*(x-delta)*(x+delta) */
1065     felem_diff64(ftmp, delta);
1066     /* ftmp[i] < 2^61 */
1067     felem_sum64(ftmp2, delta);
1068     /* ftmp2[i] < 2^60 + 2^15 */
1069     felem_scalar64(ftmp2, 3);
1070     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1071     felem_mul(tmp, ftmp, ftmp2);
1072     /*-
1073      * tmp[i] < 17(3*2^121 + 3*2^76)
1074      *        = 61*2^121 + 61*2^76
1075      *        < 64*2^121 + 64*2^76
1076      *        = 2^127 + 2^82
1077      *        < 2^128
1078      */
1079     felem_reduce(alpha, tmp);
1080
1081     /* x' = alpha^2 - 8*beta */
1082     felem_square(tmp, alpha);
1083     /*
1084      * tmp[i] < 17*2^120 < 2^125
1085      */
1086     felem_assign(ftmp, beta);
1087     felem_scalar64(ftmp, 8);
1088     /* ftmp[i] < 2^62 + 2^17 */
1089     felem_diff_128_64(tmp, ftmp);
1090     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1091     felem_reduce(x_out, tmp);
1092
1093     /* z' = (y + z)^2 - gamma - delta */
1094     felem_sum64(delta, gamma);
1095     /* delta[i] < 2^60 + 2^15 */
1096     felem_assign(ftmp, y_in);
1097     felem_sum64(ftmp, z_in);
1098     /* ftmp[i] < 2^60 + 2^15 */
1099     felem_square(tmp, ftmp);
1100     /*
1101      * tmp[i] < 17(2^122) < 2^127
1102      */
1103     felem_diff_128_64(tmp, delta);
1104     /* tmp[i] < 2^127 + 2^63 */
1105     felem_reduce(z_out, tmp);
1106
1107     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1108     felem_scalar64(beta, 4);
1109     /* beta[i] < 2^61 + 2^16 */
1110     felem_diff64(beta, x_out);
1111     /* beta[i] < 2^61 + 2^60 + 2^16 */
1112     felem_mul(tmp, alpha, beta);
1113     /*-
1114      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1115      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1116      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1117      *        < 2^128
1118      */
1119     felem_square(tmp2, gamma);
1120     /*-
1121      * tmp2[i] < 17*(2^59 + 2^14)^2
1122      *         = 17*(2^118 + 2^74 + 2^28)
1123      */
1124     felem_scalar128(tmp2, 8);
1125     /*-
1126      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1127      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1128      *         < 2^126
1129      */
1130     felem_diff128(tmp, tmp2);
1131     /*-
1132      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1133      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1134      *          2^74 + 2^69 + 2^34 + 2^30
1135      *        < 2^128
1136      */
1137     felem_reduce(y_out, tmp);
1138 }
1139
1140 /* copy_conditional copies in to out iff mask is all ones. */
1141 static void copy_conditional(felem out, const felem in, limb mask)
1142 {
1143     unsigned i;
1144     for (i = 0; i < NLIMBS; ++i) {
1145         const limb tmp = mask & (in[i] ^ out[i]);
1146         out[i] ^= tmp;
1147     }
1148 }
1149
1150 /*-
1151  * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1152  *
1153  * The method is taken from
1154  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1155  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1156  *
1157  * This function includes a branch for checking whether the two input points
1158  * are equal (while not equal to the point at infinity). This case never
1159  * happens during single point multiplication, so there is no timing leak for
1160  * ECDH or ECDSA signing. */
1161 static void point_add(felem x3, felem y3, felem z3,
1162                       const felem x1, const felem y1, const felem z1,
1163                       const int mixed, const felem x2, const felem y2,
1164                       const felem z2)
1165 {
1166     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1167     largefelem tmp, tmp2;
1168     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1169
1170     z1_is_zero = felem_is_zero(z1);
1171     z2_is_zero = felem_is_zero(z2);
1172
1173     /* ftmp = z1z1 = z1**2 */
1174     felem_square(tmp, z1);
1175     felem_reduce(ftmp, tmp);
1176
1177     if (!mixed) {
1178         /* ftmp2 = z2z2 = z2**2 */
1179         felem_square(tmp, z2);
1180         felem_reduce(ftmp2, tmp);
1181
1182         /* u1 = ftmp3 = x1*z2z2 */
1183         felem_mul(tmp, x1, ftmp2);
1184         felem_reduce(ftmp3, tmp);
1185
1186         /* ftmp5 = z1 + z2 */
1187         felem_assign(ftmp5, z1);
1188         felem_sum64(ftmp5, z2);
1189         /* ftmp5[i] < 2^61 */
1190
1191         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1192         felem_square(tmp, ftmp5);
1193         /* tmp[i] < 17*2^122 */
1194         felem_diff_128_64(tmp, ftmp);
1195         /* tmp[i] < 17*2^122 + 2^63 */
1196         felem_diff_128_64(tmp, ftmp2);
1197         /* tmp[i] < 17*2^122 + 2^64 */
1198         felem_reduce(ftmp5, tmp);
1199
1200         /* ftmp2 = z2 * z2z2 */
1201         felem_mul(tmp, ftmp2, z2);
1202         felem_reduce(ftmp2, tmp);
1203
1204         /* s1 = ftmp6 = y1 * z2**3 */
1205         felem_mul(tmp, y1, ftmp2);
1206         felem_reduce(ftmp6, tmp);
1207     } else {
1208         /*
1209          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1210          */
1211
1212         /* u1 = ftmp3 = x1*z2z2 */
1213         felem_assign(ftmp3, x1);
1214
1215         /* ftmp5 = 2*z1z2 */
1216         felem_scalar(ftmp5, z1, 2);
1217
1218         /* s1 = ftmp6 = y1 * z2**3 */
1219         felem_assign(ftmp6, y1);
1220     }
1221
1222     /* u2 = x2*z1z1 */
1223     felem_mul(tmp, x2, ftmp);
1224     /* tmp[i] < 17*2^120 */
1225
1226     /* h = ftmp4 = u2 - u1 */
1227     felem_diff_128_64(tmp, ftmp3);
1228     /* tmp[i] < 17*2^120 + 2^63 */
1229     felem_reduce(ftmp4, tmp);
1230
1231     x_equal = felem_is_zero(ftmp4);
1232
1233     /* z_out = ftmp5 * h */
1234     felem_mul(tmp, ftmp5, ftmp4);
1235     felem_reduce(z_out, tmp);
1236
1237     /* ftmp = z1 * z1z1 */
1238     felem_mul(tmp, ftmp, z1);
1239     felem_reduce(ftmp, tmp);
1240
1241     /* s2 = tmp = y2 * z1**3 */
1242     felem_mul(tmp, y2, ftmp);
1243     /* tmp[i] < 17*2^120 */
1244
1245     /* r = ftmp5 = (s2 - s1)*2 */
1246     felem_diff_128_64(tmp, ftmp6);
1247     /* tmp[i] < 17*2^120 + 2^63 */
1248     felem_reduce(ftmp5, tmp);
1249     y_equal = felem_is_zero(ftmp5);
1250     felem_scalar64(ftmp5, 2);
1251     /* ftmp5[i] < 2^61 */
1252
1253     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1254         point_double(x3, y3, z3, x1, y1, z1);
1255         return;
1256     }
1257
1258     /* I = ftmp = (2h)**2 */
1259     felem_assign(ftmp, ftmp4);
1260     felem_scalar64(ftmp, 2);
1261     /* ftmp[i] < 2^61 */
1262     felem_square(tmp, ftmp);
1263     /* tmp[i] < 17*2^122 */
1264     felem_reduce(ftmp, tmp);
1265
1266     /* J = ftmp2 = h * I */
1267     felem_mul(tmp, ftmp4, ftmp);
1268     felem_reduce(ftmp2, tmp);
1269
1270     /* V = ftmp4 = U1 * I */
1271     felem_mul(tmp, ftmp3, ftmp);
1272     felem_reduce(ftmp4, tmp);
1273
1274     /* x_out = r**2 - J - 2V */
1275     felem_square(tmp, ftmp5);
1276     /* tmp[i] < 17*2^122 */
1277     felem_diff_128_64(tmp, ftmp2);
1278     /* tmp[i] < 17*2^122 + 2^63 */
1279     felem_assign(ftmp3, ftmp4);
1280     felem_scalar64(ftmp4, 2);
1281     /* ftmp4[i] < 2^61 */
1282     felem_diff_128_64(tmp, ftmp4);
1283     /* tmp[i] < 17*2^122 + 2^64 */
1284     felem_reduce(x_out, tmp);
1285
1286     /* y_out = r(V-x_out) - 2 * s1 * J */
1287     felem_diff64(ftmp3, x_out);
1288     /*
1289      * ftmp3[i] < 2^60 + 2^60 = 2^61
1290      */
1291     felem_mul(tmp, ftmp5, ftmp3);
1292     /* tmp[i] < 17*2^122 */
1293     felem_mul(tmp2, ftmp6, ftmp2);
1294     /* tmp2[i] < 17*2^120 */
1295     felem_scalar128(tmp2, 2);
1296     /* tmp2[i] < 17*2^121 */
1297     felem_diff128(tmp, tmp2);
1298         /*-
1299          * tmp[i] < 2^127 - 2^69 + 17*2^122
1300          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1301          *        < 2^127
1302          */
1303     felem_reduce(y_out, tmp);
1304
1305     copy_conditional(x_out, x2, z1_is_zero);
1306     copy_conditional(x_out, x1, z2_is_zero);
1307     copy_conditional(y_out, y2, z1_is_zero);
1308     copy_conditional(y_out, y1, z2_is_zero);
1309     copy_conditional(z_out, z2, z1_is_zero);
1310     copy_conditional(z_out, z1, z2_is_zero);
1311     felem_assign(x3, x_out);
1312     felem_assign(y3, y_out);
1313     felem_assign(z3, z_out);
1314 }
1315
1316 /*-
1317  * Base point pre computation
1318  * --------------------------
1319  *
1320  * Two different sorts of precomputed tables are used in the following code.
1321  * Each contain various points on the curve, where each point is three field
1322  * elements (x, y, z).
1323  *
1324  * For the base point table, z is usually 1 (0 for the point at infinity).
1325  * This table has 16 elements:
1326  * index | bits    | point
1327  * ------+---------+------------------------------
1328  *     0 | 0 0 0 0 | 0G
1329  *     1 | 0 0 0 1 | 1G
1330  *     2 | 0 0 1 0 | 2^130G
1331  *     3 | 0 0 1 1 | (2^130 + 1)G
1332  *     4 | 0 1 0 0 | 2^260G
1333  *     5 | 0 1 0 1 | (2^260 + 1)G
1334  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1335  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1336  *     8 | 1 0 0 0 | 2^390G
1337  *     9 | 1 0 0 1 | (2^390 + 1)G
1338  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1339  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1340  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1341  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1342  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1343  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1344  *
1345  * The reason for this is so that we can clock bits into four different
1346  * locations when doing simple scalar multiplies against the base point.
1347  *
1348  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1349
1350 /* gmul is the table of precomputed base points */
1351 static const felem gmul[16][3] = {
1352 {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1353  {0, 0, 0, 0, 0, 0, 0, 0, 0},
1354  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1355 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1356   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1357   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1358  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1359   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1360   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1361  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1362 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1363   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1364   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1365  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1366   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1367   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1368  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1369 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1370   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1371   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1372  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1373   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1374   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1375  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1376 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1377   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1378   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1379  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1380   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1381   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1382  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1383 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1384   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1385   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1386  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1387   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1388   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1389  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1390 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1391   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1392   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1393  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1394   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1395   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1396  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1397 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1398   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1399   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1400  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1401   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1402   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1403  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1404 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1405   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1406   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1407  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1408   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1409   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1410  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1411 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1412   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1413   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1414  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1415   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1416   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1417  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1418 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1419   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1420   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1421  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1422   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1423   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1424  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1425 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1426   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1427   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1428  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1429   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1430   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1431  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1432 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1433   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1434   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1435  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1436   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1437   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1438  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1439 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1440   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1441   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1442  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1443   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1444   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1445  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1446 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1447   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1448   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1449  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1450   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1451   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1452  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1453 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1454   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1455   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1456  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1457   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1458   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1459  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1460 };
1461
1462 /*
1463  * select_point selects the |idx|th point from a precomputation table and
1464  * copies it to out.
1465  */
1466  /* pre_comp below is of the size provided in |size| */
1467 static void select_point(const limb idx, unsigned int size,
1468                          const felem pre_comp[][3], felem out[3])
1469 {
1470     unsigned i, j;
1471     limb *outlimbs = &out[0][0];
1472
1473     memset(out, 0, sizeof(*out) * 3);
1474
1475     for (i = 0; i < size; i++) {
1476         const limb *inlimbs = &pre_comp[i][0][0];
1477         limb mask = i ^ idx;
1478         mask |= mask >> 4;
1479         mask |= mask >> 2;
1480         mask |= mask >> 1;
1481         mask &= 1;
1482         mask--;
1483         for (j = 0; j < NLIMBS * 3; j++)
1484             outlimbs[j] |= inlimbs[j] & mask;
1485     }
1486 }
1487
1488 /* get_bit returns the |i|th bit in |in| */
1489 static char get_bit(const felem_bytearray in, int i)
1490 {
1491     if (i < 0)
1492         return 0;
1493     return (in[i >> 3] >> (i & 7)) & 1;
1494 }
1495
1496 /*
1497  * Interleaved point multiplication using precomputed point multiples: The
1498  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1499  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1500  * generator, using certain (large) precomputed multiples in g_pre_comp.
1501  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1502  */
1503 static void batch_mul(felem x_out, felem y_out, felem z_out,
1504                       const felem_bytearray scalars[],
1505                       const unsigned num_points, const u8 *g_scalar,
1506                       const int mixed, const felem pre_comp[][17][3],
1507                       const felem g_pre_comp[16][3])
1508 {
1509     int i, skip;
1510     unsigned num, gen_mul = (g_scalar != NULL);
1511     felem nq[3], tmp[4];
1512     limb bits;
1513     u8 sign, digit;
1514
1515     /* set nq to the point at infinity */
1516     memset(nq, 0, sizeof(nq));
1517
1518     /*
1519      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1520      * of the generator (last quarter of rounds) and additions of other
1521      * points multiples (every 5th round).
1522      */
1523     skip = 1;                   /* save two point operations in the first
1524                                  * round */
1525     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1526         /* double */
1527         if (!skip)
1528             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1529
1530         /* add multiples of the generator */
1531         if (gen_mul && (i <= 130)) {
1532             bits = get_bit(g_scalar, i + 390) << 3;
1533             if (i < 130) {
1534                 bits |= get_bit(g_scalar, i + 260) << 2;
1535                 bits |= get_bit(g_scalar, i + 130) << 1;
1536                 bits |= get_bit(g_scalar, i);
1537             }
1538             /* select the point to add, in constant time */
1539             select_point(bits, 16, g_pre_comp, tmp);
1540             if (!skip) {
1541                 /* The 1 argument below is for "mixed" */
1542                 point_add(nq[0], nq[1], nq[2],
1543                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1544             } else {
1545                 memcpy(nq, tmp, 3 * sizeof(felem));
1546                 skip = 0;
1547             }
1548         }
1549
1550         /* do other additions every 5 doublings */
1551         if (num_points && (i % 5 == 0)) {
1552             /* loop over all scalars */
1553             for (num = 0; num < num_points; ++num) {
1554                 bits = get_bit(scalars[num], i + 4) << 5;
1555                 bits |= get_bit(scalars[num], i + 3) << 4;
1556                 bits |= get_bit(scalars[num], i + 2) << 3;
1557                 bits |= get_bit(scalars[num], i + 1) << 2;
1558                 bits |= get_bit(scalars[num], i) << 1;
1559                 bits |= get_bit(scalars[num], i - 1);
1560                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1561
1562                 /*
1563                  * select the point to add or subtract, in constant time
1564                  */
1565                 select_point(digit, 17, pre_comp[num], tmp);
1566                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1567                                             * point */
1568                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1569
1570                 if (!skip) {
1571                     point_add(nq[0], nq[1], nq[2],
1572                               nq[0], nq[1], nq[2],
1573                               mixed, tmp[0], tmp[1], tmp[2]);
1574                 } else {
1575                     memcpy(nq, tmp, 3 * sizeof(felem));
1576                     skip = 0;
1577                 }
1578             }
1579         }
1580     }
1581     felem_assign(x_out, nq[0]);
1582     felem_assign(y_out, nq[1]);
1583     felem_assign(z_out, nq[2]);
1584 }
1585
1586 /* Precomputation for the group generator. */
1587 struct nistp521_pre_comp_st {
1588     felem g_pre_comp[16][3];
1589     int references;
1590 };
1591
1592 const EC_METHOD *EC_GFp_nistp521_method(void)
1593 {
1594     static const EC_METHOD ret = {
1595         EC_FLAGS_DEFAULT_OCT,
1596         NID_X9_62_prime_field,
1597         ec_GFp_nistp521_group_init,
1598         ec_GFp_simple_group_finish,
1599         ec_GFp_simple_group_clear_finish,
1600         ec_GFp_nist_group_copy,
1601         ec_GFp_nistp521_group_set_curve,
1602         ec_GFp_simple_group_get_curve,
1603         ec_GFp_simple_group_get_degree,
1604         ec_GFp_simple_group_check_discriminant,
1605         ec_GFp_simple_point_init,
1606         ec_GFp_simple_point_finish,
1607         ec_GFp_simple_point_clear_finish,
1608         ec_GFp_simple_point_copy,
1609         ec_GFp_simple_point_set_to_infinity,
1610         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1611         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1612         ec_GFp_simple_point_set_affine_coordinates,
1613         ec_GFp_nistp521_point_get_affine_coordinates,
1614         0 /* point_set_compressed_coordinates */ ,
1615         0 /* point2oct */ ,
1616         0 /* oct2point */ ,
1617         ec_GFp_simple_add,
1618         ec_GFp_simple_dbl,
1619         ec_GFp_simple_invert,
1620         ec_GFp_simple_is_at_infinity,
1621         ec_GFp_simple_is_on_curve,
1622         ec_GFp_simple_cmp,
1623         ec_GFp_simple_make_affine,
1624         ec_GFp_simple_points_make_affine,
1625         ec_GFp_nistp521_points_mul,
1626         ec_GFp_nistp521_precompute_mult,
1627         ec_GFp_nistp521_have_precompute_mult,
1628         ec_GFp_nist_field_mul,
1629         ec_GFp_nist_field_sqr,
1630         0 /* field_div */ ,
1631         0 /* field_encode */ ,
1632         0 /* field_decode */ ,
1633         0                       /* field_set_to_one */
1634     };
1635
1636     return &ret;
1637 }
1638
1639 /******************************************************************************/
1640 /*
1641  * FUNCTIONS TO MANAGE PRECOMPUTATION
1642  */
1643
1644 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1645 {
1646     NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1647
1648     if (ret == NULL) {
1649         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1650         return ret;
1651     }
1652     ret->references = 1;
1653     return ret;
1654 }
1655
1656 NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
1657 {
1658     if (p != NULL)
1659         CRYPTO_add(&p->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1660     return p;
1661 }
1662
1663 void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1664 {
1665     if (p == NULL
1666             || CRYPTO_add(&p->references, -1, CRYPTO_LOCK_EC_PRE_COMP) > 0)
1667         return;
1668     OPENSSL_free(p);
1669 }
1670
1671 /******************************************************************************/
1672 /*
1673  * OPENSSL EC_METHOD FUNCTIONS
1674  */
1675
1676 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1677 {
1678     int ret;
1679     ret = ec_GFp_simple_group_init(group);
1680     group->a_is_minus3 = 1;
1681     return ret;
1682 }
1683
1684 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1685                                     const BIGNUM *a, const BIGNUM *b,
1686                                     BN_CTX *ctx)
1687 {
1688     int ret = 0;
1689     BN_CTX *new_ctx = NULL;
1690     BIGNUM *curve_p, *curve_a, *curve_b;
1691
1692     if (ctx == NULL)
1693         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1694             return 0;
1695     BN_CTX_start(ctx);
1696     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1697         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1698         ((curve_b = BN_CTX_get(ctx)) == NULL))
1699         goto err;
1700     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1701     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1702     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1703     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1704         ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1705               EC_R_WRONG_CURVE_PARAMETERS);
1706         goto err;
1707     }
1708     group->field_mod_func = BN_nist_mod_521;
1709     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1710  err:
1711     BN_CTX_end(ctx);
1712     BN_CTX_free(new_ctx);
1713     return ret;
1714 }
1715
1716 /*
1717  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1718  * (X/Z^2, Y/Z^3)
1719  */
1720 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1721                                                  const EC_POINT *point,
1722                                                  BIGNUM *x, BIGNUM *y,
1723                                                  BN_CTX *ctx)
1724 {
1725     felem z1, z2, x_in, y_in, x_out, y_out;
1726     largefelem tmp;
1727
1728     if (EC_POINT_is_at_infinity(group, point)) {
1729         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1730               EC_R_POINT_AT_INFINITY);
1731         return 0;
1732     }
1733     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1734         (!BN_to_felem(z1, point->Z)))
1735         return 0;
1736     felem_inv(z2, z1);
1737     felem_square(tmp, z2);
1738     felem_reduce(z1, tmp);
1739     felem_mul(tmp, x_in, z1);
1740     felem_reduce(x_in, tmp);
1741     felem_contract(x_out, x_in);
1742     if (x != NULL) {
1743         if (!felem_to_BN(x, x_out)) {
1744             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1745                   ERR_R_BN_LIB);
1746             return 0;
1747         }
1748     }
1749     felem_mul(tmp, z1, z2);
1750     felem_reduce(z1, tmp);
1751     felem_mul(tmp, y_in, z1);
1752     felem_reduce(y_in, tmp);
1753     felem_contract(y_out, y_in);
1754     if (y != NULL) {
1755         if (!felem_to_BN(y, y_out)) {
1756             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1757                   ERR_R_BN_LIB);
1758             return 0;
1759         }
1760     }
1761     return 1;
1762 }
1763
1764 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1765 static void make_points_affine(size_t num, felem points[][3],
1766                                felem tmp_felems[])
1767 {
1768     /*
1769      * Runs in constant time, unless an input is the point at infinity (which
1770      * normally shouldn't happen).
1771      */
1772     ec_GFp_nistp_points_make_affine_internal(num,
1773                                              points,
1774                                              sizeof(felem),
1775                                              tmp_felems,
1776                                              (void (*)(void *))felem_one,
1777                                              (int (*)(const void *))
1778                                              felem_is_zero_int,
1779                                              (void (*)(void *, const void *))
1780                                              felem_assign,
1781                                              (void (*)(void *, const void *))
1782                                              felem_square_reduce, (void (*)
1783                                                                    (void *,
1784                                                                     const void
1785                                                                     *,
1786                                                                     const void
1787                                                                     *))
1788                                              felem_mul_reduce,
1789                                              (void (*)(void *, const void *))
1790                                              felem_inv,
1791                                              (void (*)(void *, const void *))
1792                                              felem_contract);
1793 }
1794
1795 /*
1796  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1797  * values Result is stored in r (r can equal one of the inputs).
1798  */
1799 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1800                                const BIGNUM *scalar, size_t num,
1801                                const EC_POINT *points[],
1802                                const BIGNUM *scalars[], BN_CTX *ctx)
1803 {
1804     int ret = 0;
1805     int j;
1806     int mixed = 0;
1807     BN_CTX *new_ctx = NULL;
1808     BIGNUM *x, *y, *z, *tmp_scalar;
1809     felem_bytearray g_secret;
1810     felem_bytearray *secrets = NULL;
1811     felem (*pre_comp)[17][3] = NULL;
1812     felem *tmp_felems = NULL;
1813     felem_bytearray tmp;
1814     unsigned i, num_bytes;
1815     int have_pre_comp = 0;
1816     size_t num_points = num;
1817     felem x_in, y_in, z_in, x_out, y_out, z_out;
1818     NISTP521_PRE_COMP *pre = NULL;
1819     felem(*g_pre_comp)[3] = NULL;
1820     EC_POINT *generator = NULL;
1821     const EC_POINT *p = NULL;
1822     const BIGNUM *p_scalar = NULL;
1823
1824     if (ctx == NULL)
1825         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1826             return 0;
1827     BN_CTX_start(ctx);
1828     if (((x = BN_CTX_get(ctx)) == NULL) ||
1829         ((y = BN_CTX_get(ctx)) == NULL) ||
1830         ((z = BN_CTX_get(ctx)) == NULL) ||
1831         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1832         goto err;
1833
1834     if (scalar != NULL) {
1835         pre = group->pre_comp.nistp521;
1836         if (pre)
1837             /* we have precomputation, try to use it */
1838             g_pre_comp = &pre->g_pre_comp[0];
1839         else
1840             /* try to use the standard precomputation */
1841             g_pre_comp = (felem(*)[3]) gmul;
1842         generator = EC_POINT_new(group);
1843         if (generator == NULL)
1844             goto err;
1845         /* get the generator from precomputation */
1846         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1847             !felem_to_BN(y, g_pre_comp[1][1]) ||
1848             !felem_to_BN(z, g_pre_comp[1][2])) {
1849             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1850             goto err;
1851         }
1852         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1853                                                       generator, x, y, z,
1854                                                       ctx))
1855             goto err;
1856         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1857             /* precomputation matches generator */
1858             have_pre_comp = 1;
1859         else
1860             /*
1861              * we don't have valid precomputation: treat the generator as a
1862              * random point
1863              */
1864             num_points++;
1865     }
1866
1867     if (num_points > 0) {
1868         if (num_points >= 2) {
1869             /*
1870              * unless we precompute multiples for just one point, converting
1871              * those into affine form is time well spent
1872              */
1873             mixed = 1;
1874         }
1875         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1876         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1877         if (mixed)
1878             tmp_felems =
1879                 OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1880         if ((secrets == NULL) || (pre_comp == NULL)
1881             || (mixed && (tmp_felems == NULL))) {
1882             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1883             goto err;
1884         }
1885
1886         /*
1887          * we treat NULL scalars as 0, and NULL points as points at infinity,
1888          * i.e., they contribute nothing to the linear combination
1889          */
1890         for (i = 0; i < num_points; ++i) {
1891             if (i == num)
1892                 /*
1893                  * we didn't have a valid precomputation, so we pick the
1894                  * generator
1895                  */
1896             {
1897                 p = EC_GROUP_get0_generator(group);
1898                 p_scalar = scalar;
1899             } else
1900                 /* the i^th point */
1901             {
1902                 p = points[i];
1903                 p_scalar = scalars[i];
1904             }
1905             if ((p_scalar != NULL) && (p != NULL)) {
1906                 /* reduce scalar to 0 <= scalar < 2^521 */
1907                 if ((BN_num_bits(p_scalar) > 521)
1908                     || (BN_is_negative(p_scalar))) {
1909                     /*
1910                      * this is an unusual input, and we don't guarantee
1911                      * constant-timeness
1912                      */
1913                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1914                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1915                         goto err;
1916                     }
1917                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
1918                 } else
1919                     num_bytes = BN_bn2bin(p_scalar, tmp);
1920                 flip_endian(secrets[i], tmp, num_bytes);
1921                 /* precompute multiples */
1922                 if ((!BN_to_felem(x_out, p->X)) ||
1923                     (!BN_to_felem(y_out, p->Y)) ||
1924                     (!BN_to_felem(z_out, p->Z)))
1925                     goto err;
1926                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1927                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1928                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1929                 for (j = 2; j <= 16; ++j) {
1930                     if (j & 1) {
1931                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1932                                   pre_comp[i][j][2], pre_comp[i][1][0],
1933                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1934                                   pre_comp[i][j - 1][0],
1935                                   pre_comp[i][j - 1][1],
1936                                   pre_comp[i][j - 1][2]);
1937                     } else {
1938                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1939                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1940                                      pre_comp[i][j / 2][1],
1941                                      pre_comp[i][j / 2][2]);
1942                     }
1943                 }
1944             }
1945         }
1946         if (mixed)
1947             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1948     }
1949
1950     /* the scalar for the generator */
1951     if ((scalar != NULL) && (have_pre_comp)) {
1952         memset(g_secret, 0, sizeof(g_secret));
1953         /* reduce scalar to 0 <= scalar < 2^521 */
1954         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1955             /*
1956              * this is an unusual input, and we don't guarantee
1957              * constant-timeness
1958              */
1959             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1960                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1961                 goto err;
1962             }
1963             num_bytes = BN_bn2bin(tmp_scalar, tmp);
1964         } else
1965             num_bytes = BN_bn2bin(scalar, tmp);
1966         flip_endian(g_secret, tmp, num_bytes);
1967         /* do the multiplication with generator precomputation */
1968         batch_mul(x_out, y_out, z_out,
1969                   (const felem_bytearray(*))secrets, num_points,
1970                   g_secret,
1971                   mixed, (const felem(*)[17][3])pre_comp,
1972                   (const felem(*)[3])g_pre_comp);
1973     } else
1974         /* do the multiplication without generator precomputation */
1975         batch_mul(x_out, y_out, z_out,
1976                   (const felem_bytearray(*))secrets, num_points,
1977                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1978     /* reduce the output to its unique minimal representation */
1979     felem_contract(x_in, x_out);
1980     felem_contract(y_in, y_out);
1981     felem_contract(z_in, z_out);
1982     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1983         (!felem_to_BN(z, z_in))) {
1984         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1985         goto err;
1986     }
1987     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1988
1989  err:
1990     BN_CTX_end(ctx);
1991     EC_POINT_free(generator);
1992     BN_CTX_free(new_ctx);
1993     OPENSSL_free(secrets);
1994     OPENSSL_free(pre_comp);
1995     OPENSSL_free(tmp_felems);
1996     return ret;
1997 }
1998
1999 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2000 {
2001     int ret = 0;
2002     NISTP521_PRE_COMP *pre = NULL;
2003     int i, j;
2004     BN_CTX *new_ctx = NULL;
2005     BIGNUM *x, *y;
2006     EC_POINT *generator = NULL;
2007     felem tmp_felems[16];
2008
2009     /* throw away old precomputation */
2010     EC_pre_comp_free(group);
2011     if (ctx == NULL)
2012         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2013             return 0;
2014     BN_CTX_start(ctx);
2015     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2016         goto err;
2017     /* get the generator */
2018     if (group->generator == NULL)
2019         goto err;
2020     generator = EC_POINT_new(group);
2021     if (generator == NULL)
2022         goto err;
2023     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2024     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2025     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2026         goto err;
2027     if ((pre = nistp521_pre_comp_new()) == NULL)
2028         goto err;
2029     /*
2030      * if the generator is the standard one, use built-in precomputation
2031      */
2032     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2033         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2034         ret = 1;
2035         goto err;
2036     }
2037     if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
2038         (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
2039         (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2040         goto err;
2041     /* compute 2^130*G, 2^260*G, 2^390*G */
2042     for (i = 1; i <= 4; i <<= 1) {
2043         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2044                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2045                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2046         for (j = 0; j < 129; ++j) {
2047             point_double(pre->g_pre_comp[2 * i][0],
2048                          pre->g_pre_comp[2 * i][1],
2049                          pre->g_pre_comp[2 * i][2],
2050                          pre->g_pre_comp[2 * i][0],
2051                          pre->g_pre_comp[2 * i][1],
2052                          pre->g_pre_comp[2 * i][2]);
2053         }
2054     }
2055     /* g_pre_comp[0] is the point at infinity */
2056     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2057     /* the remaining multiples */
2058     /* 2^130*G + 2^260*G */
2059     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2060               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2061               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2062               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2063               pre->g_pre_comp[2][2]);
2064     /* 2^130*G + 2^390*G */
2065     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2066               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2067               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2068               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2069               pre->g_pre_comp[2][2]);
2070     /* 2^260*G + 2^390*G */
2071     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2072               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2073               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2074               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2075               pre->g_pre_comp[4][2]);
2076     /* 2^130*G + 2^260*G + 2^390*G */
2077     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2078               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2079               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2080               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2081               pre->g_pre_comp[2][2]);
2082     for (i = 1; i < 8; ++i) {
2083         /* odd multiples: add G */
2084         point_add(pre->g_pre_comp[2 * i + 1][0],
2085                   pre->g_pre_comp[2 * i + 1][1],
2086                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2087                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2088                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2089                   pre->g_pre_comp[1][2]);
2090     }
2091     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2092
2093     SETPRECOMP(group, nistp521, pre);
2094     ret = 1;
2095     pre = NULL;
2096  err:
2097     BN_CTX_end(ctx);
2098     EC_POINT_free(generator);
2099     BN_CTX_free(new_ctx);
2100     EC_nistp521_pre_comp_free(pre);
2101     return ret;
2102 }
2103
2104 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2105 {
2106     return HAVEPRECOMP(group, nistp521);
2107 }
2108
2109 #else
2110 static void *dummy = &dummy;
2111 #endif