crypto/ec/ecp_nistp521.c

   1 /* crypto/ec/ecp_nistp521.c */
   2 /*
   3  * Written by Adam Langley (Google) for the OpenSSL project
   4  */
   5 /* Copyright 2011 Google Inc.
   6  *
   7  * Licensed under the Apache License, Version 2.0 (the "License");
   8  *
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  *  Unless required by applicable law or agreed to in writing, software
  15  *  distributed under the License is distributed on an "AS IS" BASIS,
  16  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  *  See the License for the specific language governing permissions and
  18  *  limitations under the License.
  19  */
  20
  21 /*
  22  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  23  *
  24  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  25  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  26  * work which got its smarts from Daniel J. Bernstein's work on the same.
  27  */
  28
  29 #include <openssl/opensslconf.h>
  30 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
  31
  32 #include <stdint.h>
  33 #include <string.h>
  34 #include <openssl/err.h>
  35 #include "ec_lcl.h"
  36
  37 #if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  38   /* even with gcc, the typedef won't work for 32-bit platforms */
  39   typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */
  40 #else
  41   #error "Need GCC 3.1 or later to define type uint128_t"
  42 #endif
  43
  44 typedef uint8_t u8;
  45 typedef uint64_t u64;
  46 typedef int64_t s64;
  47
  48 /* The underlying field.
  49  *
  50  * P521 operates over GF(2^521-1). We can serialise an element of this field
  51  * into 66 bytes where the most significant byte contains only a single bit. We
  52  * call this an felem_bytearray. */
  53
  54 typedef u8 felem_bytearray[66];
  55
  56 /* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  57  * These values are big-endian. */
  58 static const felem_bytearray nistp521_curve_params[5] =
  59         {
  60         {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  /* p */
  61          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  62          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  63          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  64          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  65          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  66          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  67          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  68          0xff, 0xff},
  69         {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  /* a = -3 */
  70          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  71          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  72          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  74          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  75          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  76          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  77          0xff, 0xfc},
  78         {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c,  /* b */
  79          0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  80          0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  81          0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  82          0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  83          0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  84          0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  85          0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  86          0x3f, 0x00},
  87         {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04,  /* x */
  88          0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  89          0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
  90          0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
  91          0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
  92          0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
  93          0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
  94          0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
  95          0xbd, 0x66},
  96         {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b,  /* y */
  97          0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
  98          0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
  99          0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 100          0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 101          0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 102          0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 103          0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 104          0x66, 0x50}
 105         };
 106
 107 /* The representation of field elements.
 108  * ------------------------------------
 109  *
 110  * We represent field elements with nine values. These values are either 64 or
 111  * 128 bits and the field element represented is:
 112  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 113  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 114  * 58 bits apart, but are greater than 58 bits in length, the most significant
 115  * bits of each limb overlap with the least significant bits of the next.
 116  *
 117  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 118  * 'largefelem' */
 119
 120 #define NLIMBS 9
 121
 122 typedef uint64_t limb;
 123 typedef limb felem[NLIMBS];
 124 typedef uint128_t largefelem[NLIMBS];
 125
 126 static const limb bottom57bits = 0x1ffffffffffffff;
 127 static const limb bottom58bits = 0x3ffffffffffffff;
 128
 129 /* bin66_to_felem takes a little-endian byte array and converts it into felem
 130  * form. This assumes that the CPU is little-endian. */
 131 static void bin66_to_felem(felem out, const u8 in[66])
 132         {
 133         out[0] = (*((limb*) &in[0])) & bottom58bits;
 134         out[1] = (*((limb*) &in[7]) >> 2) & bottom58bits;
 135         out[2] = (*((limb*) &in[14]) >> 4) & bottom58bits;
 136         out[3] = (*((limb*) &in[21]) >> 6) & bottom58bits;
 137         out[4] = (*((limb*) &in[29])) & bottom58bits;
 138         out[5] = (*((limb*) &in[36]) >> 2) & bottom58bits;
 139         out[6] = (*((limb*) &in[43]) >> 4) & bottom58bits;
 140         out[7] = (*((limb*) &in[50]) >> 6) & bottom58bits;
 141         out[8] = (*((limb*) &in[58])) & bottom57bits;
 142         }
 143
 144 /* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 145  * array. This assumes that the CPU is little-endian. */
 146 static void felem_to_bin66(u8 out[66], const felem in)
 147         {
 148         memset(out, 0, 66);
 149         (*((limb*) &out[0])) = in[0];
 150         (*((limb*) &out[7])) |= in[1] << 2;
 151         (*((limb*) &out[14])) |= in[2] << 4;
 152         (*((limb*) &out[21])) |= in[3] << 6;
 153         (*((limb*) &out[29])) = in[4];
 154         (*((limb*) &out[36])) |= in[5] << 2;
 155         (*((limb*) &out[43])) |= in[6] << 4;
 156         (*((limb*) &out[50])) |= in[7] << 6;
 157         (*((limb*) &out[58])) = in[8];
 158         }
 159
 160 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 161 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 162         {
 163         unsigned i;
 164         for (i = 0; i < len; ++i)
 165                 out[i] = in[len-1-i];
 166         }
 167
 168 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 169 static int BN_to_felem(felem out, const BIGNUM *bn)
 170         {
 171         felem_bytearray b_in;
 172         felem_bytearray b_out;
 173         unsigned num_bytes;
 174
 175         /* BN_bn2bin eats leading zeroes */
 176         memset(b_out, 0, sizeof b_out);
 177         num_bytes = BN_num_bytes(bn);
 178         if (num_bytes > sizeof b_out)
 179                 {
 180                 ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 181                 return 0;
 182                 }
 183         if (BN_is_negative(bn))
 184                 {
 185                 ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 186                 return 0;
 187                 }
 188         num_bytes = BN_bn2bin(bn, b_in);
 189         flip_endian(b_out, b_in, num_bytes);
 190         bin66_to_felem(out, b_out);
 191         return 1;
 192         }
 193
 194 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 195 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 196         {
 197         felem_bytearray b_in, b_out;
 198         felem_to_bin66(b_in, in);
 199         flip_endian(b_out, b_in, sizeof b_out);
 200         return BN_bin2bn(b_out, sizeof b_out, out);
 201         }
 202
 203
 204 /* Field operations
 205  * ---------------- */
 206
 207 static void felem_one(felem out)
 208         {
 209         out[0] = 1;
 210         out[1] = 0;
 211         out[2] = 0;
 212         out[3] = 0;
 213         out[4] = 0;
 214         out[5] = 0;
 215         out[6] = 0;
 216         out[7] = 0;
 217         out[8] = 0;
 218         }
 219
 220 static void felem_assign(felem out, const felem in)
 221         {
 222         out[0] = in[0];
 223         out[1] = in[1];
 224         out[2] = in[2];
 225         out[3] = in[3];
 226         out[4] = in[4];
 227         out[5] = in[5];
 228         out[6] = in[6];
 229         out[7] = in[7];
 230         out[8] = in[8];
 231         }
 232
 233 /* felem_sum64 sets out = out + in. */
 234 static void felem_sum64(felem out, const felem in)
 235         {
 236         out[0] += in[0];
 237         out[1] += in[1];
 238         out[2] += in[2];
 239         out[3] += in[3];
 240         out[4] += in[4];
 241         out[5] += in[5];
 242         out[6] += in[6];
 243         out[7] += in[7];
 244         out[8] += in[8];
 245         }
 246
 247 /* felem_scalar sets out = in * scalar */
 248 static void felem_scalar(felem out, const felem in, limb scalar)
 249         {
 250         out[0] = in[0] * scalar;
 251         out[1] = in[1] * scalar;
 252         out[2] = in[2] * scalar;
 253         out[3] = in[3] * scalar;
 254         out[4] = in[4] * scalar;
 255         out[5] = in[5] * scalar;
 256         out[6] = in[6] * scalar;
 257         out[7] = in[7] * scalar;
 258         out[8] = in[8] * scalar;
 259         }
 260
 261 /* felem_scalar64 sets out = out * scalar */
 262 static void felem_scalar64(felem out, limb scalar)
 263         {
 264         out[0] *= scalar;
 265         out[1] *= scalar;
 266         out[2] *= scalar;
 267         out[3] *= scalar;
 268         out[4] *= scalar;
 269         out[5] *= scalar;
 270         out[6] *= scalar;
 271         out[7] *= scalar;
 272         out[8] *= scalar;
 273         }
 274
 275 /* felem_scalar128 sets out = out * scalar */
 276 static void felem_scalar128(largefelem out, limb scalar)
 277         {
 278         out[0] *= scalar;
 279         out[1] *= scalar;
 280         out[2] *= scalar;
 281         out[3] *= scalar;
 282         out[4] *= scalar;
 283         out[5] *= scalar;
 284         out[6] *= scalar;
 285         out[7] *= scalar;
 286         out[8] *= scalar;
 287         }
 288
 289 /* felem_neg sets |out| to |-in|
 290  * On entry:
 291  *   in[i] < 2^59 + 2^14
 292  * On exit:
 293  *   out[i] < 2^62
 294  */
 295 static void felem_neg(felem out, const felem in)
 296         {
 297         /* In order to prevent underflow, we subtract from 0 mod p. */
 298         static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
 299         static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
 300
 301         out[0] = two62m3 - in[0];
 302         out[1] = two62m2 - in[1];
 303         out[2] = two62m2 - in[2];
 304         out[3] = two62m2 - in[3];
 305         out[4] = two62m2 - in[4];
 306         out[5] = two62m2 - in[5];
 307         out[6] = two62m2 - in[6];
 308         out[7] = two62m2 - in[7];
 309         out[8] = two62m2 - in[8];
 310         }
 311
 312 /* felem_diff64 subtracts |in| from |out|
 313  * On entry:
 314  *   in[i] < 2^59 + 2^14
 315  * On exit:
 316  *   out[i] < out[i] + 2^62
 317  */
 318 static void felem_diff64(felem out, const felem in)
 319         {
 320         /* In order to prevent underflow, we add 0 mod p before subtracting. */
 321         static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
 322         static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
 323
 324         out[0] += two62m3 - in[0];
 325         out[1] += two62m2 - in[1];
 326         out[2] += two62m2 - in[2];
 327         out[3] += two62m2 - in[3];
 328         out[4] += two62m2 - in[4];
 329         out[5] += two62m2 - in[5];
 330         out[6] += two62m2 - in[6];
 331         out[7] += two62m2 - in[7];
 332         out[8] += two62m2 - in[8];
 333         }
 334
 335 /* felem_diff_128_64 subtracts |in| from |out|
 336  * On entry:
 337  *   in[i] < 2^62 + 2^17
 338  * On exit:
 339  *   out[i] < out[i] + 2^63
 340  */
 341 static void felem_diff_128_64(largefelem out, const felem in)
 342         {
 343         /* In order to prevent underflow, we add 0 mod p before subtracting. */
 344         static const limb two63m6 = (((limb)1) << 62) - (((limb)1) << 5);
 345         static const limb two63m5 = (((limb)1) << 62) - (((limb)1) << 4);
 346
 347         out[0] += two63m6 - in[0];
 348         out[1] += two63m5 - in[1];
 349         out[2] += two63m5 - in[2];
 350         out[3] += two63m5 - in[3];
 351         out[4] += two63m5 - in[4];
 352         out[5] += two63m5 - in[5];
 353         out[6] += two63m5 - in[6];
 354         out[7] += two63m5 - in[7];
 355         out[8] += two63m5 - in[8];
 356         }
 357
 358 /* felem_diff_128_64 subtracts |in| from |out|
 359  * On entry:
 360  *   in[i] < 2^126
 361  * On exit:
 362  *   out[i] < out[i] + 2^127 - 2^69
 363  */
 364 static void felem_diff128(largefelem out, const largefelem in)
 365         {
 366         /* In order to prevent underflow, we add 0 mod p before subtracting. */
 367         static const uint128_t two127m70 = (((uint128_t)1) << 127) - (((uint128_t)1) << 70);
 368         static const uint128_t two127m69 = (((uint128_t)1) << 127) - (((uint128_t)1) << 69);
 369
 370         out[0] += (two127m70 - in[0]);
 371         out[1] += (two127m69 - in[1]);
 372         out[2] += (two127m69 - in[2]);
 373         out[3] += (two127m69 - in[3]);
 374         out[4] += (two127m69 - in[4]);
 375         out[5] += (two127m69 - in[5]);
 376         out[6] += (two127m69 - in[6]);
 377         out[7] += (two127m69 - in[7]);
 378         out[8] += (two127m69 - in[8]);
 379         }
 380
 381 /* felem_square sets |out| = |in|^2
 382  * On entry:
 383  *   in[i] < 2^62
 384  * On exit:
 385  *   out[i] < 17 * max(in[i]) * max(in[i])
 386  */
 387 static void felem_square(largefelem out, const felem in)
 388         {
 389         felem inx2, inx4;
 390         felem_scalar(inx2, in, 2);
 391         felem_scalar(inx4, in, 4);
 392
 393         /* We have many cases were we want to do
 394          *   in[x] * in[y] +
 395          *   in[y] * in[x]
 396          * This is obviously just
 397          *   2 * in[x] * in[y]
 398          * However, rather than do the doubling on the 128 bit result, we
 399          * double one of the inputs to the multiplication by reading from
 400          * |inx2| */
 401
 402         out[0] = ((uint128_t) in[0]) * in[0];
 403         out[1] = ((uint128_t) in[0]) * inx2[1];
 404         out[2] = ((uint128_t) in[0]) * inx2[2] +
 405                  ((uint128_t) in[1]) * in[1];
 406         out[3] = ((uint128_t) in[0]) * inx2[3] +
 407                  ((uint128_t) in[1]) * inx2[2];
 408         out[4] = ((uint128_t) in[0]) * inx2[4] +
 409                  ((uint128_t) in[1]) * inx2[3] +
 410                  ((uint128_t) in[2]) * in[2];
 411         out[5] = ((uint128_t) in[0]) * inx2[5] +
 412                  ((uint128_t) in[1]) * inx2[4] +
 413                  ((uint128_t) in[2]) * inx2[3];
 414         out[6] = ((uint128_t) in[0]) * inx2[6] +
 415                  ((uint128_t) in[1]) * inx2[5] +
 416                  ((uint128_t) in[2]) * inx2[4] +
 417                  ((uint128_t) in[3]) * in[3];
 418         out[7] = ((uint128_t) in[0]) * inx2[7] +
 419                  ((uint128_t) in[1]) * inx2[6] +
 420                  ((uint128_t) in[2]) * inx2[5] +
 421                  ((uint128_t) in[3]) * inx2[4];
 422         out[8] = ((uint128_t) in[0]) * inx2[8] +
 423                  ((uint128_t) in[1]) * inx2[7] +
 424                  ((uint128_t) in[2]) * inx2[6] +
 425                  ((uint128_t) in[3]) * inx2[5] +
 426                  ((uint128_t) in[4]) * in[4];
 427
 428         /* The remaining limbs fall above 2^521, with the first falling at
 429          * 2^522. They correspond to locations one bit up from the limbs
 430          * produced above so we would have to multiply by two to align them.
 431          * Again, rather than operate on the 128-bit result, we double one of
 432          * the inputs to the multiplication. If we want to double for both this
 433          * reason, and the reason above, then we end up multiplying by four. */
 434
 435         /* 9 */
 436         out[0] += ((uint128_t) in[1]) * inx4[8] +
 437                   ((uint128_t) in[2]) * inx4[7] +
 438                   ((uint128_t) in[3]) * inx4[6] +
 439                   ((uint128_t) in[4]) * inx4[5];
 440
 441         /* 10 */
 442         out[1] += ((uint128_t) in[2]) * inx4[8] +
 443                   ((uint128_t) in[3]) * inx4[7] +
 444                   ((uint128_t) in[4]) * inx4[6] +
 445                   ((uint128_t) in[5]) * inx2[5];
 446
 447         /* 11 */
 448         out[2] += ((uint128_t) in[3]) * inx4[8] +
 449                   ((uint128_t) in[4]) * inx4[7] +
 450                   ((uint128_t) in[5]) * inx4[6];
 451
 452         /* 12 */
 453         out[3] += ((uint128_t) in[4]) * inx4[8] +
 454                   ((uint128_t) in[5]) * inx4[7] +
 455                   ((uint128_t) in[6]) * inx2[6];
 456
 457         /* 13 */
 458         out[4] += ((uint128_t) in[5]) * inx4[8] +
 459                   ((uint128_t) in[6]) * inx4[7];
 460
 461         /* 14 */
 462         out[5] += ((uint128_t) in[6]) * inx4[8] +
 463                   ((uint128_t) in[7]) * inx2[7];
 464
 465         /* 15 */
 466         out[6] += ((uint128_t) in[7]) * inx4[8];
 467
 468         /* 16 */
 469         out[7] += ((uint128_t) in[8]) * inx2[8];
 470         }
 471
 472 /* felem_mul sets |out| = |in1| * |in2|
 473  * On entry:
 474  *   in1[i] < 2^64
 475  *   in2[i] < 2^63
 476  * On exit:
 477  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 478  */
 479 static void felem_mul(largefelem out, const felem in1, const felem in2)
 480         {
 481         felem in2x2;
 482         felem_scalar(in2x2, in2, 2);
 483
 484         out[0] = ((uint128_t) in1[0]) * in2[0];
 485
 486         out[1] = ((uint128_t) in1[0]) * in2[1] +
 487                  ((uint128_t) in1[1]) * in2[0];
 488
 489         out[2] = ((uint128_t) in1[0]) * in2[2] +
 490                  ((uint128_t) in1[1]) * in2[1] +
 491                  ((uint128_t) in1[2]) * in2[0];
 492
 493         out[3] = ((uint128_t) in1[0]) * in2[3] +
 494                  ((uint128_t) in1[1]) * in2[2] +
 495                  ((uint128_t) in1[2]) * in2[1] +
 496                  ((uint128_t) in1[3]) * in2[0];
 497
 498         out[4] = ((uint128_t) in1[0]) * in2[4] +
 499                  ((uint128_t) in1[1]) * in2[3] +
 500                  ((uint128_t) in1[2]) * in2[2] +
 501                  ((uint128_t) in1[3]) * in2[1] +
 502                  ((uint128_t) in1[4]) * in2[0];
 503
 504         out[5] = ((uint128_t) in1[0]) * in2[5] +
 505                  ((uint128_t) in1[1]) * in2[4] +
 506                  ((uint128_t) in1[2]) * in2[3] +
 507                  ((uint128_t) in1[3]) * in2[2] +
 508                  ((uint128_t) in1[4]) * in2[1] +
 509                  ((uint128_t) in1[5]) * in2[0];
 510
 511         out[6] = ((uint128_t) in1[0]) * in2[6] +
 512                  ((uint128_t) in1[1]) * in2[5] +
 513                  ((uint128_t) in1[2]) * in2[4] +
 514                  ((uint128_t) in1[3]) * in2[3] +
 515                  ((uint128_t) in1[4]) * in2[2] +
 516                  ((uint128_t) in1[5]) * in2[1] +
 517                  ((uint128_t) in1[6]) * in2[0];
 518
 519         out[7] = ((uint128_t) in1[0]) * in2[7] +
 520                  ((uint128_t) in1[1]) * in2[6] +
 521                  ((uint128_t) in1[2]) * in2[5] +
 522                  ((uint128_t) in1[3]) * in2[4] +
 523                  ((uint128_t) in1[4]) * in2[3] +
 524                  ((uint128_t) in1[5]) * in2[2] +
 525                  ((uint128_t) in1[6]) * in2[1] +
 526                  ((uint128_t) in1[7]) * in2[0];
 527
 528         out[8] = ((uint128_t) in1[0]) * in2[8] +
 529                  ((uint128_t) in1[1]) * in2[7] +
 530                  ((uint128_t) in1[2]) * in2[6] +
 531                  ((uint128_t) in1[3]) * in2[5] +
 532                  ((uint128_t) in1[4]) * in2[4] +
 533                  ((uint128_t) in1[5]) * in2[3] +
 534                  ((uint128_t) in1[6]) * in2[2] +
 535                  ((uint128_t) in1[7]) * in2[1] +
 536                  ((uint128_t) in1[8]) * in2[0];
 537
 538         /* See comment in felem_square about the use of in2x2 here */
 539
 540         out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 541                   ((uint128_t) in1[2]) * in2x2[7] +
 542                   ((uint128_t) in1[3]) * in2x2[6] +
 543                   ((uint128_t) in1[4]) * in2x2[5] +
 544                   ((uint128_t) in1[5]) * in2x2[4] +
 545                   ((uint128_t) in1[6]) * in2x2[3] +
 546                   ((uint128_t) in1[7]) * in2x2[2] +
 547                   ((uint128_t) in1[8]) * in2x2[1];
 548
 549         out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 550                   ((uint128_t) in1[3]) * in2x2[7] +
 551                   ((uint128_t) in1[4]) * in2x2[6] +
 552                   ((uint128_t) in1[5]) * in2x2[5] +
 553                   ((uint128_t) in1[6]) * in2x2[4] +
 554                   ((uint128_t) in1[7]) * in2x2[3] +
 555                   ((uint128_t) in1[8]) * in2x2[2];
 556
 557         out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 558                   ((uint128_t) in1[4]) * in2x2[7] +
 559                   ((uint128_t) in1[5]) * in2x2[6] +
 560                   ((uint128_t) in1[6]) * in2x2[5] +
 561                   ((uint128_t) in1[7]) * in2x2[4] +
 562                   ((uint128_t) in1[8]) * in2x2[3];
 563
 564         out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 565                   ((uint128_t) in1[5]) * in2x2[7] +
 566                   ((uint128_t) in1[6]) * in2x2[6] +
 567                   ((uint128_t) in1[7]) * in2x2[5] +
 568                   ((uint128_t) in1[8]) * in2x2[4];
 569
 570         out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 571                   ((uint128_t) in1[6]) * in2x2[7] +
 572                   ((uint128_t) in1[7]) * in2x2[6] +
 573                   ((uint128_t) in1[8]) * in2x2[5];
 574
 575         out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 576                   ((uint128_t) in1[7]) * in2x2[7] +
 577                   ((uint128_t) in1[8]) * in2x2[6];
 578
 579         out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 580                   ((uint128_t) in1[8]) * in2x2[7];
 581
 582         out[7] += ((uint128_t) in1[8]) * in2x2[8];
 583         }
 584
 585 static const limb bottom52bits = 0xfffffffffffff;
 586
 587 /* felem_reduce converts a largefelem to an felem.
 588  * On entry:
 589  *   in[i] < 2^128
 590  * On exit:
 591  *   out[i] < 2^59 + 2^14
 592  */
 593 static void felem_reduce(felem out, const largefelem in)
 594         {
 595         u64 overflow1, overflow2;
 596
 597         out[0] = ((limb) in[0]) & bottom58bits;
 598         out[1] = ((limb) in[1]) & bottom58bits;
 599         out[2] = ((limb) in[2]) & bottom58bits;
 600         out[3] = ((limb) in[3]) & bottom58bits;
 601         out[4] = ((limb) in[4]) & bottom58bits;
 602         out[5] = ((limb) in[5]) & bottom58bits;
 603         out[6] = ((limb) in[6]) & bottom58bits;
 604         out[7] = ((limb) in[7]) & bottom58bits;
 605         out[8] = ((limb) in[8]) & bottom58bits;
 606
 607         /* out[i] < 2^58 */
 608
 609         out[1] += ((limb) in[0]) >> 58;
 610         out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 611         /* out[1] < 2^58 + 2^6 + 2^58
 612          *        = 2^59 + 2^6 */
 613         out[2] += ((limb) (in[0] >> 64)) >> 52;
 614
 615         out[2] += ((limb) in[1]) >> 58;
 616         out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 617         out[3] += ((limb) (in[1] >> 64)) >> 52;
 618
 619         out[3] += ((limb) in[2]) >> 58;
 620         out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 621         out[4] += ((limb) (in[2] >> 64)) >> 52;
 622
 623         out[4] += ((limb) in[3]) >> 58;
 624         out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 625         out[5] += ((limb) (in[3] >> 64)) >> 52;
 626
 627         out[5] += ((limb) in[4]) >> 58;
 628         out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 629         out[6] += ((limb) (in[4] >> 64)) >> 52;
 630
 631         out[6] += ((limb) in[5]) >> 58;
 632         out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 633         out[7] += ((limb) (in[5] >> 64)) >> 52;
 634
 635         out[7] += ((limb) in[6]) >> 58;
 636         out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 637         out[8] += ((limb) (in[6] >> 64)) >> 52;
 638
 639         out[8] += ((limb) in[7]) >> 58;
 640         out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 641         /* out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 642          *            < 2^59 + 2^13 */
 643         overflow1 = ((limb) (in[7] >> 64)) >> 52;
 644
 645         overflow1 += ((limb) in[8]) >> 58;
 646         overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 647         overflow2 = ((limb) (in[8] >> 64)) >> 52;
 648
 649         overflow1 <<= 1;  /* overflow1 < 2^13 + 2^7 + 2^59 */
 650         overflow2 <<= 1;  /* overflow2 < 2^13 */
 651
 652         out[0] += overflow1;  /* out[0] < 2^60 */
 653         out[1] += overflow2;  /* out[1] < 2^59 + 2^6 + 2^13 */
 654
 655         out[1] += out[0] >> 58; out[0] &= bottom58bits;
 656         /* out[0] < 2^58
 657          * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 658          *        < 2^59 + 2^14 */
 659         }
 660
 661 static void felem_square_reduce(felem out, const felem in)
 662         {
 663         largefelem tmp;
 664         felem_square(tmp, in);
 665         felem_reduce(out, tmp);
 666         }
 667
 668 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 669         {
 670         largefelem tmp;
 671         felem_mul(tmp, in1, in2);
 672         felem_reduce(out, tmp);
 673         }
 674
 675 /* felem_inv calculates |out| = |in|^{-1}
 676  *
 677  * Based on Fermat's Little Theorem:
 678  *   a^p = a (mod p)
 679  *   a^{p-1} = 1 (mod p)
 680  *   a^{p-2} = a^{-1} (mod p)
 681  */
 682 static void felem_inv(felem out, const felem in)
 683         {
 684         felem ftmp, ftmp2, ftmp3, ftmp4;
 685         largefelem tmp;
 686         unsigned i;
 687
 688         felem_square(tmp, in); felem_reduce(ftmp, tmp);         /* 2^1 */
 689         felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);      /* 2^2 - 2^0 */
 690         felem_assign(ftmp2, ftmp);
 691         felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);       /* 2^3 - 2^1 */
 692         felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);      /* 2^3 - 2^0 */
 693         felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);       /* 2^4 - 2^1 */
 694
 695         felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp);     /* 2^3 - 2^1 */
 696         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^4 - 2^2 */
 697         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */
 698
 699         felem_assign(ftmp2, ftmp3);
 700         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^5 - 2^1 */
 701         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^6 - 2^2 */
 702         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^7 - 2^3 */
 703         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^8 - 2^4 */
 704         felem_assign(ftmp4, ftmp3);
 705         felem_mul(tmp, ftmp3, ftmp); felem_reduce(ftmp4, tmp);  /* 2^8 - 2^1 */
 706         felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp);     /* 2^9 - 2^2 */
 707         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */
 708         felem_assign(ftmp2, ftmp3);
 709
 710         for (i = 0; i < 8; i++)
 711                 {
 712                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^16 - 2^8 */
 713                 }
 714         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */
 715         felem_assign(ftmp2, ftmp3);
 716
 717         for (i = 0; i < 16; i++)
 718                 {
 719                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^32 - 2^16 */
 720                 }
 721         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */
 722         felem_assign(ftmp2, ftmp3);
 723
 724         for (i = 0; i < 32; i++)
 725                 {
 726                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^64 - 2^32 */
 727                 }
 728         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */
 729         felem_assign(ftmp2, ftmp3);
 730
 731         for (i = 0; i < 64; i++)
 732                 {
 733                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^128 - 2^64 */
 734                 }
 735         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */
 736         felem_assign(ftmp2, ftmp3);
 737
 738         for (i = 0; i < 128; i++)
 739                 {
 740                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^256 - 2^128 */
 741                 }
 742         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */
 743         felem_assign(ftmp2, ftmp3);
 744
 745         for (i = 0; i < 256; i++)
 746                 {
 747                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^512 - 2^256 */
 748                 }
 749         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */
 750
 751         for (i = 0; i < 9; i++)
 752                 {
 753                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^521 - 2^9 */
 754                 }
 755         felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp); /* 2^512 - 2^2 */
 756         felem_mul(tmp, ftmp3, in); felem_reduce(out, tmp);      /* 2^512 - 3 */
 757 }
 758
 759 /* This is 2^521-1, expressed as an felem */
 760 static const felem kPrime =
 761         {
 762         0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 763         0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 764         0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 765         };
 766
 767 /* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 768  * otherwise.
 769  * On entry:
 770  *   in[i] < 2^59 + 2^14
 771  */
 772 static limb felem_is_zero(const felem in)
 773         {
 774         felem ftmp;
 775         limb is_zero, is_p;
 776         felem_assign(ftmp, in);
 777
 778         ftmp[0] += ftmp[8] >> 57; ftmp[8] &= bottom57bits;
 779         /* ftmp[8] < 2^57 */
 780         ftmp[1] += ftmp[0] >> 58; ftmp[0] &= bottom58bits;
 781         ftmp[2] += ftmp[1] >> 58; ftmp[1] &= bottom58bits;
 782         ftmp[3] += ftmp[2] >> 58; ftmp[2] &= bottom58bits;
 783         ftmp[4] += ftmp[3] >> 58; ftmp[3] &= bottom58bits;
 784         ftmp[5] += ftmp[4] >> 58; ftmp[4] &= bottom58bits;
 785         ftmp[6] += ftmp[5] >> 58; ftmp[5] &= bottom58bits;
 786         ftmp[7] += ftmp[6] >> 58; ftmp[6] &= bottom58bits;
 787         ftmp[8] += ftmp[7] >> 58; ftmp[7] &= bottom58bits;
 788         /* ftmp[8] < 2^57 + 4 */
 789
 790         /* The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is
 791          * greater than our bound for ftmp[8]. Therefore we only have to check
 792          * if the zero is zero or 2^521-1. */
 793
 794         is_zero = 0;
 795         is_zero |= ftmp[0];
 796         is_zero |= ftmp[1];
 797         is_zero |= ftmp[2];
 798         is_zero |= ftmp[3];
 799         is_zero |= ftmp[4];
 800         is_zero |= ftmp[5];
 801         is_zero |= ftmp[6];
 802         is_zero |= ftmp[7];
 803         is_zero |= ftmp[8];
 804
 805         is_zero--;
 806         /* We know that ftmp[i] < 2^63, therefore the only way that the top bit
 807          * can be set is if is_zero was 0 before the decrement. */
 808         is_zero = ((s64) is_zero) >> 63;
 809
 810         is_p = ftmp[0] ^ kPrime[0];
 811         is_p |= ftmp[1] ^ kPrime[1];
 812         is_p |= ftmp[2] ^ kPrime[2];
 813         is_p |= ftmp[3] ^ kPrime[3];
 814         is_p |= ftmp[4] ^ kPrime[4];
 815         is_p |= ftmp[5] ^ kPrime[5];
 816         is_p |= ftmp[6] ^ kPrime[6];
 817         is_p |= ftmp[7] ^ kPrime[7];
 818         is_p |= ftmp[8] ^ kPrime[8];
 819
 820         is_p--;
 821         is_p = ((s64) is_p) >> 63;
 822
 823         is_zero |= is_p;
 824         return is_zero;
 825         }
 826
 827 static int felem_is_zero_int(const felem in)
 828         {
 829         return (int) (felem_is_zero(in) & ((limb)1));
 830         }
 831
 832 /* felem_contract converts |in| to its unique, minimal representation.
 833  * On entry:
 834  *   in[i] < 2^59 + 2^14
 835  */
 836 static void felem_contract(felem out, const felem in)
 837         {
 838         limb is_p, is_greater, sign;
 839         static const limb two58 = ((limb)1) << 58;
 840
 841         felem_assign(out, in);
 842
 843         out[0] += out[8] >> 57; out[8] &= bottom57bits;
 844         /* out[8] < 2^57 */
 845         out[1] += out[0] >> 58; out[0] &= bottom58bits;
 846         out[2] += out[1] >> 58; out[1] &= bottom58bits;
 847         out[3] += out[2] >> 58; out[2] &= bottom58bits;
 848         out[4] += out[3] >> 58; out[3] &= bottom58bits;
 849         out[5] += out[4] >> 58; out[4] &= bottom58bits;
 850         out[6] += out[5] >> 58; out[5] &= bottom58bits;
 851         out[7] += out[6] >> 58; out[6] &= bottom58bits;
 852         out[8] += out[7] >> 58; out[7] &= bottom58bits;
 853         /* out[8] < 2^57 + 4 */
 854
 855         /* If the value is greater than 2^521-1 then we have to subtract
 856          * 2^521-1 out. See the comments in felem_is_zero regarding why we
 857          * don't test for other multiples of the prime. */
 858
 859         /* First, if |out| is equal to 2^521-1, we subtract it out to get zero. */
 860
 861         is_p = out[0] ^ kPrime[0];
 862         is_p |= out[1] ^ kPrime[1];
 863         is_p |= out[2] ^ kPrime[2];
 864         is_p |= out[3] ^ kPrime[3];
 865         is_p |= out[4] ^ kPrime[4];
 866         is_p |= out[5] ^ kPrime[5];
 867         is_p |= out[6] ^ kPrime[6];
 868         is_p |= out[7] ^ kPrime[7];
 869         is_p |= out[8] ^ kPrime[8];
 870
 871         is_p--;
 872         is_p &= is_p << 32;
 873         is_p &= is_p << 16;
 874         is_p &= is_p << 8;
 875         is_p &= is_p << 4;
 876         is_p &= is_p << 2;
 877         is_p &= is_p << 1;
 878         is_p = ((s64) is_p) >> 63;
 879         is_p = ~is_p;
 880
 881         /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 882
 883         out[0] &= is_p;
 884         out[1] &= is_p;
 885         out[2] &= is_p;
 886         out[3] &= is_p;
 887         out[4] &= is_p;
 888         out[5] &= is_p;
 889         out[6] &= is_p;
 890         out[7] &= is_p;
 891         out[8] &= is_p;
 892
 893         /* In order to test that |out| >= 2^521-1 we need only test if out[8]
 894          * >> 57 is greater than zero as (2^521-1) + x >= 2^522 */
 895         is_greater = out[8] >> 57;
 896         is_greater |= is_greater << 32;
 897         is_greater |= is_greater << 16;
 898         is_greater |= is_greater << 8;
 899         is_greater |= is_greater << 4;
 900         is_greater |= is_greater << 2;
 901         is_greater |= is_greater << 1;
 902         is_greater = ((s64) is_greater) >> 63;
 903
 904         out[0] -= kPrime[0] & is_greater;
 905         out[1] -= kPrime[1] & is_greater;
 906         out[2] -= kPrime[2] & is_greater;
 907         out[3] -= kPrime[3] & is_greater;
 908         out[4] -= kPrime[4] & is_greater;
 909         out[5] -= kPrime[5] & is_greater;
 910         out[6] -= kPrime[6] & is_greater;
 911         out[7] -= kPrime[7] & is_greater;
 912         out[8] -= kPrime[8] & is_greater;
 913
 914         /* Eliminate negative coefficients */
 915         sign = -(out[0] >> 63); out[0] += (two58 & sign); out[1] -= (1 & sign);
 916         sign = -(out[1] >> 63); out[1] += (two58 & sign); out[2] -= (1 & sign);
 917         sign = -(out[2] >> 63); out[2] += (two58 & sign); out[3] -= (1 & sign);
 918         sign = -(out[3] >> 63); out[3] += (two58 & sign); out[4] -= (1 & sign);
 919         sign = -(out[4] >> 63); out[4] += (two58 & sign); out[5] -= (1 & sign);
 920         sign = -(out[0] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
 921         sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
 922         sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
 923         sign = -(out[5] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
 924         sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
 925         sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
 926         }
 927
 928 /* Group operations
 929  * ----------------
 930  *
 931  * Building on top of the field operations we have the operations on the
 932  * elliptic curve group itself. Points on the curve are represented in Jacobian
 933  * coordinates */
 934
 935 /* point_double calcuates 2*(x_in, y_in, z_in)
 936  *
 937  * The method is taken from:
 938  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
 939  *
 940  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
 941  * while x_out == y_in is not (maybe this works, but it's not tested). */
 942 static void
 943 point_double(felem x_out, felem y_out, felem z_out,
 944              const felem x_in, const felem y_in, const felem z_in)
 945         {
 946         largefelem tmp, tmp2;
 947         felem delta, gamma, beta, alpha, ftmp, ftmp2;
 948
 949         felem_assign(ftmp, x_in);
 950         felem_assign(ftmp2, x_in);
 951
 952         /* delta = z^2 */
 953         felem_square(tmp, z_in);
 954         felem_reduce(delta, tmp);  /* delta[i] < 2^59 + 2^14 */
 955
 956         /* gamma = y^2 */
 957         felem_square(tmp, y_in);
 958         felem_reduce(gamma, tmp);  /* gamma[i] < 2^59 + 2^14 */
 959
 960         /* beta = x*gamma */
 961         felem_mul(tmp, x_in, gamma);
 962         felem_reduce(beta, tmp);  /* beta[i] < 2^59 + 2^14 */
 963
 964         /* alpha = 3*(x-delta)*(x+delta) */
 965         felem_diff64(ftmp, delta);
 966         /* ftmp[i] < 2^61 */
 967         felem_sum64(ftmp2, delta);
 968         /* ftmp2[i] < 2^60 + 2^15 */
 969         felem_scalar64(ftmp2, 3);
 970         /* ftmp2[i] < 3*2^60 + 3*2^15 */
 971         felem_mul(tmp, ftmp, ftmp2);
 972         /* tmp[i] < 17(3*2^121 + 3*2^76)
 973          *        = 61*2^121 + 61*2^76
 974          *        < 64*2^121 + 64*2^76
 975          *        = 2^127 + 2^82
 976          *        < 2^128 */
 977         felem_reduce(alpha, tmp);
 978
 979         /* x' = alpha^2 - 8*beta */
 980         felem_square(tmp, alpha);
 981         /* tmp[i] < 17*2^120
 982          *        < 2^125 */
 983         felem_assign(ftmp, beta);
 984         felem_scalar64(ftmp, 8);
 985         /* ftmp[i] < 2^62 + 2^17 */
 986         felem_diff_128_64(tmp, ftmp);
 987         /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
 988         felem_reduce(x_out, tmp);
 989
 990         /* z' = (y + z)^2 - gamma - delta */
 991         felem_sum64(delta, gamma);
 992         /* delta[i] < 2^60 + 2^15 */
 993         felem_assign(ftmp, y_in);
 994         felem_sum64(ftmp, z_in);
 995         /* ftmp[i] < 2^60 + 2^15 */
 996         felem_square(tmp, ftmp);
 997         /* tmp[i] < 17(2^122)
 998          *        < 2^127 */
 999         felem_diff_128_64(tmp, delta);
1000         /* tmp[i] < 2^127 + 2^63 */
1001         felem_reduce(z_out, tmp);
1002
1003         /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1004         felem_scalar64(beta, 4);
1005         /* beta[i] < 2^61 + 2^16 */
1006         felem_diff64(beta, x_out);
1007         /* beta[i] < 2^61 + 2^60 + 2^16 */
1008         felem_mul(tmp, alpha, beta);
1009         /* tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1010          *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1011          *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1012          *        < 2^128 */
1013         felem_square(tmp2, gamma);
1014         /* tmp2[i] < 17*(2^59 + 2^14)^2
1015          *         = 17*(2^118 + 2^74 + 2^28) */
1016         felem_scalar128(tmp2, 8);
1017         /* tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1018          *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1019          *         < 2^126 */
1020         felem_diff128(tmp, tmp2);
1021         /* tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1022          *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1023          *          2^74 + 2^69 + 2^34 + 2^30
1024          *        < 2^128 */
1025         felem_reduce(y_out, tmp);
1026         }
1027
1028 /* copy_conditional copies in to out iff mask is all ones. */
1029 static void
1030 copy_conditional(felem out, const felem in, limb mask)
1031         {
1032         unsigned i;
1033         for (i = 0; i < NLIMBS; ++i)
1034                 {
1035                 const limb tmp = mask & (in[i] ^ out[i]);
1036                 out[i] ^= tmp;
1037                 }
1038         }
1039
1040 /* point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1041  *
1042  * The method is taken from
1043  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1044  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1045  *
1046  * This function includes a branch for checking whether the two input points
1047  * are equal (while not equal to the point at infinity). This case never
1048  * happens during single point multiplication, so there is no timing leak for
1049  * ECDH or ECDSA signing. */
1050 static void point_add(felem x3, felem y3, felem z3,
1051         const felem x1, const felem y1, const felem z1,
1052         const int mixed, const felem x2, const felem y2, const felem z2)
1053         {
1054         felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1055         largefelem tmp, tmp2;
1056         limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1057
1058         z1_is_zero = felem_is_zero(z1);
1059         z2_is_zero = felem_is_zero(z2);
1060
1061         /* ftmp = z1z1 = z1**2 */
1062         felem_square(tmp, z1);
1063         felem_reduce(ftmp, tmp);
1064
1065         if (!mixed)
1066                 {
1067                 /* ftmp2 = z2z2 = z2**2 */
1068                 felem_square(tmp, z2);
1069                 felem_reduce(ftmp2, tmp);
1070
1071                 /* u1 = ftmp3 = x1*z2z2 */
1072                 felem_mul(tmp, x1, ftmp2);
1073                 felem_reduce(ftmp3, tmp);
1074
1075                 /* ftmp5 = z1 + z2 */
1076                 felem_assign(ftmp5, z1);
1077                 felem_sum64(ftmp5, z2);
1078                 /* ftmp5[i] < 2^61 */
1079
1080                 /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1081                 felem_square(tmp, ftmp5);
1082                 /* tmp[i] < 17*2^122 */
1083                 felem_diff_128_64(tmp, ftmp);
1084                 /* tmp[i] < 17*2^122 + 2^63 */
1085                 felem_diff_128_64(tmp, ftmp2);
1086                 /* tmp[i] < 17*2^122 + 2^64 */
1087                 felem_reduce(ftmp5, tmp);
1088
1089                 /* ftmp2 = z2 * z2z2 */
1090                 felem_mul(tmp, ftmp2, z2);
1091                 felem_reduce(ftmp2, tmp);
1092
1093                 /* s1 = ftmp6 = y1 * z2**3 */
1094                 felem_mul(tmp, y1, ftmp2);
1095                 felem_reduce(ftmp6, tmp);
1096                 }
1097         else
1098                 {
1099                 /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */
1100
1101                 /* u1 = ftmp3 = x1*z2z2 */
1102                 felem_assign(ftmp3, x1);
1103
1104                 /* ftmp5 = 2*z1z2 */
1105                 felem_scalar(ftmp5, z1, 2);
1106
1107                 /* s1 = ftmp6 = y1 * z2**3 */
1108                 felem_assign(ftmp6, y1);
1109                 }
1110
1111         /* u2 = x2*z1z1 */
1112         felem_mul(tmp, x2, ftmp);
1113         /* tmp[i] < 17*2^120 */
1114
1115         /* h = ftmp4 = u2 - u1 */
1116         felem_diff_128_64(tmp, ftmp3);
1117         /* tmp[i] < 17*2^120 + 2^63 */
1118         felem_reduce(ftmp4, tmp);
1119
1120         x_equal = felem_is_zero(ftmp4);
1121
1122         /* z_out = ftmp5 * h */
1123         felem_mul(tmp, ftmp5, ftmp4);
1124         felem_reduce(z_out, tmp);
1125
1126         /* ftmp = z1 * z1z1 */
1127         felem_mul(tmp, ftmp, z1);
1128         felem_reduce(ftmp, tmp);
1129
1130         /* s2 = tmp = y2 * z1**3 */
1131         felem_mul(tmp, y2, ftmp);
1132         /* tmp[i] < 17*2^120 */
1133
1134         /* r = ftmp5 = (s2 - s1)*2 */
1135         felem_diff_128_64(tmp, ftmp6);
1136         /* tmp[i] < 17*2^120 + 2^63 */
1137         felem_reduce(ftmp5, tmp);
1138         y_equal = felem_is_zero(ftmp5);
1139         felem_scalar64(ftmp5, 2);
1140         /* ftmp5[i] < 2^61 */
1141
1142         if (x_equal && y_equal && !z1_is_zero && !z2_is_zero)
1143                 {
1144                 point_double(x3, y3, z3, x1, y1, z1);
1145                 return;
1146                 }
1147
1148         /* I = ftmp = (2h)**2 */
1149         felem_assign(ftmp, ftmp4);
1150         felem_scalar64(ftmp, 2);
1151         /* ftmp[i] < 2^61 */
1152         felem_square(tmp, ftmp);
1153         /* tmp[i] < 17*2^122 */
1154         felem_reduce(ftmp, tmp);
1155
1156         /* J = ftmp2 = h * I */
1157         felem_mul(tmp, ftmp4, ftmp);
1158         felem_reduce(ftmp2, tmp);
1159
1160         /* V = ftmp4 = U1 * I */
1161         felem_mul(tmp, ftmp3, ftmp);
1162         felem_reduce(ftmp4, tmp);
1163
1164         /* x_out = r**2 - J - 2V */
1165         felem_square(tmp, ftmp5);
1166         /* tmp[i] < 17*2^122 */
1167         felem_diff_128_64(tmp, ftmp2);
1168         /* tmp[i] < 17*2^122 + 2^63 */
1169         felem_assign(ftmp3, ftmp4);
1170         felem_scalar64(ftmp4, 2);
1171         /* ftmp4[i] < 2^61 */
1172         felem_diff_128_64(tmp, ftmp4);
1173         /* tmp[i] < 17*2^122 + 2^64 */
1174         felem_reduce(x_out, tmp);
1175
1176         /* y_out = r(V-x_out) - 2 * s1 * J */
1177         felem_diff64(ftmp3, x_out);
1178         /* ftmp3[i] < 2^60 + 2^60
1179          *          = 2^61 */
1180         felem_mul(tmp, ftmp5, ftmp3);
1181         /* tmp[i] < 17*2^122 */
1182         felem_mul(tmp2, ftmp6, ftmp2);
1183         /* tmp2[i] < 17*2^120 */
1184         felem_scalar128(tmp2, 2);
1185         /* tmp2[i] < 17*2^121 */
1186         felem_diff128(tmp, tmp2);
1187         /* tmp[i] < 2^127 - 2^69 + 17*2^122
1188          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1189          *        < 2^127 */
1190         felem_reduce(y_out, tmp);
1191
1192         copy_conditional(x_out, x2, z1_is_zero);
1193         copy_conditional(x_out, x1, z2_is_zero);
1194         copy_conditional(y_out, y2, z1_is_zero);
1195         copy_conditional(y_out, y1, z2_is_zero);
1196         copy_conditional(z_out, z2, z1_is_zero);
1197         copy_conditional(z_out, z1, z2_is_zero);
1198         felem_assign(x3, x_out);
1199         felem_assign(y3, y_out);
1200         felem_assign(z3, z_out);
1201         }
1202
1203 /* Base point pre computation
1204  * --------------------------
1205  *
1206  * Two different sorts of precomputed tables are used in the following code.
1207  * Each contain various points on the curve, where each point is three field
1208  * elements (x, y, z).
1209  *
1210  * For the base point table, z is usually 1 (0 for the point at infinity).
1211  * This table has 16 elements:
1212  * index | bits    | point
1213  * ------+---------+------------------------------
1214  *     0 | 0 0 0 0 | 0G
1215  *     1 | 0 0 0 1 | 1G
1216  *     2 | 0 0 1 0 | 2^130G
1217  *     3 | 0 0 1 1 | (2^130 + 1)G
1218  *     4 | 0 1 0 0 | 2^260G
1219  *     5 | 0 1 0 1 | (2^260 + 1)G
1220  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1221  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1222  *     8 | 1 0 0 0 | 2^390G
1223  *     9 | 1 0 0 1 | (2^390 + 1)G
1224  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1225  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1226  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1227  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1228  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1229  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1230  *
1231  * The reason for this is so that we can clock bits into four different
1232  * locations when doing simple scalar multiplies against the base point.
1233  *
1234  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1235
1236 /* gmul is the table of precomputed base points */
1237 static const felem gmul[16][3] =
1238         {{{0, 0, 0, 0, 0, 0, 0, 0, 0},
1239           {0, 0, 0, 0, 0, 0, 0, 0, 0},
1240           {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1241          {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1242            0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1243            0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1244           {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1245            0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1246            0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1247           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1248          {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1249            0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1250            0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1251           {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1252            0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1253            0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1254           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1255          {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1256            0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1257            0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1258           {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1259            0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1260            0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1261           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1262          {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1263            0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1264            0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1265           {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1266            0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1267            0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1268           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1269          {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1270            0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1271            0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1272           {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1273            0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1274            0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1275           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1276          {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1277            0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1278            0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1279           {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1280            0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1281            0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1282           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1283          {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1284            0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1285            0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1286           {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1287            0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1288            0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1289           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1290          {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1291            0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1292            0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1293           {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1294            0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1295            0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1296           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1297          {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1298            0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1299            0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1300           {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1301            0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1302            0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1303           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1304          {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1305            0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1306            0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1307           {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1308            0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1309            0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1310           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1311          {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1312            0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1313            0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1314           {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1315            0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1316            0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1317           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1318          {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1319            0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1320            0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1321           {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1322            0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1323            0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1324           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1325          {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1326            0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1327            0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1328           {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1329            0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1330            0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1331           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1332          {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1333            0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1334            0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1335           {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1336            0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1337            0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1338           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1339          {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1340            0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1341            0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1342           {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1343            0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1344            0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1345          {1, 0, 0, 0, 0, 0, 0, 0, 0}}};
1346
1347 /* select_point selects the |idx|th point from a precomputation table and
1348  * copies it to out. */
1349 static void select_point(const limb idx, unsigned int size, const felem pre_comp[/* size */][3],
1350                          felem out[3])
1351         {
1352         unsigned i, j;
1353         limb *outlimbs = &out[0][0];
1354         memset(outlimbs, 0, 3 * sizeof(felem));
1355
1356         for (i = 0; i < size; i++)
1357                 {
1358                 const limb *inlimbs = &pre_comp[i][0][0];
1359                 limb mask = i ^ idx;
1360                 mask |= mask >> 4;
1361                 mask |= mask >> 2;
1362                 mask |= mask >> 1;
1363                 mask &= 1;
1364                 mask--;
1365                 for (j = 0; j < NLIMBS * 3; j++)
1366                         outlimbs[j] |= inlimbs[j] & mask;
1367                 }
1368         }
1369
1370 /* get_bit returns the |i|th bit in |in| */
1371 static char get_bit(const felem_bytearray in, int i)
1372         {
1373         if (i < 0)
1374                 return 0;
1375         return (in[i >> 3] >> (i & 7)) & 1;
1376         }
1377
1378 /* Interleaved point multiplication using precomputed point multiples:
1379  * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[],
1380  * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
1381  * of the generator, using certain (large) precomputed multiples in g_pre_comp.
1382  * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
1383 static void batch_mul(felem x_out, felem y_out, felem z_out,
1384         const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar,
1385         const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[16][3])
1386         {
1387         int i, skip;
1388         unsigned num, gen_mul = (g_scalar != NULL);
1389         felem nq[3], tmp[4];
1390         limb bits;
1391         u8 sign, digit;
1392
1393         /* set nq to the point at infinity */
1394         memset(nq, 0, 3 * sizeof(felem));
1395
1396         /* Loop over all scalars msb-to-lsb, interleaving additions
1397          * of multiples of the generator (last quarter of rounds)
1398          * and additions of other points multiples (every 5th round).
1399          */
1400         skip = 1; /* save two point operations in the first round */
1401         for (i = (num_points ? 520 : 130); i >= 0; --i)
1402                 {
1403                 /* double */
1404                 if (!skip)
1405                         point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1406
1407                 /* add multiples of the generator */
1408                 if (gen_mul && (i <= 130))
1409                         {
1410                         bits = get_bit(g_scalar, i + 390) << 3;
1411                         if (i < 130)
1412                                 {
1413                                 bits |= get_bit(g_scalar, i + 260) << 2;
1414                                 bits |= get_bit(g_scalar, i + 130) << 1;
1415                                 bits |= get_bit(g_scalar, i);
1416                                 }
1417                         /* select the point to add, in constant time */
1418                         select_point(bits, 16, g_pre_comp, tmp);
1419                         if (!skip)
1420                                 {
1421                                 point_add(nq[0], nq[1], nq[2],
1422                                         nq[0], nq[1], nq[2],
1423                                         1 /* mixed */, tmp[0], tmp[1], tmp[2]);
1424                                 }
1425                         else
1426                                 {
1427                                 memcpy(nq, tmp, 3 * sizeof(felem));
1428                                 skip = 0;
1429                                 }
1430                         }
1431
1432                 /* do other additions every 5 doublings */
1433                 if (num_points && (i % 5 == 0))
1434                         {
1435                         /* loop over all scalars */
1436                         for (num = 0; num < num_points; ++num)
1437                                 {
1438                                 bits = get_bit(scalars[num], i + 4) << 5;
1439                                 bits |= get_bit(scalars[num], i + 3) << 4;
1440                                 bits |= get_bit(scalars[num], i + 2) << 3;
1441                                 bits |= get_bit(scalars[num], i + 1) << 2;
1442                                 bits |= get_bit(scalars[num], i) << 1;
1443                                 bits |= get_bit(scalars[num], i - 1);
1444                                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1445
1446                                 /* select the point to add or subtract, in constant time */
1447                                 select_point(digit, 17, pre_comp[num], tmp);
1448                                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */
1449                                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1450
1451                                 if (!skip)
1452                                         {
1453                                         point_add(nq[0], nq[1], nq[2],
1454                                                 nq[0], nq[1], nq[2],
1455                                                 mixed, tmp[0], tmp[1], tmp[2]);
1456                                         }
1457                                 else
1458                                         {
1459                                         memcpy(nq, tmp, 3 * sizeof(felem));
1460                                         skip = 0;
1461                                         }
1462                                 }
1463                         }
1464                 }
1465         felem_assign(x_out, nq[0]);
1466         felem_assign(y_out, nq[1]);
1467         felem_assign(z_out, nq[2]);
1468         }
1469
1470
1471 /* Precomputation for the group generator. */
1472 typedef struct {
1473         felem g_pre_comp[16][3];
1474         int references;
1475 } NISTP521_PRE_COMP;
1476
1477 const EC_METHOD *EC_GFp_nistp521_method(void)
1478         {
1479         static const EC_METHOD ret = {
1480                 EC_FLAGS_DEFAULT_OCT,
1481                 NID_X9_62_prime_field,
1482                 ec_GFp_nistp521_group_init,
1483                 ec_GFp_simple_group_finish,
1484                 ec_GFp_simple_group_clear_finish,
1485                 ec_GFp_nist_group_copy,
1486                 ec_GFp_nistp521_group_set_curve,
1487                 ec_GFp_simple_group_get_curve,
1488                 ec_GFp_simple_group_get_degree,
1489                 ec_GFp_simple_group_check_discriminant,
1490                 ec_GFp_simple_point_init,
1491                 ec_GFp_simple_point_finish,
1492                 ec_GFp_simple_point_clear_finish,
1493                 ec_GFp_simple_point_copy,
1494                 ec_GFp_simple_point_set_to_infinity,
1495                 ec_GFp_simple_set_Jprojective_coordinates_GFp,
1496                 ec_GFp_simple_get_Jprojective_coordinates_GFp,
1497                 ec_GFp_simple_point_set_affine_coordinates,
1498                 ec_GFp_nistp521_point_get_affine_coordinates,
1499                 0 /* point_set_compressed_coordinates */,
1500                 0 /* point2oct */,
1501                 0 /* oct2point */,
1502                 ec_GFp_simple_add,
1503                 ec_GFp_simple_dbl,
1504                 ec_GFp_simple_invert,
1505                 ec_GFp_simple_is_at_infinity,
1506                 ec_GFp_simple_is_on_curve,
1507                 ec_GFp_simple_cmp,
1508                 ec_GFp_simple_make_affine,
1509                 ec_GFp_simple_points_make_affine,
1510                 ec_GFp_nistp521_points_mul,
1511                 ec_GFp_nistp521_precompute_mult,
1512                 ec_GFp_nistp521_have_precompute_mult,
1513                 ec_GFp_nist_field_mul,
1514                 ec_GFp_nist_field_sqr,
1515                 0 /* field_div */,
1516                 0 /* field_encode */,
1517                 0 /* field_decode */,
1518                 0 /* field_set_to_one */ };
1519
1520         return &ret;
1521         }
1522
1523
1524 /******************************************************************************/
1525 /*                     FUNCTIONS TO MANAGE PRECOMPUTATION
1526  */
1527
1528 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1529         {
1530         NISTP521_PRE_COMP *ret = NULL;
1531         ret = (NISTP521_PRE_COMP *)OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1532         if (!ret)
1533                 {
1534                 ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1535                 return ret;
1536                 }
1537         memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1538         ret->references = 1;
1539         return ret;
1540         }
1541
1542 static void *nistp521_pre_comp_dup(void *src_)
1543         {
1544         NISTP521_PRE_COMP *src = src_;
1545
1546         /* no need to actually copy, these objects never change! */
1547         CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1548
1549         return src_;
1550         }
1551
1552 static void nistp521_pre_comp_free(void *pre_)
1553         {
1554         int i;
1555         NISTP521_PRE_COMP *pre = pre_;
1556
1557         if (!pre)
1558                 return;
1559
1560         i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1561         if (i > 0)
1562                 return;
1563
1564         OPENSSL_free(pre);
1565         }
1566
1567 static void nistp521_pre_comp_clear_free(void *pre_)
1568         {
1569         int i;
1570         NISTP521_PRE_COMP *pre = pre_;
1571
1572         if (!pre)
1573                 return;
1574
1575         i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1576         if (i > 0)
1577                 return;
1578
1579         OPENSSL_cleanse(pre, sizeof(*pre));
1580         OPENSSL_free(pre);
1581         }
1582
1583 /******************************************************************************/
1584 /*                         OPENSSL EC_METHOD FUNCTIONS
1585  */
1586
1587 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1588         {
1589         int ret;
1590         ret = ec_GFp_simple_group_init(group);
1591         group->a_is_minus3 = 1;
1592         return ret;
1593         }
1594
1595 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1596         const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
1597         {
1598         int ret = 0;
1599         BN_CTX *new_ctx = NULL;
1600         BIGNUM *curve_p, *curve_a, *curve_b;
1601
1602         if (ctx == NULL)
1603                 if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1604         BN_CTX_start(ctx);
1605         if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1606                 ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1607                 ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err;
1608         BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1609         BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1610         BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1611         if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) ||
1612                 (BN_cmp(curve_b, b)))
1613                 {
1614                 ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1615                         EC_R_WRONG_CURVE_PARAMETERS);
1616                 goto err;
1617                 }
1618         group->field_mod_func = BN_nist_mod_521;
1619         ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1620 err:
1621         BN_CTX_end(ctx);
1622         if (new_ctx != NULL)
1623                 BN_CTX_free(new_ctx);
1624         return ret;
1625         }
1626
1627 /* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
1628  * (X', Y') = (X/Z^2, Y/Z^3) */
1629 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1630         const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
1631         {
1632         felem z1, z2, x_in, y_in, x_out, y_out;
1633         largefelem tmp;
1634
1635         if (EC_POINT_is_at_infinity(group, point))
1636                 {
1637                 ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1638                         EC_R_POINT_AT_INFINITY);
1639                 return 0;
1640                 }
1641         if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1642                 (!BN_to_felem(z1, &point->Z))) return 0;
1643         felem_inv(z2, z1);
1644         felem_square(tmp, z2); felem_reduce(z1, tmp);
1645         felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp);
1646         felem_contract(x_out, x_in);
1647         if (x != NULL)
1648                 {
1649                 if (!felem_to_BN(x, x_out))
1650                         {
1651                         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
1652                         return 0;
1653                         }
1654                 }
1655         felem_mul(tmp, z1, z2); felem_reduce(z1, tmp);
1656         felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp);
1657         felem_contract(y_out, y_in);
1658         if (y != NULL)
1659                 {
1660                 if (!felem_to_BN(y, y_out))
1661                         {
1662                         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
1663                         return 0;
1664                         }
1665                 }
1666         return 1;
1667         }
1668
1669 static void make_points_affine(size_t num, felem points[/* num */][3], felem tmp_felems[/* num+1 */])
1670         {
1671         /* Runs in constant time, unless an input is the point at infinity
1672          * (which normally shouldn't happen). */
1673         ec_GFp_nistp_points_make_affine_internal(
1674                 num,
1675                 points,
1676                 sizeof(felem),
1677                 tmp_felems,
1678                 (void (*)(void *)) felem_one,
1679                 (int (*)(const void *)) felem_is_zero_int,
1680                 (void (*)(void *, const void *)) felem_assign,
1681                 (void (*)(void *, const void *)) felem_square_reduce,
1682                 (void (*)(void *, const void *, const void *)) felem_mul_reduce,
1683                 (void (*)(void *, const void *)) felem_inv,
1684                 (void (*)(void *, const void *)) felem_contract);
1685         }
1686
1687 /* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values
1688  * Result is stored in r (r can equal one of the inputs). */
1689 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1690         const BIGNUM *scalar, size_t num, const EC_POINT *points[],
1691         const BIGNUM *scalars[], BN_CTX *ctx)
1692         {
1693         int ret = 0;
1694         int j;
1695         int mixed = 0;
1696         BN_CTX *new_ctx = NULL;
1697         BIGNUM *x, *y, *z, *tmp_scalar;
1698         felem_bytearray g_secret;
1699         felem_bytearray *secrets = NULL;
1700         felem (*pre_comp)[17][3] = NULL;
1701         felem *tmp_felems = NULL;
1702         felem_bytearray tmp;
1703         unsigned i, num_bytes;
1704         int have_pre_comp = 0;
1705         size_t num_points = num;
1706         felem x_in, y_in, z_in, x_out, y_out, z_out;
1707         NISTP521_PRE_COMP *pre = NULL;
1708         felem (*g_pre_comp)[3] = NULL;
1709         EC_POINT *generator = NULL;
1710         const EC_POINT *p = NULL;
1711         const BIGNUM *p_scalar = NULL;
1712
1713         if (ctx == NULL)
1714                 if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1715         BN_CTX_start(ctx);
1716         if (((x = BN_CTX_get(ctx)) == NULL) ||
1717                 ((y = BN_CTX_get(ctx)) == NULL) ||
1718                 ((z = BN_CTX_get(ctx)) == NULL) ||
1719                 ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1720                 goto err;
1721
1722         if (scalar != NULL)
1723                 {
1724                 pre = EC_EX_DATA_get_data(group->extra_data,
1725                         nistp521_pre_comp_dup, nistp521_pre_comp_free,
1726                         nistp521_pre_comp_clear_free);
1727                 if (pre)
1728                         /* we have precomputation, try to use it */
1729                         g_pre_comp = &pre->g_pre_comp[0];
1730                 else
1731                         /* try to use the standard precomputation */
1732                         g_pre_comp = (felem (*)[3]) gmul;
1733                 generator = EC_POINT_new(group);
1734                 if (generator == NULL)
1735                         goto err;
1736                 /* get the generator from precomputation */
1737                 if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1738                         !felem_to_BN(y, g_pre_comp[1][1]) ||
1739                         !felem_to_BN(z, g_pre_comp[1][2]))
1740                         {
1741                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1742                         goto err;
1743                         }
1744                 if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1745                                 generator, x, y, z, ctx))
1746                         goto err;
1747                 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1748                         /* precomputation matches generator */
1749                         have_pre_comp = 1;
1750                 else
1751                         /* we don't have valid precomputation:
1752                          * treat the generator as a random point */
1753                         num_points++;
1754                 }
1755
1756         if (num_points > 0)
1757                 {
1758                 if (num_points >= 2)
1759                         {
1760                         /* unless we precompute multiples for just one point,
1761                          * converting those into affine form is time well spent  */
1762                         mixed = 1;
1763                         }
1764                 secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1765                 pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1766                 if (mixed)
1767                         tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1768                 if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL)))
1769                         {
1770                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1771                         goto err;
1772                         }
1773
1774                 /* we treat NULL scalars as 0, and NULL points as points at infinity,
1775                  * i.e., they contribute nothing to the linear combination */
1776                 memset(secrets, 0, num_points * sizeof(felem_bytearray));
1777                 memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1778                 for (i = 0; i < num_points; ++i)
1779                         {
1780                         if (i == num)
1781                                 /* we didn't have a valid precomputation, so we pick
1782                                  * the generator */
1783                                 {
1784                                 p = EC_GROUP_get0_generator(group);
1785                                 p_scalar = scalar;
1786                                 }
1787                         else
1788                                 /* the i^th point */
1789                                 {
1790                                 p = points[i];
1791                                 p_scalar = scalars[i];
1792                                 }
1793                         if ((p_scalar != NULL) && (p != NULL))
1794                                 {
1795                                 /* reduce scalar to 0 <= scalar < 2^521 */
1796                                 if ((BN_num_bits(p_scalar) > 521) || (BN_is_negative(p_scalar)))
1797                                         {
1798                                         /* this is an unusual input, and we don't guarantee
1799                                          * constant-timeness */
1800                                         if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx))
1801                                                 {
1802                                                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1803                                                 goto err;
1804                                                 }
1805                                         num_bytes = BN_bn2bin(tmp_scalar, tmp);
1806                                         }
1807                                 else
1808                                         num_bytes = BN_bn2bin(p_scalar, tmp);
1809                                 flip_endian(secrets[i], tmp, num_bytes);
1810                                 /* precompute multiples */
1811                                 if ((!BN_to_felem(x_out, &p->X)) ||
1812                                         (!BN_to_felem(y_out, &p->Y)) ||
1813                                         (!BN_to_felem(z_out, &p->Z))) goto err;
1814                                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1815                                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1816                                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1817                                 for (j = 2; j <= 16; ++j)
1818                                         {
1819                                         if (j & 1)
1820                                                 {
1821                                                 point_add(
1822                                                         pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1823                                                         pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2],
1824                                                         0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]);
1825                                                 }
1826                                         else
1827                                                 {
1828                                                 point_double(
1829                                                         pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1830                                                         pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]);
1831                                                 }
1832                                         }
1833                                 }
1834                         }
1835                 if (mixed)
1836                         make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1837                 }
1838
1839         /* the scalar for the generator */
1840         if ((scalar != NULL) && (have_pre_comp))
1841                 {
1842                 memset(g_secret, 0, sizeof(g_secret));
1843                 /* reduce scalar to 0 <= scalar < 2^521 */
1844                 if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar)))
1845                         {
1846                         /* this is an unusual input, and we don't guarantee
1847                          * constant-timeness */
1848                         if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx))
1849                                 {
1850                                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1851                                 goto err;
1852                                 }
1853                         num_bytes = BN_bn2bin(tmp_scalar, tmp);
1854                         }
1855                 else
1856                         num_bytes = BN_bn2bin(scalar, tmp);
1857                 flip_endian(g_secret, tmp, num_bytes);
1858                 /* do the multiplication with generator precomputation*/
1859                 batch_mul(x_out, y_out, z_out,
1860                         (const felem_bytearray (*)) secrets, num_points,
1861                         g_secret,
1862                         mixed, (const felem (*)[17][3]) pre_comp,
1863                         (const felem (*)[3]) g_pre_comp);
1864                 }
1865         else
1866                 /* do the multiplication without generator precomputation */
1867                 batch_mul(x_out, y_out, z_out,
1868                         (const felem_bytearray (*)) secrets, num_points,
1869                         NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL);
1870         /* reduce the output to its unique minimal representation */
1871         felem_contract(x_in, x_out);
1872         felem_contract(y_in, y_out);
1873         felem_contract(z_in, z_out);
1874         if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1875                 (!felem_to_BN(z, z_in)))
1876                 {
1877                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1878                 goto err;
1879                 }
1880         ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1881
1882 err:
1883         BN_CTX_end(ctx);
1884         if (generator != NULL)
1885                 EC_POINT_free(generator);
1886         if (new_ctx != NULL)
1887                 BN_CTX_free(new_ctx);
1888         if (secrets != NULL)
1889                 OPENSSL_free(secrets);
1890         if (pre_comp != NULL)
1891                 OPENSSL_free(pre_comp);
1892         if (tmp_felems != NULL)
1893                 OPENSSL_free(tmp_felems);
1894         return ret;
1895         }
1896
1897 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1898         {
1899         int ret = 0;
1900         NISTP521_PRE_COMP *pre = NULL;
1901         int i, j;
1902         BN_CTX *new_ctx = NULL;
1903         BIGNUM *x, *y;
1904         EC_POINT *generator = NULL;
1905         felem tmp_felems[16];
1906
1907         /* throw away old precomputation */
1908         EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
1909                 nistp521_pre_comp_free, nistp521_pre_comp_clear_free);
1910         if (ctx == NULL)
1911                 if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1912         BN_CTX_start(ctx);
1913         if (((x = BN_CTX_get(ctx)) == NULL) ||
1914                 ((y = BN_CTX_get(ctx)) == NULL))
1915                 goto err;
1916         /* get the generator */
1917         if (group->generator == NULL) goto err;
1918         generator = EC_POINT_new(group);
1919         if (generator == NULL)
1920                 goto err;
1921         BN_bin2bn(nistp521_curve_params[3], sizeof (felem_bytearray), x);
1922         BN_bin2bn(nistp521_curve_params[4], sizeof (felem_bytearray), y);
1923         if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
1924                 goto err;
1925         if ((pre = nistp521_pre_comp_new()) == NULL)
1926                 goto err;
1927         /* if the generator is the standard one, use built-in precomputation */
1928         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1929                 {
1930                 memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1931                 ret = 1;
1932                 goto err;
1933                 }
1934         if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
1935                 (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
1936                 (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
1937                 goto err;
1938         /* compute 2^130*G, 2^260*G, 2^390*G */
1939         for (i = 1; i <= 4; i <<= 1)
1940                 {
1941                 point_double(pre->g_pre_comp[2*i][0], pre->g_pre_comp[2*i][1],
1942                         pre->g_pre_comp[2*i][2], pre->g_pre_comp[i][0],
1943                         pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
1944                 for (j = 0; j < 129; ++j)
1945                         {
1946                         point_double(pre->g_pre_comp[2*i][0],
1947                                 pre->g_pre_comp[2*i][1],
1948                                 pre->g_pre_comp[2*i][2],
1949                                 pre->g_pre_comp[2*i][0],
1950                                 pre->g_pre_comp[2*i][1],
1951                                 pre->g_pre_comp[2*i][2]);
1952                         }
1953                 }
1954         /* g_pre_comp[0] is the point at infinity */
1955         memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
1956         /* the remaining multiples */
1957         /* 2^130*G + 2^260*G */
1958         point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
1959                 pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
1960                 pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
1961                 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1962                 pre->g_pre_comp[2][2]);
1963         /* 2^130*G + 2^390*G */
1964         point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
1965                 pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
1966                 pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
1967                 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1968                 pre->g_pre_comp[2][2]);
1969         /* 2^260*G + 2^390*G */
1970         point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
1971                 pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
1972                 pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
1973                 0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
1974                 pre->g_pre_comp[4][2]);
1975         /* 2^130*G + 2^260*G + 2^390*G */
1976         point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
1977                 pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
1978                 pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
1979                 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1980                 pre->g_pre_comp[2][2]);
1981         for (i = 1; i < 8; ++i)
1982                 {
1983                 /* odd multiples: add G */
1984                 point_add(pre->g_pre_comp[2*i+1][0], pre->g_pre_comp[2*i+1][1],
1985                         pre->g_pre_comp[2*i+1][2], pre->g_pre_comp[2*i][0],
1986                         pre->g_pre_comp[2*i][1], pre->g_pre_comp[2*i][2],
1987                         0, pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
1988                         pre->g_pre_comp[1][2]);
1989                 }
1990         make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
1991
1992         if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
1993                         nistp521_pre_comp_free, nistp521_pre_comp_clear_free))
1994                 goto err;
1995         ret = 1;
1996         pre = NULL;
1997  err:
1998         BN_CTX_end(ctx);
1999         if (generator != NULL)
2000                 EC_POINT_free(generator);
2001         if (new_ctx != NULL)
2002                 BN_CTX_free(new_ctx);
2003         if (pre)
2004                 nistp521_pre_comp_free(pre);
2005         return ret;
2006         }
2007
2008 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2009         {
2010         if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2011                         nistp521_pre_comp_free, nistp521_pre_comp_clear_free)
2012                 != NULL)
2013                 return 1;
2014         else
2015                 return 0;
2016         }
2017
2018 #else
2019 static void *dummy=&dummy;
2020 #endif