crypto/ec/ecp_nistp521.c

   1 /* crypto/ec/ecp_nistp521.c */
   2 /*
   3  * Written by Adam Langley (Google) for the OpenSSL project
   4  */
   5 /* Copyright 2011 Google Inc.
   6  *
   7  * Licensed under the Apache License, Version 2.0 (the "License");
   8  *
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  *  Unless required by applicable law or agreed to in writing, software
  15  *  distributed under the License is distributed on an "AS IS" BASIS,
  16  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  *  See the License for the specific language governing permissions and
  18  *  limitations under the License.
  19  */
  20
  21 /*
  22  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  23  *
  24  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  25  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  26  * work which got its smarts from Daniel J. Bernstein's work on the same.
  27  */
  28
  29 #include <openssl/opensslconf.h>
  30 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
  31
  32 #ifndef OPENSSL_SYS_VMS
  33 #include <stdint.h>
  34 #else
  35 #include <inttypes.h>
  36 #endif
  37
  38 #include <string.h>
  39 #include <openssl/err.h>
  40 #include "ec_lcl.h"
  41
  42 #if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  43   /* even with gcc, the typedef won't work for 32-bit platforms */
  44   typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */
  45 #else
  46   #error "Need GCC 3.1 or later to define type uint128_t"
  47 #endif
  48
  49 typedef uint8_t u8;
  50 typedef uint64_t u64;
  51 typedef int64_t s64;
  52
  53 /* The underlying field.
  54  *
  55  * P521 operates over GF(2^521-1). We can serialise an element of this field
  56  * into 66 bytes where the most significant byte contains only a single bit. We
  57  * call this an felem_bytearray. */
  58
  59 typedef u8 felem_bytearray[66];
  60
  61 /* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  62  * These values are big-endian. */
  63 static const felem_bytearray nistp521_curve_params[5] =
  64         {
  65         {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  /* p */
  66          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  67          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  68          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  69          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  70          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  71          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  72          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73          0xff, 0xff},
  74         {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  /* a = -3 */
  75          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  76          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  77          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  78          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  79          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  80          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  82          0xff, 0xfc},
  83         {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c,  /* b */
  84          0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  85          0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  86          0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  87          0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  88          0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  89          0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  90          0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  91          0x3f, 0x00},
  92         {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04,  /* x */
  93          0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  94          0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
  95          0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
  96          0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
  97          0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
  98          0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
  99          0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 100          0xbd, 0x66},
 101         {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b,  /* y */
 102          0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 103          0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 104          0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 105          0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 106          0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 107          0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 108          0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 109          0x66, 0x50}
 110         };
 111
 112 /*-
 113  * The representation of field elements.
 114  * ------------------------------------
 115  *
 116  * We represent field elements with nine values. These values are either 64 or
 117  * 128 bits and the field element represented is:
 118  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 119  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 120  * 58 bits apart, but are greater than 58 bits in length, the most significant
 121  * bits of each limb overlap with the least significant bits of the next.
 122  *
 123  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 124  * 'largefelem' */
 125
 126 #define NLIMBS 9
 127
 128 typedef uint64_t limb;
 129 typedef limb felem[NLIMBS];
 130 typedef uint128_t largefelem[NLIMBS];
 131
 132 static const limb bottom57bits = 0x1ffffffffffffff;
 133 static const limb bottom58bits = 0x3ffffffffffffff;
 134
 135 /* bin66_to_felem takes a little-endian byte array and converts it into felem
 136  * form. This assumes that the CPU is little-endian. */
 137 static void bin66_to_felem(felem out, const u8 in[66])
 138         {
 139         out[0] = (*((limb*) &in[0])) & bottom58bits;
 140         out[1] = (*((limb*) &in[7]) >> 2) & bottom58bits;
 141         out[2] = (*((limb*) &in[14]) >> 4) & bottom58bits;
 142         out[3] = (*((limb*) &in[21]) >> 6) & bottom58bits;
 143         out[4] = (*((limb*) &in[29])) & bottom58bits;
 144         out[5] = (*((limb*) &in[36]) >> 2) & bottom58bits;
 145         out[6] = (*((limb*) &in[43]) >> 4) & bottom58bits;
 146         out[7] = (*((limb*) &in[50]) >> 6) & bottom58bits;
 147         out[8] = (*((limb*) &in[58])) & bottom57bits;
 148         }
 149
 150 /* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 151  * array. This assumes that the CPU is little-endian. */
 152 static void felem_to_bin66(u8 out[66], const felem in)
 153         {
 154         memset(out, 0, 66);
 155         (*((limb*) &out[0])) = in[0];
 156         (*((limb*) &out[7])) |= in[1] << 2;
 157         (*((limb*) &out[14])) |= in[2] << 4;
 158         (*((limb*) &out[21])) |= in[3] << 6;
 159         (*((limb*) &out[29])) = in[4];
 160         (*((limb*) &out[36])) |= in[5] << 2;
 161         (*((limb*) &out[43])) |= in[6] << 4;
 162         (*((limb*) &out[50])) |= in[7] << 6;
 163         (*((limb*) &out[58])) = in[8];
 164         }
 165
 166 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 167 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 168         {
 169         unsigned i;
 170         for (i = 0; i < len; ++i)
 171                 out[i] = in[len-1-i];
 172         }
 173
 174 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 175 static int BN_to_felem(felem out, const BIGNUM *bn)
 176         {
 177         felem_bytearray b_in;
 178         felem_bytearray b_out;
 179         unsigned num_bytes;
 180
 181         /* BN_bn2bin eats leading zeroes */
 182         memset(b_out, 0, sizeof b_out);
 183         num_bytes = BN_num_bytes(bn);
 184         if (num_bytes > sizeof b_out)
 185                 {
 186                 ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 187                 return 0;
 188                 }
 189         if (BN_is_negative(bn))
 190                 {
 191                 ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 192                 return 0;
 193                 }
 194         num_bytes = BN_bn2bin(bn, b_in);
 195         flip_endian(b_out, b_in, num_bytes);
 196         bin66_to_felem(out, b_out);
 197         return 1;
 198         }
 199
 200 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 201 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 202         {
 203         felem_bytearray b_in, b_out;
 204         felem_to_bin66(b_in, in);
 205         flip_endian(b_out, b_in, sizeof b_out);
 206         return BN_bin2bn(b_out, sizeof b_out, out);
 207         }
 208
 209
 210 /*-
 211  * Field operations
 212  * ----------------
 213  */
 214
 215 static void felem_one(felem out)
 216         {
 217         out[0] = 1;
 218         out[1] = 0;
 219         out[2] = 0;
 220         out[3] = 0;
 221         out[4] = 0;
 222         out[5] = 0;
 223         out[6] = 0;
 224         out[7] = 0;
 225         out[8] = 0;
 226         }
 227
 228 static void felem_assign(felem out, const felem in)
 229         {
 230         out[0] = in[0];
 231         out[1] = in[1];
 232         out[2] = in[2];
 233         out[3] = in[3];
 234         out[4] = in[4];
 235         out[5] = in[5];
 236         out[6] = in[6];
 237         out[7] = in[7];
 238         out[8] = in[8];
 239         }
 240
 241 /* felem_sum64 sets out = out + in. */
 242 static void felem_sum64(felem out, const felem in)
 243         {
 244         out[0] += in[0];
 245         out[1] += in[1];
 246         out[2] += in[2];
 247         out[3] += in[3];
 248         out[4] += in[4];
 249         out[5] += in[5];
 250         out[6] += in[6];
 251         out[7] += in[7];
 252         out[8] += in[8];
 253         }
 254
 255 /* felem_scalar sets out = in * scalar */
 256 static void felem_scalar(felem out, const felem in, limb scalar)
 257         {
 258         out[0] = in[0] * scalar;
 259         out[1] = in[1] * scalar;
 260         out[2] = in[2] * scalar;
 261         out[3] = in[3] * scalar;
 262         out[4] = in[4] * scalar;
 263         out[5] = in[5] * scalar;
 264         out[6] = in[6] * scalar;
 265         out[7] = in[7] * scalar;
 266         out[8] = in[8] * scalar;
 267         }
 268
 269 /* felem_scalar64 sets out = out * scalar */
 270 static void felem_scalar64(felem out, limb scalar)
 271         {
 272         out[0] *= scalar;
 273         out[1] *= scalar;
 274         out[2] *= scalar;
 275         out[3] *= scalar;
 276         out[4] *= scalar;
 277         out[5] *= scalar;
 278         out[6] *= scalar;
 279         out[7] *= scalar;
 280         out[8] *= scalar;
 281         }
 282
 283 /* felem_scalar128 sets out = out * scalar */
 284 static void felem_scalar128(largefelem out, limb scalar)
 285         {
 286         out[0] *= scalar;
 287         out[1] *= scalar;
 288         out[2] *= scalar;
 289         out[3] *= scalar;
 290         out[4] *= scalar;
 291         out[5] *= scalar;
 292         out[6] *= scalar;
 293         out[7] *= scalar;
 294         out[8] *= scalar;
 295         }
 296
 297 /*-
 298  * felem_neg sets |out| to |-in|
 299  * On entry:
 300  *   in[i] < 2^59 + 2^14
 301  * On exit:
 302  *   out[i] < 2^62
 303  */
 304 static void felem_neg(felem out, const felem in)
 305         {
 306         /* In order to prevent underflow, we subtract from 0 mod p. */
 307         static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
 308         static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
 309
 310         out[0] = two62m3 - in[0];
 311         out[1] = two62m2 - in[1];
 312         out[2] = two62m2 - in[2];
 313         out[3] = two62m2 - in[3];
 314         out[4] = two62m2 - in[4];
 315         out[5] = two62m2 - in[5];
 316         out[6] = two62m2 - in[6];
 317         out[7] = two62m2 - in[7];
 318         out[8] = two62m2 - in[8];
 319         }
 320
 321 /*-
 322  * felem_diff64 subtracts |in| from |out|
 323  * On entry:
 324  *   in[i] < 2^59 + 2^14
 325  * On exit:
 326  *   out[i] < out[i] + 2^62
 327  */
 328 static void felem_diff64(felem out, const felem in)
 329         {
 330         /* In order to prevent underflow, we add 0 mod p before subtracting. */
 331         static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
 332         static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
 333
 334         out[0] += two62m3 - in[0];
 335         out[1] += two62m2 - in[1];
 336         out[2] += two62m2 - in[2];
 337         out[3] += two62m2 - in[3];
 338         out[4] += two62m2 - in[4];
 339         out[5] += two62m2 - in[5];
 340         out[6] += two62m2 - in[6];
 341         out[7] += two62m2 - in[7];
 342         out[8] += two62m2 - in[8];
 343         }
 344
 345 /*-
 346  * felem_diff_128_64 subtracts |in| from |out|
 347  * On entry:
 348  *   in[i] < 2^62 + 2^17
 349  * On exit:
 350  *   out[i] < out[i] + 2^63
 351  */
 352 static void felem_diff_128_64(largefelem out, const felem in)
 353         {
 354         /* In order to prevent underflow, we add 0 mod p before subtracting. */
 355         static const limb two63m6 = (((limb)1) << 62) - (((limb)1) << 5);
 356         static const limb two63m5 = (((limb)1) << 62) - (((limb)1) << 4);
 357
 358         out[0] += two63m6 - in[0];
 359         out[1] += two63m5 - in[1];
 360         out[2] += two63m5 - in[2];
 361         out[3] += two63m5 - in[3];
 362         out[4] += two63m5 - in[4];
 363         out[5] += two63m5 - in[5];
 364         out[6] += two63m5 - in[6];
 365         out[7] += two63m5 - in[7];
 366         out[8] += two63m5 - in[8];
 367         }
 368
 369 /*-
 370  * felem_diff_128_64 subtracts |in| from |out|
 371  * On entry:
 372  *   in[i] < 2^126
 373  * On exit:
 374  *   out[i] < out[i] + 2^127 - 2^69
 375  */
 376 static void felem_diff128(largefelem out, const largefelem in)
 377         {
 378         /* In order to prevent underflow, we add 0 mod p before subtracting. */
 379         static const uint128_t two127m70 = (((uint128_t)1) << 127) - (((uint128_t)1) << 70);
 380         static const uint128_t two127m69 = (((uint128_t)1) << 127) - (((uint128_t)1) << 69);
 381
 382         out[0] += (two127m70 - in[0]);
 383         out[1] += (two127m69 - in[1]);
 384         out[2] += (two127m69 - in[2]);
 385         out[3] += (two127m69 - in[3]);
 386         out[4] += (two127m69 - in[4]);
 387         out[5] += (two127m69 - in[5]);
 388         out[6] += (two127m69 - in[6]);
 389         out[7] += (two127m69 - in[7]);
 390         out[8] += (two127m69 - in[8]);
 391         }
 392
 393 /*-
 394  * felem_square sets |out| = |in|^2
 395  * On entry:
 396  *   in[i] < 2^62
 397  * On exit:
 398  *   out[i] < 17 * max(in[i]) * max(in[i])
 399  */
 400 static void felem_square(largefelem out, const felem in)
 401         {
 402         felem inx2, inx4;
 403         felem_scalar(inx2, in, 2);
 404         felem_scalar(inx4, in, 4);
 405
 406         /*-
 407          * We have many cases were we want to do
 408          *   in[x] * in[y] +
 409          *   in[y] * in[x]
 410          * This is obviously just
 411          *   2 * in[x] * in[y]
 412          * However, rather than do the doubling on the 128 bit result, we
 413          * double one of the inputs to the multiplication by reading from
 414          * |inx2| */
 415
 416         out[0] = ((uint128_t) in[0]) * in[0];
 417         out[1] = ((uint128_t) in[0]) * inx2[1];
 418         out[2] = ((uint128_t) in[0]) * inx2[2] +
 419                  ((uint128_t) in[1]) * in[1];
 420         out[3] = ((uint128_t) in[0]) * inx2[3] +
 421                  ((uint128_t) in[1]) * inx2[2];
 422         out[4] = ((uint128_t) in[0]) * inx2[4] +
 423                  ((uint128_t) in[1]) * inx2[3] +
 424                  ((uint128_t) in[2]) * in[2];
 425         out[5] = ((uint128_t) in[0]) * inx2[5] +
 426                  ((uint128_t) in[1]) * inx2[4] +
 427                  ((uint128_t) in[2]) * inx2[3];
 428         out[6] = ((uint128_t) in[0]) * inx2[6] +
 429                  ((uint128_t) in[1]) * inx2[5] +
 430                  ((uint128_t) in[2]) * inx2[4] +
 431                  ((uint128_t) in[3]) * in[3];
 432         out[7] = ((uint128_t) in[0]) * inx2[7] +
 433                  ((uint128_t) in[1]) * inx2[6] +
 434                  ((uint128_t) in[2]) * inx2[5] +
 435                  ((uint128_t) in[3]) * inx2[4];
 436         out[8] = ((uint128_t) in[0]) * inx2[8] +
 437                  ((uint128_t) in[1]) * inx2[7] +
 438                  ((uint128_t) in[2]) * inx2[6] +
 439                  ((uint128_t) in[3]) * inx2[5] +
 440                  ((uint128_t) in[4]) * in[4];
 441
 442         /* The remaining limbs fall above 2^521, with the first falling at
 443          * 2^522. They correspond to locations one bit up from the limbs
 444          * produced above so we would have to multiply by two to align them.
 445          * Again, rather than operate on the 128-bit result, we double one of
 446          * the inputs to the multiplication. If we want to double for both this
 447          * reason, and the reason above, then we end up multiplying by four. */
 448
 449         /* 9 */
 450         out[0] += ((uint128_t) in[1]) * inx4[8] +
 451                   ((uint128_t) in[2]) * inx4[7] +
 452                   ((uint128_t) in[3]) * inx4[6] +
 453                   ((uint128_t) in[4]) * inx4[5];
 454
 455         /* 10 */
 456         out[1] += ((uint128_t) in[2]) * inx4[8] +
 457                   ((uint128_t) in[3]) * inx4[7] +
 458                   ((uint128_t) in[4]) * inx4[6] +
 459                   ((uint128_t) in[5]) * inx2[5];
 460
 461         /* 11 */
 462         out[2] += ((uint128_t) in[3]) * inx4[8] +
 463                   ((uint128_t) in[4]) * inx4[7] +
 464                   ((uint128_t) in[5]) * inx4[6];
 465
 466         /* 12 */
 467         out[3] += ((uint128_t) in[4]) * inx4[8] +
 468                   ((uint128_t) in[5]) * inx4[7] +
 469                   ((uint128_t) in[6]) * inx2[6];
 470
 471         /* 13 */
 472         out[4] += ((uint128_t) in[5]) * inx4[8] +
 473                   ((uint128_t) in[6]) * inx4[7];
 474
 475         /* 14 */
 476         out[5] += ((uint128_t) in[6]) * inx4[8] +
 477                   ((uint128_t) in[7]) * inx2[7];
 478
 479         /* 15 */
 480         out[6] += ((uint128_t) in[7]) * inx4[8];
 481
 482         /* 16 */
 483         out[7] += ((uint128_t) in[8]) * inx2[8];
 484         }
 485
 486 /*-
 487  * felem_mul sets |out| = |in1| * |in2|
 488  * On entry:
 489  *   in1[i] < 2^64
 490  *   in2[i] < 2^63
 491  * On exit:
 492  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 493  */
 494 static void felem_mul(largefelem out, const felem in1, const felem in2)
 495         {
 496         felem in2x2;
 497         felem_scalar(in2x2, in2, 2);
 498
 499         out[0] = ((uint128_t) in1[0]) * in2[0];
 500
 501         out[1] = ((uint128_t) in1[0]) * in2[1] +
 502                  ((uint128_t) in1[1]) * in2[0];
 503
 504         out[2] = ((uint128_t) in1[0]) * in2[2] +
 505                  ((uint128_t) in1[1]) * in2[1] +
 506                  ((uint128_t) in1[2]) * in2[0];
 507
 508         out[3] = ((uint128_t) in1[0]) * in2[3] +
 509                  ((uint128_t) in1[1]) * in2[2] +
 510                  ((uint128_t) in1[2]) * in2[1] +
 511                  ((uint128_t) in1[3]) * in2[0];
 512
 513         out[4] = ((uint128_t) in1[0]) * in2[4] +
 514                  ((uint128_t) in1[1]) * in2[3] +
 515                  ((uint128_t) in1[2]) * in2[2] +
 516                  ((uint128_t) in1[3]) * in2[1] +
 517                  ((uint128_t) in1[4]) * in2[0];
 518
 519         out[5] = ((uint128_t) in1[0]) * in2[5] +
 520                  ((uint128_t) in1[1]) * in2[4] +
 521                  ((uint128_t) in1[2]) * in2[3] +
 522                  ((uint128_t) in1[3]) * in2[2] +
 523                  ((uint128_t) in1[4]) * in2[1] +
 524                  ((uint128_t) in1[5]) * in2[0];
 525
 526         out[6] = ((uint128_t) in1[0]) * in2[6] +
 527                  ((uint128_t) in1[1]) * in2[5] +
 528                  ((uint128_t) in1[2]) * in2[4] +
 529                  ((uint128_t) in1[3]) * in2[3] +
 530                  ((uint128_t) in1[4]) * in2[2] +
 531                  ((uint128_t) in1[5]) * in2[1] +
 532                  ((uint128_t) in1[6]) * in2[0];
 533
 534         out[7] = ((uint128_t) in1[0]) * in2[7] +
 535                  ((uint128_t) in1[1]) * in2[6] +
 536                  ((uint128_t) in1[2]) * in2[5] +
 537                  ((uint128_t) in1[3]) * in2[4] +
 538                  ((uint128_t) in1[4]) * in2[3] +
 539                  ((uint128_t) in1[5]) * in2[2] +
 540                  ((uint128_t) in1[6]) * in2[1] +
 541                  ((uint128_t) in1[7]) * in2[0];
 542
 543         out[8] = ((uint128_t) in1[0]) * in2[8] +
 544                  ((uint128_t) in1[1]) * in2[7] +
 545                  ((uint128_t) in1[2]) * in2[6] +
 546                  ((uint128_t) in1[3]) * in2[5] +
 547                  ((uint128_t) in1[4]) * in2[4] +
 548                  ((uint128_t) in1[5]) * in2[3] +
 549                  ((uint128_t) in1[6]) * in2[2] +
 550                  ((uint128_t) in1[7]) * in2[1] +
 551                  ((uint128_t) in1[8]) * in2[0];
 552
 553         /* See comment in felem_square about the use of in2x2 here */
 554
 555         out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 556                   ((uint128_t) in1[2]) * in2x2[7] +
 557                   ((uint128_t) in1[3]) * in2x2[6] +
 558                   ((uint128_t) in1[4]) * in2x2[5] +
 559                   ((uint128_t) in1[5]) * in2x2[4] +
 560                   ((uint128_t) in1[6]) * in2x2[3] +
 561                   ((uint128_t) in1[7]) * in2x2[2] +
 562                   ((uint128_t) in1[8]) * in2x2[1];
 563
 564         out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 565                   ((uint128_t) in1[3]) * in2x2[7] +
 566                   ((uint128_t) in1[4]) * in2x2[6] +
 567                   ((uint128_t) in1[5]) * in2x2[5] +
 568                   ((uint128_t) in1[6]) * in2x2[4] +
 569                   ((uint128_t) in1[7]) * in2x2[3] +
 570                   ((uint128_t) in1[8]) * in2x2[2];
 571
 572         out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 573                   ((uint128_t) in1[4]) * in2x2[7] +
 574                   ((uint128_t) in1[5]) * in2x2[6] +
 575                   ((uint128_t) in1[6]) * in2x2[5] +
 576                   ((uint128_t) in1[7]) * in2x2[4] +
 577                   ((uint128_t) in1[8]) * in2x2[3];
 578
 579         out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 580                   ((uint128_t) in1[5]) * in2x2[7] +
 581                   ((uint128_t) in1[6]) * in2x2[6] +
 582                   ((uint128_t) in1[7]) * in2x2[5] +
 583                   ((uint128_t) in1[8]) * in2x2[4];
 584
 585         out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 586                   ((uint128_t) in1[6]) * in2x2[7] +
 587                   ((uint128_t) in1[7]) * in2x2[6] +
 588                   ((uint128_t) in1[8]) * in2x2[5];
 589
 590         out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 591                   ((uint128_t) in1[7]) * in2x2[7] +
 592                   ((uint128_t) in1[8]) * in2x2[6];
 593
 594         out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 595                   ((uint128_t) in1[8]) * in2x2[7];
 596
 597         out[7] += ((uint128_t) in1[8]) * in2x2[8];
 598         }
 599
 600 static const limb bottom52bits = 0xfffffffffffff;
 601
 602 /*-
 603  * felem_reduce converts a largefelem to an felem.
 604  * On entry:
 605  *   in[i] < 2^128
 606  * On exit:
 607  *   out[i] < 2^59 + 2^14
 608  */
 609 static void felem_reduce(felem out, const largefelem in)
 610         {
 611         u64 overflow1, overflow2;
 612
 613         out[0] = ((limb) in[0]) & bottom58bits;
 614         out[1] = ((limb) in[1]) & bottom58bits;
 615         out[2] = ((limb) in[2]) & bottom58bits;
 616         out[3] = ((limb) in[3]) & bottom58bits;
 617         out[4] = ((limb) in[4]) & bottom58bits;
 618         out[5] = ((limb) in[5]) & bottom58bits;
 619         out[6] = ((limb) in[6]) & bottom58bits;
 620         out[7] = ((limb) in[7]) & bottom58bits;
 621         out[8] = ((limb) in[8]) & bottom58bits;
 622
 623         /* out[i] < 2^58 */
 624
 625         out[1] += ((limb) in[0]) >> 58;
 626         out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 627         /* out[1] < 2^58 + 2^6 + 2^58
 628          *        = 2^59 + 2^6 */
 629         out[2] += ((limb) (in[0] >> 64)) >> 52;
 630
 631         out[2] += ((limb) in[1]) >> 58;
 632         out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 633         out[3] += ((limb) (in[1] >> 64)) >> 52;
 634
 635         out[3] += ((limb) in[2]) >> 58;
 636         out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 637         out[4] += ((limb) (in[2] >> 64)) >> 52;
 638
 639         out[4] += ((limb) in[3]) >> 58;
 640         out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 641         out[5] += ((limb) (in[3] >> 64)) >> 52;
 642
 643         out[5] += ((limb) in[4]) >> 58;
 644         out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 645         out[6] += ((limb) (in[4] >> 64)) >> 52;
 646
 647         out[6] += ((limb) in[5]) >> 58;
 648         out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 649         out[7] += ((limb) (in[5] >> 64)) >> 52;
 650
 651         out[7] += ((limb) in[6]) >> 58;
 652         out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 653         out[8] += ((limb) (in[6] >> 64)) >> 52;
 654
 655         out[8] += ((limb) in[7]) >> 58;
 656         out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 657         /* out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 658          *            < 2^59 + 2^13 */
 659         overflow1 = ((limb) (in[7] >> 64)) >> 52;
 660
 661         overflow1 += ((limb) in[8]) >> 58;
 662         overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 663         overflow2 = ((limb) (in[8] >> 64)) >> 52;
 664
 665         overflow1 <<= 1;  /* overflow1 < 2^13 + 2^7 + 2^59 */
 666         overflow2 <<= 1;  /* overflow2 < 2^13 */
 667
 668         out[0] += overflow1;  /* out[0] < 2^60 */
 669         out[1] += overflow2;  /* out[1] < 2^59 + 2^6 + 2^13 */
 670
 671         out[1] += out[0] >> 58; out[0] &= bottom58bits;
 672         /* out[0] < 2^58
 673          * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 674          *        < 2^59 + 2^14 */
 675         }
 676
 677 static void felem_square_reduce(felem out, const felem in)
 678         {
 679         largefelem tmp;
 680         felem_square(tmp, in);
 681         felem_reduce(out, tmp);
 682         }
 683
 684 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 685         {
 686         largefelem tmp;
 687         felem_mul(tmp, in1, in2);
 688         felem_reduce(out, tmp);
 689         }
 690
 691 /*-
 692  * felem_inv calculates |out| = |in|^{-1}
 693  *
 694  * Based on Fermat's Little Theorem:
 695  *   a^p = a (mod p)
 696  *   a^{p-1} = 1 (mod p)
 697  *   a^{p-2} = a^{-1} (mod p)
 698  */
 699 static void felem_inv(felem out, const felem in)
 700         {
 701         felem ftmp, ftmp2, ftmp3, ftmp4;
 702         largefelem tmp;
 703         unsigned i;
 704
 705         felem_square(tmp, in); felem_reduce(ftmp, tmp);         /* 2^1 */
 706         felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);      /* 2^2 - 2^0 */
 707         felem_assign(ftmp2, ftmp);
 708         felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);       /* 2^3 - 2^1 */
 709         felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);      /* 2^3 - 2^0 */
 710         felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);       /* 2^4 - 2^1 */
 711
 712         felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp);     /* 2^3 - 2^1 */
 713         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^4 - 2^2 */
 714         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */
 715
 716         felem_assign(ftmp2, ftmp3);
 717         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^5 - 2^1 */
 718         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^6 - 2^2 */
 719         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^7 - 2^3 */
 720         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^8 - 2^4 */
 721         felem_assign(ftmp4, ftmp3);
 722         felem_mul(tmp, ftmp3, ftmp); felem_reduce(ftmp4, tmp);  /* 2^8 - 2^1 */
 723         felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp);     /* 2^9 - 2^2 */
 724         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */
 725         felem_assign(ftmp2, ftmp3);
 726
 727         for (i = 0; i < 8; i++)
 728                 {
 729                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^16 - 2^8 */
 730                 }
 731         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */
 732         felem_assign(ftmp2, ftmp3);
 733
 734         for (i = 0; i < 16; i++)
 735                 {
 736                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^32 - 2^16 */
 737                 }
 738         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */
 739         felem_assign(ftmp2, ftmp3);
 740
 741         for (i = 0; i < 32; i++)
 742                 {
 743                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^64 - 2^32 */
 744                 }
 745         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */
 746         felem_assign(ftmp2, ftmp3);
 747
 748         for (i = 0; i < 64; i++)
 749                 {
 750                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^128 - 2^64 */
 751                 }
 752         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */
 753         felem_assign(ftmp2, ftmp3);
 754
 755         for (i = 0; i < 128; i++)
 756                 {
 757                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^256 - 2^128 */
 758                 }
 759         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */
 760         felem_assign(ftmp2, ftmp3);
 761
 762         for (i = 0; i < 256; i++)
 763                 {
 764                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^512 - 2^256 */
 765                 }
 766         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */
 767
 768         for (i = 0; i < 9; i++)
 769                 {
 770                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^521 - 2^9 */
 771                 }
 772         felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp); /* 2^512 - 2^2 */
 773         felem_mul(tmp, ftmp3, in); felem_reduce(out, tmp);      /* 2^512 - 3 */
 774 }
 775
 776 /* This is 2^521-1, expressed as an felem */
 777 static const felem kPrime =
 778         {
 779         0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 780         0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 781         0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 782         };
 783
 784 /*-
 785  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 786  * otherwise.
 787  * On entry:
 788  *   in[i] < 2^59 + 2^14
 789  */
 790 static limb felem_is_zero(const felem in)
 791         {
 792         felem ftmp;
 793         limb is_zero, is_p;
 794         felem_assign(ftmp, in);
 795
 796         ftmp[0] += ftmp[8] >> 57; ftmp[8] &= bottom57bits;
 797         /* ftmp[8] < 2^57 */
 798         ftmp[1] += ftmp[0] >> 58; ftmp[0] &= bottom58bits;
 799         ftmp[2] += ftmp[1] >> 58; ftmp[1] &= bottom58bits;
 800         ftmp[3] += ftmp[2] >> 58; ftmp[2] &= bottom58bits;
 801         ftmp[4] += ftmp[3] >> 58; ftmp[3] &= bottom58bits;
 802         ftmp[5] += ftmp[4] >> 58; ftmp[4] &= bottom58bits;
 803         ftmp[6] += ftmp[5] >> 58; ftmp[5] &= bottom58bits;
 804         ftmp[7] += ftmp[6] >> 58; ftmp[6] &= bottom58bits;
 805         ftmp[8] += ftmp[7] >> 58; ftmp[7] &= bottom58bits;
 806         /* ftmp[8] < 2^57 + 4 */
 807
 808         /* The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is
 809          * greater than our bound for ftmp[8]. Therefore we only have to check
 810          * if the zero is zero or 2^521-1. */
 811
 812         is_zero = 0;
 813         is_zero |= ftmp[0];
 814         is_zero |= ftmp[1];
 815         is_zero |= ftmp[2];
 816         is_zero |= ftmp[3];
 817         is_zero |= ftmp[4];
 818         is_zero |= ftmp[5];
 819         is_zero |= ftmp[6];
 820         is_zero |= ftmp[7];
 821         is_zero |= ftmp[8];
 822
 823         is_zero--;
 824         /* We know that ftmp[i] < 2^63, therefore the only way that the top bit
 825          * can be set is if is_zero was 0 before the decrement. */
 826         is_zero = ((s64) is_zero) >> 63;
 827
 828         is_p = ftmp[0] ^ kPrime[0];
 829         is_p |= ftmp[1] ^ kPrime[1];
 830         is_p |= ftmp[2] ^ kPrime[2];
 831         is_p |= ftmp[3] ^ kPrime[3];
 832         is_p |= ftmp[4] ^ kPrime[4];
 833         is_p |= ftmp[5] ^ kPrime[5];
 834         is_p |= ftmp[6] ^ kPrime[6];
 835         is_p |= ftmp[7] ^ kPrime[7];
 836         is_p |= ftmp[8] ^ kPrime[8];
 837
 838         is_p--;
 839         is_p = ((s64) is_p) >> 63;
 840
 841         is_zero |= is_p;
 842         return is_zero;
 843         }
 844
 845 static int felem_is_zero_int(const felem in)
 846         {
 847         return (int) (felem_is_zero(in) & ((limb)1));
 848         }
 849
 850 /*-
 851  * felem_contract converts |in| to its unique, minimal representation.
 852  * On entry:
 853  *   in[i] < 2^59 + 2^14
 854  */
 855 static void felem_contract(felem out, const felem in)
 856         {
 857         limb is_p, is_greater, sign;
 858         static const limb two58 = ((limb)1) << 58;
 859
 860         felem_assign(out, in);
 861
 862         out[0] += out[8] >> 57; out[8] &= bottom57bits;
 863         /* out[8] < 2^57 */
 864         out[1] += out[0] >> 58; out[0] &= bottom58bits;
 865         out[2] += out[1] >> 58; out[1] &= bottom58bits;
 866         out[3] += out[2] >> 58; out[2] &= bottom58bits;
 867         out[4] += out[3] >> 58; out[3] &= bottom58bits;
 868         out[5] += out[4] >> 58; out[4] &= bottom58bits;
 869         out[6] += out[5] >> 58; out[5] &= bottom58bits;
 870         out[7] += out[6] >> 58; out[6] &= bottom58bits;
 871         out[8] += out[7] >> 58; out[7] &= bottom58bits;
 872         /* out[8] < 2^57 + 4 */
 873
 874         /* If the value is greater than 2^521-1 then we have to subtract
 875          * 2^521-1 out. See the comments in felem_is_zero regarding why we
 876          * don't test for other multiples of the prime. */
 877
 878         /* First, if |out| is equal to 2^521-1, we subtract it out to get zero. */
 879
 880         is_p = out[0] ^ kPrime[0];
 881         is_p |= out[1] ^ kPrime[1];
 882         is_p |= out[2] ^ kPrime[2];
 883         is_p |= out[3] ^ kPrime[3];
 884         is_p |= out[4] ^ kPrime[4];
 885         is_p |= out[5] ^ kPrime[5];
 886         is_p |= out[6] ^ kPrime[6];
 887         is_p |= out[7] ^ kPrime[7];
 888         is_p |= out[8] ^ kPrime[8];
 889
 890         is_p--;
 891         is_p &= is_p << 32;
 892         is_p &= is_p << 16;
 893         is_p &= is_p << 8;
 894         is_p &= is_p << 4;
 895         is_p &= is_p << 2;
 896         is_p &= is_p << 1;
 897         is_p = ((s64) is_p) >> 63;
 898         is_p = ~is_p;
 899
 900         /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 901
 902         out[0] &= is_p;
 903         out[1] &= is_p;
 904         out[2] &= is_p;
 905         out[3] &= is_p;
 906         out[4] &= is_p;
 907         out[5] &= is_p;
 908         out[6] &= is_p;
 909         out[7] &= is_p;
 910         out[8] &= is_p;
 911
 912         /* In order to test that |out| >= 2^521-1 we need only test if out[8]
 913          * >> 57 is greater than zero as (2^521-1) + x >= 2^522 */
 914         is_greater = out[8] >> 57;
 915         is_greater |= is_greater << 32;
 916         is_greater |= is_greater << 16;
 917         is_greater |= is_greater << 8;
 918         is_greater |= is_greater << 4;
 919         is_greater |= is_greater << 2;
 920         is_greater |= is_greater << 1;
 921         is_greater = ((s64) is_greater) >> 63;
 922
 923         out[0] -= kPrime[0] & is_greater;
 924         out[1] -= kPrime[1] & is_greater;
 925         out[2] -= kPrime[2] & is_greater;
 926         out[3] -= kPrime[3] & is_greater;
 927         out[4] -= kPrime[4] & is_greater;
 928         out[5] -= kPrime[5] & is_greater;
 929         out[6] -= kPrime[6] & is_greater;
 930         out[7] -= kPrime[7] & is_greater;
 931         out[8] -= kPrime[8] & is_greater;
 932
 933         /* Eliminate negative coefficients */
 934         sign = -(out[0] >> 63); out[0] += (two58 & sign); out[1] -= (1 & sign);
 935         sign = -(out[1] >> 63); out[1] += (two58 & sign); out[2] -= (1 & sign);
 936         sign = -(out[2] >> 63); out[2] += (two58 & sign); out[3] -= (1 & sign);
 937         sign = -(out[3] >> 63); out[3] += (two58 & sign); out[4] -= (1 & sign);
 938         sign = -(out[4] >> 63); out[4] += (two58 & sign); out[5] -= (1 & sign);
 939         sign = -(out[0] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
 940         sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
 941         sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
 942         sign = -(out[5] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
 943         sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
 944         sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
 945         }
 946
 947 /*-
 948  * Group operations
 949  * ----------------
 950  *
 951  * Building on top of the field operations we have the operations on the
 952  * elliptic curve group itself. Points on the curve are represented in Jacobian
 953  * coordinates */
 954
 955 /*-
 956  * point_double calcuates 2*(x_in, y_in, z_in)
 957  *
 958  * The method is taken from:
 959  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
 960  *
 961  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
 962  * while x_out == y_in is not (maybe this works, but it's not tested). */
 963 static void
 964 point_double(felem x_out, felem y_out, felem z_out,
 965              const felem x_in, const felem y_in, const felem z_in)
 966         {
 967         largefelem tmp, tmp2;
 968         felem delta, gamma, beta, alpha, ftmp, ftmp2;
 969
 970         felem_assign(ftmp, x_in);
 971         felem_assign(ftmp2, x_in);
 972
 973         /* delta = z^2 */
 974         felem_square(tmp, z_in);
 975         felem_reduce(delta, tmp);  /* delta[i] < 2^59 + 2^14 */
 976
 977         /* gamma = y^2 */
 978         felem_square(tmp, y_in);
 979         felem_reduce(gamma, tmp);  /* gamma[i] < 2^59 + 2^14 */
 980
 981         /* beta = x*gamma */
 982         felem_mul(tmp, x_in, gamma);
 983         felem_reduce(beta, tmp);  /* beta[i] < 2^59 + 2^14 */
 984
 985         /* alpha = 3*(x-delta)*(x+delta) */
 986         felem_diff64(ftmp, delta);
 987         /* ftmp[i] < 2^61 */
 988         felem_sum64(ftmp2, delta);
 989         /* ftmp2[i] < 2^60 + 2^15 */
 990         felem_scalar64(ftmp2, 3);
 991         /* ftmp2[i] < 3*2^60 + 3*2^15 */
 992         felem_mul(tmp, ftmp, ftmp2);
 993         /*-
 994          * tmp[i] < 17(3*2^121 + 3*2^76)
 995          *        = 61*2^121 + 61*2^76
 996          *        < 64*2^121 + 64*2^76
 997          *        = 2^127 + 2^82
 998          *        < 2^128
 999          */
1000         felem_reduce(alpha, tmp);
1001
1002         /* x' = alpha^2 - 8*beta */
1003         felem_square(tmp, alpha);
1004         /* tmp[i] < 17*2^120
1005          *        < 2^125 */
1006         felem_assign(ftmp, beta);
1007         felem_scalar64(ftmp, 8);
1008         /* ftmp[i] < 2^62 + 2^17 */
1009         felem_diff_128_64(tmp, ftmp);
1010         /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1011         felem_reduce(x_out, tmp);
1012
1013         /* z' = (y + z)^2 - gamma - delta */
1014         felem_sum64(delta, gamma);
1015         /* delta[i] < 2^60 + 2^15 */
1016         felem_assign(ftmp, y_in);
1017         felem_sum64(ftmp, z_in);
1018         /* ftmp[i] < 2^60 + 2^15 */
1019         felem_square(tmp, ftmp);
1020         /* tmp[i] < 17(2^122)
1021          *        < 2^127 */
1022         felem_diff_128_64(tmp, delta);
1023         /* tmp[i] < 2^127 + 2^63 */
1024         felem_reduce(z_out, tmp);
1025
1026         /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1027         felem_scalar64(beta, 4);
1028         /* beta[i] < 2^61 + 2^16 */
1029         felem_diff64(beta, x_out);
1030         /* beta[i] < 2^61 + 2^60 + 2^16 */
1031         felem_mul(tmp, alpha, beta);
1032         /*-
1033          * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1034          *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1035          *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1036          *        < 2^128
1037          */
1038         felem_square(tmp2, gamma);
1039         /*-
1040          * tmp2[i] < 17*(2^59 + 2^14)^2
1041          *         = 17*(2^118 + 2^74 + 2^28)
1042          */
1043         felem_scalar128(tmp2, 8);
1044         /*-
1045          * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1046          *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1047          *         < 2^126
1048          */
1049         felem_diff128(tmp, tmp2);
1050         /*-
1051          * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1052          *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1053          *          2^74 + 2^69 + 2^34 + 2^30
1054          *        < 2^128
1055          */
1056         felem_reduce(y_out, tmp);
1057         }
1058
1059 /* copy_conditional copies in to out iff mask is all ones. */
1060 static void
1061 copy_conditional(felem out, const felem in, limb mask)
1062         {
1063         unsigned i;
1064         for (i = 0; i < NLIMBS; ++i)
1065                 {
1066                 const limb tmp = mask & (in[i] ^ out[i]);
1067                 out[i] ^= tmp;
1068                 }
1069         }
1070
1071 /*-
1072  * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1073  *
1074  * The method is taken from
1075  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1076  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1077  *
1078  * This function includes a branch for checking whether the two input points
1079  * are equal (while not equal to the point at infinity). This case never
1080  * happens during single point multiplication, so there is no timing leak for
1081  * ECDH or ECDSA signing. */
1082 static void point_add(felem x3, felem y3, felem z3,
1083         const felem x1, const felem y1, const felem z1,
1084         const int mixed, const felem x2, const felem y2, const felem z2)
1085         {
1086         felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1087         largefelem tmp, tmp2;
1088         limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1089
1090         z1_is_zero = felem_is_zero(z1);
1091         z2_is_zero = felem_is_zero(z2);
1092
1093         /* ftmp = z1z1 = z1**2 */
1094         felem_square(tmp, z1);
1095         felem_reduce(ftmp, tmp);
1096
1097         if (!mixed)
1098                 {
1099                 /* ftmp2 = z2z2 = z2**2 */
1100                 felem_square(tmp, z2);
1101                 felem_reduce(ftmp2, tmp);
1102
1103                 /* u1 = ftmp3 = x1*z2z2 */
1104                 felem_mul(tmp, x1, ftmp2);
1105                 felem_reduce(ftmp3, tmp);
1106
1107                 /* ftmp5 = z1 + z2 */
1108                 felem_assign(ftmp5, z1);
1109                 felem_sum64(ftmp5, z2);
1110                 /* ftmp5[i] < 2^61 */
1111
1112                 /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1113                 felem_square(tmp, ftmp5);
1114                 /* tmp[i] < 17*2^122 */
1115                 felem_diff_128_64(tmp, ftmp);
1116                 /* tmp[i] < 17*2^122 + 2^63 */
1117                 felem_diff_128_64(tmp, ftmp2);
1118                 /* tmp[i] < 17*2^122 + 2^64 */
1119                 felem_reduce(ftmp5, tmp);
1120
1121                 /* ftmp2 = z2 * z2z2 */
1122                 felem_mul(tmp, ftmp2, z2);
1123                 felem_reduce(ftmp2, tmp);
1124
1125                 /* s1 = ftmp6 = y1 * z2**3 */
1126                 felem_mul(tmp, y1, ftmp2);
1127                 felem_reduce(ftmp6, tmp);
1128                 }
1129         else
1130                 {
1131                 /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */
1132
1133                 /* u1 = ftmp3 = x1*z2z2 */
1134                 felem_assign(ftmp3, x1);
1135
1136                 /* ftmp5 = 2*z1z2 */
1137                 felem_scalar(ftmp5, z1, 2);
1138
1139                 /* s1 = ftmp6 = y1 * z2**3 */
1140                 felem_assign(ftmp6, y1);
1141                 }
1142
1143         /* u2 = x2*z1z1 */
1144         felem_mul(tmp, x2, ftmp);
1145         /* tmp[i] < 17*2^120 */
1146
1147         /* h = ftmp4 = u2 - u1 */
1148         felem_diff_128_64(tmp, ftmp3);
1149         /* tmp[i] < 17*2^120 + 2^63 */
1150         felem_reduce(ftmp4, tmp);
1151
1152         x_equal = felem_is_zero(ftmp4);
1153
1154         /* z_out = ftmp5 * h */
1155         felem_mul(tmp, ftmp5, ftmp4);
1156         felem_reduce(z_out, tmp);
1157
1158         /* ftmp = z1 * z1z1 */
1159         felem_mul(tmp, ftmp, z1);
1160         felem_reduce(ftmp, tmp);
1161
1162         /* s2 = tmp = y2 * z1**3 */
1163         felem_mul(tmp, y2, ftmp);
1164         /* tmp[i] < 17*2^120 */
1165
1166         /* r = ftmp5 = (s2 - s1)*2 */
1167         felem_diff_128_64(tmp, ftmp6);
1168         /* tmp[i] < 17*2^120 + 2^63 */
1169         felem_reduce(ftmp5, tmp);
1170         y_equal = felem_is_zero(ftmp5);
1171         felem_scalar64(ftmp5, 2);
1172         /* ftmp5[i] < 2^61 */
1173
1174         if (x_equal && y_equal && !z1_is_zero && !z2_is_zero)
1175                 {
1176                 point_double(x3, y3, z3, x1, y1, z1);
1177                 return;
1178                 }
1179
1180         /* I = ftmp = (2h)**2 */
1181         felem_assign(ftmp, ftmp4);
1182         felem_scalar64(ftmp, 2);
1183         /* ftmp[i] < 2^61 */
1184         felem_square(tmp, ftmp);
1185         /* tmp[i] < 17*2^122 */
1186         felem_reduce(ftmp, tmp);
1187
1188         /* J = ftmp2 = h * I */
1189         felem_mul(tmp, ftmp4, ftmp);
1190         felem_reduce(ftmp2, tmp);
1191
1192         /* V = ftmp4 = U1 * I */
1193         felem_mul(tmp, ftmp3, ftmp);
1194         felem_reduce(ftmp4, tmp);
1195
1196         /* x_out = r**2 - J - 2V */
1197         felem_square(tmp, ftmp5);
1198         /* tmp[i] < 17*2^122 */
1199         felem_diff_128_64(tmp, ftmp2);
1200         /* tmp[i] < 17*2^122 + 2^63 */
1201         felem_assign(ftmp3, ftmp4);
1202         felem_scalar64(ftmp4, 2);
1203         /* ftmp4[i] < 2^61 */
1204         felem_diff_128_64(tmp, ftmp4);
1205         /* tmp[i] < 17*2^122 + 2^64 */
1206         felem_reduce(x_out, tmp);
1207
1208         /* y_out = r(V-x_out) - 2 * s1 * J */
1209         felem_diff64(ftmp3, x_out);
1210         /* ftmp3[i] < 2^60 + 2^60
1211          *          = 2^61 */
1212         felem_mul(tmp, ftmp5, ftmp3);
1213         /* tmp[i] < 17*2^122 */
1214         felem_mul(tmp2, ftmp6, ftmp2);
1215         /* tmp2[i] < 17*2^120 */
1216         felem_scalar128(tmp2, 2);
1217         /* tmp2[i] < 17*2^121 */
1218         felem_diff128(tmp, tmp2);
1219         /* tmp[i] < 2^127 - 2^69 + 17*2^122
1220          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1221          *        < 2^127 */
1222         felem_reduce(y_out, tmp);
1223
1224         copy_conditional(x_out, x2, z1_is_zero);
1225         copy_conditional(x_out, x1, z2_is_zero);
1226         copy_conditional(y_out, y2, z1_is_zero);
1227         copy_conditional(y_out, y1, z2_is_zero);
1228         copy_conditional(z_out, z2, z1_is_zero);
1229         copy_conditional(z_out, z1, z2_is_zero);
1230         felem_assign(x3, x_out);
1231         felem_assign(y3, y_out);
1232         felem_assign(z3, z_out);
1233         }
1234
1235 /*-
1236  * Base point pre computation
1237  * --------------------------
1238  *
1239  * Two different sorts of precomputed tables are used in the following code.
1240  * Each contain various points on the curve, where each point is three field
1241  * elements (x, y, z).
1242  *
1243  * For the base point table, z is usually 1 (0 for the point at infinity).
1244  * This table has 16 elements:
1245  * index | bits    | point
1246  * ------+---------+------------------------------
1247  *     0 | 0 0 0 0 | 0G
1248  *     1 | 0 0 0 1 | 1G
1249  *     2 | 0 0 1 0 | 2^130G
1250  *     3 | 0 0 1 1 | (2^130 + 1)G
1251  *     4 | 0 1 0 0 | 2^260G
1252  *     5 | 0 1 0 1 | (2^260 + 1)G
1253  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1254  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1255  *     8 | 1 0 0 0 | 2^390G
1256  *     9 | 1 0 0 1 | (2^390 + 1)G
1257  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1258  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1259  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1260  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1261  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1262  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1263  *
1264  * The reason for this is so that we can clock bits into four different
1265  * locations when doing simple scalar multiplies against the base point.
1266  *
1267  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1268
1269 /* gmul is the table of precomputed base points */
1270 static const felem gmul[16][3] =
1271         {{{0, 0, 0, 0, 0, 0, 0, 0, 0},
1272           {0, 0, 0, 0, 0, 0, 0, 0, 0},
1273           {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1274          {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1275            0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1276            0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1277           {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1278            0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1279            0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1280           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1281          {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1282            0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1283            0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1284           {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1285            0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1286            0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1287           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1288          {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1289            0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1290            0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1291           {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1292            0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1293            0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1294           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1295          {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1296            0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1297            0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1298           {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1299            0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1300            0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1301           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1302          {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1303            0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1304            0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1305           {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1306            0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1307            0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1308           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1309          {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1310            0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1311            0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1312           {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1313            0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1314            0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1315           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1316          {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1317            0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1318            0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1319           {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1320            0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1321            0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1322           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1323          {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1324            0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1325            0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1326           {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1327            0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1328            0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1329           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1330          {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1331            0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1332            0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1333           {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1334            0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1335            0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1336           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1337          {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1338            0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1339            0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1340           {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1341            0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1342            0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1343           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1344          {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1345            0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1346            0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1347           {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1348            0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1349            0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1350           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1351          {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1352            0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1353            0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1354           {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1355            0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1356            0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1357           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1358          {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1359            0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1360            0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1361           {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1362            0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1363            0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1364           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1365          {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1366            0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1367            0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1368           {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1369            0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1370            0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1371           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1372          {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1373            0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1374            0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1375           {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1376            0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1377            0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1378          {1, 0, 0, 0, 0, 0, 0, 0, 0}}};
1379
1380 /* select_point selects the |idx|th point from a precomputation table and
1381  * copies it to out. */
1382 static void select_point(const limb idx, unsigned int size, const felem pre_comp[/* size */][3],
1383                          felem out[3])
1384         {
1385         unsigned i, j;
1386         limb *outlimbs = &out[0][0];
1387         memset(outlimbs, 0, 3 * sizeof(felem));
1388
1389         for (i = 0; i < size; i++)
1390                 {
1391                 const limb *inlimbs = &pre_comp[i][0][0];
1392                 limb mask = i ^ idx;
1393                 mask |= mask >> 4;
1394                 mask |= mask >> 2;
1395                 mask |= mask >> 1;
1396                 mask &= 1;
1397                 mask--;
1398                 for (j = 0; j < NLIMBS * 3; j++)
1399                         outlimbs[j] |= inlimbs[j] & mask;
1400                 }
1401         }
1402
1403 /* get_bit returns the |i|th bit in |in| */
1404 static char get_bit(const felem_bytearray in, int i)
1405         {
1406         if (i < 0)
1407                 return 0;
1408         return (in[i >> 3] >> (i & 7)) & 1;
1409         }
1410
1411 /* Interleaved point multiplication using precomputed point multiples:
1412  * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[],
1413  * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
1414  * of the generator, using certain (large) precomputed multiples in g_pre_comp.
1415  * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
1416 static void batch_mul(felem x_out, felem y_out, felem z_out,
1417         const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar,
1418         const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[16][3])
1419         {
1420         int i, skip;
1421         unsigned num, gen_mul = (g_scalar != NULL);
1422         felem nq[3], tmp[4];
1423         limb bits;
1424         u8 sign, digit;
1425
1426         /* set nq to the point at infinity */
1427         memset(nq, 0, 3 * sizeof(felem));
1428
1429         /* Loop over all scalars msb-to-lsb, interleaving additions
1430          * of multiples of the generator (last quarter of rounds)
1431          * and additions of other points multiples (every 5th round).
1432          */
1433         skip = 1; /* save two point operations in the first round */
1434         for (i = (num_points ? 520 : 130); i >= 0; --i)
1435                 {
1436                 /* double */
1437                 if (!skip)
1438                         point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1439
1440                 /* add multiples of the generator */
1441                 if (gen_mul && (i <= 130))
1442                         {
1443                         bits = get_bit(g_scalar, i + 390) << 3;
1444                         if (i < 130)
1445                                 {
1446                                 bits |= get_bit(g_scalar, i + 260) << 2;
1447                                 bits |= get_bit(g_scalar, i + 130) << 1;
1448                                 bits |= get_bit(g_scalar, i);
1449                                 }
1450                         /* select the point to add, in constant time */
1451                         select_point(bits, 16, g_pre_comp, tmp);
1452                         if (!skip)
1453                                 {
1454                                 point_add(nq[0], nq[1], nq[2],
1455                                         nq[0], nq[1], nq[2],
1456                                         1 /* mixed */, tmp[0], tmp[1], tmp[2]);
1457                                 }
1458                         else
1459                                 {
1460                                 memcpy(nq, tmp, 3 * sizeof(felem));
1461                                 skip = 0;
1462                                 }
1463                         }
1464
1465                 /* do other additions every 5 doublings */
1466                 if (num_points && (i % 5 == 0))
1467                         {
1468                         /* loop over all scalars */
1469                         for (num = 0; num < num_points; ++num)
1470                                 {
1471                                 bits = get_bit(scalars[num], i + 4) << 5;
1472                                 bits |= get_bit(scalars[num], i + 3) << 4;
1473                                 bits |= get_bit(scalars[num], i + 2) << 3;
1474                                 bits |= get_bit(scalars[num], i + 1) << 2;
1475                                 bits |= get_bit(scalars[num], i) << 1;
1476                                 bits |= get_bit(scalars[num], i - 1);
1477                                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1478
1479                                 /* select the point to add or subtract, in constant time */
1480                                 select_point(digit, 17, pre_comp[num], tmp);
1481                                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */
1482                                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1483
1484                                 if (!skip)
1485                                         {
1486                                         point_add(nq[0], nq[1], nq[2],
1487                                                 nq[0], nq[1], nq[2],
1488                                                 mixed, tmp[0], tmp[1], tmp[2]);
1489                                         }
1490                                 else
1491                                         {
1492                                         memcpy(nq, tmp, 3 * sizeof(felem));
1493                                         skip = 0;
1494                                         }
1495                                 }
1496                         }
1497                 }
1498         felem_assign(x_out, nq[0]);
1499         felem_assign(y_out, nq[1]);
1500         felem_assign(z_out, nq[2]);
1501         }
1502
1503
1504 /* Precomputation for the group generator. */
1505 typedef struct {
1506         felem g_pre_comp[16][3];
1507         int references;
1508 } NISTP521_PRE_COMP;
1509
1510 const EC_METHOD *EC_GFp_nistp521_method(void)
1511         {
1512         static const EC_METHOD ret = {
1513                 EC_FLAGS_DEFAULT_OCT,
1514                 NID_X9_62_prime_field,
1515                 ec_GFp_nistp521_group_init,
1516                 ec_GFp_simple_group_finish,
1517                 ec_GFp_simple_group_clear_finish,
1518                 ec_GFp_nist_group_copy,
1519                 ec_GFp_nistp521_group_set_curve,
1520                 ec_GFp_simple_group_get_curve,
1521                 ec_GFp_simple_group_get_degree,
1522                 ec_GFp_simple_group_check_discriminant,
1523                 ec_GFp_simple_point_init,
1524                 ec_GFp_simple_point_finish,
1525                 ec_GFp_simple_point_clear_finish,
1526                 ec_GFp_simple_point_copy,
1527                 ec_GFp_simple_point_set_to_infinity,
1528                 ec_GFp_simple_set_Jprojective_coordinates_GFp,
1529                 ec_GFp_simple_get_Jprojective_coordinates_GFp,
1530                 ec_GFp_simple_point_set_affine_coordinates,
1531                 ec_GFp_nistp521_point_get_affine_coordinates,
1532                 0 /* point_set_compressed_coordinates */,
1533                 0 /* point2oct */,
1534                 0 /* oct2point */,
1535                 ec_GFp_simple_add,
1536                 ec_GFp_simple_dbl,
1537                 ec_GFp_simple_invert,
1538                 ec_GFp_simple_is_at_infinity,
1539                 ec_GFp_simple_is_on_curve,
1540                 ec_GFp_simple_cmp,
1541                 ec_GFp_simple_make_affine,
1542                 ec_GFp_simple_points_make_affine,
1543                 ec_GFp_nistp521_points_mul,
1544                 ec_GFp_nistp521_precompute_mult,
1545                 ec_GFp_nistp521_have_precompute_mult,
1546                 ec_GFp_nist_field_mul,
1547                 ec_GFp_nist_field_sqr,
1548                 0 /* field_div */,
1549                 0 /* field_encode */,
1550                 0 /* field_decode */,
1551                 0 /* field_set_to_one */ };
1552
1553         return &ret;
1554         }
1555
1556
1557 /******************************************************************************/
1558 /*                     FUNCTIONS TO MANAGE PRECOMPUTATION
1559  */
1560
1561 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1562         {
1563         NISTP521_PRE_COMP *ret = NULL;
1564         ret = (NISTP521_PRE_COMP *)OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1565         if (!ret)
1566                 {
1567                 ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1568                 return ret;
1569                 }
1570         memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1571         ret->references = 1;
1572         return ret;
1573         }
1574
1575 static void *nistp521_pre_comp_dup(void *src_)
1576         {
1577         NISTP521_PRE_COMP *src = src_;
1578
1579         /* no need to actually copy, these objects never change! */
1580         CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1581
1582         return src_;
1583         }
1584
1585 static void nistp521_pre_comp_free(void *pre_)
1586         {
1587         int i;
1588         NISTP521_PRE_COMP *pre = pre_;
1589
1590         if (!pre)
1591                 return;
1592
1593         i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1594         if (i > 0)
1595                 return;
1596
1597         OPENSSL_free(pre);
1598         }
1599
1600 static void nistp521_pre_comp_clear_free(void *pre_)
1601         {
1602         int i;
1603         NISTP521_PRE_COMP *pre = pre_;
1604
1605         if (!pre)
1606                 return;
1607
1608         i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1609         if (i > 0)
1610                 return;
1611
1612         OPENSSL_cleanse(pre, sizeof(*pre));
1613         OPENSSL_free(pre);
1614         }
1615
1616 /******************************************************************************/
1617 /*                         OPENSSL EC_METHOD FUNCTIONS
1618  */
1619
1620 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1621         {
1622         int ret;
1623         ret = ec_GFp_simple_group_init(group);
1624         group->a_is_minus3 = 1;
1625         return ret;
1626         }
1627
1628 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1629         const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
1630         {
1631         int ret = 0;
1632         BN_CTX *new_ctx = NULL;
1633         BIGNUM *curve_p, *curve_a, *curve_b;
1634
1635         if (ctx == NULL)
1636                 if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1637         BN_CTX_start(ctx);
1638         if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1639                 ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1640                 ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err;
1641         BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1642         BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1643         BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1644         if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) ||
1645                 (BN_cmp(curve_b, b)))
1646                 {
1647                 ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1648                         EC_R_WRONG_CURVE_PARAMETERS);
1649                 goto err;
1650                 }
1651         group->field_mod_func = BN_nist_mod_521;
1652         ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1653 err:
1654         BN_CTX_end(ctx);
1655         if (new_ctx != NULL)
1656                 BN_CTX_free(new_ctx);
1657         return ret;
1658         }
1659
1660 /* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
1661  * (X', Y') = (X/Z^2, Y/Z^3) */
1662 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1663         const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
1664         {
1665         felem z1, z2, x_in, y_in, x_out, y_out;
1666         largefelem tmp;
1667
1668         if (EC_POINT_is_at_infinity(group, point))
1669                 {
1670                 ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1671                         EC_R_POINT_AT_INFINITY);
1672                 return 0;
1673                 }
1674         if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1675                 (!BN_to_felem(z1, &point->Z))) return 0;
1676         felem_inv(z2, z1);
1677         felem_square(tmp, z2); felem_reduce(z1, tmp);
1678         felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp);
1679         felem_contract(x_out, x_in);
1680         if (x != NULL)
1681                 {
1682                 if (!felem_to_BN(x, x_out))
1683                         {
1684                         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
1685                         return 0;
1686                         }
1687                 }
1688         felem_mul(tmp, z1, z2); felem_reduce(z1, tmp);
1689         felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp);
1690         felem_contract(y_out, y_in);
1691         if (y != NULL)
1692                 {
1693                 if (!felem_to_BN(y, y_out))
1694                         {
1695                         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
1696                         return 0;
1697                         }
1698                 }
1699         return 1;
1700         }
1701
1702 static void make_points_affine(size_t num, felem points[/* num */][3], felem tmp_felems[/* num+1 */])
1703         {
1704         /* Runs in constant time, unless an input is the point at infinity
1705          * (which normally shouldn't happen). */
1706         ec_GFp_nistp_points_make_affine_internal(
1707                 num,
1708                 points,
1709                 sizeof(felem),
1710                 tmp_felems,
1711                 (void (*)(void *)) felem_one,
1712                 (int (*)(const void *)) felem_is_zero_int,
1713                 (void (*)(void *, const void *)) felem_assign,
1714                 (void (*)(void *, const void *)) felem_square_reduce,
1715                 (void (*)(void *, const void *, const void *)) felem_mul_reduce,
1716                 (void (*)(void *, const void *)) felem_inv,
1717                 (void (*)(void *, const void *)) felem_contract);
1718         }
1719
1720 /* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values
1721  * Result is stored in r (r can equal one of the inputs). */
1722 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1723         const BIGNUM *scalar, size_t num, const EC_POINT *points[],
1724         const BIGNUM *scalars[], BN_CTX *ctx)
1725         {
1726         int ret = 0;
1727         int j;
1728         int mixed = 0;
1729         BN_CTX *new_ctx = NULL;
1730         BIGNUM *x, *y, *z, *tmp_scalar;
1731         felem_bytearray g_secret;
1732         felem_bytearray *secrets = NULL;
1733         felem (*pre_comp)[17][3] = NULL;
1734         felem *tmp_felems = NULL;
1735         felem_bytearray tmp;
1736         unsigned i, num_bytes;
1737         int have_pre_comp = 0;
1738         size_t num_points = num;
1739         felem x_in, y_in, z_in, x_out, y_out, z_out;
1740         NISTP521_PRE_COMP *pre = NULL;
1741         felem (*g_pre_comp)[3] = NULL;
1742         EC_POINT *generator = NULL;
1743         const EC_POINT *p = NULL;
1744         const BIGNUM *p_scalar = NULL;
1745
1746         if (ctx == NULL)
1747                 if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1748         BN_CTX_start(ctx);
1749         if (((x = BN_CTX_get(ctx)) == NULL) ||
1750                 ((y = BN_CTX_get(ctx)) == NULL) ||
1751                 ((z = BN_CTX_get(ctx)) == NULL) ||
1752                 ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1753                 goto err;
1754
1755         if (scalar != NULL)
1756                 {
1757                 pre = EC_EX_DATA_get_data(group->extra_data,
1758                         nistp521_pre_comp_dup, nistp521_pre_comp_free,
1759                         nistp521_pre_comp_clear_free);
1760                 if (pre)
1761                         /* we have precomputation, try to use it */
1762                         g_pre_comp = &pre->g_pre_comp[0];
1763                 else
1764                         /* try to use the standard precomputation */
1765                         g_pre_comp = (felem (*)[3]) gmul;
1766                 generator = EC_POINT_new(group);
1767                 if (generator == NULL)
1768                         goto err;
1769                 /* get the generator from precomputation */
1770                 if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1771                         !felem_to_BN(y, g_pre_comp[1][1]) ||
1772                         !felem_to_BN(z, g_pre_comp[1][2]))
1773                         {
1774                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1775                         goto err;
1776                         }
1777                 if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1778                                 generator, x, y, z, ctx))
1779                         goto err;
1780                 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1781                         /* precomputation matches generator */
1782                         have_pre_comp = 1;
1783                 else
1784                         /* we don't have valid precomputation:
1785                          * treat the generator as a random point */
1786                         num_points++;
1787                 }
1788
1789         if (num_points > 0)
1790                 {
1791                 if (num_points >= 2)
1792                         {
1793                         /* unless we precompute multiples for just one point,
1794                          * converting those into affine form is time well spent  */
1795                         mixed = 1;
1796                         }
1797                 secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1798                 pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1799                 if (mixed)
1800                         tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1801                 if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL)))
1802                         {
1803                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1804                         goto err;
1805                         }
1806
1807                 /* we treat NULL scalars as 0, and NULL points as points at infinity,
1808                  * i.e., they contribute nothing to the linear combination */
1809                 memset(secrets, 0, num_points * sizeof(felem_bytearray));
1810                 memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1811                 for (i = 0; i < num_points; ++i)
1812                         {
1813                         if (i == num)
1814                                 /* we didn't have a valid precomputation, so we pick
1815                                  * the generator */
1816                                 {
1817                                 p = EC_GROUP_get0_generator(group);
1818                                 p_scalar = scalar;
1819                                 }
1820                         else
1821                                 /* the i^th point */
1822                                 {
1823                                 p = points[i];
1824                                 p_scalar = scalars[i];
1825                                 }
1826                         if ((p_scalar != NULL) && (p != NULL))
1827                                 {
1828                                 /* reduce scalar to 0 <= scalar < 2^521 */
1829                                 if ((BN_num_bits(p_scalar) > 521) || (BN_is_negative(p_scalar)))
1830                                         {
1831                                         /* this is an unusual input, and we don't guarantee
1832                                          * constant-timeness */
1833                                         if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx))
1834                                                 {
1835                                                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1836                                                 goto err;
1837                                                 }
1838                                         num_bytes = BN_bn2bin(tmp_scalar, tmp);
1839                                         }
1840                                 else
1841                                         num_bytes = BN_bn2bin(p_scalar, tmp);
1842                                 flip_endian(secrets[i], tmp, num_bytes);
1843                                 /* precompute multiples */
1844                                 if ((!BN_to_felem(x_out, &p->X)) ||
1845                                         (!BN_to_felem(y_out, &p->Y)) ||
1846                                         (!BN_to_felem(z_out, &p->Z))) goto err;
1847                                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1848                                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1849                                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1850                                 for (j = 2; j <= 16; ++j)
1851                                         {
1852                                         if (j & 1)
1853                                                 {
1854                                                 point_add(
1855                                                         pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1856                                                         pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2],
1857                                                         0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]);
1858                                                 }
1859                                         else
1860                                                 {
1861                                                 point_double(
1862                                                         pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1863                                                         pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]);
1864                                                 }
1865                                         }
1866                                 }
1867                         }
1868                 if (mixed)
1869                         make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1870                 }
1871
1872         /* the scalar for the generator */
1873         if ((scalar != NULL) && (have_pre_comp))
1874                 {
1875                 memset(g_secret, 0, sizeof(g_secret));
1876                 /* reduce scalar to 0 <= scalar < 2^521 */
1877                 if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar)))
1878                         {
1879                         /* this is an unusual input, and we don't guarantee
1880                          * constant-timeness */
1881                         if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx))
1882                                 {
1883                                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1884                                 goto err;
1885                                 }
1886                         num_bytes = BN_bn2bin(tmp_scalar, tmp);
1887                         }
1888                 else
1889                         num_bytes = BN_bn2bin(scalar, tmp);
1890                 flip_endian(g_secret, tmp, num_bytes);
1891                 /* do the multiplication with generator precomputation*/
1892                 batch_mul(x_out, y_out, z_out,
1893                         (const felem_bytearray (*)) secrets, num_points,
1894                         g_secret,
1895                         mixed, (const felem (*)[17][3]) pre_comp,
1896                         (const felem (*)[3]) g_pre_comp);
1897                 }
1898         else
1899                 /* do the multiplication without generator precomputation */
1900                 batch_mul(x_out, y_out, z_out,
1901                         (const felem_bytearray (*)) secrets, num_points,
1902                         NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL);
1903         /* reduce the output to its unique minimal representation */
1904         felem_contract(x_in, x_out);
1905         felem_contract(y_in, y_out);
1906         felem_contract(z_in, z_out);
1907         if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1908                 (!felem_to_BN(z, z_in)))
1909                 {
1910                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1911                 goto err;
1912                 }
1913         ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1914
1915 err:
1916         BN_CTX_end(ctx);
1917         if (generator != NULL)
1918                 EC_POINT_free(generator);
1919         if (new_ctx != NULL)
1920                 BN_CTX_free(new_ctx);
1921         if (secrets != NULL)
1922                 OPENSSL_free(secrets);
1923         if (pre_comp != NULL)
1924                 OPENSSL_free(pre_comp);
1925         if (tmp_felems != NULL)
1926                 OPENSSL_free(tmp_felems);
1927         return ret;
1928         }
1929
1930 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1931         {
1932         int ret = 0;
1933         NISTP521_PRE_COMP *pre = NULL;
1934         int i, j;
1935         BN_CTX *new_ctx = NULL;
1936         BIGNUM *x, *y;
1937         EC_POINT *generator = NULL;
1938         felem tmp_felems[16];
1939
1940         /* throw away old precomputation */
1941         EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
1942                 nistp521_pre_comp_free, nistp521_pre_comp_clear_free);
1943         if (ctx == NULL)
1944                 if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1945         BN_CTX_start(ctx);
1946         if (((x = BN_CTX_get(ctx)) == NULL) ||
1947                 ((y = BN_CTX_get(ctx)) == NULL))
1948                 goto err;
1949         /* get the generator */
1950         if (group->generator == NULL) goto err;
1951         generator = EC_POINT_new(group);
1952         if (generator == NULL)
1953                 goto err;
1954         BN_bin2bn(nistp521_curve_params[3], sizeof (felem_bytearray), x);
1955         BN_bin2bn(nistp521_curve_params[4], sizeof (felem_bytearray), y);
1956         if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
1957                 goto err;
1958         if ((pre = nistp521_pre_comp_new()) == NULL)
1959                 goto err;
1960         /* if the generator is the standard one, use built-in precomputation */
1961         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1962                 {
1963                 memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1964                 ret = 1;
1965                 goto err;
1966                 }
1967         if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
1968                 (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
1969                 (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
1970                 goto err;
1971         /* compute 2^130*G, 2^260*G, 2^390*G */
1972         for (i = 1; i <= 4; i <<= 1)
1973                 {
1974                 point_double(pre->g_pre_comp[2*i][0], pre->g_pre_comp[2*i][1],
1975                         pre->g_pre_comp[2*i][2], pre->g_pre_comp[i][0],
1976                         pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
1977                 for (j = 0; j < 129; ++j)
1978                         {
1979                         point_double(pre->g_pre_comp[2*i][0],
1980                                 pre->g_pre_comp[2*i][1],
1981                                 pre->g_pre_comp[2*i][2],
1982                                 pre->g_pre_comp[2*i][0],
1983                                 pre->g_pre_comp[2*i][1],
1984                                 pre->g_pre_comp[2*i][2]);
1985                         }
1986                 }
1987         /* g_pre_comp[0] is the point at infinity */
1988         memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
1989         /* the remaining multiples */
1990         /* 2^130*G + 2^260*G */
1991         point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
1992                 pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
1993                 pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
1994                 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1995                 pre->g_pre_comp[2][2]);
1996         /* 2^130*G + 2^390*G */
1997         point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
1998                 pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
1999                 pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2000                 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2001                 pre->g_pre_comp[2][2]);
2002         /* 2^260*G + 2^390*G */
2003         point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2004                 pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2005                 pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2006                 0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2007                 pre->g_pre_comp[4][2]);
2008         /* 2^130*G + 2^260*G + 2^390*G */
2009         point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2010                 pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2011                 pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2012                 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2013                 pre->g_pre_comp[2][2]);
2014         for (i = 1; i < 8; ++i)
2015                 {
2016                 /* odd multiples: add G */
2017                 point_add(pre->g_pre_comp[2*i+1][0], pre->g_pre_comp[2*i+1][1],
2018                         pre->g_pre_comp[2*i+1][2], pre->g_pre_comp[2*i][0],
2019                         pre->g_pre_comp[2*i][1], pre->g_pre_comp[2*i][2],
2020                         0, pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2021                         pre->g_pre_comp[1][2]);
2022                 }
2023         make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2024
2025         if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
2026                         nistp521_pre_comp_free, nistp521_pre_comp_clear_free))
2027                 goto err;
2028         ret = 1;
2029         pre = NULL;
2030  err:
2031         BN_CTX_end(ctx);
2032         if (generator != NULL)
2033                 EC_POINT_free(generator);
2034         if (new_ctx != NULL)
2035                 BN_CTX_free(new_ctx);
2036         if (pre)
2037                 nistp521_pre_comp_free(pre);
2038         return ret;
2039         }
2040
2041 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2042         {
2043         if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2044                         nistp521_pre_comp_free, nistp521_pre_comp_clear_free)
2045                 != NULL)
2046                 return 1;
2047         else
2048                 return 0;
2049         }
2050
2051 #else
2052 static void *dummy=&dummy;
2053 #endif