crypto/ec/ecp_nistp256.c

   1 /*
   2  * Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the OpenSSL license (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
  28  *
  29  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  30  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  31  * work which got its smarts from Daniel J. Bernstein's work on the same.
  32  */
  33
  34 #include <openssl/opensslconf.h>
  35 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  36 NON_EMPTY_TRANSLATION_UNIT
  37 #else
  38
  39 # include <stdint.h>
  40 # include <string.h>
  41 # include <openssl/err.h>
  42 # include "ec_lcl.h"
  43
  44 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  45   /* even with gcc, the typedef won't work for 32-bit platforms */
  46 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  47                                  * platforms */
  48 typedef __int128_t int128_t;
  49 # else
  50 #  error "Need GCC 3.1 or later to define type uint128_t"
  51 # endif
  52
  53 typedef uint8_t u8;
  54 typedef uint32_t u32;
  55 typedef uint64_t u64;
  56
  57 /*
  58  * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
  59  * can serialise an element of this field into 32 bytes. We call this an
  60  * felem_bytearray.
  61  */
  62
  63 typedef u8 felem_bytearray[32];
  64
  65 /*
  66  * These are the parameters of P256, taken from FIPS 186-3, page 86. These
  67  * values are big-endian.
  68  */
  69 static const felem_bytearray nistp256_curve_params[5] = {
  70     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
  71      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  72      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
  74     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
  75      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  76      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
  78     {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
  79      0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
  80      0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
  81      0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
  82     {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
  83      0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
  84      0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
  85      0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
  86     {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
  87      0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
  88      0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
  89      0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
  90 };
  91
  92 /*-
  93  * The representation of field elements.
  94  * ------------------------------------
  95  *
  96  * We represent field elements with either four 128-bit values, eight 128-bit
  97  * values, or four 64-bit values. The field element represented is:
  98  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
  99  * or:
 100  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
 101  *
 102  * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
 103  * apart, but are 128-bits wide, the most significant bits of each limb overlap
 104  * with the least significant bits of the next.
 105  *
 106  * A field element with four limbs is an 'felem'. One with eight limbs is a
 107  * 'longfelem'
 108  *
 109  * A field element with four, 64-bit values is called a 'smallfelem'. Small
 110  * values are used as intermediate values before multiplication.
 111  */
 112
 113 # define NLIMBS 4
 114
 115 typedef uint128_t limb;
 116 typedef limb felem[NLIMBS];
 117 typedef limb longfelem[NLIMBS * 2];
 118 typedef u64 smallfelem[NLIMBS];
 119
 120 /* This is the value of the prime as four 64-bit words, little-endian. */
 121 static const u64 kPrime[4] =
 122     { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
 123 static const u64 bottom63bits = 0x7ffffffffffffffful;
 124
 125 /*
 126  * bin32_to_felem takes a little-endian byte array and converts it into felem
 127  * form. This assumes that the CPU is little-endian.
 128  */
 129 static void bin32_to_felem(felem out, const u8 in[32])
 130 {
 131     out[0] = *((u64 *)&in[0]);
 132     out[1] = *((u64 *)&in[8]);
 133     out[2] = *((u64 *)&in[16]);
 134     out[3] = *((u64 *)&in[24]);
 135 }
 136
 137 /*
 138  * smallfelem_to_bin32 takes a smallfelem and serialises into a little
 139  * endian, 32 byte array. This assumes that the CPU is little-endian.
 140  */
 141 static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
 142 {
 143     *((u64 *)&out[0]) = in[0];
 144     *((u64 *)&out[8]) = in[1];
 145     *((u64 *)&out[16]) = in[2];
 146     *((u64 *)&out[24]) = in[3];
 147 }
 148
 149 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 150 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 151 {
 152     unsigned i;
 153     for (i = 0; i < len; ++i)
 154         out[i] = in[len - 1 - i];
 155 }
 156
 157 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 158 static int BN_to_felem(felem out, const BIGNUM *bn)
 159 {
 160     felem_bytearray b_in;
 161     felem_bytearray b_out;
 162     unsigned num_bytes;
 163
 164     /* BN_bn2bin eats leading zeroes */
 165     memset(b_out, 0, sizeof(b_out));
 166     num_bytes = BN_num_bytes(bn);
 167     if (num_bytes > sizeof(b_out)) {
 168         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 169         return 0;
 170     }
 171     if (BN_is_negative(bn)) {
 172         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 173         return 0;
 174     }
 175     num_bytes = BN_bn2bin(bn, b_in);
 176     flip_endian(b_out, b_in, num_bytes);
 177     bin32_to_felem(out, b_out);
 178     return 1;
 179 }
 180
 181 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 182 static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
 183 {
 184     felem_bytearray b_in, b_out;
 185     smallfelem_to_bin32(b_in, in);
 186     flip_endian(b_out, b_in, sizeof(b_out));
 187     return BN_bin2bn(b_out, sizeof(b_out), out);
 188 }
 189
 190 /*-
 191  * Field operations
 192  * ----------------
 193  */
 194
 195 static void smallfelem_one(smallfelem out)
 196 {
 197     out[0] = 1;
 198     out[1] = 0;
 199     out[2] = 0;
 200     out[3] = 0;
 201 }
 202
 203 static void smallfelem_assign(smallfelem out, const smallfelem in)
 204 {
 205     out[0] = in[0];
 206     out[1] = in[1];
 207     out[2] = in[2];
 208     out[3] = in[3];
 209 }
 210
 211 static void felem_assign(felem out, const felem in)
 212 {
 213     out[0] = in[0];
 214     out[1] = in[1];
 215     out[2] = in[2];
 216     out[3] = in[3];
 217 }
 218
 219 /* felem_sum sets out = out + in. */
 220 static void felem_sum(felem out, const felem in)
 221 {
 222     out[0] += in[0];
 223     out[1] += in[1];
 224     out[2] += in[2];
 225     out[3] += in[3];
 226 }
 227
 228 /* felem_small_sum sets out = out + in. */
 229 static void felem_small_sum(felem out, const smallfelem in)
 230 {
 231     out[0] += in[0];
 232     out[1] += in[1];
 233     out[2] += in[2];
 234     out[3] += in[3];
 235 }
 236
 237 /* felem_scalar sets out = out * scalar */
 238 static void felem_scalar(felem out, const u64 scalar)
 239 {
 240     out[0] *= scalar;
 241     out[1] *= scalar;
 242     out[2] *= scalar;
 243     out[3] *= scalar;
 244 }
 245
 246 /* longfelem_scalar sets out = out * scalar */
 247 static void longfelem_scalar(longfelem out, const u64 scalar)
 248 {
 249     out[0] *= scalar;
 250     out[1] *= scalar;
 251     out[2] *= scalar;
 252     out[3] *= scalar;
 253     out[4] *= scalar;
 254     out[5] *= scalar;
 255     out[6] *= scalar;
 256     out[7] *= scalar;
 257 }
 258
 259 # define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
 260 # define two105 (((limb)1) << 105)
 261 # define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
 262
 263 /* zero105 is 0 mod p */
 264 static const felem zero105 =
 265     { two105m41m9, two105, two105m41p9, two105m41p9 };
 266
 267 /*-
 268  * smallfelem_neg sets |out| to |-small|
 269  * On exit:
 270  *   out[i] < out[i] + 2^105
 271  */
 272 static void smallfelem_neg(felem out, const smallfelem small)
 273 {
 274     /* In order to prevent underflow, we subtract from 0 mod p. */
 275     out[0] = zero105[0] - small[0];
 276     out[1] = zero105[1] - small[1];
 277     out[2] = zero105[2] - small[2];
 278     out[3] = zero105[3] - small[3];
 279 }
 280
 281 /*-
 282  * felem_diff subtracts |in| from |out|
 283  * On entry:
 284  *   in[i] < 2^104
 285  * On exit:
 286  *   out[i] < out[i] + 2^105
 287  */
 288 static void felem_diff(felem out, const felem in)
 289 {
 290     /*
 291      * In order to prevent underflow, we add 0 mod p before subtracting.
 292      */
 293     out[0] += zero105[0];
 294     out[1] += zero105[1];
 295     out[2] += zero105[2];
 296     out[3] += zero105[3];
 297
 298     out[0] -= in[0];
 299     out[1] -= in[1];
 300     out[2] -= in[2];
 301     out[3] -= in[3];
 302 }
 303
 304 # define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
 305 # define two107 (((limb)1) << 107)
 306 # define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
 307
 308 /* zero107 is 0 mod p */
 309 static const felem zero107 =
 310     { two107m43m11, two107, two107m43p11, two107m43p11 };
 311
 312 /*-
 313  * An alternative felem_diff for larger inputs |in|
 314  * felem_diff_zero107 subtracts |in| from |out|
 315  * On entry:
 316  *   in[i] < 2^106
 317  * On exit:
 318  *   out[i] < out[i] + 2^107
 319  */
 320 static void felem_diff_zero107(felem out, const felem in)
 321 {
 322     /*
 323      * In order to prevent underflow, we add 0 mod p before subtracting.
 324      */
 325     out[0] += zero107[0];
 326     out[1] += zero107[1];
 327     out[2] += zero107[2];
 328     out[3] += zero107[3];
 329
 330     out[0] -= in[0];
 331     out[1] -= in[1];
 332     out[2] -= in[2];
 333     out[3] -= in[3];
 334 }
 335
 336 /*-
 337  * longfelem_diff subtracts |in| from |out|
 338  * On entry:
 339  *   in[i] < 7*2^67
 340  * On exit:
 341  *   out[i] < out[i] + 2^70 + 2^40
 342  */
 343 static void longfelem_diff(longfelem out, const longfelem in)
 344 {
 345     static const limb two70m8p6 =
 346         (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
 347     static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
 348     static const limb two70 = (((limb) 1) << 70);
 349     static const limb two70m40m38p6 =
 350         (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
 351         (((limb) 1) << 6);
 352     static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
 353
 354     /* add 0 mod p to avoid underflow */
 355     out[0] += two70m8p6;
 356     out[1] += two70p40;
 357     out[2] += two70;
 358     out[3] += two70m40m38p6;
 359     out[4] += two70m6;
 360     out[5] += two70m6;
 361     out[6] += two70m6;
 362     out[7] += two70m6;
 363
 364     /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
 365     out[0] -= in[0];
 366     out[1] -= in[1];
 367     out[2] -= in[2];
 368     out[3] -= in[3];
 369     out[4] -= in[4];
 370     out[5] -= in[5];
 371     out[6] -= in[6];
 372     out[7] -= in[7];
 373 }
 374
 375 # define two64m0 (((limb)1) << 64) - 1
 376 # define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
 377 # define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
 378 # define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
 379
 380 /* zero110 is 0 mod p */
 381 static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
 382
 383 /*-
 384  * felem_shrink converts an felem into a smallfelem. The result isn't quite
 385  * minimal as the value may be greater than p.
 386  *
 387  * On entry:
 388  *   in[i] < 2^109
 389  * On exit:
 390  *   out[i] < 2^64
 391  */
 392 static void felem_shrink(smallfelem out, const felem in)
 393 {
 394     felem tmp;
 395     u64 a, b, mask;
 396     u64 high, low;
 397     static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
 398
 399     /* Carry 2->3 */
 400     tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
 401     /* tmp[3] < 2^110 */
 402
 403     tmp[2] = zero110[2] + (u64)in[2];
 404     tmp[0] = zero110[0] + in[0];
 405     tmp[1] = zero110[1] + in[1];
 406     /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
 407
 408     /*
 409      * We perform two partial reductions where we eliminate the high-word of
 410      * tmp[3]. We don't update the other words till the end.
 411      */
 412     a = tmp[3] >> 64;           /* a < 2^46 */
 413     tmp[3] = (u64)tmp[3];
 414     tmp[3] -= a;
 415     tmp[3] += ((limb) a) << 32;
 416     /* tmp[3] < 2^79 */
 417
 418     b = a;
 419     a = tmp[3] >> 64;           /* a < 2^15 */
 420     b += a;                     /* b < 2^46 + 2^15 < 2^47 */
 421     tmp[3] = (u64)tmp[3];
 422     tmp[3] -= a;
 423     tmp[3] += ((limb) a) << 32;
 424     /* tmp[3] < 2^64 + 2^47 */
 425
 426     /*
 427      * This adjusts the other two words to complete the two partial
 428      * reductions.
 429      */
 430     tmp[0] += b;
 431     tmp[1] -= (((limb) b) << 32);
 432
 433     /*
 434      * In order to make space in tmp[3] for the carry from 2 -> 3, we
 435      * conditionally subtract kPrime if tmp[3] is large enough.
 436      */
 437     high = (u64)(tmp[3] >> 64);
 438     /* As tmp[3] < 2^65, high is either 1 or 0 */
 439     high = 0 - high;
 440     /*-
 441      * high is:
 442      *   all ones   if the high word of tmp[3] is 1
 443      *   all zeros  if the high word of tmp[3] if 0
 444      */
 445     low = (u64)tmp[3];
 446     mask = 0 - (low >> 63);
 447     /*-
 448      * mask is:
 449      *   all ones   if the MSB of low is 1
 450      *   all zeros  if the MSB of low if 0
 451      */
 452     low &= bottom63bits;
 453     low -= kPrime3Test;
 454     /* if low was greater than kPrime3Test then the MSB is zero */
 455     low = ~low;
 456     low = 0 - (low >> 63);
 457     /*-
 458      * low is:
 459      *   all ones   if low was > kPrime3Test
 460      *   all zeros  if low was <= kPrime3Test
 461      */
 462     mask = (mask & low) | high;
 463     tmp[0] -= mask & kPrime[0];
 464     tmp[1] -= mask & kPrime[1];
 465     /* kPrime[2] is zero, so omitted */
 466     tmp[3] -= mask & kPrime[3];
 467     /* tmp[3] < 2**64 - 2**32 + 1 */
 468
 469     tmp[1] += ((u64)(tmp[0] >> 64));
 470     tmp[0] = (u64)tmp[0];
 471     tmp[2] += ((u64)(tmp[1] >> 64));
 472     tmp[1] = (u64)tmp[1];
 473     tmp[3] += ((u64)(tmp[2] >> 64));
 474     tmp[2] = (u64)tmp[2];
 475     /* tmp[i] < 2^64 */
 476
 477     out[0] = tmp[0];
 478     out[1] = tmp[1];
 479     out[2] = tmp[2];
 480     out[3] = tmp[3];
 481 }
 482
 483 /* smallfelem_expand converts a smallfelem to an felem */
 484 static void smallfelem_expand(felem out, const smallfelem in)
 485 {
 486     out[0] = in[0];
 487     out[1] = in[1];
 488     out[2] = in[2];
 489     out[3] = in[3];
 490 }
 491
 492 /*-
 493  * smallfelem_square sets |out| = |small|^2
 494  * On entry:
 495  *   small[i] < 2^64
 496  * On exit:
 497  *   out[i] < 7 * 2^64 < 2^67
 498  */
 499 static void smallfelem_square(longfelem out, const smallfelem small)
 500 {
 501     limb a;
 502     u64 high, low;
 503
 504     a = ((uint128_t) small[0]) * small[0];
 505     low = a;
 506     high = a >> 64;
 507     out[0] = low;
 508     out[1] = high;
 509
 510     a = ((uint128_t) small[0]) * small[1];
 511     low = a;
 512     high = a >> 64;
 513     out[1] += low;
 514     out[1] += low;
 515     out[2] = high;
 516
 517     a = ((uint128_t) small[0]) * small[2];
 518     low = a;
 519     high = a >> 64;
 520     out[2] += low;
 521     out[2] *= 2;
 522     out[3] = high;
 523
 524     a = ((uint128_t) small[0]) * small[3];
 525     low = a;
 526     high = a >> 64;
 527     out[3] += low;
 528     out[4] = high;
 529
 530     a = ((uint128_t) small[1]) * small[2];
 531     low = a;
 532     high = a >> 64;
 533     out[3] += low;
 534     out[3] *= 2;
 535     out[4] += high;
 536
 537     a = ((uint128_t) small[1]) * small[1];
 538     low = a;
 539     high = a >> 64;
 540     out[2] += low;
 541     out[3] += high;
 542
 543     a = ((uint128_t) small[1]) * small[3];
 544     low = a;
 545     high = a >> 64;
 546     out[4] += low;
 547     out[4] *= 2;
 548     out[5] = high;
 549
 550     a = ((uint128_t) small[2]) * small[3];
 551     low = a;
 552     high = a >> 64;
 553     out[5] += low;
 554     out[5] *= 2;
 555     out[6] = high;
 556     out[6] += high;
 557
 558     a = ((uint128_t) small[2]) * small[2];
 559     low = a;
 560     high = a >> 64;
 561     out[4] += low;
 562     out[5] += high;
 563
 564     a = ((uint128_t) small[3]) * small[3];
 565     low = a;
 566     high = a >> 64;
 567     out[6] += low;
 568     out[7] = high;
 569 }
 570
 571 /*-
 572  * felem_square sets |out| = |in|^2
 573  * On entry:
 574  *   in[i] < 2^109
 575  * On exit:
 576  *   out[i] < 7 * 2^64 < 2^67
 577  */
 578 static void felem_square(longfelem out, const felem in)
 579 {
 580     u64 small[4];
 581     felem_shrink(small, in);
 582     smallfelem_square(out, small);
 583 }
 584
 585 /*-
 586  * smallfelem_mul sets |out| = |small1| * |small2|
 587  * On entry:
 588  *   small1[i] < 2^64
 589  *   small2[i] < 2^64
 590  * On exit:
 591  *   out[i] < 7 * 2^64 < 2^67
 592  */
 593 static void smallfelem_mul(longfelem out, const smallfelem small1,
 594                            const smallfelem small2)
 595 {
 596     limb a;
 597     u64 high, low;
 598
 599     a = ((uint128_t) small1[0]) * small2[0];
 600     low = a;
 601     high = a >> 64;
 602     out[0] = low;
 603     out[1] = high;
 604
 605     a = ((uint128_t) small1[0]) * small2[1];
 606     low = a;
 607     high = a >> 64;
 608     out[1] += low;
 609     out[2] = high;
 610
 611     a = ((uint128_t) small1[1]) * small2[0];
 612     low = a;
 613     high = a >> 64;
 614     out[1] += low;
 615     out[2] += high;
 616
 617     a = ((uint128_t) small1[0]) * small2[2];
 618     low = a;
 619     high = a >> 64;
 620     out[2] += low;
 621     out[3] = high;
 622
 623     a = ((uint128_t) small1[1]) * small2[1];
 624     low = a;
 625     high = a >> 64;
 626     out[2] += low;
 627     out[3] += high;
 628
 629     a = ((uint128_t) small1[2]) * small2[0];
 630     low = a;
 631     high = a >> 64;
 632     out[2] += low;
 633     out[3] += high;
 634
 635     a = ((uint128_t) small1[0]) * small2[3];
 636     low = a;
 637     high = a >> 64;
 638     out[3] += low;
 639     out[4] = high;
 640
 641     a = ((uint128_t) small1[1]) * small2[2];
 642     low = a;
 643     high = a >> 64;
 644     out[3] += low;
 645     out[4] += high;
 646
 647     a = ((uint128_t) small1[2]) * small2[1];
 648     low = a;
 649     high = a >> 64;
 650     out[3] += low;
 651     out[4] += high;
 652
 653     a = ((uint128_t) small1[3]) * small2[0];
 654     low = a;
 655     high = a >> 64;
 656     out[3] += low;
 657     out[4] += high;
 658
 659     a = ((uint128_t) small1[1]) * small2[3];
 660     low = a;
 661     high = a >> 64;
 662     out[4] += low;
 663     out[5] = high;
 664
 665     a = ((uint128_t) small1[2]) * small2[2];
 666     low = a;
 667     high = a >> 64;
 668     out[4] += low;
 669     out[5] += high;
 670
 671     a = ((uint128_t) small1[3]) * small2[1];
 672     low = a;
 673     high = a >> 64;
 674     out[4] += low;
 675     out[5] += high;
 676
 677     a = ((uint128_t) small1[2]) * small2[3];
 678     low = a;
 679     high = a >> 64;
 680     out[5] += low;
 681     out[6] = high;
 682
 683     a = ((uint128_t) small1[3]) * small2[2];
 684     low = a;
 685     high = a >> 64;
 686     out[5] += low;
 687     out[6] += high;
 688
 689     a = ((uint128_t) small1[3]) * small2[3];
 690     low = a;
 691     high = a >> 64;
 692     out[6] += low;
 693     out[7] = high;
 694 }
 695
 696 /*-
 697  * felem_mul sets |out| = |in1| * |in2|
 698  * On entry:
 699  *   in1[i] < 2^109
 700  *   in2[i] < 2^109
 701  * On exit:
 702  *   out[i] < 7 * 2^64 < 2^67
 703  */
 704 static void felem_mul(longfelem out, const felem in1, const felem in2)
 705 {
 706     smallfelem small1, small2;
 707     felem_shrink(small1, in1);
 708     felem_shrink(small2, in2);
 709     smallfelem_mul(out, small1, small2);
 710 }
 711
 712 /*-
 713  * felem_small_mul sets |out| = |small1| * |in2|
 714  * On entry:
 715  *   small1[i] < 2^64
 716  *   in2[i] < 2^109
 717  * On exit:
 718  *   out[i] < 7 * 2^64 < 2^67
 719  */
 720 static void felem_small_mul(longfelem out, const smallfelem small1,
 721                             const felem in2)
 722 {
 723     smallfelem small2;
 724     felem_shrink(small2, in2);
 725     smallfelem_mul(out, small1, small2);
 726 }
 727
 728 # define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
 729 # define two100 (((limb)1) << 100)
 730 # define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
 731 /* zero100 is 0 mod p */
 732 static const felem zero100 =
 733     { two100m36m4, two100, two100m36p4, two100m36p4 };
 734
 735 /*-
 736  * Internal function for the different flavours of felem_reduce.
 737  * felem_reduce_ reduces the higher coefficients in[4]-in[7].
 738  * On entry:
 739  *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
 740  *   out[1] >= in[7] + 2^32*in[4]
 741  *   out[2] >= in[5] + 2^32*in[5]
 742  *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
 743  * On exit:
 744  *   out[0] <= out[0] + in[4] + 2^32*in[5]
 745  *   out[1] <= out[1] + in[5] + 2^33*in[6]
 746  *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
 747  *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
 748  */
 749 static void felem_reduce_(felem out, const longfelem in)
 750 {
 751     int128_t c;
 752     /* combine common terms from below */
 753     c = in[4] + (in[5] << 32);
 754     out[0] += c;
 755     out[3] -= c;
 756
 757     c = in[5] - in[7];
 758     out[1] += c;
 759     out[2] -= c;
 760
 761     /* the remaining terms */
 762     /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
 763     out[1] -= (in[4] << 32);
 764     out[3] += (in[4] << 32);
 765
 766     /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
 767     out[2] -= (in[5] << 32);
 768
 769     /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
 770     out[0] -= in[6];
 771     out[0] -= (in[6] << 32);
 772     out[1] += (in[6] << 33);
 773     out[2] += (in[6] * 2);
 774     out[3] -= (in[6] << 32);
 775
 776     /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
 777     out[0] -= in[7];
 778     out[0] -= (in[7] << 32);
 779     out[2] += (in[7] << 33);
 780     out[3] += (in[7] * 3);
 781 }
 782
 783 /*-
 784  * felem_reduce converts a longfelem into an felem.
 785  * To be called directly after felem_square or felem_mul.
 786  * On entry:
 787  *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
 788  *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
 789  * On exit:
 790  *   out[i] < 2^101
 791  */
 792 static void felem_reduce(felem out, const longfelem in)
 793 {
 794     out[0] = zero100[0] + in[0];
 795     out[1] = zero100[1] + in[1];
 796     out[2] = zero100[2] + in[2];
 797     out[3] = zero100[3] + in[3];
 798
 799     felem_reduce_(out, in);
 800
 801     /*-
 802      * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
 803      * out[1] > 2^100 - 2^64 - 7*2^96 > 0
 804      * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
 805      * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
 806      *
 807      * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
 808      * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
 809      * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
 810      * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
 811      */
 812 }
 813
 814 /*-
 815  * felem_reduce_zero105 converts a larger longfelem into an felem.
 816  * On entry:
 817  *   in[0] < 2^71
 818  * On exit:
 819  *   out[i] < 2^106
 820  */
 821 static void felem_reduce_zero105(felem out, const longfelem in)
 822 {
 823     out[0] = zero105[0] + in[0];
 824     out[1] = zero105[1] + in[1];
 825     out[2] = zero105[2] + in[2];
 826     out[3] = zero105[3] + in[3];
 827
 828     felem_reduce_(out, in);
 829
 830     /*-
 831      * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
 832      * out[1] > 2^105 - 2^71 - 2^103 > 0
 833      * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
 834      * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
 835      *
 836      * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 837      * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 838      * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
 839      * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
 840      */
 841 }
 842
 843 /*
 844  * subtract_u64 sets *result = *result - v and *carry to one if the
 845  * subtraction underflowed.
 846  */
 847 static void subtract_u64(u64 *result, u64 *carry, u64 v)
 848 {
 849     uint128_t r = *result;
 850     r -= v;
 851     *carry = (r >> 64) & 1;
 852     *result = (u64)r;
 853 }
 854
 855 /*
 856  * felem_contract converts |in| to its unique, minimal representation. On
 857  * entry: in[i] < 2^109
 858  */
 859 static void felem_contract(smallfelem out, const felem in)
 860 {
 861     unsigned i;
 862     u64 all_equal_so_far = 0, result = 0, carry;
 863
 864     felem_shrink(out, in);
 865     /* small is minimal except that the value might be > p */
 866
 867     all_equal_so_far--;
 868     /*
 869      * We are doing a constant time test if out >= kPrime. We need to compare
 870      * each u64, from most-significant to least significant. For each one, if
 871      * all words so far have been equal (m is all ones) then a non-equal
 872      * result is the answer. Otherwise we continue.
 873      */
 874     for (i = 3; i < 4; i--) {
 875         u64 equal;
 876         uint128_t a = ((uint128_t) kPrime[i]) - out[i];
 877         /*
 878          * if out[i] > kPrime[i] then a will underflow and the high 64-bits
 879          * will all be set.
 880          */
 881         result |= all_equal_so_far & ((u64)(a >> 64));
 882
 883         /*
 884          * if kPrime[i] == out[i] then |equal| will be all zeros and the
 885          * decrement will make it all ones.
 886          */
 887         equal = kPrime[i] ^ out[i];
 888         equal--;
 889         equal &= equal << 32;
 890         equal &= equal << 16;
 891         equal &= equal << 8;
 892         equal &= equal << 4;
 893         equal &= equal << 2;
 894         equal &= equal << 1;
 895         equal = 0 - (equal >> 63);
 896
 897         all_equal_so_far &= equal;
 898     }
 899
 900     /*
 901      * if all_equal_so_far is still all ones then the two values are equal
 902      * and so out >= kPrime is true.
 903      */
 904     result |= all_equal_so_far;
 905
 906     /* if out >= kPrime then we subtract kPrime. */
 907     subtract_u64(&out[0], &carry, result & kPrime[0]);
 908     subtract_u64(&out[1], &carry, carry);
 909     subtract_u64(&out[2], &carry, carry);
 910     subtract_u64(&out[3], &carry, carry);
 911
 912     subtract_u64(&out[1], &carry, result & kPrime[1]);
 913     subtract_u64(&out[2], &carry, carry);
 914     subtract_u64(&out[3], &carry, carry);
 915
 916     subtract_u64(&out[2], &carry, result & kPrime[2]);
 917     subtract_u64(&out[3], &carry, carry);
 918
 919     subtract_u64(&out[3], &carry, result & kPrime[3]);
 920 }
 921
 922 static void smallfelem_square_contract(smallfelem out, const smallfelem in)
 923 {
 924     longfelem longtmp;
 925     felem tmp;
 926
 927     smallfelem_square(longtmp, in);
 928     felem_reduce(tmp, longtmp);
 929     felem_contract(out, tmp);
 930 }
 931
 932 static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
 933                                     const smallfelem in2)
 934 {
 935     longfelem longtmp;
 936     felem tmp;
 937
 938     smallfelem_mul(longtmp, in1, in2);
 939     felem_reduce(tmp, longtmp);
 940     felem_contract(out, tmp);
 941 }
 942
 943 /*-
 944  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 945  * otherwise.
 946  * On entry:
 947  *   small[i] < 2^64
 948  */
 949 static limb smallfelem_is_zero(const smallfelem small)
 950 {
 951     limb result;
 952     u64 is_p;
 953
 954     u64 is_zero = small[0] | small[1] | small[2] | small[3];
 955     is_zero--;
 956     is_zero &= is_zero << 32;
 957     is_zero &= is_zero << 16;
 958     is_zero &= is_zero << 8;
 959     is_zero &= is_zero << 4;
 960     is_zero &= is_zero << 2;
 961     is_zero &= is_zero << 1;
 962     is_zero = 0 - (is_zero >> 63);
 963
 964     is_p = (small[0] ^ kPrime[0]) |
 965         (small[1] ^ kPrime[1]) |
 966         (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
 967     is_p--;
 968     is_p &= is_p << 32;
 969     is_p &= is_p << 16;
 970     is_p &= is_p << 8;
 971     is_p &= is_p << 4;
 972     is_p &= is_p << 2;
 973     is_p &= is_p << 1;
 974     is_p = 0 - (is_p >> 63);
 975
 976     is_zero |= is_p;
 977
 978     result = is_zero;
 979     result |= ((limb) is_zero) << 64;
 980     return result;
 981 }
 982
 983 static int smallfelem_is_zero_int(const void *small)
 984 {
 985     return (int)(smallfelem_is_zero(small) & ((limb) 1));
 986 }
 987
 988 /*-
 989  * felem_inv calculates |out| = |in|^{-1}
 990  *
 991  * Based on Fermat's Little Theorem:
 992  *   a^p = a (mod p)
 993  *   a^{p-1} = 1 (mod p)
 994  *   a^{p-2} = a^{-1} (mod p)
 995  */
 996 static void felem_inv(felem out, const felem in)
 997 {
 998     felem ftmp, ftmp2;
 999     /* each e_I will hold |in|^{2^I - 1} */
1000     felem e2, e4, e8, e16, e32, e64;
1001     longfelem tmp;
1002     unsigned i;
1003
1004     felem_square(tmp, in);
1005     felem_reduce(ftmp, tmp);    /* 2^1 */
1006     felem_mul(tmp, in, ftmp);
1007     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
1008     felem_assign(e2, ftmp);
1009     felem_square(tmp, ftmp);
1010     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
1011     felem_square(tmp, ftmp);
1012     felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
1013     felem_mul(tmp, ftmp, e2);
1014     felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1015     felem_assign(e4, ftmp);
1016     felem_square(tmp, ftmp);
1017     felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1018     felem_square(tmp, ftmp);
1019     felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1020     felem_square(tmp, ftmp);
1021     felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1022     felem_square(tmp, ftmp);
1023     felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1024     felem_mul(tmp, ftmp, e4);
1025     felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1026     felem_assign(e8, ftmp);
1027     for (i = 0; i < 8; i++) {
1028         felem_square(tmp, ftmp);
1029         felem_reduce(ftmp, tmp);
1030     }                           /* 2^16 - 2^8 */
1031     felem_mul(tmp, ftmp, e8);
1032     felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1033     felem_assign(e16, ftmp);
1034     for (i = 0; i < 16; i++) {
1035         felem_square(tmp, ftmp);
1036         felem_reduce(ftmp, tmp);
1037     }                           /* 2^32 - 2^16 */
1038     felem_mul(tmp, ftmp, e16);
1039     felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1040     felem_assign(e32, ftmp);
1041     for (i = 0; i < 32; i++) {
1042         felem_square(tmp, ftmp);
1043         felem_reduce(ftmp, tmp);
1044     }                           /* 2^64 - 2^32 */
1045     felem_assign(e64, ftmp);
1046     felem_mul(tmp, ftmp, in);
1047     felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1048     for (i = 0; i < 192; i++) {
1049         felem_square(tmp, ftmp);
1050         felem_reduce(ftmp, tmp);
1051     }                           /* 2^256 - 2^224 + 2^192 */
1052
1053     felem_mul(tmp, e64, e32);
1054     felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1055     for (i = 0; i < 16; i++) {
1056         felem_square(tmp, ftmp2);
1057         felem_reduce(ftmp2, tmp);
1058     }                           /* 2^80 - 2^16 */
1059     felem_mul(tmp, ftmp2, e16);
1060     felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1061     for (i = 0; i < 8; i++) {
1062         felem_square(tmp, ftmp2);
1063         felem_reduce(ftmp2, tmp);
1064     }                           /* 2^88 - 2^8 */
1065     felem_mul(tmp, ftmp2, e8);
1066     felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1067     for (i = 0; i < 4; i++) {
1068         felem_square(tmp, ftmp2);
1069         felem_reduce(ftmp2, tmp);
1070     }                           /* 2^92 - 2^4 */
1071     felem_mul(tmp, ftmp2, e4);
1072     felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1073     felem_square(tmp, ftmp2);
1074     felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1075     felem_square(tmp, ftmp2);
1076     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1077     felem_mul(tmp, ftmp2, e2);
1078     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1079     felem_square(tmp, ftmp2);
1080     felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1081     felem_square(tmp, ftmp2);
1082     felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1083     felem_mul(tmp, ftmp2, in);
1084     felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1085
1086     felem_mul(tmp, ftmp2, ftmp);
1087     felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1088 }
1089
1090 static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1091 {
1092     felem tmp;
1093
1094     smallfelem_expand(tmp, in);
1095     felem_inv(tmp, tmp);
1096     felem_contract(out, tmp);
1097 }
1098
1099 /*-
1100  * Group operations
1101  * ----------------
1102  *
1103  * Building on top of the field operations we have the operations on the
1104  * elliptic curve group itself. Points on the curve are represented in Jacobian
1105  * coordinates
1106  */
1107
1108 /*-
1109  * point_double calculates 2*(x_in, y_in, z_in)
1110  *
1111  * The method is taken from:
1112  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1113  *
1114  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1115  * while x_out == y_in is not (maybe this works, but it's not tested).
1116  */
1117 static void
1118 point_double(felem x_out, felem y_out, felem z_out,
1119              const felem x_in, const felem y_in, const felem z_in)
1120 {
1121     longfelem tmp, tmp2;
1122     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1123     smallfelem small1, small2;
1124
1125     felem_assign(ftmp, x_in);
1126     /* ftmp[i] < 2^106 */
1127     felem_assign(ftmp2, x_in);
1128     /* ftmp2[i] < 2^106 */
1129
1130     /* delta = z^2 */
1131     felem_square(tmp, z_in);
1132     felem_reduce(delta, tmp);
1133     /* delta[i] < 2^101 */
1134
1135     /* gamma = y^2 */
1136     felem_square(tmp, y_in);
1137     felem_reduce(gamma, tmp);
1138     /* gamma[i] < 2^101 */
1139     felem_shrink(small1, gamma);
1140
1141     /* beta = x*gamma */
1142     felem_small_mul(tmp, small1, x_in);
1143     felem_reduce(beta, tmp);
1144     /* beta[i] < 2^101 */
1145
1146     /* alpha = 3*(x-delta)*(x+delta) */
1147     felem_diff(ftmp, delta);
1148     /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1149     felem_sum(ftmp2, delta);
1150     /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1151     felem_scalar(ftmp2, 3);
1152     /* ftmp2[i] < 3 * 2^107 < 2^109 */
1153     felem_mul(tmp, ftmp, ftmp2);
1154     felem_reduce(alpha, tmp);
1155     /* alpha[i] < 2^101 */
1156     felem_shrink(small2, alpha);
1157
1158     /* x' = alpha^2 - 8*beta */
1159     smallfelem_square(tmp, small2);
1160     felem_reduce(x_out, tmp);
1161     felem_assign(ftmp, beta);
1162     felem_scalar(ftmp, 8);
1163     /* ftmp[i] < 8 * 2^101 = 2^104 */
1164     felem_diff(x_out, ftmp);
1165     /* x_out[i] < 2^105 + 2^101 < 2^106 */
1166
1167     /* z' = (y + z)^2 - gamma - delta */
1168     felem_sum(delta, gamma);
1169     /* delta[i] < 2^101 + 2^101 = 2^102 */
1170     felem_assign(ftmp, y_in);
1171     felem_sum(ftmp, z_in);
1172     /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1173     felem_square(tmp, ftmp);
1174     felem_reduce(z_out, tmp);
1175     felem_diff(z_out, delta);
1176     /* z_out[i] < 2^105 + 2^101 < 2^106 */
1177
1178     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1179     felem_scalar(beta, 4);
1180     /* beta[i] < 4 * 2^101 = 2^103 */
1181     felem_diff_zero107(beta, x_out);
1182     /* beta[i] < 2^107 + 2^103 < 2^108 */
1183     felem_small_mul(tmp, small2, beta);
1184     /* tmp[i] < 7 * 2^64 < 2^67 */
1185     smallfelem_square(tmp2, small1);
1186     /* tmp2[i] < 7 * 2^64 */
1187     longfelem_scalar(tmp2, 8);
1188     /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1189     longfelem_diff(tmp, tmp2);
1190     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1191     felem_reduce_zero105(y_out, tmp);
1192     /* y_out[i] < 2^106 */
1193 }
1194
1195 /*
1196  * point_double_small is the same as point_double, except that it operates on
1197  * smallfelems
1198  */
1199 static void
1200 point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1201                    const smallfelem x_in, const smallfelem y_in,
1202                    const smallfelem z_in)
1203 {
1204     felem felem_x_out, felem_y_out, felem_z_out;
1205     felem felem_x_in, felem_y_in, felem_z_in;
1206
1207     smallfelem_expand(felem_x_in, x_in);
1208     smallfelem_expand(felem_y_in, y_in);
1209     smallfelem_expand(felem_z_in, z_in);
1210     point_double(felem_x_out, felem_y_out, felem_z_out,
1211                  felem_x_in, felem_y_in, felem_z_in);
1212     felem_shrink(x_out, felem_x_out);
1213     felem_shrink(y_out, felem_y_out);
1214     felem_shrink(z_out, felem_z_out);
1215 }
1216
1217 /* copy_conditional copies in to out iff mask is all ones. */
1218 static void copy_conditional(felem out, const felem in, limb mask)
1219 {
1220     unsigned i;
1221     for (i = 0; i < NLIMBS; ++i) {
1222         const limb tmp = mask & (in[i] ^ out[i]);
1223         out[i] ^= tmp;
1224     }
1225 }
1226
1227 /* copy_small_conditional copies in to out iff mask is all ones. */
1228 static void copy_small_conditional(felem out, const smallfelem in, limb mask)
1229 {
1230     unsigned i;
1231     const u64 mask64 = mask;
1232     for (i = 0; i < NLIMBS; ++i) {
1233         out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1234     }
1235 }
1236
1237 /*-
1238  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1239  *
1240  * The method is taken from:
1241  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1242  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1243  *
1244  * This function includes a branch for checking whether the two input points
1245  * are equal, (while not equal to the point at infinity). This case never
1246  * happens during single point multiplication, so there is no timing leak for
1247  * ECDH or ECDSA signing.
1248  */
1249 static void point_add(felem x3, felem y3, felem z3,
1250                       const felem x1, const felem y1, const felem z1,
1251                       const int mixed, const smallfelem x2,
1252                       const smallfelem y2, const smallfelem z2)
1253 {
1254     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1255     longfelem tmp, tmp2;
1256     smallfelem small1, small2, small3, small4, small5;
1257     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1258
1259     felem_shrink(small3, z1);
1260
1261     z1_is_zero = smallfelem_is_zero(small3);
1262     z2_is_zero = smallfelem_is_zero(z2);
1263
1264     /* ftmp = z1z1 = z1**2 */
1265     smallfelem_square(tmp, small3);
1266     felem_reduce(ftmp, tmp);
1267     /* ftmp[i] < 2^101 */
1268     felem_shrink(small1, ftmp);
1269
1270     if (!mixed) {
1271         /* ftmp2 = z2z2 = z2**2 */
1272         smallfelem_square(tmp, z2);
1273         felem_reduce(ftmp2, tmp);
1274         /* ftmp2[i] < 2^101 */
1275         felem_shrink(small2, ftmp2);
1276
1277         felem_shrink(small5, x1);
1278
1279         /* u1 = ftmp3 = x1*z2z2 */
1280         smallfelem_mul(tmp, small5, small2);
1281         felem_reduce(ftmp3, tmp);
1282         /* ftmp3[i] < 2^101 */
1283
1284         /* ftmp5 = z1 + z2 */
1285         felem_assign(ftmp5, z1);
1286         felem_small_sum(ftmp5, z2);
1287         /* ftmp5[i] < 2^107 */
1288
1289         /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1290         felem_square(tmp, ftmp5);
1291         felem_reduce(ftmp5, tmp);
1292         /* ftmp2 = z2z2 + z1z1 */
1293         felem_sum(ftmp2, ftmp);
1294         /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1295         felem_diff(ftmp5, ftmp2);
1296         /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1297
1298         /* ftmp2 = z2 * z2z2 */
1299         smallfelem_mul(tmp, small2, z2);
1300         felem_reduce(ftmp2, tmp);
1301
1302         /* s1 = ftmp2 = y1 * z2**3 */
1303         felem_mul(tmp, y1, ftmp2);
1304         felem_reduce(ftmp6, tmp);
1305         /* ftmp6[i] < 2^101 */
1306     } else {
1307         /*
1308          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1309          */
1310
1311         /* u1 = ftmp3 = x1*z2z2 */
1312         felem_assign(ftmp3, x1);
1313         /* ftmp3[i] < 2^106 */
1314
1315         /* ftmp5 = 2z1z2 */
1316         felem_assign(ftmp5, z1);
1317         felem_scalar(ftmp5, 2);
1318         /* ftmp5[i] < 2*2^106 = 2^107 */
1319
1320         /* s1 = ftmp2 = y1 * z2**3 */
1321         felem_assign(ftmp6, y1);
1322         /* ftmp6[i] < 2^106 */
1323     }
1324
1325     /* u2 = x2*z1z1 */
1326     smallfelem_mul(tmp, x2, small1);
1327     felem_reduce(ftmp4, tmp);
1328
1329     /* h = ftmp4 = u2 - u1 */
1330     felem_diff_zero107(ftmp4, ftmp3);
1331     /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1332     felem_shrink(small4, ftmp4);
1333
1334     x_equal = smallfelem_is_zero(small4);
1335
1336     /* z_out = ftmp5 * h */
1337     felem_small_mul(tmp, small4, ftmp5);
1338     felem_reduce(z_out, tmp);
1339     /* z_out[i] < 2^101 */
1340
1341     /* ftmp = z1 * z1z1 */
1342     smallfelem_mul(tmp, small1, small3);
1343     felem_reduce(ftmp, tmp);
1344
1345     /* s2 = tmp = y2 * z1**3 */
1346     felem_small_mul(tmp, y2, ftmp);
1347     felem_reduce(ftmp5, tmp);
1348
1349     /* r = ftmp5 = (s2 - s1)*2 */
1350     felem_diff_zero107(ftmp5, ftmp6);
1351     /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1352     felem_scalar(ftmp5, 2);
1353     /* ftmp5[i] < 2^109 */
1354     felem_shrink(small1, ftmp5);
1355     y_equal = smallfelem_is_zero(small1);
1356
1357     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1358         point_double(x3, y3, z3, x1, y1, z1);
1359         return;
1360     }
1361
1362     /* I = ftmp = (2h)**2 */
1363     felem_assign(ftmp, ftmp4);
1364     felem_scalar(ftmp, 2);
1365     /* ftmp[i] < 2*2^108 = 2^109 */
1366     felem_square(tmp, ftmp);
1367     felem_reduce(ftmp, tmp);
1368
1369     /* J = ftmp2 = h * I */
1370     felem_mul(tmp, ftmp4, ftmp);
1371     felem_reduce(ftmp2, tmp);
1372
1373     /* V = ftmp4 = U1 * I */
1374     felem_mul(tmp, ftmp3, ftmp);
1375     felem_reduce(ftmp4, tmp);
1376
1377     /* x_out = r**2 - J - 2V */
1378     smallfelem_square(tmp, small1);
1379     felem_reduce(x_out, tmp);
1380     felem_assign(ftmp3, ftmp4);
1381     felem_scalar(ftmp4, 2);
1382     felem_sum(ftmp4, ftmp2);
1383     /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1384     felem_diff(x_out, ftmp4);
1385     /* x_out[i] < 2^105 + 2^101 */
1386
1387     /* y_out = r(V-x_out) - 2 * s1 * J */
1388     felem_diff_zero107(ftmp3, x_out);
1389     /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1390     felem_small_mul(tmp, small1, ftmp3);
1391     felem_mul(tmp2, ftmp6, ftmp2);
1392     longfelem_scalar(tmp2, 2);
1393     /* tmp2[i] < 2*2^67 = 2^68 */
1394     longfelem_diff(tmp, tmp2);
1395     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1396     felem_reduce_zero105(y_out, tmp);
1397     /* y_out[i] < 2^106 */
1398
1399     copy_small_conditional(x_out, x2, z1_is_zero);
1400     copy_conditional(x_out, x1, z2_is_zero);
1401     copy_small_conditional(y_out, y2, z1_is_zero);
1402     copy_conditional(y_out, y1, z2_is_zero);
1403     copy_small_conditional(z_out, z2, z1_is_zero);
1404     copy_conditional(z_out, z1, z2_is_zero);
1405     felem_assign(x3, x_out);
1406     felem_assign(y3, y_out);
1407     felem_assign(z3, z_out);
1408 }
1409
1410 /*
1411  * point_add_small is the same as point_add, except that it operates on
1412  * smallfelems
1413  */
1414 static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1415                             smallfelem x1, smallfelem y1, smallfelem z1,
1416                             smallfelem x2, smallfelem y2, smallfelem z2)
1417 {
1418     felem felem_x3, felem_y3, felem_z3;
1419     felem felem_x1, felem_y1, felem_z1;
1420     smallfelem_expand(felem_x1, x1);
1421     smallfelem_expand(felem_y1, y1);
1422     smallfelem_expand(felem_z1, z1);
1423     point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1424               x2, y2, z2);
1425     felem_shrink(x3, felem_x3);
1426     felem_shrink(y3, felem_y3);
1427     felem_shrink(z3, felem_z3);
1428 }
1429
1430 /*-
1431  * Base point pre computation
1432  * --------------------------
1433  *
1434  * Two different sorts of precomputed tables are used in the following code.
1435  * Each contain various points on the curve, where each point is three field
1436  * elements (x, y, z).
1437  *
1438  * For the base point table, z is usually 1 (0 for the point at infinity).
1439  * This table has 2 * 16 elements, starting with the following:
1440  * index | bits    | point
1441  * ------+---------+------------------------------
1442  *     0 | 0 0 0 0 | 0G
1443  *     1 | 0 0 0 1 | 1G
1444  *     2 | 0 0 1 0 | 2^64G
1445  *     3 | 0 0 1 1 | (2^64 + 1)G
1446  *     4 | 0 1 0 0 | 2^128G
1447  *     5 | 0 1 0 1 | (2^128 + 1)G
1448  *     6 | 0 1 1 0 | (2^128 + 2^64)G
1449  *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1450  *     8 | 1 0 0 0 | 2^192G
1451  *     9 | 1 0 0 1 | (2^192 + 1)G
1452  *    10 | 1 0 1 0 | (2^192 + 2^64)G
1453  *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1454  *    12 | 1 1 0 0 | (2^192 + 2^128)G
1455  *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1456  *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1457  *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1458  * followed by a copy of this with each element multiplied by 2^32.
1459  *
1460  * The reason for this is so that we can clock bits into four different
1461  * locations when doing simple scalar multiplies against the base point,
1462  * and then another four locations using the second 16 elements.
1463  *
1464  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1465
1466 /* gmul is the table of precomputed base points */
1467 static const smallfelem gmul[2][16][3] = {
1468     {{{0, 0, 0, 0},
1469       {0, 0, 0, 0},
1470       {0, 0, 0, 0}},
1471      {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1472        0x6b17d1f2e12c4247},
1473       {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1474        0x4fe342e2fe1a7f9b},
1475       {1, 0, 0, 0}},
1476      {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1477        0x0fa822bc2811aaa5},
1478       {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1479        0xbff44ae8f5dba80d},
1480       {1, 0, 0, 0}},
1481      {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1482        0x300a4bbc89d6726f},
1483       {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1484        0x72aac7e0d09b4644},
1485       {1, 0, 0, 0}},
1486      {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1487        0x447d739beedb5e67},
1488       {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1489        0x2d4825ab834131ee},
1490       {1, 0, 0, 0}},
1491      {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1492        0xef9519328a9c72ff},
1493       {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1494        0x611e9fc37dbb2c9b},
1495       {1, 0, 0, 0}},
1496      {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1497        0x550663797b51f5d8},
1498       {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1499        0x157164848aecb851},
1500       {1, 0, 0, 0}},
1501      {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1502        0xeb5d7745b21141ea},
1503       {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1504        0xeafd72ebdbecc17b},
1505       {1, 0, 0, 0}},
1506      {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1507        0xa6d39677a7849276},
1508       {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1509        0x674f84749b0b8816},
1510       {1, 0, 0, 0}},
1511      {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1512        0x4e769e7672c9ddad},
1513       {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1514        0x42b99082de830663},
1515       {1, 0, 0, 0}},
1516      {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1517        0x78878ef61c6ce04d},
1518       {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1519        0xb6cb3f5d7b72c321},
1520       {1, 0, 0, 0}},
1521      {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1522        0x0c88bc4d716b1287},
1523       {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1524        0xdd5ddea3f3901dc6},
1525       {1, 0, 0, 0}},
1526      {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1527        0x68f344af6b317466},
1528       {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1529        0x31b9c405f8540a20},
1530       {1, 0, 0, 0}},
1531      {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1532        0x4052bf4b6f461db9},
1533       {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1534        0xfecf4d5190b0fc61},
1535       {1, 0, 0, 0}},
1536      {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1537        0x1eddbae2c802e41a},
1538       {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1539        0x43104d86560ebcfc},
1540       {1, 0, 0, 0}},
1541      {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1542        0xb48e26b484f7a21c},
1543       {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1544        0xfac015404d4d3dab},
1545       {1, 0, 0, 0}}},
1546     {{{0, 0, 0, 0},
1547       {0, 0, 0, 0},
1548       {0, 0, 0, 0}},
1549      {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1550        0x7fe36b40af22af89},
1551       {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1552        0xe697d45825b63624},
1553       {1, 0, 0, 0}},
1554      {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1555        0x4a5b506612a677a6},
1556       {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1557        0xeb13461ceac089f1},
1558       {1, 0, 0, 0}},
1559      {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1560        0x0781b8291c6a220a},
1561       {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1562        0x690cde8df0151593},
1563       {1, 0, 0, 0}},
1564      {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1565        0x8a535f566ec73617},
1566       {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1567        0x0455c08468b08bd7},
1568       {1, 0, 0, 0}},
1569      {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1570        0x06bada7ab77f8276},
1571       {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1572        0x5b476dfd0e6cb18a},
1573       {1, 0, 0, 0}},
1574      {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1575        0x3e29864e8a2ec908},
1576       {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1577        0x239b90ea3dc31e7e},
1578       {1, 0, 0, 0}},
1579      {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1580        0x820f4dd949f72ff7},
1581       {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1582        0x140406ec783a05ec},
1583       {1, 0, 0, 0}},
1584      {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1585        0x68f6b8542783dfee},
1586       {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1587        0xcbe1feba92e40ce6},
1588       {1, 0, 0, 0}},
1589      {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1590        0xd0b2f94d2f420109},
1591       {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1592        0x971459828b0719e5},
1593       {1, 0, 0, 0}},
1594      {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1595        0x961610004a866aba},
1596       {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1597        0x7acb9fadcee75e44},
1598       {1, 0, 0, 0}},
1599      {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1600        0x24eb9acca333bf5b},
1601       {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1602        0x69f891c5acd079cc},
1603       {1, 0, 0, 0}},
1604      {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1605        0xe51f547c5972a107},
1606       {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1607        0x1c309a2b25bb1387},
1608       {1, 0, 0, 0}},
1609      {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1610        0x20b87b8aa2c4e503},
1611       {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1612        0xf5c6fa49919776be},
1613       {1, 0, 0, 0}},
1614      {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1615        0x1ed7d1b9332010b9},
1616       {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1617        0x3a2b03f03217257a},
1618       {1, 0, 0, 0}},
1619      {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1620        0x15fee545c78dd9f6},
1621       {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1622        0x4ab5b6b2b8753f81},
1623       {1, 0, 0, 0}}}
1624 };
1625
1626 /*
1627  * select_point selects the |idx|th point from a precomputation table and
1628  * copies it to out.
1629  */
1630 static void select_point(const u64 idx, unsigned int size,
1631                          const smallfelem pre_comp[16][3], smallfelem out[3])
1632 {
1633     unsigned i, j;
1634     u64 *outlimbs = &out[0][0];
1635
1636     memset(out, 0, sizeof(*out) * 3);
1637
1638     for (i = 0; i < size; i++) {
1639         const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1640         u64 mask = i ^ idx;
1641         mask |= mask >> 4;
1642         mask |= mask >> 2;
1643         mask |= mask >> 1;
1644         mask &= 1;
1645         mask--;
1646         for (j = 0; j < NLIMBS * 3; j++)
1647             outlimbs[j] |= inlimbs[j] & mask;
1648     }
1649 }
1650
1651 /* get_bit returns the |i|th bit in |in| */
1652 static char get_bit(const felem_bytearray in, int i)
1653 {
1654     if ((i < 0) || (i >= 256))
1655         return 0;
1656     return (in[i >> 3] >> (i & 7)) & 1;
1657 }
1658
1659 /*
1660  * Interleaved point multiplication using precomputed point multiples: The
1661  * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1662  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1663  * generator, using certain (large) precomputed multiples in g_pre_comp.
1664  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1665  */
1666 static void batch_mul(felem x_out, felem y_out, felem z_out,
1667                       const felem_bytearray scalars[],
1668                       const unsigned num_points, const u8 *g_scalar,
1669                       const int mixed, const smallfelem pre_comp[][17][3],
1670                       const smallfelem g_pre_comp[2][16][3])
1671 {
1672     int i, skip;
1673     unsigned num, gen_mul = (g_scalar != NULL);
1674     felem nq[3], ftmp;
1675     smallfelem tmp[3];
1676     u64 bits;
1677     u8 sign, digit;
1678
1679     /* set nq to the point at infinity */
1680     memset(nq, 0, sizeof(nq));
1681
1682     /*
1683      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1684      * of the generator (two in each of the last 32 rounds) and additions of
1685      * other points multiples (every 5th round).
1686      */
1687     skip = 1;                   /* save two point operations in the first
1688                                  * round */
1689     for (i = (num_points ? 255 : 31); i >= 0; --i) {
1690         /* double */
1691         if (!skip)
1692             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1693
1694         /* add multiples of the generator */
1695         if (gen_mul && (i <= 31)) {
1696             /* first, look 32 bits upwards */
1697             bits = get_bit(g_scalar, i + 224) << 3;
1698             bits |= get_bit(g_scalar, i + 160) << 2;
1699             bits |= get_bit(g_scalar, i + 96) << 1;
1700             bits |= get_bit(g_scalar, i + 32);
1701             /* select the point to add, in constant time */
1702             select_point(bits, 16, g_pre_comp[1], tmp);
1703
1704             if (!skip) {
1705                 /* Arg 1 below is for "mixed" */
1706                 point_add(nq[0], nq[1], nq[2],
1707                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1708             } else {
1709                 smallfelem_expand(nq[0], tmp[0]);
1710                 smallfelem_expand(nq[1], tmp[1]);
1711                 smallfelem_expand(nq[2], tmp[2]);
1712                 skip = 0;
1713             }
1714
1715             /* second, look at the current position */
1716             bits = get_bit(g_scalar, i + 192) << 3;
1717             bits |= get_bit(g_scalar, i + 128) << 2;
1718             bits |= get_bit(g_scalar, i + 64) << 1;
1719             bits |= get_bit(g_scalar, i);
1720             /* select the point to add, in constant time */
1721             select_point(bits, 16, g_pre_comp[0], tmp);
1722             /* Arg 1 below is for "mixed" */
1723             point_add(nq[0], nq[1], nq[2],
1724                       nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1725         }
1726
1727         /* do other additions every 5 doublings */
1728         if (num_points && (i % 5 == 0)) {
1729             /* loop over all scalars */
1730             for (num = 0; num < num_points; ++num) {
1731                 bits = get_bit(scalars[num], i + 4) << 5;
1732                 bits |= get_bit(scalars[num], i + 3) << 4;
1733                 bits |= get_bit(scalars[num], i + 2) << 3;
1734                 bits |= get_bit(scalars[num], i + 1) << 2;
1735                 bits |= get_bit(scalars[num], i) << 1;
1736                 bits |= get_bit(scalars[num], i - 1);
1737                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1738
1739                 /*
1740                  * select the point to add or subtract, in constant time
1741                  */
1742                 select_point(digit, 17, pre_comp[num], tmp);
1743                 smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1744                                                * point */
1745                 copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1746                 felem_contract(tmp[1], ftmp);
1747
1748                 if (!skip) {
1749                     point_add(nq[0], nq[1], nq[2],
1750                               nq[0], nq[1], nq[2],
1751                               mixed, tmp[0], tmp[1], tmp[2]);
1752                 } else {
1753                     smallfelem_expand(nq[0], tmp[0]);
1754                     smallfelem_expand(nq[1], tmp[1]);
1755                     smallfelem_expand(nq[2], tmp[2]);
1756                     skip = 0;
1757                 }
1758             }
1759         }
1760     }
1761     felem_assign(x_out, nq[0]);
1762     felem_assign(y_out, nq[1]);
1763     felem_assign(z_out, nq[2]);
1764 }
1765
1766 /* Precomputation for the group generator. */
1767 struct nistp256_pre_comp_st {
1768     smallfelem g_pre_comp[2][16][3];
1769     int references;
1770     CRYPTO_RWLOCK *lock;
1771 };
1772
1773 const EC_METHOD *EC_GFp_nistp256_method(void)
1774 {
1775     static const EC_METHOD ret = {
1776         EC_FLAGS_DEFAULT_OCT,
1777         NID_X9_62_prime_field,
1778         ec_GFp_nistp256_group_init,
1779         ec_GFp_simple_group_finish,
1780         ec_GFp_simple_group_clear_finish,
1781         ec_GFp_nist_group_copy,
1782         ec_GFp_nistp256_group_set_curve,
1783         ec_GFp_simple_group_get_curve,
1784         ec_GFp_simple_group_get_degree,
1785         ec_group_simple_order_bits,
1786         ec_GFp_simple_group_check_discriminant,
1787         ec_GFp_simple_point_init,
1788         ec_GFp_simple_point_finish,
1789         ec_GFp_simple_point_clear_finish,
1790         ec_GFp_simple_point_copy,
1791         ec_GFp_simple_point_set_to_infinity,
1792         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1793         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1794         ec_GFp_simple_point_set_affine_coordinates,
1795         ec_GFp_nistp256_point_get_affine_coordinates,
1796         0 /* point_set_compressed_coordinates */ ,
1797         0 /* point2oct */ ,
1798         0 /* oct2point */ ,
1799         ec_GFp_simple_add,
1800         ec_GFp_simple_dbl,
1801         ec_GFp_simple_invert,
1802         ec_GFp_simple_is_at_infinity,
1803         ec_GFp_simple_is_on_curve,
1804         ec_GFp_simple_cmp,
1805         ec_GFp_simple_make_affine,
1806         ec_GFp_simple_points_make_affine,
1807         ec_GFp_nistp256_points_mul,
1808         ec_GFp_nistp256_precompute_mult,
1809         ec_GFp_nistp256_have_precompute_mult,
1810         ec_GFp_nist_field_mul,
1811         ec_GFp_nist_field_sqr,
1812         0 /* field_div */ ,
1813         0 /* field_encode */ ,
1814         0 /* field_decode */ ,
1815         0,                      /* field_set_to_one */
1816         ec_key_simple_priv2oct,
1817         ec_key_simple_oct2priv,
1818         0, /* set private */
1819         ec_key_simple_generate_key,
1820         ec_key_simple_check_key,
1821         ec_key_simple_generate_public_key,
1822         0, /* keycopy */
1823         0, /* keyfinish */
1824         ecdh_simple_compute_key
1825     };
1826
1827     return &ret;
1828 }
1829
1830 /******************************************************************************/
1831 /*
1832  * FUNCTIONS TO MANAGE PRECOMPUTATION
1833  */
1834
1835 static NISTP256_PRE_COMP *nistp256_pre_comp_new()
1836 {
1837     NISTP256_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1838
1839     if (ret == NULL) {
1840         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1841         return ret;
1842     }
1843
1844     ret->references = 1;
1845
1846     ret->lock = CRYPTO_THREAD_lock_new();
1847     if (ret->lock == NULL) {
1848         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1849         OPENSSL_free(ret);
1850         return NULL;
1851     }
1852     return ret;
1853 }
1854
1855 NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *p)
1856 {
1857     int i;
1858     if (p != NULL)
1859         CRYPTO_atomic_add(&p->references, 1, &i, p->lock);
1860     return p;
1861 }
1862
1863 void EC_nistp256_pre_comp_free(NISTP256_PRE_COMP *pre)
1864 {
1865     int i;
1866
1867     if (pre == NULL)
1868         return;
1869
1870     CRYPTO_atomic_add(&pre->references, -1, &i, pre->lock);
1871     REF_PRINT_COUNT("EC_nistp256", x);
1872     if (i > 0)
1873         return;
1874     REF_ASSERT_ISNT(i < 0);
1875
1876     CRYPTO_THREAD_lock_free(pre->lock);
1877     OPENSSL_free(pre);
1878 }
1879
1880 /******************************************************************************/
1881 /*
1882  * OPENSSL EC_METHOD FUNCTIONS
1883  */
1884
1885 int ec_GFp_nistp256_group_init(EC_GROUP *group)
1886 {
1887     int ret;
1888     ret = ec_GFp_simple_group_init(group);
1889     group->a_is_minus3 = 1;
1890     return ret;
1891 }
1892
1893 int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1894                                     const BIGNUM *a, const BIGNUM *b,
1895                                     BN_CTX *ctx)
1896 {
1897     int ret = 0;
1898     BN_CTX *new_ctx = NULL;
1899     BIGNUM *curve_p, *curve_a, *curve_b;
1900
1901     if (ctx == NULL)
1902         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1903             return 0;
1904     BN_CTX_start(ctx);
1905     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1906         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1907         ((curve_b = BN_CTX_get(ctx)) == NULL))
1908         goto err;
1909     BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1910     BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1911     BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1912     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1913         ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1914               EC_R_WRONG_CURVE_PARAMETERS);
1915         goto err;
1916     }
1917     group->field_mod_func = BN_nist_mod_256;
1918     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1919  err:
1920     BN_CTX_end(ctx);
1921     BN_CTX_free(new_ctx);
1922     return ret;
1923 }
1924
1925 /*
1926  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1927  * (X/Z^2, Y/Z^3)
1928  */
1929 int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1930                                                  const EC_POINT *point,
1931                                                  BIGNUM *x, BIGNUM *y,
1932                                                  BN_CTX *ctx)
1933 {
1934     felem z1, z2, x_in, y_in;
1935     smallfelem x_out, y_out;
1936     longfelem tmp;
1937
1938     if (EC_POINT_is_at_infinity(group, point)) {
1939         ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1940               EC_R_POINT_AT_INFINITY);
1941         return 0;
1942     }
1943     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1944         (!BN_to_felem(z1, point->Z)))
1945         return 0;
1946     felem_inv(z2, z1);
1947     felem_square(tmp, z2);
1948     felem_reduce(z1, tmp);
1949     felem_mul(tmp, x_in, z1);
1950     felem_reduce(x_in, tmp);
1951     felem_contract(x_out, x_in);
1952     if (x != NULL) {
1953         if (!smallfelem_to_BN(x, x_out)) {
1954             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1955                   ERR_R_BN_LIB);
1956             return 0;
1957         }
1958     }
1959     felem_mul(tmp, z1, z2);
1960     felem_reduce(z1, tmp);
1961     felem_mul(tmp, y_in, z1);
1962     felem_reduce(y_in, tmp);
1963     felem_contract(y_out, y_in);
1964     if (y != NULL) {
1965         if (!smallfelem_to_BN(y, y_out)) {
1966             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1967                   ERR_R_BN_LIB);
1968             return 0;
1969         }
1970     }
1971     return 1;
1972 }
1973
1974 /* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
1975 static void make_points_affine(size_t num, smallfelem points[][3],
1976                                smallfelem tmp_smallfelems[])
1977 {
1978     /*
1979      * Runs in constant time, unless an input is the point at infinity (which
1980      * normally shouldn't happen).
1981      */
1982     ec_GFp_nistp_points_make_affine_internal(num,
1983                                              points,
1984                                              sizeof(smallfelem),
1985                                              tmp_smallfelems,
1986                                              (void (*)(void *))smallfelem_one,
1987                                              smallfelem_is_zero_int,
1988                                              (void (*)(void *, const void *))
1989                                              smallfelem_assign,
1990                                              (void (*)(void *, const void *))
1991                                              smallfelem_square_contract,
1992                                              (void (*)
1993                                               (void *, const void *,
1994                                                const void *))
1995                                              smallfelem_mul_contract,
1996                                              (void (*)(void *, const void *))
1997                                              smallfelem_inv_contract,
1998                                              /* nothing to contract */
1999                                              (void (*)(void *, const void *))
2000                                              smallfelem_assign);
2001 }
2002
2003 /*
2004  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
2005  * values Result is stored in r (r can equal one of the inputs).
2006  */
2007 int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
2008                                const BIGNUM *scalar, size_t num,
2009                                const EC_POINT *points[],
2010                                const BIGNUM *scalars[], BN_CTX *ctx)
2011 {
2012     int ret = 0;
2013     int j;
2014     int mixed = 0;
2015     BN_CTX *new_ctx = NULL;
2016     BIGNUM *x, *y, *z, *tmp_scalar;
2017     felem_bytearray g_secret;
2018     felem_bytearray *secrets = NULL;
2019     smallfelem (*pre_comp)[17][3] = NULL;
2020     smallfelem *tmp_smallfelems = NULL;
2021     felem_bytearray tmp;
2022     unsigned i, num_bytes;
2023     int have_pre_comp = 0;
2024     size_t num_points = num;
2025     smallfelem x_in, y_in, z_in;
2026     felem x_out, y_out, z_out;
2027     NISTP256_PRE_COMP *pre = NULL;
2028     const smallfelem(*g_pre_comp)[16][3] = NULL;
2029     EC_POINT *generator = NULL;
2030     const EC_POINT *p = NULL;
2031     const BIGNUM *p_scalar = NULL;
2032
2033     if (ctx == NULL)
2034         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2035             return 0;
2036     BN_CTX_start(ctx);
2037     if (((x = BN_CTX_get(ctx)) == NULL) ||
2038         ((y = BN_CTX_get(ctx)) == NULL) ||
2039         ((z = BN_CTX_get(ctx)) == NULL) ||
2040         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
2041         goto err;
2042
2043     if (scalar != NULL) {
2044         pre = group->pre_comp.nistp256;
2045         if (pre)
2046             /* we have precomputation, try to use it */
2047             g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2048         else
2049             /* try to use the standard precomputation */
2050             g_pre_comp = &gmul[0];
2051         generator = EC_POINT_new(group);
2052         if (generator == NULL)
2053             goto err;
2054         /* get the generator from precomputation */
2055         if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2056             !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2057             !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2058             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2059             goto err;
2060         }
2061         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
2062                                                       generator, x, y, z,
2063                                                       ctx))
2064             goto err;
2065         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2066             /* precomputation matches generator */
2067             have_pre_comp = 1;
2068         else
2069             /*
2070              * we don't have valid precomputation: treat the generator as a
2071              * random point
2072              */
2073             num_points++;
2074     }
2075     if (num_points > 0) {
2076         if (num_points >= 3) {
2077             /*
2078              * unless we precompute multiples for just one or two points,
2079              * converting those into affine form is time well spent
2080              */
2081             mixed = 1;
2082         }
2083         secrets = OPENSSL_malloc(sizeof(*secrets) * num_points);
2084         pre_comp = OPENSSL_malloc(sizeof(*pre_comp) * num_points);
2085         if (mixed)
2086             tmp_smallfelems =
2087               OPENSSL_malloc(sizeof(*tmp_smallfelems) * (num_points * 17 + 1));
2088         if ((secrets == NULL) || (pre_comp == NULL)
2089             || (mixed && (tmp_smallfelems == NULL))) {
2090             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
2091             goto err;
2092         }
2093
2094         /*
2095          * we treat NULL scalars as 0, and NULL points as points at infinity,
2096          * i.e., they contribute nothing to the linear combination
2097          */
2098         memset(secrets, 0, sizeof(*secrets) * num_points);
2099         memset(pre_comp, 0, sizeof(*pre_comp) * num_points);
2100         for (i = 0; i < num_points; ++i) {
2101             if (i == num)
2102                 /*
2103                  * we didn't have a valid precomputation, so we pick the
2104                  * generator
2105                  */
2106             {
2107                 p = EC_GROUP_get0_generator(group);
2108                 p_scalar = scalar;
2109             } else
2110                 /* the i^th point */
2111             {
2112                 p = points[i];
2113                 p_scalar = scalars[i];
2114             }
2115             if ((p_scalar != NULL) && (p != NULL)) {
2116                 /* reduce scalar to 0 <= scalar < 2^256 */
2117                 if ((BN_num_bits(p_scalar) > 256)
2118                     || (BN_is_negative(p_scalar))) {
2119                     /*
2120                      * this is an unusual input, and we don't guarantee
2121                      * constant-timeness
2122                      */
2123                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
2124                         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2125                         goto err;
2126                     }
2127                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
2128                 } else
2129                     num_bytes = BN_bn2bin(p_scalar, tmp);
2130                 flip_endian(secrets[i], tmp, num_bytes);
2131                 /* precompute multiples */
2132                 if ((!BN_to_felem(x_out, p->X)) ||
2133                     (!BN_to_felem(y_out, p->Y)) ||
2134                     (!BN_to_felem(z_out, p->Z)))
2135                     goto err;
2136                 felem_shrink(pre_comp[i][1][0], x_out);
2137                 felem_shrink(pre_comp[i][1][1], y_out);
2138                 felem_shrink(pre_comp[i][1][2], z_out);
2139                 for (j = 2; j <= 16; ++j) {
2140                     if (j & 1) {
2141                         point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2142                                         pre_comp[i][j][2], pre_comp[i][1][0],
2143                                         pre_comp[i][1][1], pre_comp[i][1][2],
2144                                         pre_comp[i][j - 1][0],
2145                                         pre_comp[i][j - 1][1],
2146                                         pre_comp[i][j - 1][2]);
2147                     } else {
2148                         point_double_small(pre_comp[i][j][0],
2149                                            pre_comp[i][j][1],
2150                                            pre_comp[i][j][2],
2151                                            pre_comp[i][j / 2][0],
2152                                            pre_comp[i][j / 2][1],
2153                                            pre_comp[i][j / 2][2]);
2154                     }
2155                 }
2156             }
2157         }
2158         if (mixed)
2159             make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2160     }
2161
2162     /* the scalar for the generator */
2163     if ((scalar != NULL) && (have_pre_comp)) {
2164         memset(g_secret, 0, sizeof(g_secret));
2165         /* reduce scalar to 0 <= scalar < 2^256 */
2166         if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2167             /*
2168              * this is an unusual input, and we don't guarantee
2169              * constant-timeness
2170              */
2171             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2172                 ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2173                 goto err;
2174             }
2175             num_bytes = BN_bn2bin(tmp_scalar, tmp);
2176         } else
2177             num_bytes = BN_bn2bin(scalar, tmp);
2178         flip_endian(g_secret, tmp, num_bytes);
2179         /* do the multiplication with generator precomputation */
2180         batch_mul(x_out, y_out, z_out,
2181                   (const felem_bytearray(*))secrets, num_points,
2182                   g_secret,
2183                   mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2184     } else
2185         /* do the multiplication without generator precomputation */
2186         batch_mul(x_out, y_out, z_out,
2187                   (const felem_bytearray(*))secrets, num_points,
2188                   NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2189     /* reduce the output to its unique minimal representation */
2190     felem_contract(x_in, x_out);
2191     felem_contract(y_in, y_out);
2192     felem_contract(z_in, z_out);
2193     if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2194         (!smallfelem_to_BN(z, z_in))) {
2195         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2196         goto err;
2197     }
2198     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2199
2200  err:
2201     BN_CTX_end(ctx);
2202     EC_POINT_free(generator);
2203     BN_CTX_free(new_ctx);
2204     OPENSSL_free(secrets);
2205     OPENSSL_free(pre_comp);
2206     OPENSSL_free(tmp_smallfelems);
2207     return ret;
2208 }
2209
2210 int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2211 {
2212     int ret = 0;
2213     NISTP256_PRE_COMP *pre = NULL;
2214     int i, j;
2215     BN_CTX *new_ctx = NULL;
2216     BIGNUM *x, *y;
2217     EC_POINT *generator = NULL;
2218     smallfelem tmp_smallfelems[32];
2219     felem x_tmp, y_tmp, z_tmp;
2220
2221     /* throw away old precomputation */
2222     EC_pre_comp_free(group);
2223     if (ctx == NULL)
2224         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2225             return 0;
2226     BN_CTX_start(ctx);
2227     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2228         goto err;
2229     /* get the generator */
2230     if (group->generator == NULL)
2231         goto err;
2232     generator = EC_POINT_new(group);
2233     if (generator == NULL)
2234         goto err;
2235     BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2236     BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2237     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2238         goto err;
2239     if ((pre = nistp256_pre_comp_new()) == NULL)
2240         goto err;
2241     /*
2242      * if the generator is the standard one, use built-in precomputation
2243      */
2244     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2245         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2246         goto done;
2247     }
2248     if ((!BN_to_felem(x_tmp, group->generator->X)) ||
2249         (!BN_to_felem(y_tmp, group->generator->Y)) ||
2250         (!BN_to_felem(z_tmp, group->generator->Z)))
2251         goto err;
2252     felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2253     felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2254     felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2255     /*
2256      * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2257      * 2^160*G, 2^224*G for the second one
2258      */
2259     for (i = 1; i <= 8; i <<= 1) {
2260         point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2261                            pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2262                            pre->g_pre_comp[0][i][1],
2263                            pre->g_pre_comp[0][i][2]);
2264         for (j = 0; j < 31; ++j) {
2265             point_double_small(pre->g_pre_comp[1][i][0],
2266                                pre->g_pre_comp[1][i][1],
2267                                pre->g_pre_comp[1][i][2],
2268                                pre->g_pre_comp[1][i][0],
2269                                pre->g_pre_comp[1][i][1],
2270                                pre->g_pre_comp[1][i][2]);
2271         }
2272         if (i == 8)
2273             break;
2274         point_double_small(pre->g_pre_comp[0][2 * i][0],
2275                            pre->g_pre_comp[0][2 * i][1],
2276                            pre->g_pre_comp[0][2 * i][2],
2277                            pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2278                            pre->g_pre_comp[1][i][2]);
2279         for (j = 0; j < 31; ++j) {
2280             point_double_small(pre->g_pre_comp[0][2 * i][0],
2281                                pre->g_pre_comp[0][2 * i][1],
2282                                pre->g_pre_comp[0][2 * i][2],
2283                                pre->g_pre_comp[0][2 * i][0],
2284                                pre->g_pre_comp[0][2 * i][1],
2285                                pre->g_pre_comp[0][2 * i][2]);
2286         }
2287     }
2288     for (i = 0; i < 2; i++) {
2289         /* g_pre_comp[i][0] is the point at infinity */
2290         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2291         /* the remaining multiples */
2292         /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2293         point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2294                         pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2295                         pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2296                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2297                         pre->g_pre_comp[i][2][2]);
2298         /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2299         point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2300                         pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2301                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2302                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2303                         pre->g_pre_comp[i][2][2]);
2304         /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2305         point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2306                         pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2307                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2308                         pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2309                         pre->g_pre_comp[i][4][2]);
2310         /*
2311          * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2312          */
2313         point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2314                         pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2315                         pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2316                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2317                         pre->g_pre_comp[i][2][2]);
2318         for (j = 1; j < 8; ++j) {
2319             /* odd multiples: add G resp. 2^32*G */
2320             point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2321                             pre->g_pre_comp[i][2 * j + 1][1],
2322                             pre->g_pre_comp[i][2 * j + 1][2],
2323                             pre->g_pre_comp[i][2 * j][0],
2324                             pre->g_pre_comp[i][2 * j][1],
2325                             pre->g_pre_comp[i][2 * j][2],
2326                             pre->g_pre_comp[i][1][0],
2327                             pre->g_pre_comp[i][1][1],
2328                             pre->g_pre_comp[i][1][2]);
2329         }
2330     }
2331     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2332
2333  done:
2334     SETPRECOMP(group, nistp256, pre);
2335     pre = NULL;
2336     ret = 1;
2337
2338  err:
2339     BN_CTX_end(ctx);
2340     EC_POINT_free(generator);
2341     BN_CTX_free(new_ctx);
2342     EC_nistp256_pre_comp_free(pre);
2343     return ret;
2344 }
2345
2346 int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2347 {
2348     return HAVEPRECOMP(group, nistp256);
2349 }
2350 #endif