crypto/ec/ecp_nistp256.c

   1 /*
   2  * Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the Apache License 2.0 (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * ECDSA low level APIs are deprecated for public use, but still ok for
  28  * internal use.
  29  */
  30 #include "internal/deprecated.h"
  31
  32 /*
  33  * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
  34  *
  35  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  36  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  37  * work which got its smarts from Daniel J. Bernstein's work on the same.
  38  */
  39
  40 #include <openssl/opensslconf.h>
  41 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  42 NON_EMPTY_TRANSLATION_UNIT
  43 #else
  44
  45 # include <stdint.h>
  46 # include <string.h>
  47 # include <openssl/err.h>
  48 # include "ec_local.h"
  49
  50 # if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
  51   /* even with gcc, the typedef won't work for 32-bit platforms */
  52 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  53                                  * platforms */
  54 typedef __int128_t int128_t;
  55 # else
  56 #  error "Your compiler doesn't appear to support 128-bit integer types"
  57 # endif
  58
  59 typedef uint8_t u8;
  60 typedef uint32_t u32;
  61 typedef uint64_t u64;
  62
  63 /*
  64  * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
  65  * can serialise an element of this field into 32 bytes. We call this an
  66  * felem_bytearray.
  67  */
  68
  69 typedef u8 felem_bytearray[32];
  70
  71 /*
  72  * These are the parameters of P256, taken from FIPS 186-3, page 86. These
  73  * values are big-endian.
  74  */
  75 static const felem_bytearray nistp256_curve_params[5] = {
  76     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
  77      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  78      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  79      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
  80     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
  81      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  82      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  83      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
  84     {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
  85      0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
  86      0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
  87      0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
  88     {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
  89      0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
  90      0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
  91      0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
  92     {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
  93      0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
  94      0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
  95      0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
  96 };
  97
  98 /*-
  99  * The representation of field elements.
 100  * ------------------------------------
 101  *
 102  * We represent field elements with either four 128-bit values, eight 128-bit
 103  * values, or four 64-bit values. The field element represented is:
 104  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
 105  * or:
 106  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
 107  *
 108  * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
 109  * apart, but are 128-bits wide, the most significant bits of each limb overlap
 110  * with the least significant bits of the next.
 111  *
 112  * A field element with four limbs is an 'felem'. One with eight limbs is a
 113  * 'longfelem'
 114  *
 115  * A field element with four, 64-bit values is called a 'smallfelem'. Small
 116  * values are used as intermediate values before multiplication.
 117  */
 118
 119 # define NLIMBS 4
 120
 121 typedef uint128_t limb;
 122 typedef limb felem[NLIMBS];
 123 typedef limb longfelem[NLIMBS * 2];
 124 typedef u64 smallfelem[NLIMBS];
 125
 126 /* This is the value of the prime as four 64-bit words, little-endian. */
 127 static const u64 kPrime[4] =
 128     { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
 129 static const u64 bottom63bits = 0x7ffffffffffffffful;
 130
 131 /*
 132  * bin32_to_felem takes a little-endian byte array and converts it into felem
 133  * form. This assumes that the CPU is little-endian.
 134  */
 135 static void bin32_to_felem(felem out, const u8 in[32])
 136 {
 137     out[0] = *((u64 *)&in[0]);
 138     out[1] = *((u64 *)&in[8]);
 139     out[2] = *((u64 *)&in[16]);
 140     out[3] = *((u64 *)&in[24]);
 141 }
 142
 143 /*
 144  * smallfelem_to_bin32 takes a smallfelem and serialises into a little
 145  * endian, 32 byte array. This assumes that the CPU is little-endian.
 146  */
 147 static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
 148 {
 149     *((u64 *)&out[0]) = in[0];
 150     *((u64 *)&out[8]) = in[1];
 151     *((u64 *)&out[16]) = in[2];
 152     *((u64 *)&out[24]) = in[3];
 153 }
 154
 155 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 156 static int BN_to_felem(felem out, const BIGNUM *bn)
 157 {
 158     felem_bytearray b_out;
 159     int num_bytes;
 160
 161     if (BN_is_negative(bn)) {
 162         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 163         return 0;
 164     }
 165     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
 166     if (num_bytes < 0) {
 167         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 168         return 0;
 169     }
 170     bin32_to_felem(out, b_out);
 171     return 1;
 172 }
 173
 174 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 175 static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
 176 {
 177     felem_bytearray b_out;
 178     smallfelem_to_bin32(b_out, in);
 179     return BN_lebin2bn(b_out, sizeof(b_out), out);
 180 }
 181
 182 /*-
 183  * Field operations
 184  * ----------------
 185  */
 186
 187 static void smallfelem_one(smallfelem out)
 188 {
 189     out[0] = 1;
 190     out[1] = 0;
 191     out[2] = 0;
 192     out[3] = 0;
 193 }
 194
 195 static void smallfelem_assign(smallfelem out, const smallfelem in)
 196 {
 197     out[0] = in[0];
 198     out[1] = in[1];
 199     out[2] = in[2];
 200     out[3] = in[3];
 201 }
 202
 203 static void felem_assign(felem out, const felem in)
 204 {
 205     out[0] = in[0];
 206     out[1] = in[1];
 207     out[2] = in[2];
 208     out[3] = in[3];
 209 }
 210
 211 /* felem_sum sets out = out + in. */
 212 static void felem_sum(felem out, const felem in)
 213 {
 214     out[0] += in[0];
 215     out[1] += in[1];
 216     out[2] += in[2];
 217     out[3] += in[3];
 218 }
 219
 220 /* felem_small_sum sets out = out + in. */
 221 static void felem_small_sum(felem out, const smallfelem in)
 222 {
 223     out[0] += in[0];
 224     out[1] += in[1];
 225     out[2] += in[2];
 226     out[3] += in[3];
 227 }
 228
 229 /* felem_scalar sets out = out * scalar */
 230 static void felem_scalar(felem out, const u64 scalar)
 231 {
 232     out[0] *= scalar;
 233     out[1] *= scalar;
 234     out[2] *= scalar;
 235     out[3] *= scalar;
 236 }
 237
 238 /* longfelem_scalar sets out = out * scalar */
 239 static void longfelem_scalar(longfelem out, const u64 scalar)
 240 {
 241     out[0] *= scalar;
 242     out[1] *= scalar;
 243     out[2] *= scalar;
 244     out[3] *= scalar;
 245     out[4] *= scalar;
 246     out[5] *= scalar;
 247     out[6] *= scalar;
 248     out[7] *= scalar;
 249 }
 250
 251 # define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
 252 # define two105 (((limb)1) << 105)
 253 # define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
 254
 255 /* zero105 is 0 mod p */
 256 static const felem zero105 =
 257     { two105m41m9, two105, two105m41p9, two105m41p9 };
 258
 259 /*-
 260  * smallfelem_neg sets |out| to |-small|
 261  * On exit:
 262  *   out[i] < out[i] + 2^105
 263  */
 264 static void smallfelem_neg(felem out, const smallfelem small)
 265 {
 266     /* In order to prevent underflow, we subtract from 0 mod p. */
 267     out[0] = zero105[0] - small[0];
 268     out[1] = zero105[1] - small[1];
 269     out[2] = zero105[2] - small[2];
 270     out[3] = zero105[3] - small[3];
 271 }
 272
 273 /*-
 274  * felem_diff subtracts |in| from |out|
 275  * On entry:
 276  *   in[i] < 2^104
 277  * On exit:
 278  *   out[i] < out[i] + 2^105
 279  */
 280 static void felem_diff(felem out, const felem in)
 281 {
 282     /*
 283      * In order to prevent underflow, we add 0 mod p before subtracting.
 284      */
 285     out[0] += zero105[0];
 286     out[1] += zero105[1];
 287     out[2] += zero105[2];
 288     out[3] += zero105[3];
 289
 290     out[0] -= in[0];
 291     out[1] -= in[1];
 292     out[2] -= in[2];
 293     out[3] -= in[3];
 294 }
 295
 296 # define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
 297 # define two107 (((limb)1) << 107)
 298 # define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
 299
 300 /* zero107 is 0 mod p */
 301 static const felem zero107 =
 302     { two107m43m11, two107, two107m43p11, two107m43p11 };
 303
 304 /*-
 305  * An alternative felem_diff for larger inputs |in|
 306  * felem_diff_zero107 subtracts |in| from |out|
 307  * On entry:
 308  *   in[i] < 2^106
 309  * On exit:
 310  *   out[i] < out[i] + 2^107
 311  */
 312 static void felem_diff_zero107(felem out, const felem in)
 313 {
 314     /*
 315      * In order to prevent underflow, we add 0 mod p before subtracting.
 316      */
 317     out[0] += zero107[0];
 318     out[1] += zero107[1];
 319     out[2] += zero107[2];
 320     out[3] += zero107[3];
 321
 322     out[0] -= in[0];
 323     out[1] -= in[1];
 324     out[2] -= in[2];
 325     out[3] -= in[3];
 326 }
 327
 328 /*-
 329  * longfelem_diff subtracts |in| from |out|
 330  * On entry:
 331  *   in[i] < 7*2^67
 332  * On exit:
 333  *   out[i] < out[i] + 2^70 + 2^40
 334  */
 335 static void longfelem_diff(longfelem out, const longfelem in)
 336 {
 337     static const limb two70m8p6 =
 338         (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
 339     static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
 340     static const limb two70 = (((limb) 1) << 70);
 341     static const limb two70m40m38p6 =
 342         (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
 343         (((limb) 1) << 6);
 344     static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
 345
 346     /* add 0 mod p to avoid underflow */
 347     out[0] += two70m8p6;
 348     out[1] += two70p40;
 349     out[2] += two70;
 350     out[3] += two70m40m38p6;
 351     out[4] += two70m6;
 352     out[5] += two70m6;
 353     out[6] += two70m6;
 354     out[7] += two70m6;
 355
 356     /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
 357     out[0] -= in[0];
 358     out[1] -= in[1];
 359     out[2] -= in[2];
 360     out[3] -= in[3];
 361     out[4] -= in[4];
 362     out[5] -= in[5];
 363     out[6] -= in[6];
 364     out[7] -= in[7];
 365 }
 366
 367 # define two64m0 (((limb)1) << 64) - 1
 368 # define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
 369 # define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
 370 # define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
 371
 372 /* zero110 is 0 mod p */
 373 static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
 374
 375 /*-
 376  * felem_shrink converts an felem into a smallfelem. The result isn't quite
 377  * minimal as the value may be greater than p.
 378  *
 379  * On entry:
 380  *   in[i] < 2^109
 381  * On exit:
 382  *   out[i] < 2^64
 383  */
 384 static void felem_shrink(smallfelem out, const felem in)
 385 {
 386     felem tmp;
 387     u64 a, b, mask;
 388     u64 high, low;
 389     static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
 390
 391     /* Carry 2->3 */
 392     tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
 393     /* tmp[3] < 2^110 */
 394
 395     tmp[2] = zero110[2] + (u64)in[2];
 396     tmp[0] = zero110[0] + in[0];
 397     tmp[1] = zero110[1] + in[1];
 398     /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
 399
 400     /*
 401      * We perform two partial reductions where we eliminate the high-word of
 402      * tmp[3]. We don't update the other words till the end.
 403      */
 404     a = tmp[3] >> 64;           /* a < 2^46 */
 405     tmp[3] = (u64)tmp[3];
 406     tmp[3] -= a;
 407     tmp[3] += ((limb) a) << 32;
 408     /* tmp[3] < 2^79 */
 409
 410     b = a;
 411     a = tmp[3] >> 64;           /* a < 2^15 */
 412     b += a;                     /* b < 2^46 + 2^15 < 2^47 */
 413     tmp[3] = (u64)tmp[3];
 414     tmp[3] -= a;
 415     tmp[3] += ((limb) a) << 32;
 416     /* tmp[3] < 2^64 + 2^47 */
 417
 418     /*
 419      * This adjusts the other two words to complete the two partial
 420      * reductions.
 421      */
 422     tmp[0] += b;
 423     tmp[1] -= (((limb) b) << 32);
 424
 425     /*
 426      * In order to make space in tmp[3] for the carry from 2 -> 3, we
 427      * conditionally subtract kPrime if tmp[3] is large enough.
 428      */
 429     high = (u64)(tmp[3] >> 64);
 430     /* As tmp[3] < 2^65, high is either 1 or 0 */
 431     high = 0 - high;
 432     /*-
 433      * high is:
 434      *   all ones   if the high word of tmp[3] is 1
 435      *   all zeros  if the high word of tmp[3] if 0
 436      */
 437     low = (u64)tmp[3];
 438     mask = 0 - (low >> 63);
 439     /*-
 440      * mask is:
 441      *   all ones   if the MSB of low is 1
 442      *   all zeros  if the MSB of low if 0
 443      */
 444     low &= bottom63bits;
 445     low -= kPrime3Test;
 446     /* if low was greater than kPrime3Test then the MSB is zero */
 447     low = ~low;
 448     low = 0 - (low >> 63);
 449     /*-
 450      * low is:
 451      *   all ones   if low was > kPrime3Test
 452      *   all zeros  if low was <= kPrime3Test
 453      */
 454     mask = (mask & low) | high;
 455     tmp[0] -= mask & kPrime[0];
 456     tmp[1] -= mask & kPrime[1];
 457     /* kPrime[2] is zero, so omitted */
 458     tmp[3] -= mask & kPrime[3];
 459     /* tmp[3] < 2**64 - 2**32 + 1 */
 460
 461     tmp[1] += ((u64)(tmp[0] >> 64));
 462     tmp[0] = (u64)tmp[0];
 463     tmp[2] += ((u64)(tmp[1] >> 64));
 464     tmp[1] = (u64)tmp[1];
 465     tmp[3] += ((u64)(tmp[2] >> 64));
 466     tmp[2] = (u64)tmp[2];
 467     /* tmp[i] < 2^64 */
 468
 469     out[0] = tmp[0];
 470     out[1] = tmp[1];
 471     out[2] = tmp[2];
 472     out[3] = tmp[3];
 473 }
 474
 475 /* smallfelem_expand converts a smallfelem to an felem */
 476 static void smallfelem_expand(felem out, const smallfelem in)
 477 {
 478     out[0] = in[0];
 479     out[1] = in[1];
 480     out[2] = in[2];
 481     out[3] = in[3];
 482 }
 483
 484 /*-
 485  * smallfelem_square sets |out| = |small|^2
 486  * On entry:
 487  *   small[i] < 2^64
 488  * On exit:
 489  *   out[i] < 7 * 2^64 < 2^67
 490  */
 491 static void smallfelem_square(longfelem out, const smallfelem small)
 492 {
 493     limb a;
 494     u64 high, low;
 495
 496     a = ((uint128_t) small[0]) * small[0];
 497     low = a;
 498     high = a >> 64;
 499     out[0] = low;
 500     out[1] = high;
 501
 502     a = ((uint128_t) small[0]) * small[1];
 503     low = a;
 504     high = a >> 64;
 505     out[1] += low;
 506     out[1] += low;
 507     out[2] = high;
 508
 509     a = ((uint128_t) small[0]) * small[2];
 510     low = a;
 511     high = a >> 64;
 512     out[2] += low;
 513     out[2] *= 2;
 514     out[3] = high;
 515
 516     a = ((uint128_t) small[0]) * small[3];
 517     low = a;
 518     high = a >> 64;
 519     out[3] += low;
 520     out[4] = high;
 521
 522     a = ((uint128_t) small[1]) * small[2];
 523     low = a;
 524     high = a >> 64;
 525     out[3] += low;
 526     out[3] *= 2;
 527     out[4] += high;
 528
 529     a = ((uint128_t) small[1]) * small[1];
 530     low = a;
 531     high = a >> 64;
 532     out[2] += low;
 533     out[3] += high;
 534
 535     a = ((uint128_t) small[1]) * small[3];
 536     low = a;
 537     high = a >> 64;
 538     out[4] += low;
 539     out[4] *= 2;
 540     out[5] = high;
 541
 542     a = ((uint128_t) small[2]) * small[3];
 543     low = a;
 544     high = a >> 64;
 545     out[5] += low;
 546     out[5] *= 2;
 547     out[6] = high;
 548     out[6] += high;
 549
 550     a = ((uint128_t) small[2]) * small[2];
 551     low = a;
 552     high = a >> 64;
 553     out[4] += low;
 554     out[5] += high;
 555
 556     a = ((uint128_t) small[3]) * small[3];
 557     low = a;
 558     high = a >> 64;
 559     out[6] += low;
 560     out[7] = high;
 561 }
 562
 563 /*-
 564  * felem_square sets |out| = |in|^2
 565  * On entry:
 566  *   in[i] < 2^109
 567  * On exit:
 568  *   out[i] < 7 * 2^64 < 2^67
 569  */
 570 static void felem_square(longfelem out, const felem in)
 571 {
 572     u64 small[4];
 573     felem_shrink(small, in);
 574     smallfelem_square(out, small);
 575 }
 576
 577 /*-
 578  * smallfelem_mul sets |out| = |small1| * |small2|
 579  * On entry:
 580  *   small1[i] < 2^64
 581  *   small2[i] < 2^64
 582  * On exit:
 583  *   out[i] < 7 * 2^64 < 2^67
 584  */
 585 static void smallfelem_mul(longfelem out, const smallfelem small1,
 586                            const smallfelem small2)
 587 {
 588     limb a;
 589     u64 high, low;
 590
 591     a = ((uint128_t) small1[0]) * small2[0];
 592     low = a;
 593     high = a >> 64;
 594     out[0] = low;
 595     out[1] = high;
 596
 597     a = ((uint128_t) small1[0]) * small2[1];
 598     low = a;
 599     high = a >> 64;
 600     out[1] += low;
 601     out[2] = high;
 602
 603     a = ((uint128_t) small1[1]) * small2[0];
 604     low = a;
 605     high = a >> 64;
 606     out[1] += low;
 607     out[2] += high;
 608
 609     a = ((uint128_t) small1[0]) * small2[2];
 610     low = a;
 611     high = a >> 64;
 612     out[2] += low;
 613     out[3] = high;
 614
 615     a = ((uint128_t) small1[1]) * small2[1];
 616     low = a;
 617     high = a >> 64;
 618     out[2] += low;
 619     out[3] += high;
 620
 621     a = ((uint128_t) small1[2]) * small2[0];
 622     low = a;
 623     high = a >> 64;
 624     out[2] += low;
 625     out[3] += high;
 626
 627     a = ((uint128_t) small1[0]) * small2[3];
 628     low = a;
 629     high = a >> 64;
 630     out[3] += low;
 631     out[4] = high;
 632
 633     a = ((uint128_t) small1[1]) * small2[2];
 634     low = a;
 635     high = a >> 64;
 636     out[3] += low;
 637     out[4] += high;
 638
 639     a = ((uint128_t) small1[2]) * small2[1];
 640     low = a;
 641     high = a >> 64;
 642     out[3] += low;
 643     out[4] += high;
 644
 645     a = ((uint128_t) small1[3]) * small2[0];
 646     low = a;
 647     high = a >> 64;
 648     out[3] += low;
 649     out[4] += high;
 650
 651     a = ((uint128_t) small1[1]) * small2[3];
 652     low = a;
 653     high = a >> 64;
 654     out[4] += low;
 655     out[5] = high;
 656
 657     a = ((uint128_t) small1[2]) * small2[2];
 658     low = a;
 659     high = a >> 64;
 660     out[4] += low;
 661     out[5] += high;
 662
 663     a = ((uint128_t) small1[3]) * small2[1];
 664     low = a;
 665     high = a >> 64;
 666     out[4] += low;
 667     out[5] += high;
 668
 669     a = ((uint128_t) small1[2]) * small2[3];
 670     low = a;
 671     high = a >> 64;
 672     out[5] += low;
 673     out[6] = high;
 674
 675     a = ((uint128_t) small1[3]) * small2[2];
 676     low = a;
 677     high = a >> 64;
 678     out[5] += low;
 679     out[6] += high;
 680
 681     a = ((uint128_t) small1[3]) * small2[3];
 682     low = a;
 683     high = a >> 64;
 684     out[6] += low;
 685     out[7] = high;
 686 }
 687
 688 /*-
 689  * felem_mul sets |out| = |in1| * |in2|
 690  * On entry:
 691  *   in1[i] < 2^109
 692  *   in2[i] < 2^109
 693  * On exit:
 694  *   out[i] < 7 * 2^64 < 2^67
 695  */
 696 static void felem_mul(longfelem out, const felem in1, const felem in2)
 697 {
 698     smallfelem small1, small2;
 699     felem_shrink(small1, in1);
 700     felem_shrink(small2, in2);
 701     smallfelem_mul(out, small1, small2);
 702 }
 703
 704 /*-
 705  * felem_small_mul sets |out| = |small1| * |in2|
 706  * On entry:
 707  *   small1[i] < 2^64
 708  *   in2[i] < 2^109
 709  * On exit:
 710  *   out[i] < 7 * 2^64 < 2^67
 711  */
 712 static void felem_small_mul(longfelem out, const smallfelem small1,
 713                             const felem in2)
 714 {
 715     smallfelem small2;
 716     felem_shrink(small2, in2);
 717     smallfelem_mul(out, small1, small2);
 718 }
 719
 720 # define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
 721 # define two100 (((limb)1) << 100)
 722 # define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
 723 /* zero100 is 0 mod p */
 724 static const felem zero100 =
 725     { two100m36m4, two100, two100m36p4, two100m36p4 };
 726
 727 /*-
 728  * Internal function for the different flavours of felem_reduce.
 729  * felem_reduce_ reduces the higher coefficients in[4]-in[7].
 730  * On entry:
 731  *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
 732  *   out[1] >= in[7] + 2^32*in[4]
 733  *   out[2] >= in[5] + 2^32*in[5]
 734  *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
 735  * On exit:
 736  *   out[0] <= out[0] + in[4] + 2^32*in[5]
 737  *   out[1] <= out[1] + in[5] + 2^33*in[6]
 738  *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
 739  *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
 740  */
 741 static void felem_reduce_(felem out, const longfelem in)
 742 {
 743     int128_t c;
 744     /* combine common terms from below */
 745     c = in[4] + (in[5] << 32);
 746     out[0] += c;
 747     out[3] -= c;
 748
 749     c = in[5] - in[7];
 750     out[1] += c;
 751     out[2] -= c;
 752
 753     /* the remaining terms */
 754     /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
 755     out[1] -= (in[4] << 32);
 756     out[3] += (in[4] << 32);
 757
 758     /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
 759     out[2] -= (in[5] << 32);
 760
 761     /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
 762     out[0] -= in[6];
 763     out[0] -= (in[6] << 32);
 764     out[1] += (in[6] << 33);
 765     out[2] += (in[6] * 2);
 766     out[3] -= (in[6] << 32);
 767
 768     /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
 769     out[0] -= in[7];
 770     out[0] -= (in[7] << 32);
 771     out[2] += (in[7] << 33);
 772     out[3] += (in[7] * 3);
 773 }
 774
 775 /*-
 776  * felem_reduce converts a longfelem into an felem.
 777  * To be called directly after felem_square or felem_mul.
 778  * On entry:
 779  *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
 780  *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
 781  * On exit:
 782  *   out[i] < 2^101
 783  */
 784 static void felem_reduce(felem out, const longfelem in)
 785 {
 786     out[0] = zero100[0] + in[0];
 787     out[1] = zero100[1] + in[1];
 788     out[2] = zero100[2] + in[2];
 789     out[3] = zero100[3] + in[3];
 790
 791     felem_reduce_(out, in);
 792
 793     /*-
 794      * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
 795      * out[1] > 2^100 - 2^64 - 7*2^96 > 0
 796      * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
 797      * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
 798      *
 799      * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
 800      * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
 801      * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
 802      * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
 803      */
 804 }
 805
 806 /*-
 807  * felem_reduce_zero105 converts a larger longfelem into an felem.
 808  * On entry:
 809  *   in[0] < 2^71
 810  * On exit:
 811  *   out[i] < 2^106
 812  */
 813 static void felem_reduce_zero105(felem out, const longfelem in)
 814 {
 815     out[0] = zero105[0] + in[0];
 816     out[1] = zero105[1] + in[1];
 817     out[2] = zero105[2] + in[2];
 818     out[3] = zero105[3] + in[3];
 819
 820     felem_reduce_(out, in);
 821
 822     /*-
 823      * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
 824      * out[1] > 2^105 - 2^71 - 2^103 > 0
 825      * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
 826      * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
 827      *
 828      * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 829      * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 830      * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
 831      * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
 832      */
 833 }
 834
 835 /*
 836  * subtract_u64 sets *result = *result - v and *carry to one if the
 837  * subtraction underflowed.
 838  */
 839 static void subtract_u64(u64 *result, u64 *carry, u64 v)
 840 {
 841     uint128_t r = *result;
 842     r -= v;
 843     *carry = (r >> 64) & 1;
 844     *result = (u64)r;
 845 }
 846
 847 /*
 848  * felem_contract converts |in| to its unique, minimal representation. On
 849  * entry: in[i] < 2^109
 850  */
 851 static void felem_contract(smallfelem out, const felem in)
 852 {
 853     unsigned i;
 854     u64 all_equal_so_far = 0, result = 0, carry;
 855
 856     felem_shrink(out, in);
 857     /* small is minimal except that the value might be > p */
 858
 859     all_equal_so_far--;
 860     /*
 861      * We are doing a constant time test if out >= kPrime. We need to compare
 862      * each u64, from most-significant to least significant. For each one, if
 863      * all words so far have been equal (m is all ones) then a non-equal
 864      * result is the answer. Otherwise we continue.
 865      */
 866     for (i = 3; i < 4; i--) {
 867         u64 equal;
 868         uint128_t a = ((uint128_t) kPrime[i]) - out[i];
 869         /*
 870          * if out[i] > kPrime[i] then a will underflow and the high 64-bits
 871          * will all be set.
 872          */
 873         result |= all_equal_so_far & ((u64)(a >> 64));
 874
 875         /*
 876          * if kPrime[i] == out[i] then |equal| will be all zeros and the
 877          * decrement will make it all ones.
 878          */
 879         equal = kPrime[i] ^ out[i];
 880         equal--;
 881         equal &= equal << 32;
 882         equal &= equal << 16;
 883         equal &= equal << 8;
 884         equal &= equal << 4;
 885         equal &= equal << 2;
 886         equal &= equal << 1;
 887         equal = 0 - (equal >> 63);
 888
 889         all_equal_so_far &= equal;
 890     }
 891
 892     /*
 893      * if all_equal_so_far is still all ones then the two values are equal
 894      * and so out >= kPrime is true.
 895      */
 896     result |= all_equal_so_far;
 897
 898     /* if out >= kPrime then we subtract kPrime. */
 899     subtract_u64(&out[0], &carry, result & kPrime[0]);
 900     subtract_u64(&out[1], &carry, carry);
 901     subtract_u64(&out[2], &carry, carry);
 902     subtract_u64(&out[3], &carry, carry);
 903
 904     subtract_u64(&out[1], &carry, result & kPrime[1]);
 905     subtract_u64(&out[2], &carry, carry);
 906     subtract_u64(&out[3], &carry, carry);
 907
 908     subtract_u64(&out[2], &carry, result & kPrime[2]);
 909     subtract_u64(&out[3], &carry, carry);
 910
 911     subtract_u64(&out[3], &carry, result & kPrime[3]);
 912 }
 913
 914 static void smallfelem_square_contract(smallfelem out, const smallfelem in)
 915 {
 916     longfelem longtmp;
 917     felem tmp;
 918
 919     smallfelem_square(longtmp, in);
 920     felem_reduce(tmp, longtmp);
 921     felem_contract(out, tmp);
 922 }
 923
 924 static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
 925                                     const smallfelem in2)
 926 {
 927     longfelem longtmp;
 928     felem tmp;
 929
 930     smallfelem_mul(longtmp, in1, in2);
 931     felem_reduce(tmp, longtmp);
 932     felem_contract(out, tmp);
 933 }
 934
 935 /*-
 936  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 937  * otherwise.
 938  * On entry:
 939  *   small[i] < 2^64
 940  */
 941 static limb smallfelem_is_zero(const smallfelem small)
 942 {
 943     limb result;
 944     u64 is_p;
 945
 946     u64 is_zero = small[0] | small[1] | small[2] | small[3];
 947     is_zero--;
 948     is_zero &= is_zero << 32;
 949     is_zero &= is_zero << 16;
 950     is_zero &= is_zero << 8;
 951     is_zero &= is_zero << 4;
 952     is_zero &= is_zero << 2;
 953     is_zero &= is_zero << 1;
 954     is_zero = 0 - (is_zero >> 63);
 955
 956     is_p = (small[0] ^ kPrime[0]) |
 957         (small[1] ^ kPrime[1]) |
 958         (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
 959     is_p--;
 960     is_p &= is_p << 32;
 961     is_p &= is_p << 16;
 962     is_p &= is_p << 8;
 963     is_p &= is_p << 4;
 964     is_p &= is_p << 2;
 965     is_p &= is_p << 1;
 966     is_p = 0 - (is_p >> 63);
 967
 968     is_zero |= is_p;
 969
 970     result = is_zero;
 971     result |= ((limb) is_zero) << 64;
 972     return result;
 973 }
 974
 975 static int smallfelem_is_zero_int(const void *small)
 976 {
 977     return (int)(smallfelem_is_zero(small) & ((limb) 1));
 978 }
 979
 980 /*-
 981  * felem_inv calculates |out| = |in|^{-1}
 982  *
 983  * Based on Fermat's Little Theorem:
 984  *   a^p = a (mod p)
 985  *   a^{p-1} = 1 (mod p)
 986  *   a^{p-2} = a^{-1} (mod p)
 987  */
 988 static void felem_inv(felem out, const felem in)
 989 {
 990     felem ftmp, ftmp2;
 991     /* each e_I will hold |in|^{2^I - 1} */
 992     felem e2, e4, e8, e16, e32, e64;
 993     longfelem tmp;
 994     unsigned i;
 995
 996     felem_square(tmp, in);
 997     felem_reduce(ftmp, tmp);    /* 2^1 */
 998     felem_mul(tmp, in, ftmp);
 999     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
1000     felem_assign(e2, ftmp);
1001     felem_square(tmp, ftmp);
1002     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
1003     felem_square(tmp, ftmp);
1004     felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
1005     felem_mul(tmp, ftmp, e2);
1006     felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1007     felem_assign(e4, ftmp);
1008     felem_square(tmp, ftmp);
1009     felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1010     felem_square(tmp, ftmp);
1011     felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1012     felem_square(tmp, ftmp);
1013     felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1014     felem_square(tmp, ftmp);
1015     felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1016     felem_mul(tmp, ftmp, e4);
1017     felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1018     felem_assign(e8, ftmp);
1019     for (i = 0; i < 8; i++) {
1020         felem_square(tmp, ftmp);
1021         felem_reduce(ftmp, tmp);
1022     }                           /* 2^16 - 2^8 */
1023     felem_mul(tmp, ftmp, e8);
1024     felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1025     felem_assign(e16, ftmp);
1026     for (i = 0; i < 16; i++) {
1027         felem_square(tmp, ftmp);
1028         felem_reduce(ftmp, tmp);
1029     }                           /* 2^32 - 2^16 */
1030     felem_mul(tmp, ftmp, e16);
1031     felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1032     felem_assign(e32, ftmp);
1033     for (i = 0; i < 32; i++) {
1034         felem_square(tmp, ftmp);
1035         felem_reduce(ftmp, tmp);
1036     }                           /* 2^64 - 2^32 */
1037     felem_assign(e64, ftmp);
1038     felem_mul(tmp, ftmp, in);
1039     felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1040     for (i = 0; i < 192; i++) {
1041         felem_square(tmp, ftmp);
1042         felem_reduce(ftmp, tmp);
1043     }                           /* 2^256 - 2^224 + 2^192 */
1044
1045     felem_mul(tmp, e64, e32);
1046     felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1047     for (i = 0; i < 16; i++) {
1048         felem_square(tmp, ftmp2);
1049         felem_reduce(ftmp2, tmp);
1050     }                           /* 2^80 - 2^16 */
1051     felem_mul(tmp, ftmp2, e16);
1052     felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1053     for (i = 0; i < 8; i++) {
1054         felem_square(tmp, ftmp2);
1055         felem_reduce(ftmp2, tmp);
1056     }                           /* 2^88 - 2^8 */
1057     felem_mul(tmp, ftmp2, e8);
1058     felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1059     for (i = 0; i < 4; i++) {
1060         felem_square(tmp, ftmp2);
1061         felem_reduce(ftmp2, tmp);
1062     }                           /* 2^92 - 2^4 */
1063     felem_mul(tmp, ftmp2, e4);
1064     felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1065     felem_square(tmp, ftmp2);
1066     felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1067     felem_square(tmp, ftmp2);
1068     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1069     felem_mul(tmp, ftmp2, e2);
1070     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1071     felem_square(tmp, ftmp2);
1072     felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1073     felem_square(tmp, ftmp2);
1074     felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1075     felem_mul(tmp, ftmp2, in);
1076     felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1077
1078     felem_mul(tmp, ftmp2, ftmp);
1079     felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1080 }
1081
1082 static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1083 {
1084     felem tmp;
1085
1086     smallfelem_expand(tmp, in);
1087     felem_inv(tmp, tmp);
1088     felem_contract(out, tmp);
1089 }
1090
1091 /*-
1092  * Group operations
1093  * ----------------
1094  *
1095  * Building on top of the field operations we have the operations on the
1096  * elliptic curve group itself. Points on the curve are represented in Jacobian
1097  * coordinates
1098  */
1099
1100 /*-
1101  * point_double calculates 2*(x_in, y_in, z_in)
1102  *
1103  * The method is taken from:
1104  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1105  *
1106  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1107  * while x_out == y_in is not (maybe this works, but it's not tested).
1108  */
1109 static void
1110 point_double(felem x_out, felem y_out, felem z_out,
1111              const felem x_in, const felem y_in, const felem z_in)
1112 {
1113     longfelem tmp, tmp2;
1114     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1115     smallfelem small1, small2;
1116
1117     felem_assign(ftmp, x_in);
1118     /* ftmp[i] < 2^106 */
1119     felem_assign(ftmp2, x_in);
1120     /* ftmp2[i] < 2^106 */
1121
1122     /* delta = z^2 */
1123     felem_square(tmp, z_in);
1124     felem_reduce(delta, tmp);
1125     /* delta[i] < 2^101 */
1126
1127     /* gamma = y^2 */
1128     felem_square(tmp, y_in);
1129     felem_reduce(gamma, tmp);
1130     /* gamma[i] < 2^101 */
1131     felem_shrink(small1, gamma);
1132
1133     /* beta = x*gamma */
1134     felem_small_mul(tmp, small1, x_in);
1135     felem_reduce(beta, tmp);
1136     /* beta[i] < 2^101 */
1137
1138     /* alpha = 3*(x-delta)*(x+delta) */
1139     felem_diff(ftmp, delta);
1140     /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1141     felem_sum(ftmp2, delta);
1142     /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1143     felem_scalar(ftmp2, 3);
1144     /* ftmp2[i] < 3 * 2^107 < 2^109 */
1145     felem_mul(tmp, ftmp, ftmp2);
1146     felem_reduce(alpha, tmp);
1147     /* alpha[i] < 2^101 */
1148     felem_shrink(small2, alpha);
1149
1150     /* x' = alpha^2 - 8*beta */
1151     smallfelem_square(tmp, small2);
1152     felem_reduce(x_out, tmp);
1153     felem_assign(ftmp, beta);
1154     felem_scalar(ftmp, 8);
1155     /* ftmp[i] < 8 * 2^101 = 2^104 */
1156     felem_diff(x_out, ftmp);
1157     /* x_out[i] < 2^105 + 2^101 < 2^106 */
1158
1159     /* z' = (y + z)^2 - gamma - delta */
1160     felem_sum(delta, gamma);
1161     /* delta[i] < 2^101 + 2^101 = 2^102 */
1162     felem_assign(ftmp, y_in);
1163     felem_sum(ftmp, z_in);
1164     /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1165     felem_square(tmp, ftmp);
1166     felem_reduce(z_out, tmp);
1167     felem_diff(z_out, delta);
1168     /* z_out[i] < 2^105 + 2^101 < 2^106 */
1169
1170     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1171     felem_scalar(beta, 4);
1172     /* beta[i] < 4 * 2^101 = 2^103 */
1173     felem_diff_zero107(beta, x_out);
1174     /* beta[i] < 2^107 + 2^103 < 2^108 */
1175     felem_small_mul(tmp, small2, beta);
1176     /* tmp[i] < 7 * 2^64 < 2^67 */
1177     smallfelem_square(tmp2, small1);
1178     /* tmp2[i] < 7 * 2^64 */
1179     longfelem_scalar(tmp2, 8);
1180     /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1181     longfelem_diff(tmp, tmp2);
1182     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1183     felem_reduce_zero105(y_out, tmp);
1184     /* y_out[i] < 2^106 */
1185 }
1186
1187 /*
1188  * point_double_small is the same as point_double, except that it operates on
1189  * smallfelems
1190  */
1191 static void
1192 point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1193                    const smallfelem x_in, const smallfelem y_in,
1194                    const smallfelem z_in)
1195 {
1196     felem felem_x_out, felem_y_out, felem_z_out;
1197     felem felem_x_in, felem_y_in, felem_z_in;
1198
1199     smallfelem_expand(felem_x_in, x_in);
1200     smallfelem_expand(felem_y_in, y_in);
1201     smallfelem_expand(felem_z_in, z_in);
1202     point_double(felem_x_out, felem_y_out, felem_z_out,
1203                  felem_x_in, felem_y_in, felem_z_in);
1204     felem_shrink(x_out, felem_x_out);
1205     felem_shrink(y_out, felem_y_out);
1206     felem_shrink(z_out, felem_z_out);
1207 }
1208
1209 /* copy_conditional copies in to out iff mask is all ones. */
1210 static void copy_conditional(felem out, const felem in, limb mask)
1211 {
1212     unsigned i;
1213     for (i = 0; i < NLIMBS; ++i) {
1214         const limb tmp = mask & (in[i] ^ out[i]);
1215         out[i] ^= tmp;
1216     }
1217 }
1218
1219 /* copy_small_conditional copies in to out iff mask is all ones. */
1220 static void copy_small_conditional(felem out, const smallfelem in, limb mask)
1221 {
1222     unsigned i;
1223     const u64 mask64 = mask;
1224     for (i = 0; i < NLIMBS; ++i) {
1225         out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1226     }
1227 }
1228
1229 /*-
1230  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1231  *
1232  * The method is taken from:
1233  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1234  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1235  *
1236  * This function includes a branch for checking whether the two input points
1237  * are equal, (while not equal to the point at infinity). This case never
1238  * happens during single point multiplication, so there is no timing leak for
1239  * ECDH or ECDSA signing.
1240  */
1241 static void point_add(felem x3, felem y3, felem z3,
1242                       const felem x1, const felem y1, const felem z1,
1243                       const int mixed, const smallfelem x2,
1244                       const smallfelem y2, const smallfelem z2)
1245 {
1246     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1247     longfelem tmp, tmp2;
1248     smallfelem small1, small2, small3, small4, small5;
1249     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1250     limb points_equal;
1251
1252     felem_shrink(small3, z1);
1253
1254     z1_is_zero = smallfelem_is_zero(small3);
1255     z2_is_zero = smallfelem_is_zero(z2);
1256
1257     /* ftmp = z1z1 = z1**2 */
1258     smallfelem_square(tmp, small3);
1259     felem_reduce(ftmp, tmp);
1260     /* ftmp[i] < 2^101 */
1261     felem_shrink(small1, ftmp);
1262
1263     if (!mixed) {
1264         /* ftmp2 = z2z2 = z2**2 */
1265         smallfelem_square(tmp, z2);
1266         felem_reduce(ftmp2, tmp);
1267         /* ftmp2[i] < 2^101 */
1268         felem_shrink(small2, ftmp2);
1269
1270         felem_shrink(small5, x1);
1271
1272         /* u1 = ftmp3 = x1*z2z2 */
1273         smallfelem_mul(tmp, small5, small2);
1274         felem_reduce(ftmp3, tmp);
1275         /* ftmp3[i] < 2^101 */
1276
1277         /* ftmp5 = z1 + z2 */
1278         felem_assign(ftmp5, z1);
1279         felem_small_sum(ftmp5, z2);
1280         /* ftmp5[i] < 2^107 */
1281
1282         /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1283         felem_square(tmp, ftmp5);
1284         felem_reduce(ftmp5, tmp);
1285         /* ftmp2 = z2z2 + z1z1 */
1286         felem_sum(ftmp2, ftmp);
1287         /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1288         felem_diff(ftmp5, ftmp2);
1289         /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1290
1291         /* ftmp2 = z2 * z2z2 */
1292         smallfelem_mul(tmp, small2, z2);
1293         felem_reduce(ftmp2, tmp);
1294
1295         /* s1 = ftmp2 = y1 * z2**3 */
1296         felem_mul(tmp, y1, ftmp2);
1297         felem_reduce(ftmp6, tmp);
1298         /* ftmp6[i] < 2^101 */
1299     } else {
1300         /*
1301          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1302          */
1303
1304         /* u1 = ftmp3 = x1*z2z2 */
1305         felem_assign(ftmp3, x1);
1306         /* ftmp3[i] < 2^106 */
1307
1308         /* ftmp5 = 2z1z2 */
1309         felem_assign(ftmp5, z1);
1310         felem_scalar(ftmp5, 2);
1311         /* ftmp5[i] < 2*2^106 = 2^107 */
1312
1313         /* s1 = ftmp2 = y1 * z2**3 */
1314         felem_assign(ftmp6, y1);
1315         /* ftmp6[i] < 2^106 */
1316     }
1317
1318     /* u2 = x2*z1z1 */
1319     smallfelem_mul(tmp, x2, small1);
1320     felem_reduce(ftmp4, tmp);
1321
1322     /* h = ftmp4 = u2 - u1 */
1323     felem_diff_zero107(ftmp4, ftmp3);
1324     /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1325     felem_shrink(small4, ftmp4);
1326
1327     x_equal = smallfelem_is_zero(small4);
1328
1329     /* z_out = ftmp5 * h */
1330     felem_small_mul(tmp, small4, ftmp5);
1331     felem_reduce(z_out, tmp);
1332     /* z_out[i] < 2^101 */
1333
1334     /* ftmp = z1 * z1z1 */
1335     smallfelem_mul(tmp, small1, small3);
1336     felem_reduce(ftmp, tmp);
1337
1338     /* s2 = tmp = y2 * z1**3 */
1339     felem_small_mul(tmp, y2, ftmp);
1340     felem_reduce(ftmp5, tmp);
1341
1342     /* r = ftmp5 = (s2 - s1)*2 */
1343     felem_diff_zero107(ftmp5, ftmp6);
1344     /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1345     felem_scalar(ftmp5, 2);
1346     /* ftmp5[i] < 2^109 */
1347     felem_shrink(small1, ftmp5);
1348     y_equal = smallfelem_is_zero(small1);
1349
1350     /*
1351      * The formulae are incorrect if the points are equal, in affine coordinates
1352      * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1353      * happens.
1354      *
1355      * We use bitwise operations to avoid potential side-channels introduced by
1356      * the short-circuiting behaviour of boolean operators.
1357      *
1358      * The special case of either point being the point at infinity (z1 and/or
1359      * z2 are zero), is handled separately later on in this function, so we
1360      * avoid jumping to point_double here in those special cases.
1361      */
1362     points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1363
1364     if (points_equal) {
1365         /*
1366          * This is obviously not constant-time but, as mentioned before, this
1367          * case never happens during single point multiplication, so there is no
1368          * timing leak for ECDH or ECDSA signing.
1369          */
1370         point_double(x3, y3, z3, x1, y1, z1);
1371         return;
1372     }
1373
1374     /* I = ftmp = (2h)**2 */
1375     felem_assign(ftmp, ftmp4);
1376     felem_scalar(ftmp, 2);
1377     /* ftmp[i] < 2*2^108 = 2^109 */
1378     felem_square(tmp, ftmp);
1379     felem_reduce(ftmp, tmp);
1380
1381     /* J = ftmp2 = h * I */
1382     felem_mul(tmp, ftmp4, ftmp);
1383     felem_reduce(ftmp2, tmp);
1384
1385     /* V = ftmp4 = U1 * I */
1386     felem_mul(tmp, ftmp3, ftmp);
1387     felem_reduce(ftmp4, tmp);
1388
1389     /* x_out = r**2 - J - 2V */
1390     smallfelem_square(tmp, small1);
1391     felem_reduce(x_out, tmp);
1392     felem_assign(ftmp3, ftmp4);
1393     felem_scalar(ftmp4, 2);
1394     felem_sum(ftmp4, ftmp2);
1395     /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1396     felem_diff(x_out, ftmp4);
1397     /* x_out[i] < 2^105 + 2^101 */
1398
1399     /* y_out = r(V-x_out) - 2 * s1 * J */
1400     felem_diff_zero107(ftmp3, x_out);
1401     /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1402     felem_small_mul(tmp, small1, ftmp3);
1403     felem_mul(tmp2, ftmp6, ftmp2);
1404     longfelem_scalar(tmp2, 2);
1405     /* tmp2[i] < 2*2^67 = 2^68 */
1406     longfelem_diff(tmp, tmp2);
1407     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1408     felem_reduce_zero105(y_out, tmp);
1409     /* y_out[i] < 2^106 */
1410
1411     copy_small_conditional(x_out, x2, z1_is_zero);
1412     copy_conditional(x_out, x1, z2_is_zero);
1413     copy_small_conditional(y_out, y2, z1_is_zero);
1414     copy_conditional(y_out, y1, z2_is_zero);
1415     copy_small_conditional(z_out, z2, z1_is_zero);
1416     copy_conditional(z_out, z1, z2_is_zero);
1417     felem_assign(x3, x_out);
1418     felem_assign(y3, y_out);
1419     felem_assign(z3, z_out);
1420 }
1421
1422 /*
1423  * point_add_small is the same as point_add, except that it operates on
1424  * smallfelems
1425  */
1426 static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1427                             smallfelem x1, smallfelem y1, smallfelem z1,
1428                             smallfelem x2, smallfelem y2, smallfelem z2)
1429 {
1430     felem felem_x3, felem_y3, felem_z3;
1431     felem felem_x1, felem_y1, felem_z1;
1432     smallfelem_expand(felem_x1, x1);
1433     smallfelem_expand(felem_y1, y1);
1434     smallfelem_expand(felem_z1, z1);
1435     point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1436               x2, y2, z2);
1437     felem_shrink(x3, felem_x3);
1438     felem_shrink(y3, felem_y3);
1439     felem_shrink(z3, felem_z3);
1440 }
1441
1442 /*-
1443  * Base point pre computation
1444  * --------------------------
1445  *
1446  * Two different sorts of precomputed tables are used in the following code.
1447  * Each contain various points on the curve, where each point is three field
1448  * elements (x, y, z).
1449  *
1450  * For the base point table, z is usually 1 (0 for the point at infinity).
1451  * This table has 2 * 16 elements, starting with the following:
1452  * index | bits    | point
1453  * ------+---------+------------------------------
1454  *     0 | 0 0 0 0 | 0G
1455  *     1 | 0 0 0 1 | 1G
1456  *     2 | 0 0 1 0 | 2^64G
1457  *     3 | 0 0 1 1 | (2^64 + 1)G
1458  *     4 | 0 1 0 0 | 2^128G
1459  *     5 | 0 1 0 1 | (2^128 + 1)G
1460  *     6 | 0 1 1 0 | (2^128 + 2^64)G
1461  *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1462  *     8 | 1 0 0 0 | 2^192G
1463  *     9 | 1 0 0 1 | (2^192 + 1)G
1464  *    10 | 1 0 1 0 | (2^192 + 2^64)G
1465  *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1466  *    12 | 1 1 0 0 | (2^192 + 2^128)G
1467  *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1468  *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1469  *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1470  * followed by a copy of this with each element multiplied by 2^32.
1471  *
1472  * The reason for this is so that we can clock bits into four different
1473  * locations when doing simple scalar multiplies against the base point,
1474  * and then another four locations using the second 16 elements.
1475  *
1476  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1477
1478 /* gmul is the table of precomputed base points */
1479 static const smallfelem gmul[2][16][3] = {
1480     {{{0, 0, 0, 0},
1481       {0, 0, 0, 0},
1482       {0, 0, 0, 0}},
1483      {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1484        0x6b17d1f2e12c4247},
1485       {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1486        0x4fe342e2fe1a7f9b},
1487       {1, 0, 0, 0}},
1488      {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1489        0x0fa822bc2811aaa5},
1490       {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1491        0xbff44ae8f5dba80d},
1492       {1, 0, 0, 0}},
1493      {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1494        0x300a4bbc89d6726f},
1495       {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1496        0x72aac7e0d09b4644},
1497       {1, 0, 0, 0}},
1498      {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1499        0x447d739beedb5e67},
1500       {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1501        0x2d4825ab834131ee},
1502       {1, 0, 0, 0}},
1503      {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1504        0xef9519328a9c72ff},
1505       {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1506        0x611e9fc37dbb2c9b},
1507       {1, 0, 0, 0}},
1508      {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1509        0x550663797b51f5d8},
1510       {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1511        0x157164848aecb851},
1512       {1, 0, 0, 0}},
1513      {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1514        0xeb5d7745b21141ea},
1515       {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1516        0xeafd72ebdbecc17b},
1517       {1, 0, 0, 0}},
1518      {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1519        0xa6d39677a7849276},
1520       {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1521        0x674f84749b0b8816},
1522       {1, 0, 0, 0}},
1523      {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1524        0x4e769e7672c9ddad},
1525       {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1526        0x42b99082de830663},
1527       {1, 0, 0, 0}},
1528      {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1529        0x78878ef61c6ce04d},
1530       {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1531        0xb6cb3f5d7b72c321},
1532       {1, 0, 0, 0}},
1533      {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1534        0x0c88bc4d716b1287},
1535       {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1536        0xdd5ddea3f3901dc6},
1537       {1, 0, 0, 0}},
1538      {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1539        0x68f344af6b317466},
1540       {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1541        0x31b9c405f8540a20},
1542       {1, 0, 0, 0}},
1543      {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1544        0x4052bf4b6f461db9},
1545       {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1546        0xfecf4d5190b0fc61},
1547       {1, 0, 0, 0}},
1548      {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1549        0x1eddbae2c802e41a},
1550       {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1551        0x43104d86560ebcfc},
1552       {1, 0, 0, 0}},
1553      {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1554        0xb48e26b484f7a21c},
1555       {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1556        0xfac015404d4d3dab},
1557       {1, 0, 0, 0}}},
1558     {{{0, 0, 0, 0},
1559       {0, 0, 0, 0},
1560       {0, 0, 0, 0}},
1561      {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1562        0x7fe36b40af22af89},
1563       {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1564        0xe697d45825b63624},
1565       {1, 0, 0, 0}},
1566      {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1567        0x4a5b506612a677a6},
1568       {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1569        0xeb13461ceac089f1},
1570       {1, 0, 0, 0}},
1571      {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1572        0x0781b8291c6a220a},
1573       {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1574        0x690cde8df0151593},
1575       {1, 0, 0, 0}},
1576      {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1577        0x8a535f566ec73617},
1578       {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1579        0x0455c08468b08bd7},
1580       {1, 0, 0, 0}},
1581      {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1582        0x06bada7ab77f8276},
1583       {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1584        0x5b476dfd0e6cb18a},
1585       {1, 0, 0, 0}},
1586      {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1587        0x3e29864e8a2ec908},
1588       {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1589        0x239b90ea3dc31e7e},
1590       {1, 0, 0, 0}},
1591      {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1592        0x820f4dd949f72ff7},
1593       {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1594        0x140406ec783a05ec},
1595       {1, 0, 0, 0}},
1596      {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1597        0x68f6b8542783dfee},
1598       {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1599        0xcbe1feba92e40ce6},
1600       {1, 0, 0, 0}},
1601      {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1602        0xd0b2f94d2f420109},
1603       {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1604        0x971459828b0719e5},
1605       {1, 0, 0, 0}},
1606      {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1607        0x961610004a866aba},
1608       {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1609        0x7acb9fadcee75e44},
1610       {1, 0, 0, 0}},
1611      {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1612        0x24eb9acca333bf5b},
1613       {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1614        0x69f891c5acd079cc},
1615       {1, 0, 0, 0}},
1616      {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1617        0xe51f547c5972a107},
1618       {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1619        0x1c309a2b25bb1387},
1620       {1, 0, 0, 0}},
1621      {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1622        0x20b87b8aa2c4e503},
1623       {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1624        0xf5c6fa49919776be},
1625       {1, 0, 0, 0}},
1626      {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1627        0x1ed7d1b9332010b9},
1628       {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1629        0x3a2b03f03217257a},
1630       {1, 0, 0, 0}},
1631      {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1632        0x15fee545c78dd9f6},
1633       {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1634        0x4ab5b6b2b8753f81},
1635       {1, 0, 0, 0}}}
1636 };
1637
1638 /*
1639  * select_point selects the |idx|th point from a precomputation table and
1640  * copies it to out.
1641  */
1642 static void select_point(const u64 idx, unsigned int size,
1643                          const smallfelem pre_comp[16][3], smallfelem out[3])
1644 {
1645     unsigned i, j;
1646     u64 *outlimbs = &out[0][0];
1647
1648     memset(out, 0, sizeof(*out) * 3);
1649
1650     for (i = 0; i < size; i++) {
1651         const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1652         u64 mask = i ^ idx;
1653         mask |= mask >> 4;
1654         mask |= mask >> 2;
1655         mask |= mask >> 1;
1656         mask &= 1;
1657         mask--;
1658         for (j = 0; j < NLIMBS * 3; j++)
1659             outlimbs[j] |= inlimbs[j] & mask;
1660     }
1661 }
1662
1663 /* get_bit returns the |i|th bit in |in| */
1664 static char get_bit(const felem_bytearray in, int i)
1665 {
1666     if ((i < 0) || (i >= 256))
1667         return 0;
1668     return (in[i >> 3] >> (i & 7)) & 1;
1669 }
1670
1671 /*
1672  * Interleaved point multiplication using precomputed point multiples: The
1673  * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1674  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1675  * generator, using certain (large) precomputed multiples in g_pre_comp.
1676  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1677  */
1678 static void batch_mul(felem x_out, felem y_out, felem z_out,
1679                       const felem_bytearray scalars[],
1680                       const unsigned num_points, const u8 *g_scalar,
1681                       const int mixed, const smallfelem pre_comp[][17][3],
1682                       const smallfelem g_pre_comp[2][16][3])
1683 {
1684     int i, skip;
1685     unsigned num, gen_mul = (g_scalar != NULL);
1686     felem nq[3], ftmp;
1687     smallfelem tmp[3];
1688     u64 bits;
1689     u8 sign, digit;
1690
1691     /* set nq to the point at infinity */
1692     memset(nq, 0, sizeof(nq));
1693
1694     /*
1695      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1696      * of the generator (two in each of the last 32 rounds) and additions of
1697      * other points multiples (every 5th round).
1698      */
1699     skip = 1;                   /* save two point operations in the first
1700                                  * round */
1701     for (i = (num_points ? 255 : 31); i >= 0; --i) {
1702         /* double */
1703         if (!skip)
1704             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1705
1706         /* add multiples of the generator */
1707         if (gen_mul && (i <= 31)) {
1708             /* first, look 32 bits upwards */
1709             bits = get_bit(g_scalar, i + 224) << 3;
1710             bits |= get_bit(g_scalar, i + 160) << 2;
1711             bits |= get_bit(g_scalar, i + 96) << 1;
1712             bits |= get_bit(g_scalar, i + 32);
1713             /* select the point to add, in constant time */
1714             select_point(bits, 16, g_pre_comp[1], tmp);
1715
1716             if (!skip) {
1717                 /* Arg 1 below is for "mixed" */
1718                 point_add(nq[0], nq[1], nq[2],
1719                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1720             } else {
1721                 smallfelem_expand(nq[0], tmp[0]);
1722                 smallfelem_expand(nq[1], tmp[1]);
1723                 smallfelem_expand(nq[2], tmp[2]);
1724                 skip = 0;
1725             }
1726
1727             /* second, look at the current position */
1728             bits = get_bit(g_scalar, i + 192) << 3;
1729             bits |= get_bit(g_scalar, i + 128) << 2;
1730             bits |= get_bit(g_scalar, i + 64) << 1;
1731             bits |= get_bit(g_scalar, i);
1732             /* select the point to add, in constant time */
1733             select_point(bits, 16, g_pre_comp[0], tmp);
1734             /* Arg 1 below is for "mixed" */
1735             point_add(nq[0], nq[1], nq[2],
1736                       nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1737         }
1738
1739         /* do other additions every 5 doublings */
1740         if (num_points && (i % 5 == 0)) {
1741             /* loop over all scalars */
1742             for (num = 0; num < num_points; ++num) {
1743                 bits = get_bit(scalars[num], i + 4) << 5;
1744                 bits |= get_bit(scalars[num], i + 3) << 4;
1745                 bits |= get_bit(scalars[num], i + 2) << 3;
1746                 bits |= get_bit(scalars[num], i + 1) << 2;
1747                 bits |= get_bit(scalars[num], i) << 1;
1748                 bits |= get_bit(scalars[num], i - 1);
1749                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1750
1751                 /*
1752                  * select the point to add or subtract, in constant time
1753                  */
1754                 select_point(digit, 17, pre_comp[num], tmp);
1755                 smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1756                                                * point */
1757                 copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1758                 felem_contract(tmp[1], ftmp);
1759
1760                 if (!skip) {
1761                     point_add(nq[0], nq[1], nq[2],
1762                               nq[0], nq[1], nq[2],
1763                               mixed, tmp[0], tmp[1], tmp[2]);
1764                 } else {
1765                     smallfelem_expand(nq[0], tmp[0]);
1766                     smallfelem_expand(nq[1], tmp[1]);
1767                     smallfelem_expand(nq[2], tmp[2]);
1768                     skip = 0;
1769                 }
1770             }
1771         }
1772     }
1773     felem_assign(x_out, nq[0]);
1774     felem_assign(y_out, nq[1]);
1775     felem_assign(z_out, nq[2]);
1776 }
1777
1778 /* Precomputation for the group generator. */
1779 struct nistp256_pre_comp_st {
1780     smallfelem g_pre_comp[2][16][3];
1781     CRYPTO_REF_COUNT references;
1782     CRYPTO_RWLOCK *lock;
1783 };
1784
1785 const EC_METHOD *EC_GFp_nistp256_method(void)
1786 {
1787     static const EC_METHOD ret = {
1788         EC_FLAGS_DEFAULT_OCT,
1789         NID_X9_62_prime_field,
1790         ec_GFp_nistp256_group_init,
1791         ec_GFp_simple_group_finish,
1792         ec_GFp_simple_group_clear_finish,
1793         ec_GFp_nist_group_copy,
1794         ec_GFp_nistp256_group_set_curve,
1795         ec_GFp_simple_group_get_curve,
1796         ec_GFp_simple_group_get_degree,
1797         ec_group_simple_order_bits,
1798         ec_GFp_simple_group_check_discriminant,
1799         ec_GFp_simple_point_init,
1800         ec_GFp_simple_point_finish,
1801         ec_GFp_simple_point_clear_finish,
1802         ec_GFp_simple_point_copy,
1803         ec_GFp_simple_point_set_to_infinity,
1804         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1805         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1806         ec_GFp_simple_point_set_affine_coordinates,
1807         ec_GFp_nistp256_point_get_affine_coordinates,
1808         0 /* point_set_compressed_coordinates */ ,
1809         0 /* point2oct */ ,
1810         0 /* oct2point */ ,
1811         ec_GFp_simple_add,
1812         ec_GFp_simple_dbl,
1813         ec_GFp_simple_invert,
1814         ec_GFp_simple_is_at_infinity,
1815         ec_GFp_simple_is_on_curve,
1816         ec_GFp_simple_cmp,
1817         ec_GFp_simple_make_affine,
1818         ec_GFp_simple_points_make_affine,
1819         ec_GFp_nistp256_points_mul,
1820         ec_GFp_nistp256_precompute_mult,
1821         ec_GFp_nistp256_have_precompute_mult,
1822         ec_GFp_nist_field_mul,
1823         ec_GFp_nist_field_sqr,
1824         0 /* field_div */ ,
1825         ec_GFp_simple_field_inv,
1826         0 /* field_encode */ ,
1827         0 /* field_decode */ ,
1828         0,                      /* field_set_to_one */
1829         ec_key_simple_priv2oct,
1830         ec_key_simple_oct2priv,
1831         0, /* set private */
1832         ec_key_simple_generate_key,
1833         ec_key_simple_check_key,
1834         ec_key_simple_generate_public_key,
1835         0, /* keycopy */
1836         0, /* keyfinish */
1837         ecdh_simple_compute_key,
1838         ecdsa_simple_sign_setup,
1839         ecdsa_simple_sign_sig,
1840         ecdsa_simple_verify_sig,
1841         0, /* field_inverse_mod_ord */
1842         0, /* blind_coordinates */
1843         0, /* ladder_pre */
1844         0, /* ladder_step */
1845         0  /* ladder_post */
1846     };
1847
1848     return &ret;
1849 }
1850
1851 /******************************************************************************/
1852 /*
1853  * FUNCTIONS TO MANAGE PRECOMPUTATION
1854  */
1855
1856 static NISTP256_PRE_COMP *nistp256_pre_comp_new(void)
1857 {
1858     NISTP256_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1859
1860     if (ret == NULL) {
1861         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1862         return ret;
1863     }
1864
1865     ret->references = 1;
1866
1867     ret->lock = CRYPTO_THREAD_lock_new();
1868     if (ret->lock == NULL) {
1869         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1870         OPENSSL_free(ret);
1871         return NULL;
1872     }
1873     return ret;
1874 }
1875
1876 NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *p)
1877 {
1878     int i;
1879     if (p != NULL)
1880         CRYPTO_UP_REF(&p->references, &i, p->lock);
1881     return p;
1882 }
1883
1884 void EC_nistp256_pre_comp_free(NISTP256_PRE_COMP *pre)
1885 {
1886     int i;
1887
1888     if (pre == NULL)
1889         return;
1890
1891     CRYPTO_DOWN_REF(&pre->references, &i, pre->lock);
1892     REF_PRINT_COUNT("EC_nistp256", x);
1893     if (i > 0)
1894         return;
1895     REF_ASSERT_ISNT(i < 0);
1896
1897     CRYPTO_THREAD_lock_free(pre->lock);
1898     OPENSSL_free(pre);
1899 }
1900
1901 /******************************************************************************/
1902 /*
1903  * OPENSSL EC_METHOD FUNCTIONS
1904  */
1905
1906 int ec_GFp_nistp256_group_init(EC_GROUP *group)
1907 {
1908     int ret;
1909     ret = ec_GFp_simple_group_init(group);
1910     group->a_is_minus3 = 1;
1911     return ret;
1912 }
1913
1914 int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1915                                     const BIGNUM *a, const BIGNUM *b,
1916                                     BN_CTX *ctx)
1917 {
1918     int ret = 0;
1919     BIGNUM *curve_p, *curve_a, *curve_b;
1920 #ifndef FIPS_MODE
1921     BN_CTX *new_ctx = NULL;
1922
1923     if (ctx == NULL)
1924         ctx = new_ctx = BN_CTX_new();
1925 #endif
1926     if (ctx == NULL)
1927         return 0;
1928
1929     BN_CTX_start(ctx);
1930     curve_p = BN_CTX_get(ctx);
1931     curve_a = BN_CTX_get(ctx);
1932     curve_b = BN_CTX_get(ctx);
1933     if (curve_b == NULL)
1934         goto err;
1935     BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1936     BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1937     BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1938     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1939         ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1940               EC_R_WRONG_CURVE_PARAMETERS);
1941         goto err;
1942     }
1943     group->field_mod_func = BN_nist_mod_256;
1944     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1945  err:
1946     BN_CTX_end(ctx);
1947 #ifndef FIPS_MODE
1948     BN_CTX_free(new_ctx);
1949 #endif
1950     return ret;
1951 }
1952
1953 /*
1954  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1955  * (X/Z^2, Y/Z^3)
1956  */
1957 int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1958                                                  const EC_POINT *point,
1959                                                  BIGNUM *x, BIGNUM *y,
1960                                                  BN_CTX *ctx)
1961 {
1962     felem z1, z2, x_in, y_in;
1963     smallfelem x_out, y_out;
1964     longfelem tmp;
1965
1966     if (EC_POINT_is_at_infinity(group, point)) {
1967         ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1968               EC_R_POINT_AT_INFINITY);
1969         return 0;
1970     }
1971     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1972         (!BN_to_felem(z1, point->Z)))
1973         return 0;
1974     felem_inv(z2, z1);
1975     felem_square(tmp, z2);
1976     felem_reduce(z1, tmp);
1977     felem_mul(tmp, x_in, z1);
1978     felem_reduce(x_in, tmp);
1979     felem_contract(x_out, x_in);
1980     if (x != NULL) {
1981         if (!smallfelem_to_BN(x, x_out)) {
1982             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1983                   ERR_R_BN_LIB);
1984             return 0;
1985         }
1986     }
1987     felem_mul(tmp, z1, z2);
1988     felem_reduce(z1, tmp);
1989     felem_mul(tmp, y_in, z1);
1990     felem_reduce(y_in, tmp);
1991     felem_contract(y_out, y_in);
1992     if (y != NULL) {
1993         if (!smallfelem_to_BN(y, y_out)) {
1994             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1995                   ERR_R_BN_LIB);
1996             return 0;
1997         }
1998     }
1999     return 1;
2000 }
2001
2002 /* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
2003 static void make_points_affine(size_t num, smallfelem points[][3],
2004                                smallfelem tmp_smallfelems[])
2005 {
2006     /*
2007      * Runs in constant time, unless an input is the point at infinity (which
2008      * normally shouldn't happen).
2009      */
2010     ec_GFp_nistp_points_make_affine_internal(num,
2011                                              points,
2012                                              sizeof(smallfelem),
2013                                              tmp_smallfelems,
2014                                              (void (*)(void *))smallfelem_one,
2015                                              smallfelem_is_zero_int,
2016                                              (void (*)(void *, const void *))
2017                                              smallfelem_assign,
2018                                              (void (*)(void *, const void *))
2019                                              smallfelem_square_contract,
2020                                              (void (*)
2021                                               (void *, const void *,
2022                                                const void *))
2023                                              smallfelem_mul_contract,
2024                                              (void (*)(void *, const void *))
2025                                              smallfelem_inv_contract,
2026                                              /* nothing to contract */
2027                                              (void (*)(void *, const void *))
2028                                              smallfelem_assign);
2029 }
2030
2031 /*
2032  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
2033  * values Result is stored in r (r can equal one of the inputs).
2034  */
2035 int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
2036                                const BIGNUM *scalar, size_t num,
2037                                const EC_POINT *points[],
2038                                const BIGNUM *scalars[], BN_CTX *ctx)
2039 {
2040     int ret = 0;
2041     int j;
2042     int mixed = 0;
2043     BIGNUM *x, *y, *z, *tmp_scalar;
2044     felem_bytearray g_secret;
2045     felem_bytearray *secrets = NULL;
2046     smallfelem (*pre_comp)[17][3] = NULL;
2047     smallfelem *tmp_smallfelems = NULL;
2048     unsigned i;
2049     int num_bytes;
2050     int have_pre_comp = 0;
2051     size_t num_points = num;
2052     smallfelem x_in, y_in, z_in;
2053     felem x_out, y_out, z_out;
2054     NISTP256_PRE_COMP *pre = NULL;
2055     const smallfelem(*g_pre_comp)[16][3] = NULL;
2056     EC_POINT *generator = NULL;
2057     const EC_POINT *p = NULL;
2058     const BIGNUM *p_scalar = NULL;
2059
2060     BN_CTX_start(ctx);
2061     x = BN_CTX_get(ctx);
2062     y = BN_CTX_get(ctx);
2063     z = BN_CTX_get(ctx);
2064     tmp_scalar = BN_CTX_get(ctx);
2065     if (tmp_scalar == NULL)
2066         goto err;
2067
2068     if (scalar != NULL) {
2069         pre = group->pre_comp.nistp256;
2070         if (pre)
2071             /* we have precomputation, try to use it */
2072             g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2073         else
2074             /* try to use the standard precomputation */
2075             g_pre_comp = &gmul[0];
2076         generator = EC_POINT_new(group);
2077         if (generator == NULL)
2078             goto err;
2079         /* get the generator from precomputation */
2080         if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2081             !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2082             !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2083             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2084             goto err;
2085         }
2086         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
2087                                                       generator, x, y, z,
2088                                                       ctx))
2089             goto err;
2090         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2091             /* precomputation matches generator */
2092             have_pre_comp = 1;
2093         else
2094             /*
2095              * we don't have valid precomputation: treat the generator as a
2096              * random point
2097              */
2098             num_points++;
2099     }
2100     if (num_points > 0) {
2101         if (num_points >= 3) {
2102             /*
2103              * unless we precompute multiples for just one or two points,
2104              * converting those into affine form is time well spent
2105              */
2106             mixed = 1;
2107         }
2108         secrets = OPENSSL_malloc(sizeof(*secrets) * num_points);
2109         pre_comp = OPENSSL_malloc(sizeof(*pre_comp) * num_points);
2110         if (mixed)
2111             tmp_smallfelems =
2112               OPENSSL_malloc(sizeof(*tmp_smallfelems) * (num_points * 17 + 1));
2113         if ((secrets == NULL) || (pre_comp == NULL)
2114             || (mixed && (tmp_smallfelems == NULL))) {
2115             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
2116             goto err;
2117         }
2118
2119         /*
2120          * we treat NULL scalars as 0, and NULL points as points at infinity,
2121          * i.e., they contribute nothing to the linear combination
2122          */
2123         memset(secrets, 0, sizeof(*secrets) * num_points);
2124         memset(pre_comp, 0, sizeof(*pre_comp) * num_points);
2125         for (i = 0; i < num_points; ++i) {
2126             if (i == num) {
2127                 /*
2128                  * we didn't have a valid precomputation, so we pick the
2129                  * generator
2130                  */
2131                 p = EC_GROUP_get0_generator(group);
2132                 p_scalar = scalar;
2133             } else {
2134                 /* the i^th point */
2135                 p = points[i];
2136                 p_scalar = scalars[i];
2137             }
2138             if ((p_scalar != NULL) && (p != NULL)) {
2139                 /* reduce scalar to 0 <= scalar < 2^256 */
2140                 if ((BN_num_bits(p_scalar) > 256)
2141                     || (BN_is_negative(p_scalar))) {
2142                     /*
2143                      * this is an unusual input, and we don't guarantee
2144                      * constant-timeness
2145                      */
2146                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
2147                         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2148                         goto err;
2149                     }
2150                     num_bytes = BN_bn2lebinpad(tmp_scalar,
2151                                                secrets[i], sizeof(secrets[i]));
2152                 } else {
2153                     num_bytes = BN_bn2lebinpad(p_scalar,
2154                                                secrets[i], sizeof(secrets[i]));
2155                 }
2156                 if (num_bytes < 0) {
2157                     ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2158                     goto err;
2159                 }
2160                 /* precompute multiples */
2161                 if ((!BN_to_felem(x_out, p->X)) ||
2162                     (!BN_to_felem(y_out, p->Y)) ||
2163                     (!BN_to_felem(z_out, p->Z)))
2164                     goto err;
2165                 felem_shrink(pre_comp[i][1][0], x_out);
2166                 felem_shrink(pre_comp[i][1][1], y_out);
2167                 felem_shrink(pre_comp[i][1][2], z_out);
2168                 for (j = 2; j <= 16; ++j) {
2169                     if (j & 1) {
2170                         point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2171                                         pre_comp[i][j][2], pre_comp[i][1][0],
2172                                         pre_comp[i][1][1], pre_comp[i][1][2],
2173                                         pre_comp[i][j - 1][0],
2174                                         pre_comp[i][j - 1][1],
2175                                         pre_comp[i][j - 1][2]);
2176                     } else {
2177                         point_double_small(pre_comp[i][j][0],
2178                                            pre_comp[i][j][1],
2179                                            pre_comp[i][j][2],
2180                                            pre_comp[i][j / 2][0],
2181                                            pre_comp[i][j / 2][1],
2182                                            pre_comp[i][j / 2][2]);
2183                     }
2184                 }
2185             }
2186         }
2187         if (mixed)
2188             make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2189     }
2190
2191     /* the scalar for the generator */
2192     if ((scalar != NULL) && (have_pre_comp)) {
2193         memset(g_secret, 0, sizeof(g_secret));
2194         /* reduce scalar to 0 <= scalar < 2^256 */
2195         if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2196             /*
2197              * this is an unusual input, and we don't guarantee
2198              * constant-timeness
2199              */
2200             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2201                 ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2202                 goto err;
2203             }
2204             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2205         } else {
2206             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2207         }
2208         /* do the multiplication with generator precomputation */
2209         batch_mul(x_out, y_out, z_out,
2210                   (const felem_bytearray(*))secrets, num_points,
2211                   g_secret,
2212                   mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2213     } else {
2214         /* do the multiplication without generator precomputation */
2215         batch_mul(x_out, y_out, z_out,
2216                   (const felem_bytearray(*))secrets, num_points,
2217                   NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2218     }
2219     /* reduce the output to its unique minimal representation */
2220     felem_contract(x_in, x_out);
2221     felem_contract(y_in, y_out);
2222     felem_contract(z_in, z_out);
2223     if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2224         (!smallfelem_to_BN(z, z_in))) {
2225         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2226         goto err;
2227     }
2228     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2229
2230  err:
2231     BN_CTX_end(ctx);
2232     EC_POINT_free(generator);
2233     OPENSSL_free(secrets);
2234     OPENSSL_free(pre_comp);
2235     OPENSSL_free(tmp_smallfelems);
2236     return ret;
2237 }
2238
2239 int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2240 {
2241     int ret = 0;
2242     NISTP256_PRE_COMP *pre = NULL;
2243     int i, j;
2244     BIGNUM *x, *y;
2245     EC_POINT *generator = NULL;
2246     smallfelem tmp_smallfelems[32];
2247     felem x_tmp, y_tmp, z_tmp;
2248 #ifndef FIPS_MODE
2249     BN_CTX *new_ctx = NULL;
2250 #endif
2251
2252     /* throw away old precomputation */
2253     EC_pre_comp_free(group);
2254
2255 #ifndef FIPS_MODE
2256     if (ctx == NULL)
2257         ctx = new_ctx = BN_CTX_new();
2258 #endif
2259     if (ctx == NULL)
2260         return 0;
2261
2262     BN_CTX_start(ctx);
2263     x = BN_CTX_get(ctx);
2264     y = BN_CTX_get(ctx);
2265     if (y == NULL)
2266         goto err;
2267     /* get the generator */
2268     if (group->generator == NULL)
2269         goto err;
2270     generator = EC_POINT_new(group);
2271     if (generator == NULL)
2272         goto err;
2273     BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2274     BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2275     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2276         goto err;
2277     if ((pre = nistp256_pre_comp_new()) == NULL)
2278         goto err;
2279     /*
2280      * if the generator is the standard one, use built-in precomputation
2281      */
2282     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2283         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2284         goto done;
2285     }
2286     if ((!BN_to_felem(x_tmp, group->generator->X)) ||
2287         (!BN_to_felem(y_tmp, group->generator->Y)) ||
2288         (!BN_to_felem(z_tmp, group->generator->Z)))
2289         goto err;
2290     felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2291     felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2292     felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2293     /*
2294      * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2295      * 2^160*G, 2^224*G for the second one
2296      */
2297     for (i = 1; i <= 8; i <<= 1) {
2298         point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2299                            pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2300                            pre->g_pre_comp[0][i][1],
2301                            pre->g_pre_comp[0][i][2]);
2302         for (j = 0; j < 31; ++j) {
2303             point_double_small(pre->g_pre_comp[1][i][0],
2304                                pre->g_pre_comp[1][i][1],
2305                                pre->g_pre_comp[1][i][2],
2306                                pre->g_pre_comp[1][i][0],
2307                                pre->g_pre_comp[1][i][1],
2308                                pre->g_pre_comp[1][i][2]);
2309         }
2310         if (i == 8)
2311             break;
2312         point_double_small(pre->g_pre_comp[0][2 * i][0],
2313                            pre->g_pre_comp[0][2 * i][1],
2314                            pre->g_pre_comp[0][2 * i][2],
2315                            pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2316                            pre->g_pre_comp[1][i][2]);
2317         for (j = 0; j < 31; ++j) {
2318             point_double_small(pre->g_pre_comp[0][2 * i][0],
2319                                pre->g_pre_comp[0][2 * i][1],
2320                                pre->g_pre_comp[0][2 * i][2],
2321                                pre->g_pre_comp[0][2 * i][0],
2322                                pre->g_pre_comp[0][2 * i][1],
2323                                pre->g_pre_comp[0][2 * i][2]);
2324         }
2325     }
2326     for (i = 0; i < 2; i++) {
2327         /* g_pre_comp[i][0] is the point at infinity */
2328         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2329         /* the remaining multiples */
2330         /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2331         point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2332                         pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2333                         pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2334                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2335                         pre->g_pre_comp[i][2][2]);
2336         /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2337         point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2338                         pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2339                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2340                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2341                         pre->g_pre_comp[i][2][2]);
2342         /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2343         point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2344                         pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2345                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2346                         pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2347                         pre->g_pre_comp[i][4][2]);
2348         /*
2349          * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2350          */
2351         point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2352                         pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2353                         pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2354                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2355                         pre->g_pre_comp[i][2][2]);
2356         for (j = 1; j < 8; ++j) {
2357             /* odd multiples: add G resp. 2^32*G */
2358             point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2359                             pre->g_pre_comp[i][2 * j + 1][1],
2360                             pre->g_pre_comp[i][2 * j + 1][2],
2361                             pre->g_pre_comp[i][2 * j][0],
2362                             pre->g_pre_comp[i][2 * j][1],
2363                             pre->g_pre_comp[i][2 * j][2],
2364                             pre->g_pre_comp[i][1][0],
2365                             pre->g_pre_comp[i][1][1],
2366                             pre->g_pre_comp[i][1][2]);
2367         }
2368     }
2369     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2370
2371  done:
2372     SETPRECOMP(group, nistp256, pre);
2373     pre = NULL;
2374     ret = 1;
2375
2376  err:
2377     BN_CTX_end(ctx);
2378     EC_POINT_free(generator);
2379 #ifndef FIPS_MODE
2380     BN_CTX_free(new_ctx);
2381 #endif
2382     EC_nistp256_pre_comp_free(pre);
2383     return ret;
2384 }
2385
2386 int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2387 {
2388     return HAVEPRECOMP(group, nistp256);
2389 }
2390 #endif