crypto/ec/ecp_nistp256.c

   1 /*
   2  * Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the OpenSSL license (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
  28  *
  29  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  30  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  31  * work which got its smarts from Daniel J. Bernstein's work on the same.
  32  */
  33
  34 #include <openssl/opensslconf.h>
  35 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  36 NON_EMPTY_TRANSLATION_UNIT
  37 #else
  38
  39 # include <stdint.h>
  40 # include <string.h>
  41 # include <openssl/err.h>
  42 # include "ec_lcl.h"
  43
  44 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  45   /* even with gcc, the typedef won't work for 32-bit platforms */
  46 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  47                                  * platforms */
  48 typedef __int128_t int128_t;
  49 # else
  50 #  error "Need GCC 3.1 or later to define type uint128_t"
  51 # endif
  52
  53 typedef uint8_t u8;
  54 typedef uint32_t u32;
  55 typedef uint64_t u64;
  56 typedef int64_t s64;
  57
  58 /*
  59  * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
  60  * can serialise an element of this field into 32 bytes. We call this an
  61  * felem_bytearray.
  62  */
  63
  64 typedef u8 felem_bytearray[32];
  65
  66 /*
  67  * These are the parameters of P256, taken from FIPS 186-3, page 86. These
  68  * values are big-endian.
  69  */
  70 static const felem_bytearray nistp256_curve_params[5] = {
  71     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
  72      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  73      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  74      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
  75     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
  76      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  77      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  78      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
  79     {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
  80      0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
  81      0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
  82      0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
  83     {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
  84      0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
  85      0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
  86      0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
  87     {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
  88      0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
  89      0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
  90      0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
  91 };
  92
  93 /*-
  94  * The representation of field elements.
  95  * ------------------------------------
  96  *
  97  * We represent field elements with either four 128-bit values, eight 128-bit
  98  * values, or four 64-bit values. The field element represented is:
  99  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
 100  * or:
 101  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
 102  *
 103  * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
 104  * apart, but are 128-bits wide, the most significant bits of each limb overlap
 105  * with the least significant bits of the next.
 106  *
 107  * A field element with four limbs is an 'felem'. One with eight limbs is a
 108  * 'longfelem'
 109  *
 110  * A field element with four, 64-bit values is called a 'smallfelem'. Small
 111  * values are used as intermediate values before multiplication.
 112  */
 113
 114 # define NLIMBS 4
 115
 116 typedef uint128_t limb;
 117 typedef limb felem[NLIMBS];
 118 typedef limb longfelem[NLIMBS * 2];
 119 typedef u64 smallfelem[NLIMBS];
 120
 121 /* This is the value of the prime as four 64-bit words, little-endian. */
 122 static const u64 kPrime[4] =
 123     { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
 124 static const u64 bottom63bits = 0x7ffffffffffffffful;
 125
 126 /*
 127  * bin32_to_felem takes a little-endian byte array and converts it into felem
 128  * form. This assumes that the CPU is little-endian.
 129  */
 130 static void bin32_to_felem(felem out, const u8 in[32])
 131 {
 132     out[0] = *((u64 *)&in[0]);
 133     out[1] = *((u64 *)&in[8]);
 134     out[2] = *((u64 *)&in[16]);
 135     out[3] = *((u64 *)&in[24]);
 136 }
 137
 138 /*
 139  * smallfelem_to_bin32 takes a smallfelem and serialises into a little
 140  * endian, 32 byte array. This assumes that the CPU is little-endian.
 141  */
 142 static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
 143 {
 144     *((u64 *)&out[0]) = in[0];
 145     *((u64 *)&out[8]) = in[1];
 146     *((u64 *)&out[16]) = in[2];
 147     *((u64 *)&out[24]) = in[3];
 148 }
 149
 150 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 151 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 152 {
 153     unsigned i;
 154     for (i = 0; i < len; ++i)
 155         out[i] = in[len - 1 - i];
 156 }
 157
 158 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 159 static int BN_to_felem(felem out, const BIGNUM *bn)
 160 {
 161     felem_bytearray b_in;
 162     felem_bytearray b_out;
 163     unsigned num_bytes;
 164
 165     /* BN_bn2bin eats leading zeroes */
 166     memset(b_out, 0, sizeof(b_out));
 167     num_bytes = BN_num_bytes(bn);
 168     if (num_bytes > sizeof b_out) {
 169         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 170         return 0;
 171     }
 172     if (BN_is_negative(bn)) {
 173         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 174         return 0;
 175     }
 176     num_bytes = BN_bn2bin(bn, b_in);
 177     flip_endian(b_out, b_in, num_bytes);
 178     bin32_to_felem(out, b_out);
 179     return 1;
 180 }
 181
 182 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 183 static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
 184 {
 185     felem_bytearray b_in, b_out;
 186     smallfelem_to_bin32(b_in, in);
 187     flip_endian(b_out, b_in, sizeof b_out);
 188     return BN_bin2bn(b_out, sizeof b_out, out);
 189 }
 190
 191 /*-
 192  * Field operations
 193  * ----------------
 194  */
 195
 196 static void smallfelem_one(smallfelem out)
 197 {
 198     out[0] = 1;
 199     out[1] = 0;
 200     out[2] = 0;
 201     out[3] = 0;
 202 }
 203
 204 static void smallfelem_assign(smallfelem out, const smallfelem in)
 205 {
 206     out[0] = in[0];
 207     out[1] = in[1];
 208     out[2] = in[2];
 209     out[3] = in[3];
 210 }
 211
 212 static void felem_assign(felem out, const felem in)
 213 {
 214     out[0] = in[0];
 215     out[1] = in[1];
 216     out[2] = in[2];
 217     out[3] = in[3];
 218 }
 219
 220 /* felem_sum sets out = out + in. */
 221 static void felem_sum(felem out, const felem in)
 222 {
 223     out[0] += in[0];
 224     out[1] += in[1];
 225     out[2] += in[2];
 226     out[3] += in[3];
 227 }
 228
 229 /* felem_small_sum sets out = out + in. */
 230 static void felem_small_sum(felem out, const smallfelem in)
 231 {
 232     out[0] += in[0];
 233     out[1] += in[1];
 234     out[2] += in[2];
 235     out[3] += in[3];
 236 }
 237
 238 /* felem_scalar sets out = out * scalar */
 239 static void felem_scalar(felem out, const u64 scalar)
 240 {
 241     out[0] *= scalar;
 242     out[1] *= scalar;
 243     out[2] *= scalar;
 244     out[3] *= scalar;
 245 }
 246
 247 /* longfelem_scalar sets out = out * scalar */
 248 static void longfelem_scalar(longfelem out, const u64 scalar)
 249 {
 250     out[0] *= scalar;
 251     out[1] *= scalar;
 252     out[2] *= scalar;
 253     out[3] *= scalar;
 254     out[4] *= scalar;
 255     out[5] *= scalar;
 256     out[6] *= scalar;
 257     out[7] *= scalar;
 258 }
 259
 260 # define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
 261 # define two105 (((limb)1) << 105)
 262 # define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
 263
 264 /* zero105 is 0 mod p */
 265 static const felem zero105 =
 266     { two105m41m9, two105, two105m41p9, two105m41p9 };
 267
 268 /*-
 269  * smallfelem_neg sets |out| to |-small|
 270  * On exit:
 271  *   out[i] < out[i] + 2^105
 272  */
 273 static void smallfelem_neg(felem out, const smallfelem small)
 274 {
 275     /* In order to prevent underflow, we subtract from 0 mod p. */
 276     out[0] = zero105[0] - small[0];
 277     out[1] = zero105[1] - small[1];
 278     out[2] = zero105[2] - small[2];
 279     out[3] = zero105[3] - small[3];
 280 }
 281
 282 /*-
 283  * felem_diff subtracts |in| from |out|
 284  * On entry:
 285  *   in[i] < 2^104
 286  * On exit:
 287  *   out[i] < out[i] + 2^105
 288  */
 289 static void felem_diff(felem out, const felem in)
 290 {
 291     /*
 292      * In order to prevent underflow, we add 0 mod p before subtracting.
 293      */
 294     out[0] += zero105[0];
 295     out[1] += zero105[1];
 296     out[2] += zero105[2];
 297     out[3] += zero105[3];
 298
 299     out[0] -= in[0];
 300     out[1] -= in[1];
 301     out[2] -= in[2];
 302     out[3] -= in[3];
 303 }
 304
 305 # define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
 306 # define two107 (((limb)1) << 107)
 307 # define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
 308
 309 /* zero107 is 0 mod p */
 310 static const felem zero107 =
 311     { two107m43m11, two107, two107m43p11, two107m43p11 };
 312
 313 /*-
 314  * An alternative felem_diff for larger inputs |in|
 315  * felem_diff_zero107 subtracts |in| from |out|
 316  * On entry:
 317  *   in[i] < 2^106
 318  * On exit:
 319  *   out[i] < out[i] + 2^107
 320  */
 321 static void felem_diff_zero107(felem out, const felem in)
 322 {
 323     /*
 324      * In order to prevent underflow, we add 0 mod p before subtracting.
 325      */
 326     out[0] += zero107[0];
 327     out[1] += zero107[1];
 328     out[2] += zero107[2];
 329     out[3] += zero107[3];
 330
 331     out[0] -= in[0];
 332     out[1] -= in[1];
 333     out[2] -= in[2];
 334     out[3] -= in[3];
 335 }
 336
 337 /*-
 338  * longfelem_diff subtracts |in| from |out|
 339  * On entry:
 340  *   in[i] < 7*2^67
 341  * On exit:
 342  *   out[i] < out[i] + 2^70 + 2^40
 343  */
 344 static void longfelem_diff(longfelem out, const longfelem in)
 345 {
 346     static const limb two70m8p6 =
 347         (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
 348     static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
 349     static const limb two70 = (((limb) 1) << 70);
 350     static const limb two70m40m38p6 =
 351         (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
 352         (((limb) 1) << 6);
 353     static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
 354
 355     /* add 0 mod p to avoid underflow */
 356     out[0] += two70m8p6;
 357     out[1] += two70p40;
 358     out[2] += two70;
 359     out[3] += two70m40m38p6;
 360     out[4] += two70m6;
 361     out[5] += two70m6;
 362     out[6] += two70m6;
 363     out[7] += two70m6;
 364
 365     /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
 366     out[0] -= in[0];
 367     out[1] -= in[1];
 368     out[2] -= in[2];
 369     out[3] -= in[3];
 370     out[4] -= in[4];
 371     out[5] -= in[5];
 372     out[6] -= in[6];
 373     out[7] -= in[7];
 374 }
 375
 376 # define two64m0 (((limb)1) << 64) - 1
 377 # define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
 378 # define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
 379 # define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
 380
 381 /* zero110 is 0 mod p */
 382 static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
 383
 384 /*-
 385  * felem_shrink converts an felem into a smallfelem. The result isn't quite
 386  * minimal as the value may be greater than p.
 387  *
 388  * On entry:
 389  *   in[i] < 2^109
 390  * On exit:
 391  *   out[i] < 2^64
 392  */
 393 static void felem_shrink(smallfelem out, const felem in)
 394 {
 395     felem tmp;
 396     u64 a, b, mask;
 397     s64 high, low;
 398     static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
 399
 400     /* Carry 2->3 */
 401     tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
 402     /* tmp[3] < 2^110 */
 403
 404     tmp[2] = zero110[2] + (u64)in[2];
 405     tmp[0] = zero110[0] + in[0];
 406     tmp[1] = zero110[1] + in[1];
 407     /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
 408
 409     /*
 410      * We perform two partial reductions where we eliminate the high-word of
 411      * tmp[3]. We don't update the other words till the end.
 412      */
 413     a = tmp[3] >> 64;           /* a < 2^46 */
 414     tmp[3] = (u64)tmp[3];
 415     tmp[3] -= a;
 416     tmp[3] += ((limb) a) << 32;
 417     /* tmp[3] < 2^79 */
 418
 419     b = a;
 420     a = tmp[3] >> 64;           /* a < 2^15 */
 421     b += a;                     /* b < 2^46 + 2^15 < 2^47 */
 422     tmp[3] = (u64)tmp[3];
 423     tmp[3] -= a;
 424     tmp[3] += ((limb) a) << 32;
 425     /* tmp[3] < 2^64 + 2^47 */
 426
 427     /*
 428      * This adjusts the other two words to complete the two partial
 429      * reductions.
 430      */
 431     tmp[0] += b;
 432     tmp[1] -= (((limb) b) << 32);
 433
 434     /*
 435      * In order to make space in tmp[3] for the carry from 2 -> 3, we
 436      * conditionally subtract kPrime if tmp[3] is large enough.
 437      */
 438     high = tmp[3] >> 64;
 439     /* As tmp[3] < 2^65, high is either 1 or 0 */
 440     high <<= 63;
 441     high >>= 63;
 442     /*-
 443      * high is:
 444      *   all ones   if the high word of tmp[3] is 1
 445      *   all zeros  if the high word of tmp[3] if 0 */
 446     low = tmp[3];
 447     mask = low >> 63;
 448     /*-
 449      * mask is:
 450      *   all ones   if the MSB of low is 1
 451      *   all zeros  if the MSB of low if 0 */
 452     low &= bottom63bits;
 453     low -= kPrime3Test;
 454     /* if low was greater than kPrime3Test then the MSB is zero */
 455     low = ~low;
 456     low >>= 63;
 457     /*-
 458      * low is:
 459      *   all ones   if low was > kPrime3Test
 460      *   all zeros  if low was <= kPrime3Test */
 461     mask = (mask & low) | high;
 462     tmp[0] -= mask & kPrime[0];
 463     tmp[1] -= mask & kPrime[1];
 464     /* kPrime[2] is zero, so omitted */
 465     tmp[3] -= mask & kPrime[3];
 466     /* tmp[3] < 2**64 - 2**32 + 1 */
 467
 468     tmp[1] += ((u64)(tmp[0] >> 64));
 469     tmp[0] = (u64)tmp[0];
 470     tmp[2] += ((u64)(tmp[1] >> 64));
 471     tmp[1] = (u64)tmp[1];
 472     tmp[3] += ((u64)(tmp[2] >> 64));
 473     tmp[2] = (u64)tmp[2];
 474     /* tmp[i] < 2^64 */
 475
 476     out[0] = tmp[0];
 477     out[1] = tmp[1];
 478     out[2] = tmp[2];
 479     out[3] = tmp[3];
 480 }
 481
 482 /* smallfelem_expand converts a smallfelem to an felem */
 483 static void smallfelem_expand(felem out, const smallfelem in)
 484 {
 485     out[0] = in[0];
 486     out[1] = in[1];
 487     out[2] = in[2];
 488     out[3] = in[3];
 489 }
 490
 491 /*-
 492  * smallfelem_square sets |out| = |small|^2
 493  * On entry:
 494  *   small[i] < 2^64
 495  * On exit:
 496  *   out[i] < 7 * 2^64 < 2^67
 497  */
 498 static void smallfelem_square(longfelem out, const smallfelem small)
 499 {
 500     limb a;
 501     u64 high, low;
 502
 503     a = ((uint128_t) small[0]) * small[0];
 504     low = a;
 505     high = a >> 64;
 506     out[0] = low;
 507     out[1] = high;
 508
 509     a = ((uint128_t) small[0]) * small[1];
 510     low = a;
 511     high = a >> 64;
 512     out[1] += low;
 513     out[1] += low;
 514     out[2] = high;
 515
 516     a = ((uint128_t) small[0]) * small[2];
 517     low = a;
 518     high = a >> 64;
 519     out[2] += low;
 520     out[2] *= 2;
 521     out[3] = high;
 522
 523     a = ((uint128_t) small[0]) * small[3];
 524     low = a;
 525     high = a >> 64;
 526     out[3] += low;
 527     out[4] = high;
 528
 529     a = ((uint128_t) small[1]) * small[2];
 530     low = a;
 531     high = a >> 64;
 532     out[3] += low;
 533     out[3] *= 2;
 534     out[4] += high;
 535
 536     a = ((uint128_t) small[1]) * small[1];
 537     low = a;
 538     high = a >> 64;
 539     out[2] += low;
 540     out[3] += high;
 541
 542     a = ((uint128_t) small[1]) * small[3];
 543     low = a;
 544     high = a >> 64;
 545     out[4] += low;
 546     out[4] *= 2;
 547     out[5] = high;
 548
 549     a = ((uint128_t) small[2]) * small[3];
 550     low = a;
 551     high = a >> 64;
 552     out[5] += low;
 553     out[5] *= 2;
 554     out[6] = high;
 555     out[6] += high;
 556
 557     a = ((uint128_t) small[2]) * small[2];
 558     low = a;
 559     high = a >> 64;
 560     out[4] += low;
 561     out[5] += high;
 562
 563     a = ((uint128_t) small[3]) * small[3];
 564     low = a;
 565     high = a >> 64;
 566     out[6] += low;
 567     out[7] = high;
 568 }
 569
 570 /*-
 571  * felem_square sets |out| = |in|^2
 572  * On entry:
 573  *   in[i] < 2^109
 574  * On exit:
 575  *   out[i] < 7 * 2^64 < 2^67
 576  */
 577 static void felem_square(longfelem out, const felem in)
 578 {
 579     u64 small[4];
 580     felem_shrink(small, in);
 581     smallfelem_square(out, small);
 582 }
 583
 584 /*-
 585  * smallfelem_mul sets |out| = |small1| * |small2|
 586  * On entry:
 587  *   small1[i] < 2^64
 588  *   small2[i] < 2^64
 589  * On exit:
 590  *   out[i] < 7 * 2^64 < 2^67
 591  */
 592 static void smallfelem_mul(longfelem out, const smallfelem small1,
 593                            const smallfelem small2)
 594 {
 595     limb a;
 596     u64 high, low;
 597
 598     a = ((uint128_t) small1[0]) * small2[0];
 599     low = a;
 600     high = a >> 64;
 601     out[0] = low;
 602     out[1] = high;
 603
 604     a = ((uint128_t) small1[0]) * small2[1];
 605     low = a;
 606     high = a >> 64;
 607     out[1] += low;
 608     out[2] = high;
 609
 610     a = ((uint128_t) small1[1]) * small2[0];
 611     low = a;
 612     high = a >> 64;
 613     out[1] += low;
 614     out[2] += high;
 615
 616     a = ((uint128_t) small1[0]) * small2[2];
 617     low = a;
 618     high = a >> 64;
 619     out[2] += low;
 620     out[3] = high;
 621
 622     a = ((uint128_t) small1[1]) * small2[1];
 623     low = a;
 624     high = a >> 64;
 625     out[2] += low;
 626     out[3] += high;
 627
 628     a = ((uint128_t) small1[2]) * small2[0];
 629     low = a;
 630     high = a >> 64;
 631     out[2] += low;
 632     out[3] += high;
 633
 634     a = ((uint128_t) small1[0]) * small2[3];
 635     low = a;
 636     high = a >> 64;
 637     out[3] += low;
 638     out[4] = high;
 639
 640     a = ((uint128_t) small1[1]) * small2[2];
 641     low = a;
 642     high = a >> 64;
 643     out[3] += low;
 644     out[4] += high;
 645
 646     a = ((uint128_t) small1[2]) * small2[1];
 647     low = a;
 648     high = a >> 64;
 649     out[3] += low;
 650     out[4] += high;
 651
 652     a = ((uint128_t) small1[3]) * small2[0];
 653     low = a;
 654     high = a >> 64;
 655     out[3] += low;
 656     out[4] += high;
 657
 658     a = ((uint128_t) small1[1]) * small2[3];
 659     low = a;
 660     high = a >> 64;
 661     out[4] += low;
 662     out[5] = high;
 663
 664     a = ((uint128_t) small1[2]) * small2[2];
 665     low = a;
 666     high = a >> 64;
 667     out[4] += low;
 668     out[5] += high;
 669
 670     a = ((uint128_t) small1[3]) * small2[1];
 671     low = a;
 672     high = a >> 64;
 673     out[4] += low;
 674     out[5] += high;
 675
 676     a = ((uint128_t) small1[2]) * small2[3];
 677     low = a;
 678     high = a >> 64;
 679     out[5] += low;
 680     out[6] = high;
 681
 682     a = ((uint128_t) small1[3]) * small2[2];
 683     low = a;
 684     high = a >> 64;
 685     out[5] += low;
 686     out[6] += high;
 687
 688     a = ((uint128_t) small1[3]) * small2[3];
 689     low = a;
 690     high = a >> 64;
 691     out[6] += low;
 692     out[7] = high;
 693 }
 694
 695 /*-
 696  * felem_mul sets |out| = |in1| * |in2|
 697  * On entry:
 698  *   in1[i] < 2^109
 699  *   in2[i] < 2^109
 700  * On exit:
 701  *   out[i] < 7 * 2^64 < 2^67
 702  */
 703 static void felem_mul(longfelem out, const felem in1, const felem in2)
 704 {
 705     smallfelem small1, small2;
 706     felem_shrink(small1, in1);
 707     felem_shrink(small2, in2);
 708     smallfelem_mul(out, small1, small2);
 709 }
 710
 711 /*-
 712  * felem_small_mul sets |out| = |small1| * |in2|
 713  * On entry:
 714  *   small1[i] < 2^64
 715  *   in2[i] < 2^109
 716  * On exit:
 717  *   out[i] < 7 * 2^64 < 2^67
 718  */
 719 static void felem_small_mul(longfelem out, const smallfelem small1,
 720                             const felem in2)
 721 {
 722     smallfelem small2;
 723     felem_shrink(small2, in2);
 724     smallfelem_mul(out, small1, small2);
 725 }
 726
 727 # define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
 728 # define two100 (((limb)1) << 100)
 729 # define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
 730 /* zero100 is 0 mod p */
 731 static const felem zero100 =
 732     { two100m36m4, two100, two100m36p4, two100m36p4 };
 733
 734 /*-
 735  * Internal function for the different flavours of felem_reduce.
 736  * felem_reduce_ reduces the higher coefficients in[4]-in[7].
 737  * On entry:
 738  *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
 739  *   out[1] >= in[7] + 2^32*in[4]
 740  *   out[2] >= in[5] + 2^32*in[5]
 741  *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
 742  * On exit:
 743  *   out[0] <= out[0] + in[4] + 2^32*in[5]
 744  *   out[1] <= out[1] + in[5] + 2^33*in[6]
 745  *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
 746  *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
 747  */
 748 static void felem_reduce_(felem out, const longfelem in)
 749 {
 750     int128_t c;
 751     /* combine common terms from below */
 752     c = in[4] + (in[5] << 32);
 753     out[0] += c;
 754     out[3] -= c;
 755
 756     c = in[5] - in[7];
 757     out[1] += c;
 758     out[2] -= c;
 759
 760     /* the remaining terms */
 761     /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
 762     out[1] -= (in[4] << 32);
 763     out[3] += (in[4] << 32);
 764
 765     /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
 766     out[2] -= (in[5] << 32);
 767
 768     /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
 769     out[0] -= in[6];
 770     out[0] -= (in[6] << 32);
 771     out[1] += (in[6] << 33);
 772     out[2] += (in[6] * 2);
 773     out[3] -= (in[6] << 32);
 774
 775     /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
 776     out[0] -= in[7];
 777     out[0] -= (in[7] << 32);
 778     out[2] += (in[7] << 33);
 779     out[3] += (in[7] * 3);
 780 }
 781
 782 /*-
 783  * felem_reduce converts a longfelem into an felem.
 784  * To be called directly after felem_square or felem_mul.
 785  * On entry:
 786  *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
 787  *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
 788  * On exit:
 789  *   out[i] < 2^101
 790  */
 791 static void felem_reduce(felem out, const longfelem in)
 792 {
 793     out[0] = zero100[0] + in[0];
 794     out[1] = zero100[1] + in[1];
 795     out[2] = zero100[2] + in[2];
 796     out[3] = zero100[3] + in[3];
 797
 798     felem_reduce_(out, in);
 799
 800     /*-
 801      * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
 802      * out[1] > 2^100 - 2^64 - 7*2^96 > 0
 803      * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
 804      * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
 805      *
 806      * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
 807      * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
 808      * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
 809      * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
 810      */
 811 }
 812
 813 /*-
 814  * felem_reduce_zero105 converts a larger longfelem into an felem.
 815  * On entry:
 816  *   in[0] < 2^71
 817  * On exit:
 818  *   out[i] < 2^106
 819  */
 820 static void felem_reduce_zero105(felem out, const longfelem in)
 821 {
 822     out[0] = zero105[0] + in[0];
 823     out[1] = zero105[1] + in[1];
 824     out[2] = zero105[2] + in[2];
 825     out[3] = zero105[3] + in[3];
 826
 827     felem_reduce_(out, in);
 828
 829     /*-
 830      * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
 831      * out[1] > 2^105 - 2^71 - 2^103 > 0
 832      * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
 833      * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
 834      *
 835      * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 836      * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 837      * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
 838      * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
 839      */
 840 }
 841
 842 /*
 843  * subtract_u64 sets *result = *result - v and *carry to one if the
 844  * subtraction underflowed.
 845  */
 846 static void subtract_u64(u64 *result, u64 *carry, u64 v)
 847 {
 848     uint128_t r = *result;
 849     r -= v;
 850     *carry = (r >> 64) & 1;
 851     *result = (u64)r;
 852 }
 853
 854 /*
 855  * felem_contract converts |in| to its unique, minimal representation. On
 856  * entry: in[i] < 2^109
 857  */
 858 static void felem_contract(smallfelem out, const felem in)
 859 {
 860     unsigned i;
 861     u64 all_equal_so_far = 0, result = 0, carry;
 862
 863     felem_shrink(out, in);
 864     /* small is minimal except that the value might be > p */
 865
 866     all_equal_so_far--;
 867     /*
 868      * We are doing a constant time test if out >= kPrime. We need to compare
 869      * each u64, from most-significant to least significant. For each one, if
 870      * all words so far have been equal (m is all ones) then a non-equal
 871      * result is the answer. Otherwise we continue.
 872      */
 873     for (i = 3; i < 4; i--) {
 874         u64 equal;
 875         uint128_t a = ((uint128_t) kPrime[i]) - out[i];
 876         /*
 877          * if out[i] > kPrime[i] then a will underflow and the high 64-bits
 878          * will all be set.
 879          */
 880         result |= all_equal_so_far & ((u64)(a >> 64));
 881
 882         /*
 883          * if kPrime[i] == out[i] then |equal| will be all zeros and the
 884          * decrement will make it all ones.
 885          */
 886         equal = kPrime[i] ^ out[i];
 887         equal--;
 888         equal &= equal << 32;
 889         equal &= equal << 16;
 890         equal &= equal << 8;
 891         equal &= equal << 4;
 892         equal &= equal << 2;
 893         equal &= equal << 1;
 894         equal = ((s64) equal) >> 63;
 895
 896         all_equal_so_far &= equal;
 897     }
 898
 899     /*
 900      * if all_equal_so_far is still all ones then the two values are equal
 901      * and so out >= kPrime is true.
 902      */
 903     result |= all_equal_so_far;
 904
 905     /* if out >= kPrime then we subtract kPrime. */
 906     subtract_u64(&out[0], &carry, result & kPrime[0]);
 907     subtract_u64(&out[1], &carry, carry);
 908     subtract_u64(&out[2], &carry, carry);
 909     subtract_u64(&out[3], &carry, carry);
 910
 911     subtract_u64(&out[1], &carry, result & kPrime[1]);
 912     subtract_u64(&out[2], &carry, carry);
 913     subtract_u64(&out[3], &carry, carry);
 914
 915     subtract_u64(&out[2], &carry, result & kPrime[2]);
 916     subtract_u64(&out[3], &carry, carry);
 917
 918     subtract_u64(&out[3], &carry, result & kPrime[3]);
 919 }
 920
 921 static void smallfelem_square_contract(smallfelem out, const smallfelem in)
 922 {
 923     longfelem longtmp;
 924     felem tmp;
 925
 926     smallfelem_square(longtmp, in);
 927     felem_reduce(tmp, longtmp);
 928     felem_contract(out, tmp);
 929 }
 930
 931 static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
 932                                     const smallfelem in2)
 933 {
 934     longfelem longtmp;
 935     felem tmp;
 936
 937     smallfelem_mul(longtmp, in1, in2);
 938     felem_reduce(tmp, longtmp);
 939     felem_contract(out, tmp);
 940 }
 941
 942 /*-
 943  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 944  * otherwise.
 945  * On entry:
 946  *   small[i] < 2^64
 947  */
 948 static limb smallfelem_is_zero(const smallfelem small)
 949 {
 950     limb result;
 951     u64 is_p;
 952
 953     u64 is_zero = small[0] | small[1] | small[2] | small[3];
 954     is_zero--;
 955     is_zero &= is_zero << 32;
 956     is_zero &= is_zero << 16;
 957     is_zero &= is_zero << 8;
 958     is_zero &= is_zero << 4;
 959     is_zero &= is_zero << 2;
 960     is_zero &= is_zero << 1;
 961     is_zero = ((s64) is_zero) >> 63;
 962
 963     is_p = (small[0] ^ kPrime[0]) |
 964         (small[1] ^ kPrime[1]) |
 965         (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
 966     is_p--;
 967     is_p &= is_p << 32;
 968     is_p &= is_p << 16;
 969     is_p &= is_p << 8;
 970     is_p &= is_p << 4;
 971     is_p &= is_p << 2;
 972     is_p &= is_p << 1;
 973     is_p = ((s64) is_p) >> 63;
 974
 975     is_zero |= is_p;
 976
 977     result = is_zero;
 978     result |= ((limb) is_zero) << 64;
 979     return result;
 980 }
 981
 982 static int smallfelem_is_zero_int(const smallfelem small)
 983 {
 984     return (int)(smallfelem_is_zero(small) & ((limb) 1));
 985 }
 986
 987 /*-
 988  * felem_inv calculates |out| = |in|^{-1}
 989  *
 990  * Based on Fermat's Little Theorem:
 991  *   a^p = a (mod p)
 992  *   a^{p-1} = 1 (mod p)
 993  *   a^{p-2} = a^{-1} (mod p)
 994  */
 995 static void felem_inv(felem out, const felem in)
 996 {
 997     felem ftmp, ftmp2;
 998     /* each e_I will hold |in|^{2^I - 1} */
 999     felem e2, e4, e8, e16, e32, e64;
1000     longfelem tmp;
1001     unsigned i;
1002
1003     felem_square(tmp, in);
1004     felem_reduce(ftmp, tmp);    /* 2^1 */
1005     felem_mul(tmp, in, ftmp);
1006     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
1007     felem_assign(e2, ftmp);
1008     felem_square(tmp, ftmp);
1009     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
1010     felem_square(tmp, ftmp);
1011     felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
1012     felem_mul(tmp, ftmp, e2);
1013     felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1014     felem_assign(e4, ftmp);
1015     felem_square(tmp, ftmp);
1016     felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1017     felem_square(tmp, ftmp);
1018     felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1019     felem_square(tmp, ftmp);
1020     felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1021     felem_square(tmp, ftmp);
1022     felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1023     felem_mul(tmp, ftmp, e4);
1024     felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1025     felem_assign(e8, ftmp);
1026     for (i = 0; i < 8; i++) {
1027         felem_square(tmp, ftmp);
1028         felem_reduce(ftmp, tmp);
1029     }                           /* 2^16 - 2^8 */
1030     felem_mul(tmp, ftmp, e8);
1031     felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1032     felem_assign(e16, ftmp);
1033     for (i = 0; i < 16; i++) {
1034         felem_square(tmp, ftmp);
1035         felem_reduce(ftmp, tmp);
1036     }                           /* 2^32 - 2^16 */
1037     felem_mul(tmp, ftmp, e16);
1038     felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1039     felem_assign(e32, ftmp);
1040     for (i = 0; i < 32; i++) {
1041         felem_square(tmp, ftmp);
1042         felem_reduce(ftmp, tmp);
1043     }                           /* 2^64 - 2^32 */
1044     felem_assign(e64, ftmp);
1045     felem_mul(tmp, ftmp, in);
1046     felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1047     for (i = 0; i < 192; i++) {
1048         felem_square(tmp, ftmp);
1049         felem_reduce(ftmp, tmp);
1050     }                           /* 2^256 - 2^224 + 2^192 */
1051
1052     felem_mul(tmp, e64, e32);
1053     felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1054     for (i = 0; i < 16; i++) {
1055         felem_square(tmp, ftmp2);
1056         felem_reduce(ftmp2, tmp);
1057     }                           /* 2^80 - 2^16 */
1058     felem_mul(tmp, ftmp2, e16);
1059     felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1060     for (i = 0; i < 8; i++) {
1061         felem_square(tmp, ftmp2);
1062         felem_reduce(ftmp2, tmp);
1063     }                           /* 2^88 - 2^8 */
1064     felem_mul(tmp, ftmp2, e8);
1065     felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1066     for (i = 0; i < 4; i++) {
1067         felem_square(tmp, ftmp2);
1068         felem_reduce(ftmp2, tmp);
1069     }                           /* 2^92 - 2^4 */
1070     felem_mul(tmp, ftmp2, e4);
1071     felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1072     felem_square(tmp, ftmp2);
1073     felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1074     felem_square(tmp, ftmp2);
1075     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1076     felem_mul(tmp, ftmp2, e2);
1077     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1078     felem_square(tmp, ftmp2);
1079     felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1080     felem_square(tmp, ftmp2);
1081     felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1082     felem_mul(tmp, ftmp2, in);
1083     felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1084
1085     felem_mul(tmp, ftmp2, ftmp);
1086     felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1087 }
1088
1089 static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1090 {
1091     felem tmp;
1092
1093     smallfelem_expand(tmp, in);
1094     felem_inv(tmp, tmp);
1095     felem_contract(out, tmp);
1096 }
1097
1098 /*-
1099  * Group operations
1100  * ----------------
1101  *
1102  * Building on top of the field operations we have the operations on the
1103  * elliptic curve group itself. Points on the curve are represented in Jacobian
1104  * coordinates
1105  */
1106
1107 /*-
1108  * point_double calculates 2*(x_in, y_in, z_in)
1109  *
1110  * The method is taken from:
1111  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1112  *
1113  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1114  * while x_out == y_in is not (maybe this works, but it's not tested).
1115  */
1116 static void
1117 point_double(felem x_out, felem y_out, felem z_out,
1118              const felem x_in, const felem y_in, const felem z_in)
1119 {
1120     longfelem tmp, tmp2;
1121     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1122     smallfelem small1, small2;
1123
1124     felem_assign(ftmp, x_in);
1125     /* ftmp[i] < 2^106 */
1126     felem_assign(ftmp2, x_in);
1127     /* ftmp2[i] < 2^106 */
1128
1129     /* delta = z^2 */
1130     felem_square(tmp, z_in);
1131     felem_reduce(delta, tmp);
1132     /* delta[i] < 2^101 */
1133
1134     /* gamma = y^2 */
1135     felem_square(tmp, y_in);
1136     felem_reduce(gamma, tmp);
1137     /* gamma[i] < 2^101 */
1138     felem_shrink(small1, gamma);
1139
1140     /* beta = x*gamma */
1141     felem_small_mul(tmp, small1, x_in);
1142     felem_reduce(beta, tmp);
1143     /* beta[i] < 2^101 */
1144
1145     /* alpha = 3*(x-delta)*(x+delta) */
1146     felem_diff(ftmp, delta);
1147     /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1148     felem_sum(ftmp2, delta);
1149     /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1150     felem_scalar(ftmp2, 3);
1151     /* ftmp2[i] < 3 * 2^107 < 2^109 */
1152     felem_mul(tmp, ftmp, ftmp2);
1153     felem_reduce(alpha, tmp);
1154     /* alpha[i] < 2^101 */
1155     felem_shrink(small2, alpha);
1156
1157     /* x' = alpha^2 - 8*beta */
1158     smallfelem_square(tmp, small2);
1159     felem_reduce(x_out, tmp);
1160     felem_assign(ftmp, beta);
1161     felem_scalar(ftmp, 8);
1162     /* ftmp[i] < 8 * 2^101 = 2^104 */
1163     felem_diff(x_out, ftmp);
1164     /* x_out[i] < 2^105 + 2^101 < 2^106 */
1165
1166     /* z' = (y + z)^2 - gamma - delta */
1167     felem_sum(delta, gamma);
1168     /* delta[i] < 2^101 + 2^101 = 2^102 */
1169     felem_assign(ftmp, y_in);
1170     felem_sum(ftmp, z_in);
1171     /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1172     felem_square(tmp, ftmp);
1173     felem_reduce(z_out, tmp);
1174     felem_diff(z_out, delta);
1175     /* z_out[i] < 2^105 + 2^101 < 2^106 */
1176
1177     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1178     felem_scalar(beta, 4);
1179     /* beta[i] < 4 * 2^101 = 2^103 */
1180     felem_diff_zero107(beta, x_out);
1181     /* beta[i] < 2^107 + 2^103 < 2^108 */
1182     felem_small_mul(tmp, small2, beta);
1183     /* tmp[i] < 7 * 2^64 < 2^67 */
1184     smallfelem_square(tmp2, small1);
1185     /* tmp2[i] < 7 * 2^64 */
1186     longfelem_scalar(tmp2, 8);
1187     /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1188     longfelem_diff(tmp, tmp2);
1189     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1190     felem_reduce_zero105(y_out, tmp);
1191     /* y_out[i] < 2^106 */
1192 }
1193
1194 /*
1195  * point_double_small is the same as point_double, except that it operates on
1196  * smallfelems
1197  */
1198 static void
1199 point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1200                    const smallfelem x_in, const smallfelem y_in,
1201                    const smallfelem z_in)
1202 {
1203     felem felem_x_out, felem_y_out, felem_z_out;
1204     felem felem_x_in, felem_y_in, felem_z_in;
1205
1206     smallfelem_expand(felem_x_in, x_in);
1207     smallfelem_expand(felem_y_in, y_in);
1208     smallfelem_expand(felem_z_in, z_in);
1209     point_double(felem_x_out, felem_y_out, felem_z_out,
1210                  felem_x_in, felem_y_in, felem_z_in);
1211     felem_shrink(x_out, felem_x_out);
1212     felem_shrink(y_out, felem_y_out);
1213     felem_shrink(z_out, felem_z_out);
1214 }
1215
1216 /* copy_conditional copies in to out iff mask is all ones. */
1217 static void copy_conditional(felem out, const felem in, limb mask)
1218 {
1219     unsigned i;
1220     for (i = 0; i < NLIMBS; ++i) {
1221         const limb tmp = mask & (in[i] ^ out[i]);
1222         out[i] ^= tmp;
1223     }
1224 }
1225
1226 /* copy_small_conditional copies in to out iff mask is all ones. */
1227 static void copy_small_conditional(felem out, const smallfelem in, limb mask)
1228 {
1229     unsigned i;
1230     const u64 mask64 = mask;
1231     for (i = 0; i < NLIMBS; ++i) {
1232         out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1233     }
1234 }
1235
1236 /*-
1237  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1238  *
1239  * The method is taken from:
1240  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1241  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1242  *
1243  * This function includes a branch for checking whether the two input points
1244  * are equal, (while not equal to the point at infinity). This case never
1245  * happens during single point multiplication, so there is no timing leak for
1246  * ECDH or ECDSA signing.
1247  */
1248 static void point_add(felem x3, felem y3, felem z3,
1249                       const felem x1, const felem y1, const felem z1,
1250                       const int mixed, const smallfelem x2,
1251                       const smallfelem y2, const smallfelem z2)
1252 {
1253     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1254     longfelem tmp, tmp2;
1255     smallfelem small1, small2, small3, small4, small5;
1256     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1257
1258     felem_shrink(small3, z1);
1259
1260     z1_is_zero = smallfelem_is_zero(small3);
1261     z2_is_zero = smallfelem_is_zero(z2);
1262
1263     /* ftmp = z1z1 = z1**2 */
1264     smallfelem_square(tmp, small3);
1265     felem_reduce(ftmp, tmp);
1266     /* ftmp[i] < 2^101 */
1267     felem_shrink(small1, ftmp);
1268
1269     if (!mixed) {
1270         /* ftmp2 = z2z2 = z2**2 */
1271         smallfelem_square(tmp, z2);
1272         felem_reduce(ftmp2, tmp);
1273         /* ftmp2[i] < 2^101 */
1274         felem_shrink(small2, ftmp2);
1275
1276         felem_shrink(small5, x1);
1277
1278         /* u1 = ftmp3 = x1*z2z2 */
1279         smallfelem_mul(tmp, small5, small2);
1280         felem_reduce(ftmp3, tmp);
1281         /* ftmp3[i] < 2^101 */
1282
1283         /* ftmp5 = z1 + z2 */
1284         felem_assign(ftmp5, z1);
1285         felem_small_sum(ftmp5, z2);
1286         /* ftmp5[i] < 2^107 */
1287
1288         /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1289         felem_square(tmp, ftmp5);
1290         felem_reduce(ftmp5, tmp);
1291         /* ftmp2 = z2z2 + z1z1 */
1292         felem_sum(ftmp2, ftmp);
1293         /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1294         felem_diff(ftmp5, ftmp2);
1295         /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1296
1297         /* ftmp2 = z2 * z2z2 */
1298         smallfelem_mul(tmp, small2, z2);
1299         felem_reduce(ftmp2, tmp);
1300
1301         /* s1 = ftmp2 = y1 * z2**3 */
1302         felem_mul(tmp, y1, ftmp2);
1303         felem_reduce(ftmp6, tmp);
1304         /* ftmp6[i] < 2^101 */
1305     } else {
1306         /*
1307          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1308          */
1309
1310         /* u1 = ftmp3 = x1*z2z2 */
1311         felem_assign(ftmp3, x1);
1312         /* ftmp3[i] < 2^106 */
1313
1314         /* ftmp5 = 2z1z2 */
1315         felem_assign(ftmp5, z1);
1316         felem_scalar(ftmp5, 2);
1317         /* ftmp5[i] < 2*2^106 = 2^107 */
1318
1319         /* s1 = ftmp2 = y1 * z2**3 */
1320         felem_assign(ftmp6, y1);
1321         /* ftmp6[i] < 2^106 */
1322     }
1323
1324     /* u2 = x2*z1z1 */
1325     smallfelem_mul(tmp, x2, small1);
1326     felem_reduce(ftmp4, tmp);
1327
1328     /* h = ftmp4 = u2 - u1 */
1329     felem_diff_zero107(ftmp4, ftmp3);
1330     /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1331     felem_shrink(small4, ftmp4);
1332
1333     x_equal = smallfelem_is_zero(small4);
1334
1335     /* z_out = ftmp5 * h */
1336     felem_small_mul(tmp, small4, ftmp5);
1337     felem_reduce(z_out, tmp);
1338     /* z_out[i] < 2^101 */
1339
1340     /* ftmp = z1 * z1z1 */
1341     smallfelem_mul(tmp, small1, small3);
1342     felem_reduce(ftmp, tmp);
1343
1344     /* s2 = tmp = y2 * z1**3 */
1345     felem_small_mul(tmp, y2, ftmp);
1346     felem_reduce(ftmp5, tmp);
1347
1348     /* r = ftmp5 = (s2 - s1)*2 */
1349     felem_diff_zero107(ftmp5, ftmp6);
1350     /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1351     felem_scalar(ftmp5, 2);
1352     /* ftmp5[i] < 2^109 */
1353     felem_shrink(small1, ftmp5);
1354     y_equal = smallfelem_is_zero(small1);
1355
1356     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1357         point_double(x3, y3, z3, x1, y1, z1);
1358         return;
1359     }
1360
1361     /* I = ftmp = (2h)**2 */
1362     felem_assign(ftmp, ftmp4);
1363     felem_scalar(ftmp, 2);
1364     /* ftmp[i] < 2*2^108 = 2^109 */
1365     felem_square(tmp, ftmp);
1366     felem_reduce(ftmp, tmp);
1367
1368     /* J = ftmp2 = h * I */
1369     felem_mul(tmp, ftmp4, ftmp);
1370     felem_reduce(ftmp2, tmp);
1371
1372     /* V = ftmp4 = U1 * I */
1373     felem_mul(tmp, ftmp3, ftmp);
1374     felem_reduce(ftmp4, tmp);
1375
1376     /* x_out = r**2 - J - 2V */
1377     smallfelem_square(tmp, small1);
1378     felem_reduce(x_out, tmp);
1379     felem_assign(ftmp3, ftmp4);
1380     felem_scalar(ftmp4, 2);
1381     felem_sum(ftmp4, ftmp2);
1382     /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1383     felem_diff(x_out, ftmp4);
1384     /* x_out[i] < 2^105 + 2^101 */
1385
1386     /* y_out = r(V-x_out) - 2 * s1 * J */
1387     felem_diff_zero107(ftmp3, x_out);
1388     /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1389     felem_small_mul(tmp, small1, ftmp3);
1390     felem_mul(tmp2, ftmp6, ftmp2);
1391     longfelem_scalar(tmp2, 2);
1392     /* tmp2[i] < 2*2^67 = 2^68 */
1393     longfelem_diff(tmp, tmp2);
1394     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1395     felem_reduce_zero105(y_out, tmp);
1396     /* y_out[i] < 2^106 */
1397
1398     copy_small_conditional(x_out, x2, z1_is_zero);
1399     copy_conditional(x_out, x1, z2_is_zero);
1400     copy_small_conditional(y_out, y2, z1_is_zero);
1401     copy_conditional(y_out, y1, z2_is_zero);
1402     copy_small_conditional(z_out, z2, z1_is_zero);
1403     copy_conditional(z_out, z1, z2_is_zero);
1404     felem_assign(x3, x_out);
1405     felem_assign(y3, y_out);
1406     felem_assign(z3, z_out);
1407 }
1408
1409 /*
1410  * point_add_small is the same as point_add, except that it operates on
1411  * smallfelems
1412  */
1413 static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1414                             smallfelem x1, smallfelem y1, smallfelem z1,
1415                             smallfelem x2, smallfelem y2, smallfelem z2)
1416 {
1417     felem felem_x3, felem_y3, felem_z3;
1418     felem felem_x1, felem_y1, felem_z1;
1419     smallfelem_expand(felem_x1, x1);
1420     smallfelem_expand(felem_y1, y1);
1421     smallfelem_expand(felem_z1, z1);
1422     point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1423               x2, y2, z2);
1424     felem_shrink(x3, felem_x3);
1425     felem_shrink(y3, felem_y3);
1426     felem_shrink(z3, felem_z3);
1427 }
1428
1429 /*-
1430  * Base point pre computation
1431  * --------------------------
1432  *
1433  * Two different sorts of precomputed tables are used in the following code.
1434  * Each contain various points on the curve, where each point is three field
1435  * elements (x, y, z).
1436  *
1437  * For the base point table, z is usually 1 (0 for the point at infinity).
1438  * This table has 2 * 16 elements, starting with the following:
1439  * index | bits    | point
1440  * ------+---------+------------------------------
1441  *     0 | 0 0 0 0 | 0G
1442  *     1 | 0 0 0 1 | 1G
1443  *     2 | 0 0 1 0 | 2^64G
1444  *     3 | 0 0 1 1 | (2^64 + 1)G
1445  *     4 | 0 1 0 0 | 2^128G
1446  *     5 | 0 1 0 1 | (2^128 + 1)G
1447  *     6 | 0 1 1 0 | (2^128 + 2^64)G
1448  *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1449  *     8 | 1 0 0 0 | 2^192G
1450  *     9 | 1 0 0 1 | (2^192 + 1)G
1451  *    10 | 1 0 1 0 | (2^192 + 2^64)G
1452  *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1453  *    12 | 1 1 0 0 | (2^192 + 2^128)G
1454  *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1455  *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1456  *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1457  * followed by a copy of this with each element multiplied by 2^32.
1458  *
1459  * The reason for this is so that we can clock bits into four different
1460  * locations when doing simple scalar multiplies against the base point,
1461  * and then another four locations using the second 16 elements.
1462  *
1463  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1464
1465 /* gmul is the table of precomputed base points */
1466 static const smallfelem gmul[2][16][3] = {
1467     {{{0, 0, 0, 0},
1468       {0, 0, 0, 0},
1469       {0, 0, 0, 0}},
1470      {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1471        0x6b17d1f2e12c4247},
1472       {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1473        0x4fe342e2fe1a7f9b},
1474       {1, 0, 0, 0}},
1475      {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1476        0x0fa822bc2811aaa5},
1477       {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1478        0xbff44ae8f5dba80d},
1479       {1, 0, 0, 0}},
1480      {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1481        0x300a4bbc89d6726f},
1482       {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1483        0x72aac7e0d09b4644},
1484       {1, 0, 0, 0}},
1485      {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1486        0x447d739beedb5e67},
1487       {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1488        0x2d4825ab834131ee},
1489       {1, 0, 0, 0}},
1490      {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1491        0xef9519328a9c72ff},
1492       {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1493        0x611e9fc37dbb2c9b},
1494       {1, 0, 0, 0}},
1495      {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1496        0x550663797b51f5d8},
1497       {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1498        0x157164848aecb851},
1499       {1, 0, 0, 0}},
1500      {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1501        0xeb5d7745b21141ea},
1502       {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1503        0xeafd72ebdbecc17b},
1504       {1, 0, 0, 0}},
1505      {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1506        0xa6d39677a7849276},
1507       {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1508        0x674f84749b0b8816},
1509       {1, 0, 0, 0}},
1510      {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1511        0x4e769e7672c9ddad},
1512       {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1513        0x42b99082de830663},
1514       {1, 0, 0, 0}},
1515      {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1516        0x78878ef61c6ce04d},
1517       {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1518        0xb6cb3f5d7b72c321},
1519       {1, 0, 0, 0}},
1520      {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1521        0x0c88bc4d716b1287},
1522       {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1523        0xdd5ddea3f3901dc6},
1524       {1, 0, 0, 0}},
1525      {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1526        0x68f344af6b317466},
1527       {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1528        0x31b9c405f8540a20},
1529       {1, 0, 0, 0}},
1530      {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1531        0x4052bf4b6f461db9},
1532       {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1533        0xfecf4d5190b0fc61},
1534       {1, 0, 0, 0}},
1535      {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1536        0x1eddbae2c802e41a},
1537       {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1538        0x43104d86560ebcfc},
1539       {1, 0, 0, 0}},
1540      {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1541        0xb48e26b484f7a21c},
1542       {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1543        0xfac015404d4d3dab},
1544       {1, 0, 0, 0}}},
1545     {{{0, 0, 0, 0},
1546       {0, 0, 0, 0},
1547       {0, 0, 0, 0}},
1548      {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1549        0x7fe36b40af22af89},
1550       {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1551        0xe697d45825b63624},
1552       {1, 0, 0, 0}},
1553      {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1554        0x4a5b506612a677a6},
1555       {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1556        0xeb13461ceac089f1},
1557       {1, 0, 0, 0}},
1558      {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1559        0x0781b8291c6a220a},
1560       {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1561        0x690cde8df0151593},
1562       {1, 0, 0, 0}},
1563      {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1564        0x8a535f566ec73617},
1565       {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1566        0x0455c08468b08bd7},
1567       {1, 0, 0, 0}},
1568      {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1569        0x06bada7ab77f8276},
1570       {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1571        0x5b476dfd0e6cb18a},
1572       {1, 0, 0, 0}},
1573      {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1574        0x3e29864e8a2ec908},
1575       {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1576        0x239b90ea3dc31e7e},
1577       {1, 0, 0, 0}},
1578      {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1579        0x820f4dd949f72ff7},
1580       {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1581        0x140406ec783a05ec},
1582       {1, 0, 0, 0}},
1583      {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1584        0x68f6b8542783dfee},
1585       {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1586        0xcbe1feba92e40ce6},
1587       {1, 0, 0, 0}},
1588      {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1589        0xd0b2f94d2f420109},
1590       {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1591        0x971459828b0719e5},
1592       {1, 0, 0, 0}},
1593      {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1594        0x961610004a866aba},
1595       {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1596        0x7acb9fadcee75e44},
1597       {1, 0, 0, 0}},
1598      {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1599        0x24eb9acca333bf5b},
1600       {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1601        0x69f891c5acd079cc},
1602       {1, 0, 0, 0}},
1603      {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1604        0xe51f547c5972a107},
1605       {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1606        0x1c309a2b25bb1387},
1607       {1, 0, 0, 0}},
1608      {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1609        0x20b87b8aa2c4e503},
1610       {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1611        0xf5c6fa49919776be},
1612       {1, 0, 0, 0}},
1613      {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1614        0x1ed7d1b9332010b9},
1615       {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1616        0x3a2b03f03217257a},
1617       {1, 0, 0, 0}},
1618      {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1619        0x15fee545c78dd9f6},
1620       {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1621        0x4ab5b6b2b8753f81},
1622       {1, 0, 0, 0}}}
1623 };
1624
1625 /*
1626  * select_point selects the |idx|th point from a precomputation table and
1627  * copies it to out.
1628  */
1629 static void select_point(const u64 idx, unsigned int size,
1630                          const smallfelem pre_comp[16][3], smallfelem out[3])
1631 {
1632     unsigned i, j;
1633     u64 *outlimbs = &out[0][0];
1634
1635     memset(out, 0, sizeof(*out) * 3);
1636
1637     for (i = 0; i < size; i++) {
1638         const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1639         u64 mask = i ^ idx;
1640         mask |= mask >> 4;
1641         mask |= mask >> 2;
1642         mask |= mask >> 1;
1643         mask &= 1;
1644         mask--;
1645         for (j = 0; j < NLIMBS * 3; j++)
1646             outlimbs[j] |= inlimbs[j] & mask;
1647     }
1648 }
1649
1650 /* get_bit returns the |i|th bit in |in| */
1651 static char get_bit(const felem_bytearray in, int i)
1652 {
1653     if ((i < 0) || (i >= 256))
1654         return 0;
1655     return (in[i >> 3] >> (i & 7)) & 1;
1656 }
1657
1658 /*
1659  * Interleaved point multiplication using precomputed point multiples: The
1660  * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1661  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1662  * generator, using certain (large) precomputed multiples in g_pre_comp.
1663  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1664  */
1665 static void batch_mul(felem x_out, felem y_out, felem z_out,
1666                       const felem_bytearray scalars[],
1667                       const unsigned num_points, const u8 *g_scalar,
1668                       const int mixed, const smallfelem pre_comp[][17][3],
1669                       const smallfelem g_pre_comp[2][16][3])
1670 {
1671     int i, skip;
1672     unsigned num, gen_mul = (g_scalar != NULL);
1673     felem nq[3], ftmp;
1674     smallfelem tmp[3];
1675     u64 bits;
1676     u8 sign, digit;
1677
1678     /* set nq to the point at infinity */
1679     memset(nq, 0, sizeof(nq));
1680
1681     /*
1682      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1683      * of the generator (two in each of the last 32 rounds) and additions of
1684      * other points multiples (every 5th round).
1685      */
1686     skip = 1;                   /* save two point operations in the first
1687                                  * round */
1688     for (i = (num_points ? 255 : 31); i >= 0; --i) {
1689         /* double */
1690         if (!skip)
1691             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1692
1693         /* add multiples of the generator */
1694         if (gen_mul && (i <= 31)) {
1695             /* first, look 32 bits upwards */
1696             bits = get_bit(g_scalar, i + 224) << 3;
1697             bits |= get_bit(g_scalar, i + 160) << 2;
1698             bits |= get_bit(g_scalar, i + 96) << 1;
1699             bits |= get_bit(g_scalar, i + 32);
1700             /* select the point to add, in constant time */
1701             select_point(bits, 16, g_pre_comp[1], tmp);
1702
1703             if (!skip) {
1704                 /* Arg 1 below is for "mixed" */
1705                 point_add(nq[0], nq[1], nq[2],
1706                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1707             } else {
1708                 smallfelem_expand(nq[0], tmp[0]);
1709                 smallfelem_expand(nq[1], tmp[1]);
1710                 smallfelem_expand(nq[2], tmp[2]);
1711                 skip = 0;
1712             }
1713
1714             /* second, look at the current position */
1715             bits = get_bit(g_scalar, i + 192) << 3;
1716             bits |= get_bit(g_scalar, i + 128) << 2;
1717             bits |= get_bit(g_scalar, i + 64) << 1;
1718             bits |= get_bit(g_scalar, i);
1719             /* select the point to add, in constant time */
1720             select_point(bits, 16, g_pre_comp[0], tmp);
1721             /* Arg 1 below is for "mixed" */
1722             point_add(nq[0], nq[1], nq[2],
1723                       nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1724         }
1725
1726         /* do other additions every 5 doublings */
1727         if (num_points && (i % 5 == 0)) {
1728             /* loop over all scalars */
1729             for (num = 0; num < num_points; ++num) {
1730                 bits = get_bit(scalars[num], i + 4) << 5;
1731                 bits |= get_bit(scalars[num], i + 3) << 4;
1732                 bits |= get_bit(scalars[num], i + 2) << 3;
1733                 bits |= get_bit(scalars[num], i + 1) << 2;
1734                 bits |= get_bit(scalars[num], i) << 1;
1735                 bits |= get_bit(scalars[num], i - 1);
1736                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1737
1738                 /*
1739                  * select the point to add or subtract, in constant time
1740                  */
1741                 select_point(digit, 17, pre_comp[num], tmp);
1742                 smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1743                                                * point */
1744                 copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1745                 felem_contract(tmp[1], ftmp);
1746
1747                 if (!skip) {
1748                     point_add(nq[0], nq[1], nq[2],
1749                               nq[0], nq[1], nq[2],
1750                               mixed, tmp[0], tmp[1], tmp[2]);
1751                 } else {
1752                     smallfelem_expand(nq[0], tmp[0]);
1753                     smallfelem_expand(nq[1], tmp[1]);
1754                     smallfelem_expand(nq[2], tmp[2]);
1755                     skip = 0;
1756                 }
1757             }
1758         }
1759     }
1760     felem_assign(x_out, nq[0]);
1761     felem_assign(y_out, nq[1]);
1762     felem_assign(z_out, nq[2]);
1763 }
1764
1765 /* Precomputation for the group generator. */
1766 struct nistp256_pre_comp_st {
1767     smallfelem g_pre_comp[2][16][3];
1768     CRYPTO_REF_COUNT references;
1769     CRYPTO_RWLOCK *lock;
1770 };
1771
1772 const EC_METHOD *EC_GFp_nistp256_method(void)
1773 {
1774     static const EC_METHOD ret = {
1775         EC_FLAGS_DEFAULT_OCT,
1776         NID_X9_62_prime_field,
1777         ec_GFp_nistp256_group_init,
1778         ec_GFp_simple_group_finish,
1779         ec_GFp_simple_group_clear_finish,
1780         ec_GFp_nist_group_copy,
1781         ec_GFp_nistp256_group_set_curve,
1782         ec_GFp_simple_group_get_curve,
1783         ec_GFp_simple_group_get_degree,
1784         ec_group_simple_order_bits,
1785         ec_GFp_simple_group_check_discriminant,
1786         ec_GFp_simple_point_init,
1787         ec_GFp_simple_point_finish,
1788         ec_GFp_simple_point_clear_finish,
1789         ec_GFp_simple_point_copy,
1790         ec_GFp_simple_point_set_to_infinity,
1791         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1792         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1793         ec_GFp_simple_point_set_affine_coordinates,
1794         ec_GFp_nistp256_point_get_affine_coordinates,
1795         0 /* point_set_compressed_coordinates */ ,
1796         0 /* point2oct */ ,
1797         0 /* oct2point */ ,
1798         ec_GFp_simple_add,
1799         ec_GFp_simple_dbl,
1800         ec_GFp_simple_invert,
1801         ec_GFp_simple_is_at_infinity,
1802         ec_GFp_simple_is_on_curve,
1803         ec_GFp_simple_cmp,
1804         ec_GFp_simple_make_affine,
1805         ec_GFp_simple_points_make_affine,
1806         ec_GFp_nistp256_points_mul,
1807         ec_GFp_nistp256_precompute_mult,
1808         ec_GFp_nistp256_have_precompute_mult,
1809         ec_GFp_nist_field_mul,
1810         ec_GFp_nist_field_sqr,
1811         0 /* field_div */ ,
1812         0 /* field_encode */ ,
1813         0 /* field_decode */ ,
1814         0,                      /* field_set_to_one */
1815         ec_key_simple_priv2oct,
1816         ec_key_simple_oct2priv,
1817         0, /* set private */
1818         ec_key_simple_generate_key,
1819         ec_key_simple_check_key,
1820         ec_key_simple_generate_public_key,
1821         0, /* keycopy */
1822         0, /* keyfinish */
1823         ecdh_simple_compute_key
1824     };
1825
1826     return &ret;
1827 }
1828
1829 /******************************************************************************/
1830 /*
1831  * FUNCTIONS TO MANAGE PRECOMPUTATION
1832  */
1833
1834 static NISTP256_PRE_COMP *nistp256_pre_comp_new()
1835 {
1836     NISTP256_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1837
1838     if (ret == NULL) {
1839         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1840         return ret;
1841     }
1842
1843     ret->references = 1;
1844
1845     ret->lock = CRYPTO_THREAD_lock_new();
1846     if (ret->lock == NULL) {
1847         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1848         OPENSSL_free(ret);
1849         return NULL;
1850     }
1851     return ret;
1852 }
1853
1854 NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *p)
1855 {
1856     int i;
1857     if (p != NULL)
1858         CRYPTO_UP_REF(&p->references, &i, p->lock);
1859     return p;
1860 }
1861
1862 void EC_nistp256_pre_comp_free(NISTP256_PRE_COMP *pre)
1863 {
1864     int i;
1865
1866     if (pre == NULL)
1867         return;
1868
1869     CRYPTO_DOWN_REF(&pre->references, &i, pre->lock);
1870     REF_PRINT_COUNT("EC_nistp256", x);
1871     if (i > 0)
1872         return;
1873     REF_ASSERT_ISNT(i < 0);
1874
1875     CRYPTO_THREAD_lock_free(pre->lock);
1876     OPENSSL_free(pre);
1877 }
1878
1879 /******************************************************************************/
1880 /*
1881  * OPENSSL EC_METHOD FUNCTIONS
1882  */
1883
1884 int ec_GFp_nistp256_group_init(EC_GROUP *group)
1885 {
1886     int ret;
1887     ret = ec_GFp_simple_group_init(group);
1888     group->a_is_minus3 = 1;
1889     return ret;
1890 }
1891
1892 int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1893                                     const BIGNUM *a, const BIGNUM *b,
1894                                     BN_CTX *ctx)
1895 {
1896     int ret = 0;
1897     BN_CTX *new_ctx = NULL;
1898     BIGNUM *curve_p, *curve_a, *curve_b;
1899
1900     if (ctx == NULL)
1901         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1902             return 0;
1903     BN_CTX_start(ctx);
1904     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1905         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1906         ((curve_b = BN_CTX_get(ctx)) == NULL))
1907         goto err;
1908     BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1909     BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1910     BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1911     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1912         ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1913               EC_R_WRONG_CURVE_PARAMETERS);
1914         goto err;
1915     }
1916     group->field_mod_func = BN_nist_mod_256;
1917     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1918  err:
1919     BN_CTX_end(ctx);
1920     BN_CTX_free(new_ctx);
1921     return ret;
1922 }
1923
1924 /*
1925  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1926  * (X/Z^2, Y/Z^3)
1927  */
1928 int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1929                                                  const EC_POINT *point,
1930                                                  BIGNUM *x, BIGNUM *y,
1931                                                  BN_CTX *ctx)
1932 {
1933     felem z1, z2, x_in, y_in;
1934     smallfelem x_out, y_out;
1935     longfelem tmp;
1936
1937     if (EC_POINT_is_at_infinity(group, point)) {
1938         ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1939               EC_R_POINT_AT_INFINITY);
1940         return 0;
1941     }
1942     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1943         (!BN_to_felem(z1, point->Z)))
1944         return 0;
1945     felem_inv(z2, z1);
1946     felem_square(tmp, z2);
1947     felem_reduce(z1, tmp);
1948     felem_mul(tmp, x_in, z1);
1949     felem_reduce(x_in, tmp);
1950     felem_contract(x_out, x_in);
1951     if (x != NULL) {
1952         if (!smallfelem_to_BN(x, x_out)) {
1953             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1954                   ERR_R_BN_LIB);
1955             return 0;
1956         }
1957     }
1958     felem_mul(tmp, z1, z2);
1959     felem_reduce(z1, tmp);
1960     felem_mul(tmp, y_in, z1);
1961     felem_reduce(y_in, tmp);
1962     felem_contract(y_out, y_in);
1963     if (y != NULL) {
1964         if (!smallfelem_to_BN(y, y_out)) {
1965             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1966                   ERR_R_BN_LIB);
1967             return 0;
1968         }
1969     }
1970     return 1;
1971 }
1972
1973 /* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
1974 static void make_points_affine(size_t num, smallfelem points[][3],
1975                                smallfelem tmp_smallfelems[])
1976 {
1977     /*
1978      * Runs in constant time, unless an input is the point at infinity (which
1979      * normally shouldn't happen).
1980      */
1981     ec_GFp_nistp_points_make_affine_internal(num,
1982                                              points,
1983                                              sizeof(smallfelem),
1984                                              tmp_smallfelems,
1985                                              (void (*)(void *))smallfelem_one,
1986                                              (int (*)(const void *))
1987                                              smallfelem_is_zero_int,
1988                                              (void (*)(void *, const void *))
1989                                              smallfelem_assign,
1990                                              (void (*)(void *, const void *))
1991                                              smallfelem_square_contract,
1992                                              (void (*)
1993                                               (void *, const void *,
1994                                                const void *))
1995                                              smallfelem_mul_contract,
1996                                              (void (*)(void *, const void *))
1997                                              smallfelem_inv_contract,
1998                                              /* nothing to contract */
1999                                              (void (*)(void *, const void *))
2000                                              smallfelem_assign);
2001 }
2002
2003 /*
2004  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
2005  * values Result is stored in r (r can equal one of the inputs).
2006  */
2007 int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
2008                                const BIGNUM *scalar, size_t num,
2009                                const EC_POINT *points[],
2010                                const BIGNUM *scalars[], BN_CTX *ctx)
2011 {
2012     int ret = 0;
2013     int j;
2014     int mixed = 0;
2015     BN_CTX *new_ctx = NULL;
2016     BIGNUM *x, *y, *z, *tmp_scalar;
2017     felem_bytearray g_secret;
2018     felem_bytearray *secrets = NULL;
2019     smallfelem (*pre_comp)[17][3] = NULL;
2020     smallfelem *tmp_smallfelems = NULL;
2021     felem_bytearray tmp;
2022     unsigned i, num_bytes;
2023     int have_pre_comp = 0;
2024     size_t num_points = num;
2025     smallfelem x_in, y_in, z_in;
2026     felem x_out, y_out, z_out;
2027     NISTP256_PRE_COMP *pre = NULL;
2028     const smallfelem(*g_pre_comp)[16][3] = NULL;
2029     EC_POINT *generator = NULL;
2030     const EC_POINT *p = NULL;
2031     const BIGNUM *p_scalar = NULL;
2032
2033     if (ctx == NULL)
2034         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2035             return 0;
2036     BN_CTX_start(ctx);
2037     if (((x = BN_CTX_get(ctx)) == NULL) ||
2038         ((y = BN_CTX_get(ctx)) == NULL) ||
2039         ((z = BN_CTX_get(ctx)) == NULL) ||
2040         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
2041         goto err;
2042
2043     if (scalar != NULL) {
2044         pre = group->pre_comp.nistp256;
2045         if (pre)
2046             /* we have precomputation, try to use it */
2047             g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2048         else
2049             /* try to use the standard precomputation */
2050             g_pre_comp = &gmul[0];
2051         generator = EC_POINT_new(group);
2052         if (generator == NULL)
2053             goto err;
2054         /* get the generator from precomputation */
2055         if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2056             !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2057             !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2058             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2059             goto err;
2060         }
2061         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
2062                                                       generator, x, y, z,
2063                                                       ctx))
2064             goto err;
2065         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2066             /* precomputation matches generator */
2067             have_pre_comp = 1;
2068         else
2069             /*
2070              * we don't have valid precomputation: treat the generator as a
2071              * random point
2072              */
2073             num_points++;
2074     }
2075     if (num_points > 0) {
2076         if (num_points >= 3) {
2077             /*
2078              * unless we precompute multiples for just one or two points,
2079              * converting those into affine form is time well spent
2080              */
2081             mixed = 1;
2082         }
2083         secrets = OPENSSL_malloc(sizeof(*secrets) * num_points);
2084         pre_comp = OPENSSL_malloc(sizeof(*pre_comp) * num_points);
2085         if (mixed)
2086             tmp_smallfelems =
2087               OPENSSL_malloc(sizeof(*tmp_smallfelems) * (num_points * 17 + 1));
2088         if ((secrets == NULL) || (pre_comp == NULL)
2089             || (mixed && (tmp_smallfelems == NULL))) {
2090             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
2091             goto err;
2092         }
2093
2094         /*
2095          * we treat NULL scalars as 0, and NULL points as points at infinity,
2096          * i.e., they contribute nothing to the linear combination
2097          */
2098         memset(secrets, 0, sizeof(*secrets) * num_points);
2099         memset(pre_comp, 0, sizeof(*pre_comp) * num_points);
2100         for (i = 0; i < num_points; ++i) {
2101             if (i == num)
2102                 /*
2103                  * we didn't have a valid precomputation, so we pick the
2104                  * generator
2105                  */
2106             {
2107                 p = EC_GROUP_get0_generator(group);
2108                 p_scalar = scalar;
2109             } else
2110                 /* the i^th point */
2111             {
2112                 p = points[i];
2113                 p_scalar = scalars[i];
2114             }
2115             if ((p_scalar != NULL) && (p != NULL)) {
2116                 /* reduce scalar to 0 <= scalar < 2^256 */
2117                 if ((BN_num_bits(p_scalar) > 256)
2118                     || (BN_is_negative(p_scalar))) {
2119                     /*
2120                      * this is an unusual input, and we don't guarantee
2121                      * constant-timeness
2122                      */
2123                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
2124                         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2125                         goto err;
2126                     }
2127                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
2128                 } else
2129                     num_bytes = BN_bn2bin(p_scalar, tmp);
2130                 flip_endian(secrets[i], tmp, num_bytes);
2131                 /* precompute multiples */
2132                 if ((!BN_to_felem(x_out, p->X)) ||
2133                     (!BN_to_felem(y_out, p->Y)) ||
2134                     (!BN_to_felem(z_out, p->Z)))
2135                     goto err;
2136                 felem_shrink(pre_comp[i][1][0], x_out);
2137                 felem_shrink(pre_comp[i][1][1], y_out);
2138                 felem_shrink(pre_comp[i][1][2], z_out);
2139                 for (j = 2; j <= 16; ++j) {
2140                     if (j & 1) {
2141                         point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2142                                         pre_comp[i][j][2], pre_comp[i][1][0],
2143                                         pre_comp[i][1][1], pre_comp[i][1][2],
2144                                         pre_comp[i][j - 1][0],
2145                                         pre_comp[i][j - 1][1],
2146                                         pre_comp[i][j - 1][2]);
2147                     } else {
2148                         point_double_small(pre_comp[i][j][0],
2149                                            pre_comp[i][j][1],
2150                                            pre_comp[i][j][2],
2151                                            pre_comp[i][j / 2][0],
2152                                            pre_comp[i][j / 2][1],
2153                                            pre_comp[i][j / 2][2]);
2154                     }
2155                 }
2156             }
2157         }
2158         if (mixed)
2159             make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2160     }
2161
2162     /* the scalar for the generator */
2163     if ((scalar != NULL) && (have_pre_comp)) {
2164         memset(g_secret, 0, sizeof(g_secret));
2165         /* reduce scalar to 0 <= scalar < 2^256 */
2166         if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2167             /*
2168              * this is an unusual input, and we don't guarantee
2169              * constant-timeness
2170              */
2171             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2172                 ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2173                 goto err;
2174             }
2175             num_bytes = BN_bn2bin(tmp_scalar, tmp);
2176         } else
2177             num_bytes = BN_bn2bin(scalar, tmp);
2178         flip_endian(g_secret, tmp, num_bytes);
2179         /* do the multiplication with generator precomputation */
2180         batch_mul(x_out, y_out, z_out,
2181                   (const felem_bytearray(*))secrets, num_points,
2182                   g_secret,
2183                   mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2184     } else
2185         /* do the multiplication without generator precomputation */
2186         batch_mul(x_out, y_out, z_out,
2187                   (const felem_bytearray(*))secrets, num_points,
2188                   NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2189     /* reduce the output to its unique minimal representation */
2190     felem_contract(x_in, x_out);
2191     felem_contract(y_in, y_out);
2192     felem_contract(z_in, z_out);
2193     if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2194         (!smallfelem_to_BN(z, z_in))) {
2195         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2196         goto err;
2197     }
2198     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2199
2200  err:
2201     BN_CTX_end(ctx);
2202     EC_POINT_free(generator);
2203     BN_CTX_free(new_ctx);
2204     OPENSSL_free(secrets);
2205     OPENSSL_free(pre_comp);
2206     OPENSSL_free(tmp_smallfelems);
2207     return ret;
2208 }
2209
2210 int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2211 {
2212     int ret = 0;
2213     NISTP256_PRE_COMP *pre = NULL;
2214     int i, j;
2215     BN_CTX *new_ctx = NULL;
2216     BIGNUM *x, *y;
2217     EC_POINT *generator = NULL;
2218     smallfelem tmp_smallfelems[32];
2219     felem x_tmp, y_tmp, z_tmp;
2220
2221     /* throw away old precomputation */
2222     EC_pre_comp_free(group);
2223     if (ctx == NULL)
2224         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2225             return 0;
2226     BN_CTX_start(ctx);
2227     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2228         goto err;
2229     /* get the generator */
2230     if (group->generator == NULL)
2231         goto err;
2232     generator = EC_POINT_new(group);
2233     if (generator == NULL)
2234         goto err;
2235     BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2236     BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2237     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2238         goto err;
2239     if ((pre = nistp256_pre_comp_new()) == NULL)
2240         goto err;
2241     /*
2242      * if the generator is the standard one, use built-in precomputation
2243      */
2244     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2245         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2246         goto done;
2247     }
2248     if ((!BN_to_felem(x_tmp, group->generator->X)) ||
2249         (!BN_to_felem(y_tmp, group->generator->Y)) ||
2250         (!BN_to_felem(z_tmp, group->generator->Z)))
2251         goto err;
2252     felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2253     felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2254     felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2255     /*
2256      * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2257      * 2^160*G, 2^224*G for the second one
2258      */
2259     for (i = 1; i <= 8; i <<= 1) {
2260         point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2261                            pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2262                            pre->g_pre_comp[0][i][1],
2263                            pre->g_pre_comp[0][i][2]);
2264         for (j = 0; j < 31; ++j) {
2265             point_double_small(pre->g_pre_comp[1][i][0],
2266                                pre->g_pre_comp[1][i][1],
2267                                pre->g_pre_comp[1][i][2],
2268                                pre->g_pre_comp[1][i][0],
2269                                pre->g_pre_comp[1][i][1],
2270                                pre->g_pre_comp[1][i][2]);
2271         }
2272         if (i == 8)
2273             break;
2274         point_double_small(pre->g_pre_comp[0][2 * i][0],
2275                            pre->g_pre_comp[0][2 * i][1],
2276                            pre->g_pre_comp[0][2 * i][2],
2277                            pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2278                            pre->g_pre_comp[1][i][2]);
2279         for (j = 0; j < 31; ++j) {
2280             point_double_small(pre->g_pre_comp[0][2 * i][0],
2281                                pre->g_pre_comp[0][2 * i][1],
2282                                pre->g_pre_comp[0][2 * i][2],
2283                                pre->g_pre_comp[0][2 * i][0],
2284                                pre->g_pre_comp[0][2 * i][1],
2285                                pre->g_pre_comp[0][2 * i][2]);
2286         }
2287     }
2288     for (i = 0; i < 2; i++) {
2289         /* g_pre_comp[i][0] is the point at infinity */
2290         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2291         /* the remaining multiples */
2292         /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2293         point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2294                         pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2295                         pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2296                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2297                         pre->g_pre_comp[i][2][2]);
2298         /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2299         point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2300                         pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2301                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2302                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2303                         pre->g_pre_comp[i][2][2]);
2304         /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2305         point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2306                         pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2307                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2308                         pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2309                         pre->g_pre_comp[i][4][2]);
2310         /*
2311          * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2312          */
2313         point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2314                         pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2315                         pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2316                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2317                         pre->g_pre_comp[i][2][2]);
2318         for (j = 1; j < 8; ++j) {
2319             /* odd multiples: add G resp. 2^32*G */
2320             point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2321                             pre->g_pre_comp[i][2 * j + 1][1],
2322                             pre->g_pre_comp[i][2 * j + 1][2],
2323                             pre->g_pre_comp[i][2 * j][0],
2324                             pre->g_pre_comp[i][2 * j][1],
2325                             pre->g_pre_comp[i][2 * j][2],
2326                             pre->g_pre_comp[i][1][0],
2327                             pre->g_pre_comp[i][1][1],
2328                             pre->g_pre_comp[i][1][2]);
2329         }
2330     }
2331     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2332
2333  done:
2334     SETPRECOMP(group, nistp256, pre);
2335     pre = NULL;
2336     ret = 1;
2337
2338  err:
2339     BN_CTX_end(ctx);
2340     EC_POINT_free(generator);
2341     BN_CTX_free(new_ctx);
2342     EC_nistp256_pre_comp_free(pre);
2343     return ret;
2344 }
2345
2346 int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2347 {
2348     return HAVEPRECOMP(group, nistp256);
2349 }
2350 #endif