crypto/ec/ecp_nistp256.c

   1 /*
   2  * Written by Adam Langley (Google) for the OpenSSL project
   3  */
   4 /* Copyright 2011 Google Inc.
   5  *
   6  * Licensed under the Apache License, Version 2.0 (the "License");
   7  *
   8  * you may not use this file except in compliance with the License.
   9  * You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  *  Unless required by applicable law or agreed to in writing, software
  14  *  distributed under the License is distributed on an "AS IS" BASIS,
  15  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  *  See the License for the specific language governing permissions and
  17  *  limitations under the License.
  18  */
  19
  20 /*
  21  * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
  22  *
  23  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  24  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  25  * work which got its smarts from Daniel J. Bernstein's work on the same.
  26  */
  27
  28 #include <openssl/opensslconf.h>
  29 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  30 NON_EMPTY_TRANSLATION_UNIT
  31 #else
  32
  33 # include <stdint.h>
  34 # include <string.h>
  35 # include <openssl/err.h>
  36 # include "ec_lcl.h"
  37
  38 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  39   /* even with gcc, the typedef won't work for 32-bit platforms */
  40 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  41                                  * platforms */
  42 typedef __int128_t int128_t;
  43 # else
  44 #  error "Need GCC 3.1 or later to define type uint128_t"
  45 # endif
  46
  47 typedef uint8_t u8;
  48 typedef uint32_t u32;
  49 typedef uint64_t u64;
  50 typedef int64_t s64;
  51
  52 /*
  53  * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
  54  * can serialise an element of this field into 32 bytes. We call this an
  55  * felem_bytearray.
  56  */
  57
  58 typedef u8 felem_bytearray[32];
  59
  60 /*
  61  * These are the parameters of P256, taken from FIPS 186-3, page 86. These
  62  * values are big-endian.
  63  */
  64 static const felem_bytearray nistp256_curve_params[5] = {
  65     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
  66      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  67      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  68      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
  69     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
  70      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  71      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
  73     {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
  74      0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
  75      0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
  76      0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
  77     {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
  78      0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
  79      0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
  80      0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
  81     {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
  82      0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
  83      0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
  84      0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
  85 };
  86
  87 /*-
  88  * The representation of field elements.
  89  * ------------------------------------
  90  *
  91  * We represent field elements with either four 128-bit values, eight 128-bit
  92  * values, or four 64-bit values. The field element represented is:
  93  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
  94  * or:
  95  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
  96  *
  97  * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
  98  * apart, but are 128-bits wide, the most significant bits of each limb overlap
  99  * with the least significant bits of the next.
 100  *
 101  * A field element with four limbs is an 'felem'. One with eight limbs is a
 102  * 'longfelem'
 103  *
 104  * A field element with four, 64-bit values is called a 'smallfelem'. Small
 105  * values are used as intermediate values before multiplication.
 106  */
 107
 108 # define NLIMBS 4
 109
 110 typedef uint128_t limb;
 111 typedef limb felem[NLIMBS];
 112 typedef limb longfelem[NLIMBS * 2];
 113 typedef u64 smallfelem[NLIMBS];
 114
 115 /* This is the value of the prime as four 64-bit words, little-endian. */
 116 static const u64 kPrime[4] =
 117     { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
 118 static const u64 bottom63bits = 0x7ffffffffffffffful;
 119
 120 /*
 121  * bin32_to_felem takes a little-endian byte array and converts it into felem
 122  * form. This assumes that the CPU is little-endian.
 123  */
 124 static void bin32_to_felem(felem out, const u8 in[32])
 125 {
 126     out[0] = *((u64 *)&in[0]);
 127     out[1] = *((u64 *)&in[8]);
 128     out[2] = *((u64 *)&in[16]);
 129     out[3] = *((u64 *)&in[24]);
 130 }
 131
 132 /*
 133  * smallfelem_to_bin32 takes a smallfelem and serialises into a little
 134  * endian, 32 byte array. This assumes that the CPU is little-endian.
 135  */
 136 static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
 137 {
 138     *((u64 *)&out[0]) = in[0];
 139     *((u64 *)&out[8]) = in[1];
 140     *((u64 *)&out[16]) = in[2];
 141     *((u64 *)&out[24]) = in[3];
 142 }
 143
 144 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 145 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 146 {
 147     unsigned i;
 148     for (i = 0; i < len; ++i)
 149         out[i] = in[len - 1 - i];
 150 }
 151
 152 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 153 static int BN_to_felem(felem out, const BIGNUM *bn)
 154 {
 155     felem_bytearray b_in;
 156     felem_bytearray b_out;
 157     unsigned num_bytes;
 158
 159     /* BN_bn2bin eats leading zeroes */
 160     memset(b_out, 0, sizeof(b_out));
 161     num_bytes = BN_num_bytes(bn);
 162     if (num_bytes > sizeof b_out) {
 163         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 164         return 0;
 165     }
 166     if (BN_is_negative(bn)) {
 167         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 168         return 0;
 169     }
 170     num_bytes = BN_bn2bin(bn, b_in);
 171     flip_endian(b_out, b_in, num_bytes);
 172     bin32_to_felem(out, b_out);
 173     return 1;
 174 }
 175
 176 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 177 static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
 178 {
 179     felem_bytearray b_in, b_out;
 180     smallfelem_to_bin32(b_in, in);
 181     flip_endian(b_out, b_in, sizeof b_out);
 182     return BN_bin2bn(b_out, sizeof b_out, out);
 183 }
 184
 185 /*-
 186  * Field operations
 187  * ----------------
 188  */
 189
 190 static void smallfelem_one(smallfelem out)
 191 {
 192     out[0] = 1;
 193     out[1] = 0;
 194     out[2] = 0;
 195     out[3] = 0;
 196 }
 197
 198 static void smallfelem_assign(smallfelem out, const smallfelem in)
 199 {
 200     out[0] = in[0];
 201     out[1] = in[1];
 202     out[2] = in[2];
 203     out[3] = in[3];
 204 }
 205
 206 static void felem_assign(felem out, const felem in)
 207 {
 208     out[0] = in[0];
 209     out[1] = in[1];
 210     out[2] = in[2];
 211     out[3] = in[3];
 212 }
 213
 214 /* felem_sum sets out = out + in. */
 215 static void felem_sum(felem out, const felem in)
 216 {
 217     out[0] += in[0];
 218     out[1] += in[1];
 219     out[2] += in[2];
 220     out[3] += in[3];
 221 }
 222
 223 /* felem_small_sum sets out = out + in. */
 224 static void felem_small_sum(felem out, const smallfelem in)
 225 {
 226     out[0] += in[0];
 227     out[1] += in[1];
 228     out[2] += in[2];
 229     out[3] += in[3];
 230 }
 231
 232 /* felem_scalar sets out = out * scalar */
 233 static void felem_scalar(felem out, const u64 scalar)
 234 {
 235     out[0] *= scalar;
 236     out[1] *= scalar;
 237     out[2] *= scalar;
 238     out[3] *= scalar;
 239 }
 240
 241 /* longfelem_scalar sets out = out * scalar */
 242 static void longfelem_scalar(longfelem out, const u64 scalar)
 243 {
 244     out[0] *= scalar;
 245     out[1] *= scalar;
 246     out[2] *= scalar;
 247     out[3] *= scalar;
 248     out[4] *= scalar;
 249     out[5] *= scalar;
 250     out[6] *= scalar;
 251     out[7] *= scalar;
 252 }
 253
 254 # define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
 255 # define two105 (((limb)1) << 105)
 256 # define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
 257
 258 /* zero105 is 0 mod p */
 259 static const felem zero105 =
 260     { two105m41m9, two105, two105m41p9, two105m41p9 };
 261
 262 /*-
 263  * smallfelem_neg sets |out| to |-small|
 264  * On exit:
 265  *   out[i] < out[i] + 2^105
 266  */
 267 static void smallfelem_neg(felem out, const smallfelem small)
 268 {
 269     /* In order to prevent underflow, we subtract from 0 mod p. */
 270     out[0] = zero105[0] - small[0];
 271     out[1] = zero105[1] - small[1];
 272     out[2] = zero105[2] - small[2];
 273     out[3] = zero105[3] - small[3];
 274 }
 275
 276 /*-
 277  * felem_diff subtracts |in| from |out|
 278  * On entry:
 279  *   in[i] < 2^104
 280  * On exit:
 281  *   out[i] < out[i] + 2^105
 282  */
 283 static void felem_diff(felem out, const felem in)
 284 {
 285     /*
 286      * In order to prevent underflow, we add 0 mod p before subtracting.
 287      */
 288     out[0] += zero105[0];
 289     out[1] += zero105[1];
 290     out[2] += zero105[2];
 291     out[3] += zero105[3];
 292
 293     out[0] -= in[0];
 294     out[1] -= in[1];
 295     out[2] -= in[2];
 296     out[3] -= in[3];
 297 }
 298
 299 # define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
 300 # define two107 (((limb)1) << 107)
 301 # define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
 302
 303 /* zero107 is 0 mod p */
 304 static const felem zero107 =
 305     { two107m43m11, two107, two107m43p11, two107m43p11 };
 306
 307 /*-
 308  * An alternative felem_diff for larger inputs |in|
 309  * felem_diff_zero107 subtracts |in| from |out|
 310  * On entry:
 311  *   in[i] < 2^106
 312  * On exit:
 313  *   out[i] < out[i] + 2^107
 314  */
 315 static void felem_diff_zero107(felem out, const felem in)
 316 {
 317     /*
 318      * In order to prevent underflow, we add 0 mod p before subtracting.
 319      */
 320     out[0] += zero107[0];
 321     out[1] += zero107[1];
 322     out[2] += zero107[2];
 323     out[3] += zero107[3];
 324
 325     out[0] -= in[0];
 326     out[1] -= in[1];
 327     out[2] -= in[2];
 328     out[3] -= in[3];
 329 }
 330
 331 /*-
 332  * longfelem_diff subtracts |in| from |out|
 333  * On entry:
 334  *   in[i] < 7*2^67
 335  * On exit:
 336  *   out[i] < out[i] + 2^70 + 2^40
 337  */
 338 static void longfelem_diff(longfelem out, const longfelem in)
 339 {
 340     static const limb two70m8p6 =
 341         (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
 342     static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
 343     static const limb two70 = (((limb) 1) << 70);
 344     static const limb two70m40m38p6 =
 345         (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
 346         (((limb) 1) << 6);
 347     static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
 348
 349     /* add 0 mod p to avoid underflow */
 350     out[0] += two70m8p6;
 351     out[1] += two70p40;
 352     out[2] += two70;
 353     out[3] += two70m40m38p6;
 354     out[4] += two70m6;
 355     out[5] += two70m6;
 356     out[6] += two70m6;
 357     out[7] += two70m6;
 358
 359     /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
 360     out[0] -= in[0];
 361     out[1] -= in[1];
 362     out[2] -= in[2];
 363     out[3] -= in[3];
 364     out[4] -= in[4];
 365     out[5] -= in[5];
 366     out[6] -= in[6];
 367     out[7] -= in[7];
 368 }
 369
 370 # define two64m0 (((limb)1) << 64) - 1
 371 # define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
 372 # define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
 373 # define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
 374
 375 /* zero110 is 0 mod p */
 376 static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
 377
 378 /*-
 379  * felem_shrink converts an felem into a smallfelem. The result isn't quite
 380  * minimal as the value may be greater than p.
 381  *
 382  * On entry:
 383  *   in[i] < 2^109
 384  * On exit:
 385  *   out[i] < 2^64
 386  */
 387 static void felem_shrink(smallfelem out, const felem in)
 388 {
 389     felem tmp;
 390     u64 a, b, mask;
 391     s64 high, low;
 392     static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
 393
 394     /* Carry 2->3 */
 395     tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
 396     /* tmp[3] < 2^110 */
 397
 398     tmp[2] = zero110[2] + (u64)in[2];
 399     tmp[0] = zero110[0] + in[0];
 400     tmp[1] = zero110[1] + in[1];
 401     /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
 402
 403     /*
 404      * We perform two partial reductions where we eliminate the high-word of
 405      * tmp[3]. We don't update the other words till the end.
 406      */
 407     a = tmp[3] >> 64;           /* a < 2^46 */
 408     tmp[3] = (u64)tmp[3];
 409     tmp[3] -= a;
 410     tmp[3] += ((limb) a) << 32;
 411     /* tmp[3] < 2^79 */
 412
 413     b = a;
 414     a = tmp[3] >> 64;           /* a < 2^15 */
 415     b += a;                     /* b < 2^46 + 2^15 < 2^47 */
 416     tmp[3] = (u64)tmp[3];
 417     tmp[3] -= a;
 418     tmp[3] += ((limb) a) << 32;
 419     /* tmp[3] < 2^64 + 2^47 */
 420
 421     /*
 422      * This adjusts the other two words to complete the two partial
 423      * reductions.
 424      */
 425     tmp[0] += b;
 426     tmp[1] -= (((limb) b) << 32);
 427
 428     /*
 429      * In order to make space in tmp[3] for the carry from 2 -> 3, we
 430      * conditionally subtract kPrime if tmp[3] is large enough.
 431      */
 432     high = tmp[3] >> 64;
 433     /* As tmp[3] < 2^65, high is either 1 or 0 */
 434     high <<= 63;
 435     high >>= 63;
 436     /*-
 437      * high is:
 438      *   all ones   if the high word of tmp[3] is 1
 439      *   all zeros  if the high word of tmp[3] if 0 */
 440     low = tmp[3];
 441     mask = low >> 63;
 442     /*-
 443      * mask is:
 444      *   all ones   if the MSB of low is 1
 445      *   all zeros  if the MSB of low if 0 */
 446     low &= bottom63bits;
 447     low -= kPrime3Test;
 448     /* if low was greater than kPrime3Test then the MSB is zero */
 449     low = ~low;
 450     low >>= 63;
 451     /*-
 452      * low is:
 453      *   all ones   if low was > kPrime3Test
 454      *   all zeros  if low was <= kPrime3Test */
 455     mask = (mask & low) | high;
 456     tmp[0] -= mask & kPrime[0];
 457     tmp[1] -= mask & kPrime[1];
 458     /* kPrime[2] is zero, so omitted */
 459     tmp[3] -= mask & kPrime[3];
 460     /* tmp[3] < 2**64 - 2**32 + 1 */
 461
 462     tmp[1] += ((u64)(tmp[0] >> 64));
 463     tmp[0] = (u64)tmp[0];
 464     tmp[2] += ((u64)(tmp[1] >> 64));
 465     tmp[1] = (u64)tmp[1];
 466     tmp[3] += ((u64)(tmp[2] >> 64));
 467     tmp[2] = (u64)tmp[2];
 468     /* tmp[i] < 2^64 */
 469
 470     out[0] = tmp[0];
 471     out[1] = tmp[1];
 472     out[2] = tmp[2];
 473     out[3] = tmp[3];
 474 }
 475
 476 /* smallfelem_expand converts a smallfelem to an felem */
 477 static void smallfelem_expand(felem out, const smallfelem in)
 478 {
 479     out[0] = in[0];
 480     out[1] = in[1];
 481     out[2] = in[2];
 482     out[3] = in[3];
 483 }
 484
 485 /*-
 486  * smallfelem_square sets |out| = |small|^2
 487  * On entry:
 488  *   small[i] < 2^64
 489  * On exit:
 490  *   out[i] < 7 * 2^64 < 2^67
 491  */
 492 static void smallfelem_square(longfelem out, const smallfelem small)
 493 {
 494     limb a;
 495     u64 high, low;
 496
 497     a = ((uint128_t) small[0]) * small[0];
 498     low = a;
 499     high = a >> 64;
 500     out[0] = low;
 501     out[1] = high;
 502
 503     a = ((uint128_t) small[0]) * small[1];
 504     low = a;
 505     high = a >> 64;
 506     out[1] += low;
 507     out[1] += low;
 508     out[2] = high;
 509
 510     a = ((uint128_t) small[0]) * small[2];
 511     low = a;
 512     high = a >> 64;
 513     out[2] += low;
 514     out[2] *= 2;
 515     out[3] = high;
 516
 517     a = ((uint128_t) small[0]) * small[3];
 518     low = a;
 519     high = a >> 64;
 520     out[3] += low;
 521     out[4] = high;
 522
 523     a = ((uint128_t) small[1]) * small[2];
 524     low = a;
 525     high = a >> 64;
 526     out[3] += low;
 527     out[3] *= 2;
 528     out[4] += high;
 529
 530     a = ((uint128_t) small[1]) * small[1];
 531     low = a;
 532     high = a >> 64;
 533     out[2] += low;
 534     out[3] += high;
 535
 536     a = ((uint128_t) small[1]) * small[3];
 537     low = a;
 538     high = a >> 64;
 539     out[4] += low;
 540     out[4] *= 2;
 541     out[5] = high;
 542
 543     a = ((uint128_t) small[2]) * small[3];
 544     low = a;
 545     high = a >> 64;
 546     out[5] += low;
 547     out[5] *= 2;
 548     out[6] = high;
 549     out[6] += high;
 550
 551     a = ((uint128_t) small[2]) * small[2];
 552     low = a;
 553     high = a >> 64;
 554     out[4] += low;
 555     out[5] += high;
 556
 557     a = ((uint128_t) small[3]) * small[3];
 558     low = a;
 559     high = a >> 64;
 560     out[6] += low;
 561     out[7] = high;
 562 }
 563
 564 /*-
 565  * felem_square sets |out| = |in|^2
 566  * On entry:
 567  *   in[i] < 2^109
 568  * On exit:
 569  *   out[i] < 7 * 2^64 < 2^67
 570  */
 571 static void felem_square(longfelem out, const felem in)
 572 {
 573     u64 small[4];
 574     felem_shrink(small, in);
 575     smallfelem_square(out, small);
 576 }
 577
 578 /*-
 579  * smallfelem_mul sets |out| = |small1| * |small2|
 580  * On entry:
 581  *   small1[i] < 2^64
 582  *   small2[i] < 2^64
 583  * On exit:
 584  *   out[i] < 7 * 2^64 < 2^67
 585  */
 586 static void smallfelem_mul(longfelem out, const smallfelem small1,
 587                            const smallfelem small2)
 588 {
 589     limb a;
 590     u64 high, low;
 591
 592     a = ((uint128_t) small1[0]) * small2[0];
 593     low = a;
 594     high = a >> 64;
 595     out[0] = low;
 596     out[1] = high;
 597
 598     a = ((uint128_t) small1[0]) * small2[1];
 599     low = a;
 600     high = a >> 64;
 601     out[1] += low;
 602     out[2] = high;
 603
 604     a = ((uint128_t) small1[1]) * small2[0];
 605     low = a;
 606     high = a >> 64;
 607     out[1] += low;
 608     out[2] += high;
 609
 610     a = ((uint128_t) small1[0]) * small2[2];
 611     low = a;
 612     high = a >> 64;
 613     out[2] += low;
 614     out[3] = high;
 615
 616     a = ((uint128_t) small1[1]) * small2[1];
 617     low = a;
 618     high = a >> 64;
 619     out[2] += low;
 620     out[3] += high;
 621
 622     a = ((uint128_t) small1[2]) * small2[0];
 623     low = a;
 624     high = a >> 64;
 625     out[2] += low;
 626     out[3] += high;
 627
 628     a = ((uint128_t) small1[0]) * small2[3];
 629     low = a;
 630     high = a >> 64;
 631     out[3] += low;
 632     out[4] = high;
 633
 634     a = ((uint128_t) small1[1]) * small2[2];
 635     low = a;
 636     high = a >> 64;
 637     out[3] += low;
 638     out[4] += high;
 639
 640     a = ((uint128_t) small1[2]) * small2[1];
 641     low = a;
 642     high = a >> 64;
 643     out[3] += low;
 644     out[4] += high;
 645
 646     a = ((uint128_t) small1[3]) * small2[0];
 647     low = a;
 648     high = a >> 64;
 649     out[3] += low;
 650     out[4] += high;
 651
 652     a = ((uint128_t) small1[1]) * small2[3];
 653     low = a;
 654     high = a >> 64;
 655     out[4] += low;
 656     out[5] = high;
 657
 658     a = ((uint128_t) small1[2]) * small2[2];
 659     low = a;
 660     high = a >> 64;
 661     out[4] += low;
 662     out[5] += high;
 663
 664     a = ((uint128_t) small1[3]) * small2[1];
 665     low = a;
 666     high = a >> 64;
 667     out[4] += low;
 668     out[5] += high;
 669
 670     a = ((uint128_t) small1[2]) * small2[3];
 671     low = a;
 672     high = a >> 64;
 673     out[5] += low;
 674     out[6] = high;
 675
 676     a = ((uint128_t) small1[3]) * small2[2];
 677     low = a;
 678     high = a >> 64;
 679     out[5] += low;
 680     out[6] += high;
 681
 682     a = ((uint128_t) small1[3]) * small2[3];
 683     low = a;
 684     high = a >> 64;
 685     out[6] += low;
 686     out[7] = high;
 687 }
 688
 689 /*-
 690  * felem_mul sets |out| = |in1| * |in2|
 691  * On entry:
 692  *   in1[i] < 2^109
 693  *   in2[i] < 2^109
 694  * On exit:
 695  *   out[i] < 7 * 2^64 < 2^67
 696  */
 697 static void felem_mul(longfelem out, const felem in1, const felem in2)
 698 {
 699     smallfelem small1, small2;
 700     felem_shrink(small1, in1);
 701     felem_shrink(small2, in2);
 702     smallfelem_mul(out, small1, small2);
 703 }
 704
 705 /*-
 706  * felem_small_mul sets |out| = |small1| * |in2|
 707  * On entry:
 708  *   small1[i] < 2^64
 709  *   in2[i] < 2^109
 710  * On exit:
 711  *   out[i] < 7 * 2^64 < 2^67
 712  */
 713 static void felem_small_mul(longfelem out, const smallfelem small1,
 714                             const felem in2)
 715 {
 716     smallfelem small2;
 717     felem_shrink(small2, in2);
 718     smallfelem_mul(out, small1, small2);
 719 }
 720
 721 # define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
 722 # define two100 (((limb)1) << 100)
 723 # define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
 724 /* zero100 is 0 mod p */
 725 static const felem zero100 =
 726     { two100m36m4, two100, two100m36p4, two100m36p4 };
 727
 728 /*-
 729  * Internal function for the different flavours of felem_reduce.
 730  * felem_reduce_ reduces the higher coefficients in[4]-in[7].
 731  * On entry:
 732  *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
 733  *   out[1] >= in[7] + 2^32*in[4]
 734  *   out[2] >= in[5] + 2^32*in[5]
 735  *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
 736  * On exit:
 737  *   out[0] <= out[0] + in[4] + 2^32*in[5]
 738  *   out[1] <= out[1] + in[5] + 2^33*in[6]
 739  *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
 740  *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
 741  */
 742 static void felem_reduce_(felem out, const longfelem in)
 743 {
 744     int128_t c;
 745     /* combine common terms from below */
 746     c = in[4] + (in[5] << 32);
 747     out[0] += c;
 748     out[3] -= c;
 749
 750     c = in[5] - in[7];
 751     out[1] += c;
 752     out[2] -= c;
 753
 754     /* the remaining terms */
 755     /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
 756     out[1] -= (in[4] << 32);
 757     out[3] += (in[4] << 32);
 758
 759     /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
 760     out[2] -= (in[5] << 32);
 761
 762     /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
 763     out[0] -= in[6];
 764     out[0] -= (in[6] << 32);
 765     out[1] += (in[6] << 33);
 766     out[2] += (in[6] * 2);
 767     out[3] -= (in[6] << 32);
 768
 769     /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
 770     out[0] -= in[7];
 771     out[0] -= (in[7] << 32);
 772     out[2] += (in[7] << 33);
 773     out[3] += (in[7] * 3);
 774 }
 775
 776 /*-
 777  * felem_reduce converts a longfelem into an felem.
 778  * To be called directly after felem_square or felem_mul.
 779  * On entry:
 780  *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
 781  *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
 782  * On exit:
 783  *   out[i] < 2^101
 784  */
 785 static void felem_reduce(felem out, const longfelem in)
 786 {
 787     out[0] = zero100[0] + in[0];
 788     out[1] = zero100[1] + in[1];
 789     out[2] = zero100[2] + in[2];
 790     out[3] = zero100[3] + in[3];
 791
 792     felem_reduce_(out, in);
 793
 794     /*-
 795      * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
 796      * out[1] > 2^100 - 2^64 - 7*2^96 > 0
 797      * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
 798      * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
 799      *
 800      * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
 801      * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
 802      * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
 803      * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
 804      */
 805 }
 806
 807 /*-
 808  * felem_reduce_zero105 converts a larger longfelem into an felem.
 809  * On entry:
 810  *   in[0] < 2^71
 811  * On exit:
 812  *   out[i] < 2^106
 813  */
 814 static void felem_reduce_zero105(felem out, const longfelem in)
 815 {
 816     out[0] = zero105[0] + in[0];
 817     out[1] = zero105[1] + in[1];
 818     out[2] = zero105[2] + in[2];
 819     out[3] = zero105[3] + in[3];
 820
 821     felem_reduce_(out, in);
 822
 823     /*-
 824      * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
 825      * out[1] > 2^105 - 2^71 - 2^103 > 0
 826      * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
 827      * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
 828      *
 829      * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 830      * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 831      * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
 832      * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
 833      */
 834 }
 835
 836 /*
 837  * subtract_u64 sets *result = *result - v and *carry to one if the
 838  * subtraction underflowed.
 839  */
 840 static void subtract_u64(u64 *result, u64 *carry, u64 v)
 841 {
 842     uint128_t r = *result;
 843     r -= v;
 844     *carry = (r >> 64) & 1;
 845     *result = (u64)r;
 846 }
 847
 848 /*
 849  * felem_contract converts |in| to its unique, minimal representation. On
 850  * entry: in[i] < 2^109
 851  */
 852 static void felem_contract(smallfelem out, const felem in)
 853 {
 854     unsigned i;
 855     u64 all_equal_so_far = 0, result = 0, carry;
 856
 857     felem_shrink(out, in);
 858     /* small is minimal except that the value might be > p */
 859
 860     all_equal_so_far--;
 861     /*
 862      * We are doing a constant time test if out >= kPrime. We need to compare
 863      * each u64, from most-significant to least significant. For each one, if
 864      * all words so far have been equal (m is all ones) then a non-equal
 865      * result is the answer. Otherwise we continue.
 866      */
 867     for (i = 3; i < 4; i--) {
 868         u64 equal;
 869         uint128_t a = ((uint128_t) kPrime[i]) - out[i];
 870         /*
 871          * if out[i] > kPrime[i] then a will underflow and the high 64-bits
 872          * will all be set.
 873          */
 874         result |= all_equal_so_far & ((u64)(a >> 64));
 875
 876         /*
 877          * if kPrime[i] == out[i] then |equal| will be all zeros and the
 878          * decrement will make it all ones.
 879          */
 880         equal = kPrime[i] ^ out[i];
 881         equal--;
 882         equal &= equal << 32;
 883         equal &= equal << 16;
 884         equal &= equal << 8;
 885         equal &= equal << 4;
 886         equal &= equal << 2;
 887         equal &= equal << 1;
 888         equal = ((s64) equal) >> 63;
 889
 890         all_equal_so_far &= equal;
 891     }
 892
 893     /*
 894      * if all_equal_so_far is still all ones then the two values are equal
 895      * and so out >= kPrime is true.
 896      */
 897     result |= all_equal_so_far;
 898
 899     /* if out >= kPrime then we subtract kPrime. */
 900     subtract_u64(&out[0], &carry, result & kPrime[0]);
 901     subtract_u64(&out[1], &carry, carry);
 902     subtract_u64(&out[2], &carry, carry);
 903     subtract_u64(&out[3], &carry, carry);
 904
 905     subtract_u64(&out[1], &carry, result & kPrime[1]);
 906     subtract_u64(&out[2], &carry, carry);
 907     subtract_u64(&out[3], &carry, carry);
 908
 909     subtract_u64(&out[2], &carry, result & kPrime[2]);
 910     subtract_u64(&out[3], &carry, carry);
 911
 912     subtract_u64(&out[3], &carry, result & kPrime[3]);
 913 }
 914
 915 static void smallfelem_square_contract(smallfelem out, const smallfelem in)
 916 {
 917     longfelem longtmp;
 918     felem tmp;
 919
 920     smallfelem_square(longtmp, in);
 921     felem_reduce(tmp, longtmp);
 922     felem_contract(out, tmp);
 923 }
 924
 925 static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
 926                                     const smallfelem in2)
 927 {
 928     longfelem longtmp;
 929     felem tmp;
 930
 931     smallfelem_mul(longtmp, in1, in2);
 932     felem_reduce(tmp, longtmp);
 933     felem_contract(out, tmp);
 934 }
 935
 936 /*-
 937  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 938  * otherwise.
 939  * On entry:
 940  *   small[i] < 2^64
 941  */
 942 static limb smallfelem_is_zero(const smallfelem small)
 943 {
 944     limb result;
 945     u64 is_p;
 946
 947     u64 is_zero = small[0] | small[1] | small[2] | small[3];
 948     is_zero--;
 949     is_zero &= is_zero << 32;
 950     is_zero &= is_zero << 16;
 951     is_zero &= is_zero << 8;
 952     is_zero &= is_zero << 4;
 953     is_zero &= is_zero << 2;
 954     is_zero &= is_zero << 1;
 955     is_zero = ((s64) is_zero) >> 63;
 956
 957     is_p = (small[0] ^ kPrime[0]) |
 958         (small[1] ^ kPrime[1]) |
 959         (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
 960     is_p--;
 961     is_p &= is_p << 32;
 962     is_p &= is_p << 16;
 963     is_p &= is_p << 8;
 964     is_p &= is_p << 4;
 965     is_p &= is_p << 2;
 966     is_p &= is_p << 1;
 967     is_p = ((s64) is_p) >> 63;
 968
 969     is_zero |= is_p;
 970
 971     result = is_zero;
 972     result |= ((limb) is_zero) << 64;
 973     return result;
 974 }
 975
 976 static int smallfelem_is_zero_int(const smallfelem small)
 977 {
 978     return (int)(smallfelem_is_zero(small) & ((limb) 1));
 979 }
 980
 981 /*-
 982  * felem_inv calculates |out| = |in|^{-1}
 983  *
 984  * Based on Fermat's Little Theorem:
 985  *   a^p = a (mod p)
 986  *   a^{p-1} = 1 (mod p)
 987  *   a^{p-2} = a^{-1} (mod p)
 988  */
 989 static void felem_inv(felem out, const felem in)
 990 {
 991     felem ftmp, ftmp2;
 992     /* each e_I will hold |in|^{2^I - 1} */
 993     felem e2, e4, e8, e16, e32, e64;
 994     longfelem tmp;
 995     unsigned i;
 996
 997     felem_square(tmp, in);
 998     felem_reduce(ftmp, tmp);    /* 2^1 */
 999     felem_mul(tmp, in, ftmp);
1000     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
1001     felem_assign(e2, ftmp);
1002     felem_square(tmp, ftmp);
1003     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
1004     felem_square(tmp, ftmp);
1005     felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
1006     felem_mul(tmp, ftmp, e2);
1007     felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1008     felem_assign(e4, ftmp);
1009     felem_square(tmp, ftmp);
1010     felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1011     felem_square(tmp, ftmp);
1012     felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1013     felem_square(tmp, ftmp);
1014     felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1015     felem_square(tmp, ftmp);
1016     felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1017     felem_mul(tmp, ftmp, e4);
1018     felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1019     felem_assign(e8, ftmp);
1020     for (i = 0; i < 8; i++) {
1021         felem_square(tmp, ftmp);
1022         felem_reduce(ftmp, tmp);
1023     }                           /* 2^16 - 2^8 */
1024     felem_mul(tmp, ftmp, e8);
1025     felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1026     felem_assign(e16, ftmp);
1027     for (i = 0; i < 16; i++) {
1028         felem_square(tmp, ftmp);
1029         felem_reduce(ftmp, tmp);
1030     }                           /* 2^32 - 2^16 */
1031     felem_mul(tmp, ftmp, e16);
1032     felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1033     felem_assign(e32, ftmp);
1034     for (i = 0; i < 32; i++) {
1035         felem_square(tmp, ftmp);
1036         felem_reduce(ftmp, tmp);
1037     }                           /* 2^64 - 2^32 */
1038     felem_assign(e64, ftmp);
1039     felem_mul(tmp, ftmp, in);
1040     felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1041     for (i = 0; i < 192; i++) {
1042         felem_square(tmp, ftmp);
1043         felem_reduce(ftmp, tmp);
1044     }                           /* 2^256 - 2^224 + 2^192 */
1045
1046     felem_mul(tmp, e64, e32);
1047     felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1048     for (i = 0; i < 16; i++) {
1049         felem_square(tmp, ftmp2);
1050         felem_reduce(ftmp2, tmp);
1051     }                           /* 2^80 - 2^16 */
1052     felem_mul(tmp, ftmp2, e16);
1053     felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1054     for (i = 0; i < 8; i++) {
1055         felem_square(tmp, ftmp2);
1056         felem_reduce(ftmp2, tmp);
1057     }                           /* 2^88 - 2^8 */
1058     felem_mul(tmp, ftmp2, e8);
1059     felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1060     for (i = 0; i < 4; i++) {
1061         felem_square(tmp, ftmp2);
1062         felem_reduce(ftmp2, tmp);
1063     }                           /* 2^92 - 2^4 */
1064     felem_mul(tmp, ftmp2, e4);
1065     felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1066     felem_square(tmp, ftmp2);
1067     felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1068     felem_square(tmp, ftmp2);
1069     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1070     felem_mul(tmp, ftmp2, e2);
1071     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1072     felem_square(tmp, ftmp2);
1073     felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1074     felem_square(tmp, ftmp2);
1075     felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1076     felem_mul(tmp, ftmp2, in);
1077     felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1078
1079     felem_mul(tmp, ftmp2, ftmp);
1080     felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1081 }
1082
1083 static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1084 {
1085     felem tmp;
1086
1087     smallfelem_expand(tmp, in);
1088     felem_inv(tmp, tmp);
1089     felem_contract(out, tmp);
1090 }
1091
1092 /*-
1093  * Group operations
1094  * ----------------
1095  *
1096  * Building on top of the field operations we have the operations on the
1097  * elliptic curve group itself. Points on the curve are represented in Jacobian
1098  * coordinates
1099  */
1100
1101 /*-
1102  * point_double calculates 2*(x_in, y_in, z_in)
1103  *
1104  * The method is taken from:
1105  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1106  *
1107  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1108  * while x_out == y_in is not (maybe this works, but it's not tested).
1109  */
1110 static void
1111 point_double(felem x_out, felem y_out, felem z_out,
1112              const felem x_in, const felem y_in, const felem z_in)
1113 {
1114     longfelem tmp, tmp2;
1115     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1116     smallfelem small1, small2;
1117
1118     felem_assign(ftmp, x_in);
1119     /* ftmp[i] < 2^106 */
1120     felem_assign(ftmp2, x_in);
1121     /* ftmp2[i] < 2^106 */
1122
1123     /* delta = z^2 */
1124     felem_square(tmp, z_in);
1125     felem_reduce(delta, tmp);
1126     /* delta[i] < 2^101 */
1127
1128     /* gamma = y^2 */
1129     felem_square(tmp, y_in);
1130     felem_reduce(gamma, tmp);
1131     /* gamma[i] < 2^101 */
1132     felem_shrink(small1, gamma);
1133
1134     /* beta = x*gamma */
1135     felem_small_mul(tmp, small1, x_in);
1136     felem_reduce(beta, tmp);
1137     /* beta[i] < 2^101 */
1138
1139     /* alpha = 3*(x-delta)*(x+delta) */
1140     felem_diff(ftmp, delta);
1141     /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1142     felem_sum(ftmp2, delta);
1143     /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1144     felem_scalar(ftmp2, 3);
1145     /* ftmp2[i] < 3 * 2^107 < 2^109 */
1146     felem_mul(tmp, ftmp, ftmp2);
1147     felem_reduce(alpha, tmp);
1148     /* alpha[i] < 2^101 */
1149     felem_shrink(small2, alpha);
1150
1151     /* x' = alpha^2 - 8*beta */
1152     smallfelem_square(tmp, small2);
1153     felem_reduce(x_out, tmp);
1154     felem_assign(ftmp, beta);
1155     felem_scalar(ftmp, 8);
1156     /* ftmp[i] < 8 * 2^101 = 2^104 */
1157     felem_diff(x_out, ftmp);
1158     /* x_out[i] < 2^105 + 2^101 < 2^106 */
1159
1160     /* z' = (y + z)^2 - gamma - delta */
1161     felem_sum(delta, gamma);
1162     /* delta[i] < 2^101 + 2^101 = 2^102 */
1163     felem_assign(ftmp, y_in);
1164     felem_sum(ftmp, z_in);
1165     /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1166     felem_square(tmp, ftmp);
1167     felem_reduce(z_out, tmp);
1168     felem_diff(z_out, delta);
1169     /* z_out[i] < 2^105 + 2^101 < 2^106 */
1170
1171     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1172     felem_scalar(beta, 4);
1173     /* beta[i] < 4 * 2^101 = 2^103 */
1174     felem_diff_zero107(beta, x_out);
1175     /* beta[i] < 2^107 + 2^103 < 2^108 */
1176     felem_small_mul(tmp, small2, beta);
1177     /* tmp[i] < 7 * 2^64 < 2^67 */
1178     smallfelem_square(tmp2, small1);
1179     /* tmp2[i] < 7 * 2^64 */
1180     longfelem_scalar(tmp2, 8);
1181     /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1182     longfelem_diff(tmp, tmp2);
1183     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1184     felem_reduce_zero105(y_out, tmp);
1185     /* y_out[i] < 2^106 */
1186 }
1187
1188 /*
1189  * point_double_small is the same as point_double, except that it operates on
1190  * smallfelems
1191  */
1192 static void
1193 point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1194                    const smallfelem x_in, const smallfelem y_in,
1195                    const smallfelem z_in)
1196 {
1197     felem felem_x_out, felem_y_out, felem_z_out;
1198     felem felem_x_in, felem_y_in, felem_z_in;
1199
1200     smallfelem_expand(felem_x_in, x_in);
1201     smallfelem_expand(felem_y_in, y_in);
1202     smallfelem_expand(felem_z_in, z_in);
1203     point_double(felem_x_out, felem_y_out, felem_z_out,
1204                  felem_x_in, felem_y_in, felem_z_in);
1205     felem_shrink(x_out, felem_x_out);
1206     felem_shrink(y_out, felem_y_out);
1207     felem_shrink(z_out, felem_z_out);
1208 }
1209
1210 /* copy_conditional copies in to out iff mask is all ones. */
1211 static void copy_conditional(felem out, const felem in, limb mask)
1212 {
1213     unsigned i;
1214     for (i = 0; i < NLIMBS; ++i) {
1215         const limb tmp = mask & (in[i] ^ out[i]);
1216         out[i] ^= tmp;
1217     }
1218 }
1219
1220 /* copy_small_conditional copies in to out iff mask is all ones. */
1221 static void copy_small_conditional(felem out, const smallfelem in, limb mask)
1222 {
1223     unsigned i;
1224     const u64 mask64 = mask;
1225     for (i = 0; i < NLIMBS; ++i) {
1226         out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1227     }
1228 }
1229
1230 /*-
1231  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1232  *
1233  * The method is taken from:
1234  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1235  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1236  *
1237  * This function includes a branch for checking whether the two input points
1238  * are equal, (while not equal to the point at infinity). This case never
1239  * happens during single point multiplication, so there is no timing leak for
1240  * ECDH or ECDSA signing.
1241  */
1242 static void point_add(felem x3, felem y3, felem z3,
1243                       const felem x1, const felem y1, const felem z1,
1244                       const int mixed, const smallfelem x2,
1245                       const smallfelem y2, const smallfelem z2)
1246 {
1247     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1248     longfelem tmp, tmp2;
1249     smallfelem small1, small2, small3, small4, small5;
1250     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1251
1252     felem_shrink(small3, z1);
1253
1254     z1_is_zero = smallfelem_is_zero(small3);
1255     z2_is_zero = smallfelem_is_zero(z2);
1256
1257     /* ftmp = z1z1 = z1**2 */
1258     smallfelem_square(tmp, small3);
1259     felem_reduce(ftmp, tmp);
1260     /* ftmp[i] < 2^101 */
1261     felem_shrink(small1, ftmp);
1262
1263     if (!mixed) {
1264         /* ftmp2 = z2z2 = z2**2 */
1265         smallfelem_square(tmp, z2);
1266         felem_reduce(ftmp2, tmp);
1267         /* ftmp2[i] < 2^101 */
1268         felem_shrink(small2, ftmp2);
1269
1270         felem_shrink(small5, x1);
1271
1272         /* u1 = ftmp3 = x1*z2z2 */
1273         smallfelem_mul(tmp, small5, small2);
1274         felem_reduce(ftmp3, tmp);
1275         /* ftmp3[i] < 2^101 */
1276
1277         /* ftmp5 = z1 + z2 */
1278         felem_assign(ftmp5, z1);
1279         felem_small_sum(ftmp5, z2);
1280         /* ftmp5[i] < 2^107 */
1281
1282         /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1283         felem_square(tmp, ftmp5);
1284         felem_reduce(ftmp5, tmp);
1285         /* ftmp2 = z2z2 + z1z1 */
1286         felem_sum(ftmp2, ftmp);
1287         /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1288         felem_diff(ftmp5, ftmp2);
1289         /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1290
1291         /* ftmp2 = z2 * z2z2 */
1292         smallfelem_mul(tmp, small2, z2);
1293         felem_reduce(ftmp2, tmp);
1294
1295         /* s1 = ftmp2 = y1 * z2**3 */
1296         felem_mul(tmp, y1, ftmp2);
1297         felem_reduce(ftmp6, tmp);
1298         /* ftmp6[i] < 2^101 */
1299     } else {
1300         /*
1301          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1302          */
1303
1304         /* u1 = ftmp3 = x1*z2z2 */
1305         felem_assign(ftmp3, x1);
1306         /* ftmp3[i] < 2^106 */
1307
1308         /* ftmp5 = 2z1z2 */
1309         felem_assign(ftmp5, z1);
1310         felem_scalar(ftmp5, 2);
1311         /* ftmp5[i] < 2*2^106 = 2^107 */
1312
1313         /* s1 = ftmp2 = y1 * z2**3 */
1314         felem_assign(ftmp6, y1);
1315         /* ftmp6[i] < 2^106 */
1316     }
1317
1318     /* u2 = x2*z1z1 */
1319     smallfelem_mul(tmp, x2, small1);
1320     felem_reduce(ftmp4, tmp);
1321
1322     /* h = ftmp4 = u2 - u1 */
1323     felem_diff_zero107(ftmp4, ftmp3);
1324     /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1325     felem_shrink(small4, ftmp4);
1326
1327     x_equal = smallfelem_is_zero(small4);
1328
1329     /* z_out = ftmp5 * h */
1330     felem_small_mul(tmp, small4, ftmp5);
1331     felem_reduce(z_out, tmp);
1332     /* z_out[i] < 2^101 */
1333
1334     /* ftmp = z1 * z1z1 */
1335     smallfelem_mul(tmp, small1, small3);
1336     felem_reduce(ftmp, tmp);
1337
1338     /* s2 = tmp = y2 * z1**3 */
1339     felem_small_mul(tmp, y2, ftmp);
1340     felem_reduce(ftmp5, tmp);
1341
1342     /* r = ftmp5 = (s2 - s1)*2 */
1343     felem_diff_zero107(ftmp5, ftmp6);
1344     /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1345     felem_scalar(ftmp5, 2);
1346     /* ftmp5[i] < 2^109 */
1347     felem_shrink(small1, ftmp5);
1348     y_equal = smallfelem_is_zero(small1);
1349
1350     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1351         point_double(x3, y3, z3, x1, y1, z1);
1352         return;
1353     }
1354
1355     /* I = ftmp = (2h)**2 */
1356     felem_assign(ftmp, ftmp4);
1357     felem_scalar(ftmp, 2);
1358     /* ftmp[i] < 2*2^108 = 2^109 */
1359     felem_square(tmp, ftmp);
1360     felem_reduce(ftmp, tmp);
1361
1362     /* J = ftmp2 = h * I */
1363     felem_mul(tmp, ftmp4, ftmp);
1364     felem_reduce(ftmp2, tmp);
1365
1366     /* V = ftmp4 = U1 * I */
1367     felem_mul(tmp, ftmp3, ftmp);
1368     felem_reduce(ftmp4, tmp);
1369
1370     /* x_out = r**2 - J - 2V */
1371     smallfelem_square(tmp, small1);
1372     felem_reduce(x_out, tmp);
1373     felem_assign(ftmp3, ftmp4);
1374     felem_scalar(ftmp4, 2);
1375     felem_sum(ftmp4, ftmp2);
1376     /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1377     felem_diff(x_out, ftmp4);
1378     /* x_out[i] < 2^105 + 2^101 */
1379
1380     /* y_out = r(V-x_out) - 2 * s1 * J */
1381     felem_diff_zero107(ftmp3, x_out);
1382     /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1383     felem_small_mul(tmp, small1, ftmp3);
1384     felem_mul(tmp2, ftmp6, ftmp2);
1385     longfelem_scalar(tmp2, 2);
1386     /* tmp2[i] < 2*2^67 = 2^68 */
1387     longfelem_diff(tmp, tmp2);
1388     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1389     felem_reduce_zero105(y_out, tmp);
1390     /* y_out[i] < 2^106 */
1391
1392     copy_small_conditional(x_out, x2, z1_is_zero);
1393     copy_conditional(x_out, x1, z2_is_zero);
1394     copy_small_conditional(y_out, y2, z1_is_zero);
1395     copy_conditional(y_out, y1, z2_is_zero);
1396     copy_small_conditional(z_out, z2, z1_is_zero);
1397     copy_conditional(z_out, z1, z2_is_zero);
1398     felem_assign(x3, x_out);
1399     felem_assign(y3, y_out);
1400     felem_assign(z3, z_out);
1401 }
1402
1403 /*
1404  * point_add_small is the same as point_add, except that it operates on
1405  * smallfelems
1406  */
1407 static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1408                             smallfelem x1, smallfelem y1, smallfelem z1,
1409                             smallfelem x2, smallfelem y2, smallfelem z2)
1410 {
1411     felem felem_x3, felem_y3, felem_z3;
1412     felem felem_x1, felem_y1, felem_z1;
1413     smallfelem_expand(felem_x1, x1);
1414     smallfelem_expand(felem_y1, y1);
1415     smallfelem_expand(felem_z1, z1);
1416     point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1417               x2, y2, z2);
1418     felem_shrink(x3, felem_x3);
1419     felem_shrink(y3, felem_y3);
1420     felem_shrink(z3, felem_z3);
1421 }
1422
1423 /*-
1424  * Base point pre computation
1425  * --------------------------
1426  *
1427  * Two different sorts of precomputed tables are used in the following code.
1428  * Each contain various points on the curve, where each point is three field
1429  * elements (x, y, z).
1430  *
1431  * For the base point table, z is usually 1 (0 for the point at infinity).
1432  * This table has 2 * 16 elements, starting with the following:
1433  * index | bits    | point
1434  * ------+---------+------------------------------
1435  *     0 | 0 0 0 0 | 0G
1436  *     1 | 0 0 0 1 | 1G
1437  *     2 | 0 0 1 0 | 2^64G
1438  *     3 | 0 0 1 1 | (2^64 + 1)G
1439  *     4 | 0 1 0 0 | 2^128G
1440  *     5 | 0 1 0 1 | (2^128 + 1)G
1441  *     6 | 0 1 1 0 | (2^128 + 2^64)G
1442  *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1443  *     8 | 1 0 0 0 | 2^192G
1444  *     9 | 1 0 0 1 | (2^192 + 1)G
1445  *    10 | 1 0 1 0 | (2^192 + 2^64)G
1446  *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1447  *    12 | 1 1 0 0 | (2^192 + 2^128)G
1448  *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1449  *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1450  *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1451  * followed by a copy of this with each element multiplied by 2^32.
1452  *
1453  * The reason for this is so that we can clock bits into four different
1454  * locations when doing simple scalar multiplies against the base point,
1455  * and then another four locations using the second 16 elements.
1456  *
1457  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1458
1459 /* gmul is the table of precomputed base points */
1460 static const smallfelem gmul[2][16][3] = {
1461     {{{0, 0, 0, 0},
1462       {0, 0, 0, 0},
1463       {0, 0, 0, 0}},
1464      {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1465        0x6b17d1f2e12c4247},
1466       {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1467        0x4fe342e2fe1a7f9b},
1468       {1, 0, 0, 0}},
1469      {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1470        0x0fa822bc2811aaa5},
1471       {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1472        0xbff44ae8f5dba80d},
1473       {1, 0, 0, 0}},
1474      {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1475        0x300a4bbc89d6726f},
1476       {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1477        0x72aac7e0d09b4644},
1478       {1, 0, 0, 0}},
1479      {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1480        0x447d739beedb5e67},
1481       {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1482        0x2d4825ab834131ee},
1483       {1, 0, 0, 0}},
1484      {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1485        0xef9519328a9c72ff},
1486       {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1487        0x611e9fc37dbb2c9b},
1488       {1, 0, 0, 0}},
1489      {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1490        0x550663797b51f5d8},
1491       {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1492        0x157164848aecb851},
1493       {1, 0, 0, 0}},
1494      {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1495        0xeb5d7745b21141ea},
1496       {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1497        0xeafd72ebdbecc17b},
1498       {1, 0, 0, 0}},
1499      {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1500        0xa6d39677a7849276},
1501       {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1502        0x674f84749b0b8816},
1503       {1, 0, 0, 0}},
1504      {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1505        0x4e769e7672c9ddad},
1506       {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1507        0x42b99082de830663},
1508       {1, 0, 0, 0}},
1509      {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1510        0x78878ef61c6ce04d},
1511       {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1512        0xb6cb3f5d7b72c321},
1513       {1, 0, 0, 0}},
1514      {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1515        0x0c88bc4d716b1287},
1516       {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1517        0xdd5ddea3f3901dc6},
1518       {1, 0, 0, 0}},
1519      {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1520        0x68f344af6b317466},
1521       {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1522        0x31b9c405f8540a20},
1523       {1, 0, 0, 0}},
1524      {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1525        0x4052bf4b6f461db9},
1526       {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1527        0xfecf4d5190b0fc61},
1528       {1, 0, 0, 0}},
1529      {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1530        0x1eddbae2c802e41a},
1531       {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1532        0x43104d86560ebcfc},
1533       {1, 0, 0, 0}},
1534      {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1535        0xb48e26b484f7a21c},
1536       {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1537        0xfac015404d4d3dab},
1538       {1, 0, 0, 0}}},
1539     {{{0, 0, 0, 0},
1540       {0, 0, 0, 0},
1541       {0, 0, 0, 0}},
1542      {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1543        0x7fe36b40af22af89},
1544       {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1545        0xe697d45825b63624},
1546       {1, 0, 0, 0}},
1547      {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1548        0x4a5b506612a677a6},
1549       {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1550        0xeb13461ceac089f1},
1551       {1, 0, 0, 0}},
1552      {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1553        0x0781b8291c6a220a},
1554       {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1555        0x690cde8df0151593},
1556       {1, 0, 0, 0}},
1557      {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1558        0x8a535f566ec73617},
1559       {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1560        0x0455c08468b08bd7},
1561       {1, 0, 0, 0}},
1562      {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1563        0x06bada7ab77f8276},
1564       {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1565        0x5b476dfd0e6cb18a},
1566       {1, 0, 0, 0}},
1567      {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1568        0x3e29864e8a2ec908},
1569       {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1570        0x239b90ea3dc31e7e},
1571       {1, 0, 0, 0}},
1572      {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1573        0x820f4dd949f72ff7},
1574       {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1575        0x140406ec783a05ec},
1576       {1, 0, 0, 0}},
1577      {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1578        0x68f6b8542783dfee},
1579       {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1580        0xcbe1feba92e40ce6},
1581       {1, 0, 0, 0}},
1582      {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1583        0xd0b2f94d2f420109},
1584       {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1585        0x971459828b0719e5},
1586       {1, 0, 0, 0}},
1587      {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1588        0x961610004a866aba},
1589       {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1590        0x7acb9fadcee75e44},
1591       {1, 0, 0, 0}},
1592      {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1593        0x24eb9acca333bf5b},
1594       {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1595        0x69f891c5acd079cc},
1596       {1, 0, 0, 0}},
1597      {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1598        0xe51f547c5972a107},
1599       {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1600        0x1c309a2b25bb1387},
1601       {1, 0, 0, 0}},
1602      {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1603        0x20b87b8aa2c4e503},
1604       {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1605        0xf5c6fa49919776be},
1606       {1, 0, 0, 0}},
1607      {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1608        0x1ed7d1b9332010b9},
1609       {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1610        0x3a2b03f03217257a},
1611       {1, 0, 0, 0}},
1612      {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1613        0x15fee545c78dd9f6},
1614       {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1615        0x4ab5b6b2b8753f81},
1616       {1, 0, 0, 0}}}
1617 };
1618
1619 /*
1620  * select_point selects the |idx|th point from a precomputation table and
1621  * copies it to out.
1622  */
1623 static void select_point(const u64 idx, unsigned int size,
1624                          const smallfelem pre_comp[16][3], smallfelem out[3])
1625 {
1626     unsigned i, j;
1627     u64 *outlimbs = &out[0][0];
1628
1629     memset(out, 0, sizeof(*out) * 3);
1630
1631     for (i = 0; i < size; i++) {
1632         const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1633         u64 mask = i ^ idx;
1634         mask |= mask >> 4;
1635         mask |= mask >> 2;
1636         mask |= mask >> 1;
1637         mask &= 1;
1638         mask--;
1639         for (j = 0; j < NLIMBS * 3; j++)
1640             outlimbs[j] |= inlimbs[j] & mask;
1641     }
1642 }
1643
1644 /* get_bit returns the |i|th bit in |in| */
1645 static char get_bit(const felem_bytearray in, int i)
1646 {
1647     if ((i < 0) || (i >= 256))
1648         return 0;
1649     return (in[i >> 3] >> (i & 7)) & 1;
1650 }
1651
1652 /*
1653  * Interleaved point multiplication using precomputed point multiples: The
1654  * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1655  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1656  * generator, using certain (large) precomputed multiples in g_pre_comp.
1657  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1658  */
1659 static void batch_mul(felem x_out, felem y_out, felem z_out,
1660                       const felem_bytearray scalars[],
1661                       const unsigned num_points, const u8 *g_scalar,
1662                       const int mixed, const smallfelem pre_comp[][17][3],
1663                       const smallfelem g_pre_comp[2][16][3])
1664 {
1665     int i, skip;
1666     unsigned num, gen_mul = (g_scalar != NULL);
1667     felem nq[3], ftmp;
1668     smallfelem tmp[3];
1669     u64 bits;
1670     u8 sign, digit;
1671
1672     /* set nq to the point at infinity */
1673     memset(nq, 0, sizeof(nq));
1674
1675     /*
1676      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1677      * of the generator (two in each of the last 32 rounds) and additions of
1678      * other points multiples (every 5th round).
1679      */
1680     skip = 1;                   /* save two point operations in the first
1681                                  * round */
1682     for (i = (num_points ? 255 : 31); i >= 0; --i) {
1683         /* double */
1684         if (!skip)
1685             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1686
1687         /* add multiples of the generator */
1688         if (gen_mul && (i <= 31)) {
1689             /* first, look 32 bits upwards */
1690             bits = get_bit(g_scalar, i + 224) << 3;
1691             bits |= get_bit(g_scalar, i + 160) << 2;
1692             bits |= get_bit(g_scalar, i + 96) << 1;
1693             bits |= get_bit(g_scalar, i + 32);
1694             /* select the point to add, in constant time */
1695             select_point(bits, 16, g_pre_comp[1], tmp);
1696
1697             if (!skip) {
1698                 /* Arg 1 below is for "mixed" */
1699                 point_add(nq[0], nq[1], nq[2],
1700                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1701             } else {
1702                 smallfelem_expand(nq[0], tmp[0]);
1703                 smallfelem_expand(nq[1], tmp[1]);
1704                 smallfelem_expand(nq[2], tmp[2]);
1705                 skip = 0;
1706             }
1707
1708             /* second, look at the current position */
1709             bits = get_bit(g_scalar, i + 192) << 3;
1710             bits |= get_bit(g_scalar, i + 128) << 2;
1711             bits |= get_bit(g_scalar, i + 64) << 1;
1712             bits |= get_bit(g_scalar, i);
1713             /* select the point to add, in constant time */
1714             select_point(bits, 16, g_pre_comp[0], tmp);
1715             /* Arg 1 below is for "mixed" */
1716             point_add(nq[0], nq[1], nq[2],
1717                       nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1718         }
1719
1720         /* do other additions every 5 doublings */
1721         if (num_points && (i % 5 == 0)) {
1722             /* loop over all scalars */
1723             for (num = 0; num < num_points; ++num) {
1724                 bits = get_bit(scalars[num], i + 4) << 5;
1725                 bits |= get_bit(scalars[num], i + 3) << 4;
1726                 bits |= get_bit(scalars[num], i + 2) << 3;
1727                 bits |= get_bit(scalars[num], i + 1) << 2;
1728                 bits |= get_bit(scalars[num], i) << 1;
1729                 bits |= get_bit(scalars[num], i - 1);
1730                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1731
1732                 /*
1733                  * select the point to add or subtract, in constant time
1734                  */
1735                 select_point(digit, 17, pre_comp[num], tmp);
1736                 smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1737                                                * point */
1738                 copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1739                 felem_contract(tmp[1], ftmp);
1740
1741                 if (!skip) {
1742                     point_add(nq[0], nq[1], nq[2],
1743                               nq[0], nq[1], nq[2],
1744                               mixed, tmp[0], tmp[1], tmp[2]);
1745                 } else {
1746                     smallfelem_expand(nq[0], tmp[0]);
1747                     smallfelem_expand(nq[1], tmp[1]);
1748                     smallfelem_expand(nq[2], tmp[2]);
1749                     skip = 0;
1750                 }
1751             }
1752         }
1753     }
1754     felem_assign(x_out, nq[0]);
1755     felem_assign(y_out, nq[1]);
1756     felem_assign(z_out, nq[2]);
1757 }
1758
1759 /* Precomputation for the group generator. */
1760 struct nistp256_pre_comp_st {
1761     smallfelem g_pre_comp[2][16][3];
1762     int references;
1763     CRYPTO_RWLOCK *lock;
1764 };
1765
1766 const EC_METHOD *EC_GFp_nistp256_method(void)
1767 {
1768     static const EC_METHOD ret = {
1769         EC_FLAGS_DEFAULT_OCT,
1770         NID_X9_62_prime_field,
1771         ec_GFp_nistp256_group_init,
1772         ec_GFp_simple_group_finish,
1773         ec_GFp_simple_group_clear_finish,
1774         ec_GFp_nist_group_copy,
1775         ec_GFp_nistp256_group_set_curve,
1776         ec_GFp_simple_group_get_curve,
1777         ec_GFp_simple_group_get_degree,
1778         ec_group_simple_order_bits,
1779         ec_GFp_simple_group_check_discriminant,
1780         ec_GFp_simple_point_init,
1781         ec_GFp_simple_point_finish,
1782         ec_GFp_simple_point_clear_finish,
1783         ec_GFp_simple_point_copy,
1784         ec_GFp_simple_point_set_to_infinity,
1785         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1786         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1787         ec_GFp_simple_point_set_affine_coordinates,
1788         ec_GFp_nistp256_point_get_affine_coordinates,
1789         0 /* point_set_compressed_coordinates */ ,
1790         0 /* point2oct */ ,
1791         0 /* oct2point */ ,
1792         ec_GFp_simple_add,
1793         ec_GFp_simple_dbl,
1794         ec_GFp_simple_invert,
1795         ec_GFp_simple_is_at_infinity,
1796         ec_GFp_simple_is_on_curve,
1797         ec_GFp_simple_cmp,
1798         ec_GFp_simple_make_affine,
1799         ec_GFp_simple_points_make_affine,
1800         ec_GFp_nistp256_points_mul,
1801         ec_GFp_nistp256_precompute_mult,
1802         ec_GFp_nistp256_have_precompute_mult,
1803         ec_GFp_nist_field_mul,
1804         ec_GFp_nist_field_sqr,
1805         0 /* field_div */ ,
1806         0 /* field_encode */ ,
1807         0 /* field_decode */ ,
1808         0,                      /* field_set_to_one */
1809         ec_key_simple_priv2oct,
1810         ec_key_simple_oct2priv,
1811         0, /* set private */
1812         ec_key_simple_generate_key,
1813         ec_key_simple_check_key,
1814         ec_key_simple_generate_public_key,
1815         0, /* keycopy */
1816         0, /* keyfinish */
1817         ecdh_simple_compute_key
1818     };
1819
1820     return &ret;
1821 }
1822
1823 /******************************************************************************/
1824 /*
1825  * FUNCTIONS TO MANAGE PRECOMPUTATION
1826  */
1827
1828 static NISTP256_PRE_COMP *nistp256_pre_comp_new()
1829 {
1830     NISTP256_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1831
1832     if (ret == NULL) {
1833         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1834         return ret;
1835     }
1836
1837     ret->references = 1;
1838
1839     ret->lock = CRYPTO_THREAD_lock_new();
1840     if (ret->lock == NULL) {
1841         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1842         OPENSSL_free(ret);
1843         return NULL;
1844     }
1845     return ret;
1846 }
1847
1848 NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *p)
1849 {
1850     int i;
1851     if (p != NULL)
1852         CRYPTO_atomic_add(&p->references, 1, &i, p->lock);
1853     return p;
1854 }
1855
1856 void EC_nistp256_pre_comp_free(NISTP256_PRE_COMP *pre)
1857 {
1858     int i;
1859
1860     if (pre == NULL)
1861         return;
1862
1863     CRYPTO_atomic_add(&pre->references, -1, &i, pre->lock);
1864     REF_PRINT_COUNT("EC_nistp256", x);
1865     if (i > 0)
1866         return;
1867     REF_ASSERT_ISNT(i < 0);
1868
1869     CRYPTO_THREAD_lock_free(pre->lock);
1870     OPENSSL_free(pre);
1871 }
1872
1873 /******************************************************************************/
1874 /*
1875  * OPENSSL EC_METHOD FUNCTIONS
1876  */
1877
1878 int ec_GFp_nistp256_group_init(EC_GROUP *group)
1879 {
1880     int ret;
1881     ret = ec_GFp_simple_group_init(group);
1882     group->a_is_minus3 = 1;
1883     return ret;
1884 }
1885
1886 int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1887                                     const BIGNUM *a, const BIGNUM *b,
1888                                     BN_CTX *ctx)
1889 {
1890     int ret = 0;
1891     BN_CTX *new_ctx = NULL;
1892     BIGNUM *curve_p, *curve_a, *curve_b;
1893
1894     if (ctx == NULL)
1895         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1896             return 0;
1897     BN_CTX_start(ctx);
1898     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1899         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1900         ((curve_b = BN_CTX_get(ctx)) == NULL))
1901         goto err;
1902     BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1903     BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1904     BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1905     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1906         ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1907               EC_R_WRONG_CURVE_PARAMETERS);
1908         goto err;
1909     }
1910     group->field_mod_func = BN_nist_mod_256;
1911     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1912  err:
1913     BN_CTX_end(ctx);
1914     BN_CTX_free(new_ctx);
1915     return ret;
1916 }
1917
1918 /*
1919  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1920  * (X/Z^2, Y/Z^3)
1921  */
1922 int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1923                                                  const EC_POINT *point,
1924                                                  BIGNUM *x, BIGNUM *y,
1925                                                  BN_CTX *ctx)
1926 {
1927     felem z1, z2, x_in, y_in;
1928     smallfelem x_out, y_out;
1929     longfelem tmp;
1930
1931     if (EC_POINT_is_at_infinity(group, point)) {
1932         ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1933               EC_R_POINT_AT_INFINITY);
1934         return 0;
1935     }
1936     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1937         (!BN_to_felem(z1, point->Z)))
1938         return 0;
1939     felem_inv(z2, z1);
1940     felem_square(tmp, z2);
1941     felem_reduce(z1, tmp);
1942     felem_mul(tmp, x_in, z1);
1943     felem_reduce(x_in, tmp);
1944     felem_contract(x_out, x_in);
1945     if (x != NULL) {
1946         if (!smallfelem_to_BN(x, x_out)) {
1947             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1948                   ERR_R_BN_LIB);
1949             return 0;
1950         }
1951     }
1952     felem_mul(tmp, z1, z2);
1953     felem_reduce(z1, tmp);
1954     felem_mul(tmp, y_in, z1);
1955     felem_reduce(y_in, tmp);
1956     felem_contract(y_out, y_in);
1957     if (y != NULL) {
1958         if (!smallfelem_to_BN(y, y_out)) {
1959             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1960                   ERR_R_BN_LIB);
1961             return 0;
1962         }
1963     }
1964     return 1;
1965 }
1966
1967 /* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
1968 static void make_points_affine(size_t num, smallfelem points[][3],
1969                                smallfelem tmp_smallfelems[])
1970 {
1971     /*
1972      * Runs in constant time, unless an input is the point at infinity (which
1973      * normally shouldn't happen).
1974      */
1975     ec_GFp_nistp_points_make_affine_internal(num,
1976                                              points,
1977                                              sizeof(smallfelem),
1978                                              tmp_smallfelems,
1979                                              (void (*)(void *))smallfelem_one,
1980                                              (int (*)(const void *))
1981                                              smallfelem_is_zero_int,
1982                                              (void (*)(void *, const void *))
1983                                              smallfelem_assign,
1984                                              (void (*)(void *, const void *))
1985                                              smallfelem_square_contract,
1986                                              (void (*)
1987                                               (void *, const void *,
1988                                                const void *))
1989                                              smallfelem_mul_contract,
1990                                              (void (*)(void *, const void *))
1991                                              smallfelem_inv_contract,
1992                                              /* nothing to contract */
1993                                              (void (*)(void *, const void *))
1994                                              smallfelem_assign);
1995 }
1996
1997 /*
1998  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1999  * values Result is stored in r (r can equal one of the inputs).
2000  */
2001 int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
2002                                const BIGNUM *scalar, size_t num,
2003                                const EC_POINT *points[],
2004                                const BIGNUM *scalars[], BN_CTX *ctx)
2005 {
2006     int ret = 0;
2007     int j;
2008     int mixed = 0;
2009     BN_CTX *new_ctx = NULL;
2010     BIGNUM *x, *y, *z, *tmp_scalar;
2011     felem_bytearray g_secret;
2012     felem_bytearray *secrets = NULL;
2013     smallfelem (*pre_comp)[17][3] = NULL;
2014     smallfelem *tmp_smallfelems = NULL;
2015     felem_bytearray tmp;
2016     unsigned i, num_bytes;
2017     int have_pre_comp = 0;
2018     size_t num_points = num;
2019     smallfelem x_in, y_in, z_in;
2020     felem x_out, y_out, z_out;
2021     NISTP256_PRE_COMP *pre = NULL;
2022     const smallfelem(*g_pre_comp)[16][3] = NULL;
2023     EC_POINT *generator = NULL;
2024     const EC_POINT *p = NULL;
2025     const BIGNUM *p_scalar = NULL;
2026
2027     if (ctx == NULL)
2028         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2029             return 0;
2030     BN_CTX_start(ctx);
2031     if (((x = BN_CTX_get(ctx)) == NULL) ||
2032         ((y = BN_CTX_get(ctx)) == NULL) ||
2033         ((z = BN_CTX_get(ctx)) == NULL) ||
2034         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
2035         goto err;
2036
2037     if (scalar != NULL) {
2038         pre = group->pre_comp.nistp256;
2039         if (pre)
2040             /* we have precomputation, try to use it */
2041             g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2042         else
2043             /* try to use the standard precomputation */
2044             g_pre_comp = &gmul[0];
2045         generator = EC_POINT_new(group);
2046         if (generator == NULL)
2047             goto err;
2048         /* get the generator from precomputation */
2049         if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2050             !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2051             !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2052             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2053             goto err;
2054         }
2055         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
2056                                                       generator, x, y, z,
2057                                                       ctx))
2058             goto err;
2059         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2060             /* precomputation matches generator */
2061             have_pre_comp = 1;
2062         else
2063             /*
2064              * we don't have valid precomputation: treat the generator as a
2065              * random point
2066              */
2067             num_points++;
2068     }
2069     if (num_points > 0) {
2070         if (num_points >= 3) {
2071             /*
2072              * unless we precompute multiples for just one or two points,
2073              * converting those into affine form is time well spent
2074              */
2075             mixed = 1;
2076         }
2077         secrets = OPENSSL_malloc(sizeof(*secrets) * num_points);
2078         pre_comp = OPENSSL_malloc(sizeof(*pre_comp) * num_points);
2079         if (mixed)
2080             tmp_smallfelems =
2081               OPENSSL_malloc(sizeof(*tmp_smallfelems) * (num_points * 17 + 1));
2082         if ((secrets == NULL) || (pre_comp == NULL)
2083             || (mixed && (tmp_smallfelems == NULL))) {
2084             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
2085             goto err;
2086         }
2087
2088         /*
2089          * we treat NULL scalars as 0, and NULL points as points at infinity,
2090          * i.e., they contribute nothing to the linear combination
2091          */
2092         memset(secrets, 0, sizeof(*secrets) * num_points);
2093         memset(pre_comp, 0, sizeof(*pre_comp) * num_points);
2094         for (i = 0; i < num_points; ++i) {
2095             if (i == num)
2096                 /*
2097                  * we didn't have a valid precomputation, so we pick the
2098                  * generator
2099                  */
2100             {
2101                 p = EC_GROUP_get0_generator(group);
2102                 p_scalar = scalar;
2103             } else
2104                 /* the i^th point */
2105             {
2106                 p = points[i];
2107                 p_scalar = scalars[i];
2108             }
2109             if ((p_scalar != NULL) && (p != NULL)) {
2110                 /* reduce scalar to 0 <= scalar < 2^256 */
2111                 if ((BN_num_bits(p_scalar) > 256)
2112                     || (BN_is_negative(p_scalar))) {
2113                     /*
2114                      * this is an unusual input, and we don't guarantee
2115                      * constant-timeness
2116                      */
2117                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
2118                         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2119                         goto err;
2120                     }
2121                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
2122                 } else
2123                     num_bytes = BN_bn2bin(p_scalar, tmp);
2124                 flip_endian(secrets[i], tmp, num_bytes);
2125                 /* precompute multiples */
2126                 if ((!BN_to_felem(x_out, p->X)) ||
2127                     (!BN_to_felem(y_out, p->Y)) ||
2128                     (!BN_to_felem(z_out, p->Z)))
2129                     goto err;
2130                 felem_shrink(pre_comp[i][1][0], x_out);
2131                 felem_shrink(pre_comp[i][1][1], y_out);
2132                 felem_shrink(pre_comp[i][1][2], z_out);
2133                 for (j = 2; j <= 16; ++j) {
2134                     if (j & 1) {
2135                         point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2136                                         pre_comp[i][j][2], pre_comp[i][1][0],
2137                                         pre_comp[i][1][1], pre_comp[i][1][2],
2138                                         pre_comp[i][j - 1][0],
2139                                         pre_comp[i][j - 1][1],
2140                                         pre_comp[i][j - 1][2]);
2141                     } else {
2142                         point_double_small(pre_comp[i][j][0],
2143                                            pre_comp[i][j][1],
2144                                            pre_comp[i][j][2],
2145                                            pre_comp[i][j / 2][0],
2146                                            pre_comp[i][j / 2][1],
2147                                            pre_comp[i][j / 2][2]);
2148                     }
2149                 }
2150             }
2151         }
2152         if (mixed)
2153             make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2154     }
2155
2156     /* the scalar for the generator */
2157     if ((scalar != NULL) && (have_pre_comp)) {
2158         memset(g_secret, 0, sizeof(g_secret));
2159         /* reduce scalar to 0 <= scalar < 2^256 */
2160         if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2161             /*
2162              * this is an unusual input, and we don't guarantee
2163              * constant-timeness
2164              */
2165             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2166                 ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2167                 goto err;
2168             }
2169             num_bytes = BN_bn2bin(tmp_scalar, tmp);
2170         } else
2171             num_bytes = BN_bn2bin(scalar, tmp);
2172         flip_endian(g_secret, tmp, num_bytes);
2173         /* do the multiplication with generator precomputation */
2174         batch_mul(x_out, y_out, z_out,
2175                   (const felem_bytearray(*))secrets, num_points,
2176                   g_secret,
2177                   mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2178     } else
2179         /* do the multiplication without generator precomputation */
2180         batch_mul(x_out, y_out, z_out,
2181                   (const felem_bytearray(*))secrets, num_points,
2182                   NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2183     /* reduce the output to its unique minimal representation */
2184     felem_contract(x_in, x_out);
2185     felem_contract(y_in, y_out);
2186     felem_contract(z_in, z_out);
2187     if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2188         (!smallfelem_to_BN(z, z_in))) {
2189         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2190         goto err;
2191     }
2192     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2193
2194  err:
2195     BN_CTX_end(ctx);
2196     EC_POINT_free(generator);
2197     BN_CTX_free(new_ctx);
2198     OPENSSL_free(secrets);
2199     OPENSSL_free(pre_comp);
2200     OPENSSL_free(tmp_smallfelems);
2201     return ret;
2202 }
2203
2204 int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2205 {
2206     int ret = 0;
2207     NISTP256_PRE_COMP *pre = NULL;
2208     int i, j;
2209     BN_CTX *new_ctx = NULL;
2210     BIGNUM *x, *y;
2211     EC_POINT *generator = NULL;
2212     smallfelem tmp_smallfelems[32];
2213     felem x_tmp, y_tmp, z_tmp;
2214
2215     /* throw away old precomputation */
2216     EC_pre_comp_free(group);
2217     if (ctx == NULL)
2218         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2219             return 0;
2220     BN_CTX_start(ctx);
2221     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2222         goto err;
2223     /* get the generator */
2224     if (group->generator == NULL)
2225         goto err;
2226     generator = EC_POINT_new(group);
2227     if (generator == NULL)
2228         goto err;
2229     BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2230     BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2231     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2232         goto err;
2233     if ((pre = nistp256_pre_comp_new()) == NULL)
2234         goto err;
2235     /*
2236      * if the generator is the standard one, use built-in precomputation
2237      */
2238     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2239         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2240         goto done;
2241     }
2242     if ((!BN_to_felem(x_tmp, group->generator->X)) ||
2243         (!BN_to_felem(y_tmp, group->generator->Y)) ||
2244         (!BN_to_felem(z_tmp, group->generator->Z)))
2245         goto err;
2246     felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2247     felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2248     felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2249     /*
2250      * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2251      * 2^160*G, 2^224*G for the second one
2252      */
2253     for (i = 1; i <= 8; i <<= 1) {
2254         point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2255                            pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2256                            pre->g_pre_comp[0][i][1],
2257                            pre->g_pre_comp[0][i][2]);
2258         for (j = 0; j < 31; ++j) {
2259             point_double_small(pre->g_pre_comp[1][i][0],
2260                                pre->g_pre_comp[1][i][1],
2261                                pre->g_pre_comp[1][i][2],
2262                                pre->g_pre_comp[1][i][0],
2263                                pre->g_pre_comp[1][i][1],
2264                                pre->g_pre_comp[1][i][2]);
2265         }
2266         if (i == 8)
2267             break;
2268         point_double_small(pre->g_pre_comp[0][2 * i][0],
2269                            pre->g_pre_comp[0][2 * i][1],
2270                            pre->g_pre_comp[0][2 * i][2],
2271                            pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2272                            pre->g_pre_comp[1][i][2]);
2273         for (j = 0; j < 31; ++j) {
2274             point_double_small(pre->g_pre_comp[0][2 * i][0],
2275                                pre->g_pre_comp[0][2 * i][1],
2276                                pre->g_pre_comp[0][2 * i][2],
2277                                pre->g_pre_comp[0][2 * i][0],
2278                                pre->g_pre_comp[0][2 * i][1],
2279                                pre->g_pre_comp[0][2 * i][2]);
2280         }
2281     }
2282     for (i = 0; i < 2; i++) {
2283         /* g_pre_comp[i][0] is the point at infinity */
2284         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2285         /* the remaining multiples */
2286         /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2287         point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2288                         pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2289                         pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2290                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2291                         pre->g_pre_comp[i][2][2]);
2292         /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2293         point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2294                         pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2295                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2296                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2297                         pre->g_pre_comp[i][2][2]);
2298         /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2299         point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2300                         pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2301                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2302                         pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2303                         pre->g_pre_comp[i][4][2]);
2304         /*
2305          * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2306          */
2307         point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2308                         pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2309                         pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2310                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2311                         pre->g_pre_comp[i][2][2]);
2312         for (j = 1; j < 8; ++j) {
2313             /* odd multiples: add G resp. 2^32*G */
2314             point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2315                             pre->g_pre_comp[i][2 * j + 1][1],
2316                             pre->g_pre_comp[i][2 * j + 1][2],
2317                             pre->g_pre_comp[i][2 * j][0],
2318                             pre->g_pre_comp[i][2 * j][1],
2319                             pre->g_pre_comp[i][2 * j][2],
2320                             pre->g_pre_comp[i][1][0],
2321                             pre->g_pre_comp[i][1][1],
2322                             pre->g_pre_comp[i][1][2]);
2323         }
2324     }
2325     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2326
2327  done:
2328     SETPRECOMP(group, nistp256, pre);
2329     pre = NULL;
2330     ret = 1;
2331
2332  err:
2333     BN_CTX_end(ctx);
2334     EC_POINT_free(generator);
2335     BN_CTX_free(new_ctx);
2336     EC_nistp256_pre_comp_free(pre);
2337     return ret;
2338 }
2339
2340 int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2341 {
2342     return HAVEPRECOMP(group, nistp256);
2343 }
2344 #endif