crypto/ec/ecp_nistp256.c

   1 /* crypto/ec/ecp_nistp256.c */
   2 /*
   3  * Written by Adam Langley (Google) for the OpenSSL project
   4  */
   5 /* Copyright 2011 Google Inc.
   6  *
   7  * Licensed under the Apache License, Version 2.0 (the "License");
   8  *
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  *  Unless required by applicable law or agreed to in writing, software
  15  *  distributed under the License is distributed on an "AS IS" BASIS,
  16  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  *  See the License for the specific language governing permissions and
  18  *  limitations under the License.
  19  */
  20
  21 /*
  22  * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
  23  *
  24  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  25  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  26  * work which got its smarts from Daniel J. Bernstein's work on the same.
  27  */
  28
  29 #include <openssl/opensslconf.h>
  30 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
  31
  32 # include <stdint.h>
  33 # include <string.h>
  34 # include <openssl/err.h>
  35 # include "ec_lcl.h"
  36
  37 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  38   /* even with gcc, the typedef won't work for 32-bit platforms */
  39 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  40                                  * platforms */
  41 typedef __int128_t int128_t;
  42 # else
  43 #  error "Need GCC 3.1 or later to define type uint128_t"
  44 # endif
  45
  46 typedef uint8_t u8;
  47 typedef uint32_t u32;
  48 typedef uint64_t u64;
  49 typedef int64_t s64;
  50
  51 /*
  52  * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
  53  * can serialise an element of this field into 32 bytes. We call this an
  54  * felem_bytearray.
  55  */
  56
  57 typedef u8 felem_bytearray[32];
  58
  59 /*
  60  * These are the parameters of P256, taken from FIPS 186-3, page 86. These
  61  * values are big-endian.
  62  */
  63 static const felem_bytearray nistp256_curve_params[5] = {
  64     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
  65      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  66      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  67      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
  68     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
  69      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  70      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
  72     {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
  73      0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
  74      0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
  75      0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
  76     {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
  77      0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
  78      0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
  79      0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
  80     {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
  81      0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
  82      0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
  83      0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
  84 };
  85
  86 /*-
  87  * The representation of field elements.
  88  * ------------------------------------
  89  *
  90  * We represent field elements with either four 128-bit values, eight 128-bit
  91  * values, or four 64-bit values. The field element represented is:
  92  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
  93  * or:
  94  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
  95  *
  96  * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
  97  * apart, but are 128-bits wide, the most significant bits of each limb overlap
  98  * with the least significant bits of the next.
  99  *
 100  * A field element with four limbs is an 'felem'. One with eight limbs is a
 101  * 'longfelem'
 102  *
 103  * A field element with four, 64-bit values is called a 'smallfelem'. Small
 104  * values are used as intermediate values before multiplication.
 105  */
 106
 107 # define NLIMBS 4
 108
 109 typedef uint128_t limb;
 110 typedef limb felem[NLIMBS];
 111 typedef limb longfelem[NLIMBS * 2];
 112 typedef u64 smallfelem[NLIMBS];
 113
 114 /* This is the value of the prime as four 64-bit words, little-endian. */
 115 static const u64 kPrime[4] =
 116     { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
 117 static const u64 bottom63bits = 0x7ffffffffffffffful;
 118
 119 /*
 120  * bin32_to_felem takes a little-endian byte array and converts it into felem
 121  * form. This assumes that the CPU is little-endian.
 122  */
 123 static void bin32_to_felem(felem out, const u8 in[32])
 124 {
 125     out[0] = *((u64 *)&in[0]);
 126     out[1] = *((u64 *)&in[8]);
 127     out[2] = *((u64 *)&in[16]);
 128     out[3] = *((u64 *)&in[24]);
 129 }
 130
 131 /*
 132  * smallfelem_to_bin32 takes a smallfelem and serialises into a little
 133  * endian, 32 byte array. This assumes that the CPU is little-endian.
 134  */
 135 static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
 136 {
 137     *((u64 *)&out[0]) = in[0];
 138     *((u64 *)&out[8]) = in[1];
 139     *((u64 *)&out[16]) = in[2];
 140     *((u64 *)&out[24]) = in[3];
 141 }
 142
 143 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 144 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 145 {
 146     unsigned i;
 147     for (i = 0; i < len; ++i)
 148         out[i] = in[len - 1 - i];
 149 }
 150
 151 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 152 static int BN_to_felem(felem out, const BIGNUM *bn)
 153 {
 154     felem_bytearray b_in;
 155     felem_bytearray b_out;
 156     unsigned num_bytes;
 157
 158     /* BN_bn2bin eats leading zeroes */
 159     memset(b_out, 0, sizeof b_out);
 160     num_bytes = BN_num_bytes(bn);
 161     if (num_bytes > sizeof b_out) {
 162         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 163         return 0;
 164     }
 165     if (BN_is_negative(bn)) {
 166         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 167         return 0;
 168     }
 169     num_bytes = BN_bn2bin(bn, b_in);
 170     flip_endian(b_out, b_in, num_bytes);
 171     bin32_to_felem(out, b_out);
 172     return 1;
 173 }
 174
 175 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 176 static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
 177 {
 178     felem_bytearray b_in, b_out;
 179     smallfelem_to_bin32(b_in, in);
 180     flip_endian(b_out, b_in, sizeof b_out);
 181     return BN_bin2bn(b_out, sizeof b_out, out);
 182 }
 183
 184 /*-
 185  * Field operations
 186  * ----------------
 187  */
 188
 189 static void smallfelem_one(smallfelem out)
 190 {
 191     out[0] = 1;
 192     out[1] = 0;
 193     out[2] = 0;
 194     out[3] = 0;
 195 }
 196
 197 static void smallfelem_assign(smallfelem out, const smallfelem in)
 198 {
 199     out[0] = in[0];
 200     out[1] = in[1];
 201     out[2] = in[2];
 202     out[3] = in[3];
 203 }
 204
 205 static void felem_assign(felem out, const felem in)
 206 {
 207     out[0] = in[0];
 208     out[1] = in[1];
 209     out[2] = in[2];
 210     out[3] = in[3];
 211 }
 212
 213 /* felem_sum sets out = out + in. */
 214 static void felem_sum(felem out, const felem in)
 215 {
 216     out[0] += in[0];
 217     out[1] += in[1];
 218     out[2] += in[2];
 219     out[3] += in[3];
 220 }
 221
 222 /* felem_small_sum sets out = out + in. */
 223 static void felem_small_sum(felem out, const smallfelem in)
 224 {
 225     out[0] += in[0];
 226     out[1] += in[1];
 227     out[2] += in[2];
 228     out[3] += in[3];
 229 }
 230
 231 /* felem_scalar sets out = out * scalar */
 232 static void felem_scalar(felem out, const u64 scalar)
 233 {
 234     out[0] *= scalar;
 235     out[1] *= scalar;
 236     out[2] *= scalar;
 237     out[3] *= scalar;
 238 }
 239
 240 /* longfelem_scalar sets out = out * scalar */
 241 static void longfelem_scalar(longfelem out, const u64 scalar)
 242 {
 243     out[0] *= scalar;
 244     out[1] *= scalar;
 245     out[2] *= scalar;
 246     out[3] *= scalar;
 247     out[4] *= scalar;
 248     out[5] *= scalar;
 249     out[6] *= scalar;
 250     out[7] *= scalar;
 251 }
 252
 253 # define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
 254 # define two105 (((limb)1) << 105)
 255 # define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
 256
 257 /* zero105 is 0 mod p */
 258 static const felem zero105 =
 259     { two105m41m9, two105, two105m41p9, two105m41p9 };
 260
 261 /*-
 262  * smallfelem_neg sets |out| to |-small|
 263  * On exit:
 264  *   out[i] < out[i] + 2^105
 265  */
 266 static void smallfelem_neg(felem out, const smallfelem small)
 267 {
 268     /* In order to prevent underflow, we subtract from 0 mod p. */
 269     out[0] = zero105[0] - small[0];
 270     out[1] = zero105[1] - small[1];
 271     out[2] = zero105[2] - small[2];
 272     out[3] = zero105[3] - small[3];
 273 }
 274
 275 /*-
 276  * felem_diff subtracts |in| from |out|
 277  * On entry:
 278  *   in[i] < 2^104
 279  * On exit:
 280  *   out[i] < out[i] + 2^105
 281  */
 282 static void felem_diff(felem out, const felem in)
 283 {
 284     /*
 285      * In order to prevent underflow, we add 0 mod p before subtracting.
 286      */
 287     out[0] += zero105[0];
 288     out[1] += zero105[1];
 289     out[2] += zero105[2];
 290     out[3] += zero105[3];
 291
 292     out[0] -= in[0];
 293     out[1] -= in[1];
 294     out[2] -= in[2];
 295     out[3] -= in[3];
 296 }
 297
 298 # define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
 299 # define two107 (((limb)1) << 107)
 300 # define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
 301
 302 /* zero107 is 0 mod p */
 303 static const felem zero107 =
 304     { two107m43m11, two107, two107m43p11, two107m43p11 };
 305
 306 /*-
 307  * An alternative felem_diff for larger inputs |in|
 308  * felem_diff_zero107 subtracts |in| from |out|
 309  * On entry:
 310  *   in[i] < 2^106
 311  * On exit:
 312  *   out[i] < out[i] + 2^107
 313  */
 314 static void felem_diff_zero107(felem out, const felem in)
 315 {
 316     /*
 317      * In order to prevent underflow, we add 0 mod p before subtracting.
 318      */
 319     out[0] += zero107[0];
 320     out[1] += zero107[1];
 321     out[2] += zero107[2];
 322     out[3] += zero107[3];
 323
 324     out[0] -= in[0];
 325     out[1] -= in[1];
 326     out[2] -= in[2];
 327     out[3] -= in[3];
 328 }
 329
 330 /*-
 331  * longfelem_diff subtracts |in| from |out|
 332  * On entry:
 333  *   in[i] < 7*2^67
 334  * On exit:
 335  *   out[i] < out[i] + 2^70 + 2^40
 336  */
 337 static void longfelem_diff(longfelem out, const longfelem in)
 338 {
 339     static const limb two70m8p6 =
 340         (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
 341     static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
 342     static const limb two70 = (((limb) 1) << 70);
 343     static const limb two70m40m38p6 =
 344         (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
 345         (((limb) 1) << 6);
 346     static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
 347
 348     /* add 0 mod p to avoid underflow */
 349     out[0] += two70m8p6;
 350     out[1] += two70p40;
 351     out[2] += two70;
 352     out[3] += two70m40m38p6;
 353     out[4] += two70m6;
 354     out[5] += two70m6;
 355     out[6] += two70m6;
 356     out[7] += two70m6;
 357
 358     /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
 359     out[0] -= in[0];
 360     out[1] -= in[1];
 361     out[2] -= in[2];
 362     out[3] -= in[3];
 363     out[4] -= in[4];
 364     out[5] -= in[5];
 365     out[6] -= in[6];
 366     out[7] -= in[7];
 367 }
 368
 369 # define two64m0 (((limb)1) << 64) - 1
 370 # define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
 371 # define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
 372 # define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
 373
 374 /* zero110 is 0 mod p */
 375 static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
 376
 377 /*-
 378  * felem_shrink converts an felem into a smallfelem. The result isn't quite
 379  * minimal as the value may be greater than p.
 380  *
 381  * On entry:
 382  *   in[i] < 2^109
 383  * On exit:
 384  *   out[i] < 2^64
 385  */
 386 static void felem_shrink(smallfelem out, const felem in)
 387 {
 388     felem tmp;
 389     u64 a, b, mask;
 390     s64 high, low;
 391     static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
 392
 393     /* Carry 2->3 */
 394     tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
 395     /* tmp[3] < 2^110 */
 396
 397     tmp[2] = zero110[2] + (u64)in[2];
 398     tmp[0] = zero110[0] + in[0];
 399     tmp[1] = zero110[1] + in[1];
 400     /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
 401
 402     /*
 403      * We perform two partial reductions where we eliminate the high-word of
 404      * tmp[3]. We don't update the other words till the end.
 405      */
 406     a = tmp[3] >> 64;           /* a < 2^46 */
 407     tmp[3] = (u64)tmp[3];
 408     tmp[3] -= a;
 409     tmp[3] += ((limb) a) << 32;
 410     /* tmp[3] < 2^79 */
 411
 412     b = a;
 413     a = tmp[3] >> 64;           /* a < 2^15 */
 414     b += a;                     /* b < 2^46 + 2^15 < 2^47 */
 415     tmp[3] = (u64)tmp[3];
 416     tmp[3] -= a;
 417     tmp[3] += ((limb) a) << 32;
 418     /* tmp[3] < 2^64 + 2^47 */
 419
 420     /*
 421      * This adjusts the other two words to complete the two partial
 422      * reductions.
 423      */
 424     tmp[0] += b;
 425     tmp[1] -= (((limb) b) << 32);
 426
 427     /*
 428      * In order to make space in tmp[3] for the carry from 2 -> 3, we
 429      * conditionally subtract kPrime if tmp[3] is large enough.
 430      */
 431     high = tmp[3] >> 64;
 432     /* As tmp[3] < 2^65, high is either 1 or 0 */
 433     high <<= 63;
 434     high >>= 63;
 435         /*-
 436          * high is:
 437          *   all ones   if the high word of tmp[3] is 1
 438          *   all zeros  if the high word of tmp[3] if 0 */
 439     low = tmp[3];
 440     mask = low >> 63;
 441         /*-
 442          * mask is:
 443          *   all ones   if the MSB of low is 1
 444          *   all zeros  if the MSB of low if 0 */
 445     low &= bottom63bits;
 446     low -= kPrime3Test;
 447     /* if low was greater than kPrime3Test then the MSB is zero */
 448     low = ~low;
 449     low >>= 63;
 450         /*-
 451          * low is:
 452          *   all ones   if low was > kPrime3Test
 453          *   all zeros  if low was <= kPrime3Test */
 454     mask = (mask & low) | high;
 455     tmp[0] -= mask & kPrime[0];
 456     tmp[1] -= mask & kPrime[1];
 457     /* kPrime[2] is zero, so omitted */
 458     tmp[3] -= mask & kPrime[3];
 459     /* tmp[3] < 2**64 - 2**32 + 1 */
 460
 461     tmp[1] += ((u64)(tmp[0] >> 64));
 462     tmp[0] = (u64)tmp[0];
 463     tmp[2] += ((u64)(tmp[1] >> 64));
 464     tmp[1] = (u64)tmp[1];
 465     tmp[3] += ((u64)(tmp[2] >> 64));
 466     tmp[2] = (u64)tmp[2];
 467     /* tmp[i] < 2^64 */
 468
 469     out[0] = tmp[0];
 470     out[1] = tmp[1];
 471     out[2] = tmp[2];
 472     out[3] = tmp[3];
 473 }
 474
 475 /* smallfelem_expand converts a smallfelem to an felem */
 476 static void smallfelem_expand(felem out, const smallfelem in)
 477 {
 478     out[0] = in[0];
 479     out[1] = in[1];
 480     out[2] = in[2];
 481     out[3] = in[3];
 482 }
 483
 484 /*-
 485  * smallfelem_square sets |out| = |small|^2
 486  * On entry:
 487  *   small[i] < 2^64
 488  * On exit:
 489  *   out[i] < 7 * 2^64 < 2^67
 490  */
 491 static void smallfelem_square(longfelem out, const smallfelem small)
 492 {
 493     limb a;
 494     u64 high, low;
 495
 496     a = ((uint128_t) small[0]) * small[0];
 497     low = a;
 498     high = a >> 64;
 499     out[0] = low;
 500     out[1] = high;
 501
 502     a = ((uint128_t) small[0]) * small[1];
 503     low = a;
 504     high = a >> 64;
 505     out[1] += low;
 506     out[1] += low;
 507     out[2] = high;
 508
 509     a = ((uint128_t) small[0]) * small[2];
 510     low = a;
 511     high = a >> 64;
 512     out[2] += low;
 513     out[2] *= 2;
 514     out[3] = high;
 515
 516     a = ((uint128_t) small[0]) * small[3];
 517     low = a;
 518     high = a >> 64;
 519     out[3] += low;
 520     out[4] = high;
 521
 522     a = ((uint128_t) small[1]) * small[2];
 523     low = a;
 524     high = a >> 64;
 525     out[3] += low;
 526     out[3] *= 2;
 527     out[4] += high;
 528
 529     a = ((uint128_t) small[1]) * small[1];
 530     low = a;
 531     high = a >> 64;
 532     out[2] += low;
 533     out[3] += high;
 534
 535     a = ((uint128_t) small[1]) * small[3];
 536     low = a;
 537     high = a >> 64;
 538     out[4] += low;
 539     out[4] *= 2;
 540     out[5] = high;
 541
 542     a = ((uint128_t) small[2]) * small[3];
 543     low = a;
 544     high = a >> 64;
 545     out[5] += low;
 546     out[5] *= 2;
 547     out[6] = high;
 548     out[6] += high;
 549
 550     a = ((uint128_t) small[2]) * small[2];
 551     low = a;
 552     high = a >> 64;
 553     out[4] += low;
 554     out[5] += high;
 555
 556     a = ((uint128_t) small[3]) * small[3];
 557     low = a;
 558     high = a >> 64;
 559     out[6] += low;
 560     out[7] = high;
 561 }
 562
 563 /*-
 564  * felem_square sets |out| = |in|^2
 565  * On entry:
 566  *   in[i] < 2^109
 567  * On exit:
 568  *   out[i] < 7 * 2^64 < 2^67
 569  */
 570 static void felem_square(longfelem out, const felem in)
 571 {
 572     u64 small[4];
 573     felem_shrink(small, in);
 574     smallfelem_square(out, small);
 575 }
 576
 577 /*-
 578  * smallfelem_mul sets |out| = |small1| * |small2|
 579  * On entry:
 580  *   small1[i] < 2^64
 581  *   small2[i] < 2^64
 582  * On exit:
 583  *   out[i] < 7 * 2^64 < 2^67
 584  */
 585 static void smallfelem_mul(longfelem out, const smallfelem small1,
 586                            const smallfelem small2)
 587 {
 588     limb a;
 589     u64 high, low;
 590
 591     a = ((uint128_t) small1[0]) * small2[0];
 592     low = a;
 593     high = a >> 64;
 594     out[0] = low;
 595     out[1] = high;
 596
 597     a = ((uint128_t) small1[0]) * small2[1];
 598     low = a;
 599     high = a >> 64;
 600     out[1] += low;
 601     out[2] = high;
 602
 603     a = ((uint128_t) small1[1]) * small2[0];
 604     low = a;
 605     high = a >> 64;
 606     out[1] += low;
 607     out[2] += high;
 608
 609     a = ((uint128_t) small1[0]) * small2[2];
 610     low = a;
 611     high = a >> 64;
 612     out[2] += low;
 613     out[3] = high;
 614
 615     a = ((uint128_t) small1[1]) * small2[1];
 616     low = a;
 617     high = a >> 64;
 618     out[2] += low;
 619     out[3] += high;
 620
 621     a = ((uint128_t) small1[2]) * small2[0];
 622     low = a;
 623     high = a >> 64;
 624     out[2] += low;
 625     out[3] += high;
 626
 627     a = ((uint128_t) small1[0]) * small2[3];
 628     low = a;
 629     high = a >> 64;
 630     out[3] += low;
 631     out[4] = high;
 632
 633     a = ((uint128_t) small1[1]) * small2[2];
 634     low = a;
 635     high = a >> 64;
 636     out[3] += low;
 637     out[4] += high;
 638
 639     a = ((uint128_t) small1[2]) * small2[1];
 640     low = a;
 641     high = a >> 64;
 642     out[3] += low;
 643     out[4] += high;
 644
 645     a = ((uint128_t) small1[3]) * small2[0];
 646     low = a;
 647     high = a >> 64;
 648     out[3] += low;
 649     out[4] += high;
 650
 651     a = ((uint128_t) small1[1]) * small2[3];
 652     low = a;
 653     high = a >> 64;
 654     out[4] += low;
 655     out[5] = high;
 656
 657     a = ((uint128_t) small1[2]) * small2[2];
 658     low = a;
 659     high = a >> 64;
 660     out[4] += low;
 661     out[5] += high;
 662
 663     a = ((uint128_t) small1[3]) * small2[1];
 664     low = a;
 665     high = a >> 64;
 666     out[4] += low;
 667     out[5] += high;
 668
 669     a = ((uint128_t) small1[2]) * small2[3];
 670     low = a;
 671     high = a >> 64;
 672     out[5] += low;
 673     out[6] = high;
 674
 675     a = ((uint128_t) small1[3]) * small2[2];
 676     low = a;
 677     high = a >> 64;
 678     out[5] += low;
 679     out[6] += high;
 680
 681     a = ((uint128_t) small1[3]) * small2[3];
 682     low = a;
 683     high = a >> 64;
 684     out[6] += low;
 685     out[7] = high;
 686 }
 687
 688 /*-
 689  * felem_mul sets |out| = |in1| * |in2|
 690  * On entry:
 691  *   in1[i] < 2^109
 692  *   in2[i] < 2^109
 693  * On exit:
 694  *   out[i] < 7 * 2^64 < 2^67
 695  */
 696 static void felem_mul(longfelem out, const felem in1, const felem in2)
 697 {
 698     smallfelem small1, small2;
 699     felem_shrink(small1, in1);
 700     felem_shrink(small2, in2);
 701     smallfelem_mul(out, small1, small2);
 702 }
 703
 704 /*-
 705  * felem_small_mul sets |out| = |small1| * |in2|
 706  * On entry:
 707  *   small1[i] < 2^64
 708  *   in2[i] < 2^109
 709  * On exit:
 710  *   out[i] < 7 * 2^64 < 2^67
 711  */
 712 static void felem_small_mul(longfelem out, const smallfelem small1,
 713                             const felem in2)
 714 {
 715     smallfelem small2;
 716     felem_shrink(small2, in2);
 717     smallfelem_mul(out, small1, small2);
 718 }
 719
 720 # define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
 721 # define two100 (((limb)1) << 100)
 722 # define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
 723 /* zero100 is 0 mod p */
 724 static const felem zero100 =
 725     { two100m36m4, two100, two100m36p4, two100m36p4 };
 726
 727 /*-
 728  * Internal function for the different flavours of felem_reduce.
 729  * felem_reduce_ reduces the higher coefficients in[4]-in[7].
 730  * On entry:
 731  *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
 732  *   out[1] >= in[7] + 2^32*in[4]
 733  *   out[2] >= in[5] + 2^32*in[5]
 734  *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
 735  * On exit:
 736  *   out[0] <= out[0] + in[4] + 2^32*in[5]
 737  *   out[1] <= out[1] + in[5] + 2^33*in[6]
 738  *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
 739  *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
 740  */
 741 static void felem_reduce_(felem out, const longfelem in)
 742 {
 743     int128_t c;
 744     /* combine common terms from below */
 745     c = in[4] + (in[5] << 32);
 746     out[0] += c;
 747     out[3] -= c;
 748
 749     c = in[5] - in[7];
 750     out[1] += c;
 751     out[2] -= c;
 752
 753     /* the remaining terms */
 754     /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
 755     out[1] -= (in[4] << 32);
 756     out[3] += (in[4] << 32);
 757
 758     /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
 759     out[2] -= (in[5] << 32);
 760
 761     /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
 762     out[0] -= in[6];
 763     out[0] -= (in[6] << 32);
 764     out[1] += (in[6] << 33);
 765     out[2] += (in[6] * 2);
 766     out[3] -= (in[6] << 32);
 767
 768     /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
 769     out[0] -= in[7];
 770     out[0] -= (in[7] << 32);
 771     out[2] += (in[7] << 33);
 772     out[3] += (in[7] * 3);
 773 }
 774
 775 /*-
 776  * felem_reduce converts a longfelem into an felem.
 777  * To be called directly after felem_square or felem_mul.
 778  * On entry:
 779  *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
 780  *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
 781  * On exit:
 782  *   out[i] < 2^101
 783  */
 784 static void felem_reduce(felem out, const longfelem in)
 785 {
 786     out[0] = zero100[0] + in[0];
 787     out[1] = zero100[1] + in[1];
 788     out[2] = zero100[2] + in[2];
 789     out[3] = zero100[3] + in[3];
 790
 791     felem_reduce_(out, in);
 792
 793         /*-
 794          * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
 795          * out[1] > 2^100 - 2^64 - 7*2^96 > 0
 796          * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
 797          * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
 798          *
 799          * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
 800          * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
 801          * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
 802          * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
 803          */
 804 }
 805
 806 /*-
 807  * felem_reduce_zero105 converts a larger longfelem into an felem.
 808  * On entry:
 809  *   in[0] < 2^71
 810  * On exit:
 811  *   out[i] < 2^106
 812  */
 813 static void felem_reduce_zero105(felem out, const longfelem in)
 814 {
 815     out[0] = zero105[0] + in[0];
 816     out[1] = zero105[1] + in[1];
 817     out[2] = zero105[2] + in[2];
 818     out[3] = zero105[3] + in[3];
 819
 820     felem_reduce_(out, in);
 821
 822         /*-
 823          * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
 824          * out[1] > 2^105 - 2^71 - 2^103 > 0
 825          * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
 826          * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
 827          *
 828          * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 829          * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 830          * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
 831          * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
 832          */
 833 }
 834
 835 /*
 836  * subtract_u64 sets *result = *result - v and *carry to one if the
 837  * subtraction underflowed.
 838  */
 839 static void subtract_u64(u64 *result, u64 *carry, u64 v)
 840 {
 841     uint128_t r = *result;
 842     r -= v;
 843     *carry = (r >> 64) & 1;
 844     *result = (u64)r;
 845 }
 846
 847 /*
 848  * felem_contract converts |in| to its unique, minimal representation. On
 849  * entry: in[i] < 2^109
 850  */
 851 static void felem_contract(smallfelem out, const felem in)
 852 {
 853     unsigned i;
 854     u64 all_equal_so_far = 0, result = 0, carry;
 855
 856     felem_shrink(out, in);
 857     /* small is minimal except that the value might be > p */
 858
 859     all_equal_so_far--;
 860     /*
 861      * We are doing a constant time test if out >= kPrime. We need to compare
 862      * each u64, from most-significant to least significant. For each one, if
 863      * all words so far have been equal (m is all ones) then a non-equal
 864      * result is the answer. Otherwise we continue.
 865      */
 866     for (i = 3; i < 4; i--) {
 867         u64 equal;
 868         uint128_t a = ((uint128_t) kPrime[i]) - out[i];
 869         /*
 870          * if out[i] > kPrime[i] then a will underflow and the high 64-bits
 871          * will all be set.
 872          */
 873         result |= all_equal_so_far & ((u64)(a >> 64));
 874
 875         /*
 876          * if kPrime[i] == out[i] then |equal| will be all zeros and the
 877          * decrement will make it all ones.
 878          */
 879         equal = kPrime[i] ^ out[i];
 880         equal--;
 881         equal &= equal << 32;
 882         equal &= equal << 16;
 883         equal &= equal << 8;
 884         equal &= equal << 4;
 885         equal &= equal << 2;
 886         equal &= equal << 1;
 887         equal = ((s64) equal) >> 63;
 888
 889         all_equal_so_far &= equal;
 890     }
 891
 892     /*
 893      * if all_equal_so_far is still all ones then the two values are equal
 894      * and so out >= kPrime is true.
 895      */
 896     result |= all_equal_so_far;
 897
 898     /* if out >= kPrime then we subtract kPrime. */
 899     subtract_u64(&out[0], &carry, result & kPrime[0]);
 900     subtract_u64(&out[1], &carry, carry);
 901     subtract_u64(&out[2], &carry, carry);
 902     subtract_u64(&out[3], &carry, carry);
 903
 904     subtract_u64(&out[1], &carry, result & kPrime[1]);
 905     subtract_u64(&out[2], &carry, carry);
 906     subtract_u64(&out[3], &carry, carry);
 907
 908     subtract_u64(&out[2], &carry, result & kPrime[2]);
 909     subtract_u64(&out[3], &carry, carry);
 910
 911     subtract_u64(&out[3], &carry, result & kPrime[3]);
 912 }
 913
 914 static void smallfelem_square_contract(smallfelem out, const smallfelem in)
 915 {
 916     longfelem longtmp;
 917     felem tmp;
 918
 919     smallfelem_square(longtmp, in);
 920     felem_reduce(tmp, longtmp);
 921     felem_contract(out, tmp);
 922 }
 923
 924 static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
 925                                     const smallfelem in2)
 926 {
 927     longfelem longtmp;
 928     felem tmp;
 929
 930     smallfelem_mul(longtmp, in1, in2);
 931     felem_reduce(tmp, longtmp);
 932     felem_contract(out, tmp);
 933 }
 934
 935 /*-
 936  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 937  * otherwise.
 938  * On entry:
 939  *   small[i] < 2^64
 940  */
 941 static limb smallfelem_is_zero(const smallfelem small)
 942 {
 943     limb result;
 944     u64 is_p;
 945
 946     u64 is_zero = small[0] | small[1] | small[2] | small[3];
 947     is_zero--;
 948     is_zero &= is_zero << 32;
 949     is_zero &= is_zero << 16;
 950     is_zero &= is_zero << 8;
 951     is_zero &= is_zero << 4;
 952     is_zero &= is_zero << 2;
 953     is_zero &= is_zero << 1;
 954     is_zero = ((s64) is_zero) >> 63;
 955
 956     is_p = (small[0] ^ kPrime[0]) |
 957         (small[1] ^ kPrime[1]) |
 958         (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
 959     is_p--;
 960     is_p &= is_p << 32;
 961     is_p &= is_p << 16;
 962     is_p &= is_p << 8;
 963     is_p &= is_p << 4;
 964     is_p &= is_p << 2;
 965     is_p &= is_p << 1;
 966     is_p = ((s64) is_p) >> 63;
 967
 968     is_zero |= is_p;
 969
 970     result = is_zero;
 971     result |= ((limb) is_zero) << 64;
 972     return result;
 973 }
 974
 975 static int smallfelem_is_zero_int(const smallfelem small)
 976 {
 977     return (int)(smallfelem_is_zero(small) & ((limb) 1));
 978 }
 979
 980 /*-
 981  * felem_inv calculates |out| = |in|^{-1}
 982  *
 983  * Based on Fermat's Little Theorem:
 984  *   a^p = a (mod p)
 985  *   a^{p-1} = 1 (mod p)
 986  *   a^{p-2} = a^{-1} (mod p)
 987  */
 988 static void felem_inv(felem out, const felem in)
 989 {
 990     felem ftmp, ftmp2;
 991     /* each e_I will hold |in|^{2^I - 1} */
 992     felem e2, e4, e8, e16, e32, e64;
 993     longfelem tmp;
 994     unsigned i;
 995
 996     felem_square(tmp, in);
 997     felem_reduce(ftmp, tmp);    /* 2^1 */
 998     felem_mul(tmp, in, ftmp);
 999     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
1000     felem_assign(e2, ftmp);
1001     felem_square(tmp, ftmp);
1002     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
1003     felem_square(tmp, ftmp);
1004     felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
1005     felem_mul(tmp, ftmp, e2);
1006     felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1007     felem_assign(e4, ftmp);
1008     felem_square(tmp, ftmp);
1009     felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1010     felem_square(tmp, ftmp);
1011     felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1012     felem_square(tmp, ftmp);
1013     felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1014     felem_square(tmp, ftmp);
1015     felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1016     felem_mul(tmp, ftmp, e4);
1017     felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1018     felem_assign(e8, ftmp);
1019     for (i = 0; i < 8; i++) {
1020         felem_square(tmp, ftmp);
1021         felem_reduce(ftmp, tmp);
1022     }                           /* 2^16 - 2^8 */
1023     felem_mul(tmp, ftmp, e8);
1024     felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1025     felem_assign(e16, ftmp);
1026     for (i = 0; i < 16; i++) {
1027         felem_square(tmp, ftmp);
1028         felem_reduce(ftmp, tmp);
1029     }                           /* 2^32 - 2^16 */
1030     felem_mul(tmp, ftmp, e16);
1031     felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1032     felem_assign(e32, ftmp);
1033     for (i = 0; i < 32; i++) {
1034         felem_square(tmp, ftmp);
1035         felem_reduce(ftmp, tmp);
1036     }                           /* 2^64 - 2^32 */
1037     felem_assign(e64, ftmp);
1038     felem_mul(tmp, ftmp, in);
1039     felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1040     for (i = 0; i < 192; i++) {
1041         felem_square(tmp, ftmp);
1042         felem_reduce(ftmp, tmp);
1043     }                           /* 2^256 - 2^224 + 2^192 */
1044
1045     felem_mul(tmp, e64, e32);
1046     felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1047     for (i = 0; i < 16; i++) {
1048         felem_square(tmp, ftmp2);
1049         felem_reduce(ftmp2, tmp);
1050     }                           /* 2^80 - 2^16 */
1051     felem_mul(tmp, ftmp2, e16);
1052     felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1053     for (i = 0; i < 8; i++) {
1054         felem_square(tmp, ftmp2);
1055         felem_reduce(ftmp2, tmp);
1056     }                           /* 2^88 - 2^8 */
1057     felem_mul(tmp, ftmp2, e8);
1058     felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1059     for (i = 0; i < 4; i++) {
1060         felem_square(tmp, ftmp2);
1061         felem_reduce(ftmp2, tmp);
1062     }                           /* 2^92 - 2^4 */
1063     felem_mul(tmp, ftmp2, e4);
1064     felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1065     felem_square(tmp, ftmp2);
1066     felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1067     felem_square(tmp, ftmp2);
1068     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1069     felem_mul(tmp, ftmp2, e2);
1070     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1071     felem_square(tmp, ftmp2);
1072     felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1073     felem_square(tmp, ftmp2);
1074     felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1075     felem_mul(tmp, ftmp2, in);
1076     felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1077
1078     felem_mul(tmp, ftmp2, ftmp);
1079     felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1080 }
1081
1082 static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1083 {
1084     felem tmp;
1085
1086     smallfelem_expand(tmp, in);
1087     felem_inv(tmp, tmp);
1088     felem_contract(out, tmp);
1089 }
1090
1091 /*-
1092  * Group operations
1093  * ----------------
1094  *
1095  * Building on top of the field operations we have the operations on the
1096  * elliptic curve group itself. Points on the curve are represented in Jacobian
1097  * coordinates */
1098
1099 /*-
1100  * point_double calculates 2*(x_in, y_in, z_in)
1101  *
1102  * The method is taken from:
1103  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1104  *
1105  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1106  * while x_out == y_in is not (maybe this works, but it's not tested). */
1107 static void
1108 point_double(felem x_out, felem y_out, felem z_out,
1109              const felem x_in, const felem y_in, const felem z_in)
1110 {
1111     longfelem tmp, tmp2;
1112     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1113     smallfelem small1, small2;
1114
1115     felem_assign(ftmp, x_in);
1116     /* ftmp[i] < 2^106 */
1117     felem_assign(ftmp2, x_in);
1118     /* ftmp2[i] < 2^106 */
1119
1120     /* delta = z^2 */
1121     felem_square(tmp, z_in);
1122     felem_reduce(delta, tmp);
1123     /* delta[i] < 2^101 */
1124
1125     /* gamma = y^2 */
1126     felem_square(tmp, y_in);
1127     felem_reduce(gamma, tmp);
1128     /* gamma[i] < 2^101 */
1129     felem_shrink(small1, gamma);
1130
1131     /* beta = x*gamma */
1132     felem_small_mul(tmp, small1, x_in);
1133     felem_reduce(beta, tmp);
1134     /* beta[i] < 2^101 */
1135
1136     /* alpha = 3*(x-delta)*(x+delta) */
1137     felem_diff(ftmp, delta);
1138     /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1139     felem_sum(ftmp2, delta);
1140     /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1141     felem_scalar(ftmp2, 3);
1142     /* ftmp2[i] < 3 * 2^107 < 2^109 */
1143     felem_mul(tmp, ftmp, ftmp2);
1144     felem_reduce(alpha, tmp);
1145     /* alpha[i] < 2^101 */
1146     felem_shrink(small2, alpha);
1147
1148     /* x' = alpha^2 - 8*beta */
1149     smallfelem_square(tmp, small2);
1150     felem_reduce(x_out, tmp);
1151     felem_assign(ftmp, beta);
1152     felem_scalar(ftmp, 8);
1153     /* ftmp[i] < 8 * 2^101 = 2^104 */
1154     felem_diff(x_out, ftmp);
1155     /* x_out[i] < 2^105 + 2^101 < 2^106 */
1156
1157     /* z' = (y + z)^2 - gamma - delta */
1158     felem_sum(delta, gamma);
1159     /* delta[i] < 2^101 + 2^101 = 2^102 */
1160     felem_assign(ftmp, y_in);
1161     felem_sum(ftmp, z_in);
1162     /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1163     felem_square(tmp, ftmp);
1164     felem_reduce(z_out, tmp);
1165     felem_diff(z_out, delta);
1166     /* z_out[i] < 2^105 + 2^101 < 2^106 */
1167
1168     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1169     felem_scalar(beta, 4);
1170     /* beta[i] < 4 * 2^101 = 2^103 */
1171     felem_diff_zero107(beta, x_out);
1172     /* beta[i] < 2^107 + 2^103 < 2^108 */
1173     felem_small_mul(tmp, small2, beta);
1174     /* tmp[i] < 7 * 2^64 < 2^67 */
1175     smallfelem_square(tmp2, small1);
1176     /* tmp2[i] < 7 * 2^64 */
1177     longfelem_scalar(tmp2, 8);
1178     /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1179     longfelem_diff(tmp, tmp2);
1180     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1181     felem_reduce_zero105(y_out, tmp);
1182     /* y_out[i] < 2^106 */
1183 }
1184
1185 /*
1186  * point_double_small is the same as point_double, except that it operates on
1187  * smallfelems
1188  */
1189 static void
1190 point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1191                    const smallfelem x_in, const smallfelem y_in,
1192                    const smallfelem z_in)
1193 {
1194     felem felem_x_out, felem_y_out, felem_z_out;
1195     felem felem_x_in, felem_y_in, felem_z_in;
1196
1197     smallfelem_expand(felem_x_in, x_in);
1198     smallfelem_expand(felem_y_in, y_in);
1199     smallfelem_expand(felem_z_in, z_in);
1200     point_double(felem_x_out, felem_y_out, felem_z_out,
1201                  felem_x_in, felem_y_in, felem_z_in);
1202     felem_shrink(x_out, felem_x_out);
1203     felem_shrink(y_out, felem_y_out);
1204     felem_shrink(z_out, felem_z_out);
1205 }
1206
1207 /* copy_conditional copies in to out iff mask is all ones. */
1208 static void copy_conditional(felem out, const felem in, limb mask)
1209 {
1210     unsigned i;
1211     for (i = 0; i < NLIMBS; ++i) {
1212         const limb tmp = mask & (in[i] ^ out[i]);
1213         out[i] ^= tmp;
1214     }
1215 }
1216
1217 /* copy_small_conditional copies in to out iff mask is all ones. */
1218 static void copy_small_conditional(felem out, const smallfelem in, limb mask)
1219 {
1220     unsigned i;
1221     const u64 mask64 = mask;
1222     for (i = 0; i < NLIMBS; ++i) {
1223         out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1224     }
1225 }
1226
1227 /*-
1228  * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1229  *
1230  * The method is taken from:
1231  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1232  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1233  *
1234  * This function includes a branch for checking whether the two input points
1235  * are equal, (while not equal to the point at infinity). This case never
1236  * happens during single point multiplication, so there is no timing leak for
1237  * ECDH or ECDSA signing. */
1238 static void point_add(felem x3, felem y3, felem z3,
1239                       const felem x1, const felem y1, const felem z1,
1240                       const int mixed, const smallfelem x2,
1241                       const smallfelem y2, const smallfelem z2)
1242 {
1243     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1244     longfelem tmp, tmp2;
1245     smallfelem small1, small2, small3, small4, small5;
1246     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1247
1248     felem_shrink(small3, z1);
1249
1250     z1_is_zero = smallfelem_is_zero(small3);
1251     z2_is_zero = smallfelem_is_zero(z2);
1252
1253     /* ftmp = z1z1 = z1**2 */
1254     smallfelem_square(tmp, small3);
1255     felem_reduce(ftmp, tmp);
1256     /* ftmp[i] < 2^101 */
1257     felem_shrink(small1, ftmp);
1258
1259     if (!mixed) {
1260         /* ftmp2 = z2z2 = z2**2 */
1261         smallfelem_square(tmp, z2);
1262         felem_reduce(ftmp2, tmp);
1263         /* ftmp2[i] < 2^101 */
1264         felem_shrink(small2, ftmp2);
1265
1266         felem_shrink(small5, x1);
1267
1268         /* u1 = ftmp3 = x1*z2z2 */
1269         smallfelem_mul(tmp, small5, small2);
1270         felem_reduce(ftmp3, tmp);
1271         /* ftmp3[i] < 2^101 */
1272
1273         /* ftmp5 = z1 + z2 */
1274         felem_assign(ftmp5, z1);
1275         felem_small_sum(ftmp5, z2);
1276         /* ftmp5[i] < 2^107 */
1277
1278         /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1279         felem_square(tmp, ftmp5);
1280         felem_reduce(ftmp5, tmp);
1281         /* ftmp2 = z2z2 + z1z1 */
1282         felem_sum(ftmp2, ftmp);
1283         /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1284         felem_diff(ftmp5, ftmp2);
1285         /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1286
1287         /* ftmp2 = z2 * z2z2 */
1288         smallfelem_mul(tmp, small2, z2);
1289         felem_reduce(ftmp2, tmp);
1290
1291         /* s1 = ftmp2 = y1 * z2**3 */
1292         felem_mul(tmp, y1, ftmp2);
1293         felem_reduce(ftmp6, tmp);
1294         /* ftmp6[i] < 2^101 */
1295     } else {
1296         /*
1297          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1298          */
1299
1300         /* u1 = ftmp3 = x1*z2z2 */
1301         felem_assign(ftmp3, x1);
1302         /* ftmp3[i] < 2^106 */
1303
1304         /* ftmp5 = 2z1z2 */
1305         felem_assign(ftmp5, z1);
1306         felem_scalar(ftmp5, 2);
1307         /* ftmp5[i] < 2*2^106 = 2^107 */
1308
1309         /* s1 = ftmp2 = y1 * z2**3 */
1310         felem_assign(ftmp6, y1);
1311         /* ftmp6[i] < 2^106 */
1312     }
1313
1314     /* u2 = x2*z1z1 */
1315     smallfelem_mul(tmp, x2, small1);
1316     felem_reduce(ftmp4, tmp);
1317
1318     /* h = ftmp4 = u2 - u1 */
1319     felem_diff_zero107(ftmp4, ftmp3);
1320     /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1321     felem_shrink(small4, ftmp4);
1322
1323     x_equal = smallfelem_is_zero(small4);
1324
1325     /* z_out = ftmp5 * h */
1326     felem_small_mul(tmp, small4, ftmp5);
1327     felem_reduce(z_out, tmp);
1328     /* z_out[i] < 2^101 */
1329
1330     /* ftmp = z1 * z1z1 */
1331     smallfelem_mul(tmp, small1, small3);
1332     felem_reduce(ftmp, tmp);
1333
1334     /* s2 = tmp = y2 * z1**3 */
1335     felem_small_mul(tmp, y2, ftmp);
1336     felem_reduce(ftmp5, tmp);
1337
1338     /* r = ftmp5 = (s2 - s1)*2 */
1339     felem_diff_zero107(ftmp5, ftmp6);
1340     /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1341     felem_scalar(ftmp5, 2);
1342     /* ftmp5[i] < 2^109 */
1343     felem_shrink(small1, ftmp5);
1344     y_equal = smallfelem_is_zero(small1);
1345
1346     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1347         point_double(x3, y3, z3, x1, y1, z1);
1348         return;
1349     }
1350
1351     /* I = ftmp = (2h)**2 */
1352     felem_assign(ftmp, ftmp4);
1353     felem_scalar(ftmp, 2);
1354     /* ftmp[i] < 2*2^108 = 2^109 */
1355     felem_square(tmp, ftmp);
1356     felem_reduce(ftmp, tmp);
1357
1358     /* J = ftmp2 = h * I */
1359     felem_mul(tmp, ftmp4, ftmp);
1360     felem_reduce(ftmp2, tmp);
1361
1362     /* V = ftmp4 = U1 * I */
1363     felem_mul(tmp, ftmp3, ftmp);
1364     felem_reduce(ftmp4, tmp);
1365
1366     /* x_out = r**2 - J - 2V */
1367     smallfelem_square(tmp, small1);
1368     felem_reduce(x_out, tmp);
1369     felem_assign(ftmp3, ftmp4);
1370     felem_scalar(ftmp4, 2);
1371     felem_sum(ftmp4, ftmp2);
1372     /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1373     felem_diff(x_out, ftmp4);
1374     /* x_out[i] < 2^105 + 2^101 */
1375
1376     /* y_out = r(V-x_out) - 2 * s1 * J */
1377     felem_diff_zero107(ftmp3, x_out);
1378     /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1379     felem_small_mul(tmp, small1, ftmp3);
1380     felem_mul(tmp2, ftmp6, ftmp2);
1381     longfelem_scalar(tmp2, 2);
1382     /* tmp2[i] < 2*2^67 = 2^68 */
1383     longfelem_diff(tmp, tmp2);
1384     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1385     felem_reduce_zero105(y_out, tmp);
1386     /* y_out[i] < 2^106 */
1387
1388     copy_small_conditional(x_out, x2, z1_is_zero);
1389     copy_conditional(x_out, x1, z2_is_zero);
1390     copy_small_conditional(y_out, y2, z1_is_zero);
1391     copy_conditional(y_out, y1, z2_is_zero);
1392     copy_small_conditional(z_out, z2, z1_is_zero);
1393     copy_conditional(z_out, z1, z2_is_zero);
1394     felem_assign(x3, x_out);
1395     felem_assign(y3, y_out);
1396     felem_assign(z3, z_out);
1397 }
1398
1399 /*
1400  * point_add_small is the same as point_add, except that it operates on
1401  * smallfelems
1402  */
1403 static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1404                             smallfelem x1, smallfelem y1, smallfelem z1,
1405                             smallfelem x2, smallfelem y2, smallfelem z2)
1406 {
1407     felem felem_x3, felem_y3, felem_z3;
1408     felem felem_x1, felem_y1, felem_z1;
1409     smallfelem_expand(felem_x1, x1);
1410     smallfelem_expand(felem_y1, y1);
1411     smallfelem_expand(felem_z1, z1);
1412     point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1413               x2, y2, z2);
1414     felem_shrink(x3, felem_x3);
1415     felem_shrink(y3, felem_y3);
1416     felem_shrink(z3, felem_z3);
1417 }
1418
1419 /*-
1420  * Base point pre computation
1421  * --------------------------
1422  *
1423  * Two different sorts of precomputed tables are used in the following code.
1424  * Each contain various points on the curve, where each point is three field
1425  * elements (x, y, z).
1426  *
1427  * For the base point table, z is usually 1 (0 for the point at infinity).
1428  * This table has 2 * 16 elements, starting with the following:
1429  * index | bits    | point
1430  * ------+---------+------------------------------
1431  *     0 | 0 0 0 0 | 0G
1432  *     1 | 0 0 0 1 | 1G
1433  *     2 | 0 0 1 0 | 2^64G
1434  *     3 | 0 0 1 1 | (2^64 + 1)G
1435  *     4 | 0 1 0 0 | 2^128G
1436  *     5 | 0 1 0 1 | (2^128 + 1)G
1437  *     6 | 0 1 1 0 | (2^128 + 2^64)G
1438  *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1439  *     8 | 1 0 0 0 | 2^192G
1440  *     9 | 1 0 0 1 | (2^192 + 1)G
1441  *    10 | 1 0 1 0 | (2^192 + 2^64)G
1442  *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1443  *    12 | 1 1 0 0 | (2^192 + 2^128)G
1444  *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1445  *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1446  *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1447  * followed by a copy of this with each element multiplied by 2^32.
1448  *
1449  * The reason for this is so that we can clock bits into four different
1450  * locations when doing simple scalar multiplies against the base point,
1451  * and then another four locations using the second 16 elements.
1452  *
1453  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1454
1455 /* gmul is the table of precomputed base points */
1456 static const smallfelem gmul[2][16][3] = {
1457     {{{0, 0, 0, 0},
1458       {0, 0, 0, 0},
1459       {0, 0, 0, 0}},
1460      {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1461        0x6b17d1f2e12c4247},
1462       {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1463        0x4fe342e2fe1a7f9b},
1464       {1, 0, 0, 0}},
1465      {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1466        0x0fa822bc2811aaa5},
1467       {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1468        0xbff44ae8f5dba80d},
1469       {1, 0, 0, 0}},
1470      {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1471        0x300a4bbc89d6726f},
1472       {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1473        0x72aac7e0d09b4644},
1474       {1, 0, 0, 0}},
1475      {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1476        0x447d739beedb5e67},
1477       {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1478        0x2d4825ab834131ee},
1479       {1, 0, 0, 0}},
1480      {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1481        0xef9519328a9c72ff},
1482       {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1483        0x611e9fc37dbb2c9b},
1484       {1, 0, 0, 0}},
1485      {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1486        0x550663797b51f5d8},
1487       {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1488        0x157164848aecb851},
1489       {1, 0, 0, 0}},
1490      {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1491        0xeb5d7745b21141ea},
1492       {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1493        0xeafd72ebdbecc17b},
1494       {1, 0, 0, 0}},
1495      {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1496        0xa6d39677a7849276},
1497       {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1498        0x674f84749b0b8816},
1499       {1, 0, 0, 0}},
1500      {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1501        0x4e769e7672c9ddad},
1502       {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1503        0x42b99082de830663},
1504       {1, 0, 0, 0}},
1505      {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1506        0x78878ef61c6ce04d},
1507       {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1508        0xb6cb3f5d7b72c321},
1509       {1, 0, 0, 0}},
1510      {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1511        0x0c88bc4d716b1287},
1512       {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1513        0xdd5ddea3f3901dc6},
1514       {1, 0, 0, 0}},
1515      {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1516        0x68f344af6b317466},
1517       {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1518        0x31b9c405f8540a20},
1519       {1, 0, 0, 0}},
1520      {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1521        0x4052bf4b6f461db9},
1522       {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1523        0xfecf4d5190b0fc61},
1524       {1, 0, 0, 0}},
1525      {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1526        0x1eddbae2c802e41a},
1527       {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1528        0x43104d86560ebcfc},
1529       {1, 0, 0, 0}},
1530      {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1531        0xb48e26b484f7a21c},
1532       {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1533        0xfac015404d4d3dab},
1534       {1, 0, 0, 0}}},
1535     {{{0, 0, 0, 0},
1536       {0, 0, 0, 0},
1537       {0, 0, 0, 0}},
1538      {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1539        0x7fe36b40af22af89},
1540       {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1541        0xe697d45825b63624},
1542       {1, 0, 0, 0}},
1543      {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1544        0x4a5b506612a677a6},
1545       {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1546        0xeb13461ceac089f1},
1547       {1, 0, 0, 0}},
1548      {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1549        0x0781b8291c6a220a},
1550       {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1551        0x690cde8df0151593},
1552       {1, 0, 0, 0}},
1553      {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1554        0x8a535f566ec73617},
1555       {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1556        0x0455c08468b08bd7},
1557       {1, 0, 0, 0}},
1558      {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1559        0x06bada7ab77f8276},
1560       {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1561        0x5b476dfd0e6cb18a},
1562       {1, 0, 0, 0}},
1563      {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1564        0x3e29864e8a2ec908},
1565       {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1566        0x239b90ea3dc31e7e},
1567       {1, 0, 0, 0}},
1568      {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1569        0x820f4dd949f72ff7},
1570       {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1571        0x140406ec783a05ec},
1572       {1, 0, 0, 0}},
1573      {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1574        0x68f6b8542783dfee},
1575       {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1576        0xcbe1feba92e40ce6},
1577       {1, 0, 0, 0}},
1578      {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1579        0xd0b2f94d2f420109},
1580       {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1581        0x971459828b0719e5},
1582       {1, 0, 0, 0}},
1583      {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1584        0x961610004a866aba},
1585       {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1586        0x7acb9fadcee75e44},
1587       {1, 0, 0, 0}},
1588      {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1589        0x24eb9acca333bf5b},
1590       {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1591        0x69f891c5acd079cc},
1592       {1, 0, 0, 0}},
1593      {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1594        0xe51f547c5972a107},
1595       {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1596        0x1c309a2b25bb1387},
1597       {1, 0, 0, 0}},
1598      {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1599        0x20b87b8aa2c4e503},
1600       {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1601        0xf5c6fa49919776be},
1602       {1, 0, 0, 0}},
1603      {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1604        0x1ed7d1b9332010b9},
1605       {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1606        0x3a2b03f03217257a},
1607       {1, 0, 0, 0}},
1608      {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1609        0x15fee545c78dd9f6},
1610       {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1611        0x4ab5b6b2b8753f81},
1612       {1, 0, 0, 0}}}
1613 };
1614
1615 /*
1616  * select_point selects the |idx|th point from a precomputation table and
1617  * copies it to out.
1618  */
1619 static void select_point(const u64 idx, unsigned int size,
1620                          const smallfelem pre_comp[16][3], smallfelem out[3])
1621 {
1622     unsigned i, j;
1623     u64 *outlimbs = &out[0][0];
1624     memset(outlimbs, 0, 3 * sizeof(smallfelem));
1625
1626     for (i = 0; i < size; i++) {
1627         const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1628         u64 mask = i ^ idx;
1629         mask |= mask >> 4;
1630         mask |= mask >> 2;
1631         mask |= mask >> 1;
1632         mask &= 1;
1633         mask--;
1634         for (j = 0; j < NLIMBS * 3; j++)
1635             outlimbs[j] |= inlimbs[j] & mask;
1636     }
1637 }
1638
1639 /* get_bit returns the |i|th bit in |in| */
1640 static char get_bit(const felem_bytearray in, int i)
1641 {
1642     if ((i < 0) || (i >= 256))
1643         return 0;
1644     return (in[i >> 3] >> (i & 7)) & 1;
1645 }
1646
1647 /*
1648  * Interleaved point multiplication using precomputed point multiples: The
1649  * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1650  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1651  * generator, using certain (large) precomputed multiples in g_pre_comp.
1652  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1653  */
1654 static void batch_mul(felem x_out, felem y_out, felem z_out,
1655                       const felem_bytearray scalars[],
1656                       const unsigned num_points, const u8 *g_scalar,
1657                       const int mixed, const smallfelem pre_comp[][17][3],
1658                       const smallfelem g_pre_comp[2][16][3])
1659 {
1660     int i, skip;
1661     unsigned num, gen_mul = (g_scalar != NULL);
1662     felem nq[3], ftmp;
1663     smallfelem tmp[3];
1664     u64 bits;
1665     u8 sign, digit;
1666
1667     /* set nq to the point at infinity */
1668     memset(nq, 0, 3 * sizeof(felem));
1669
1670     /*
1671      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1672      * of the generator (two in each of the last 32 rounds) and additions of
1673      * other points multiples (every 5th round).
1674      */
1675     skip = 1;                   /* save two point operations in the first
1676                                  * round */
1677     for (i = (num_points ? 255 : 31); i >= 0; --i) {
1678         /* double */
1679         if (!skip)
1680             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1681
1682         /* add multiples of the generator */
1683         if (gen_mul && (i <= 31)) {
1684             /* first, look 32 bits upwards */
1685             bits = get_bit(g_scalar, i + 224) << 3;
1686             bits |= get_bit(g_scalar, i + 160) << 2;
1687             bits |= get_bit(g_scalar, i + 96) << 1;
1688             bits |= get_bit(g_scalar, i + 32);
1689             /* select the point to add, in constant time */
1690             select_point(bits, 16, g_pre_comp[1], tmp);
1691
1692             if (!skip) {
1693                 /* Arg 1 below is for "mixed" */
1694                 point_add(nq[0], nq[1], nq[2],
1695                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1696             } else {
1697                 smallfelem_expand(nq[0], tmp[0]);
1698                 smallfelem_expand(nq[1], tmp[1]);
1699                 smallfelem_expand(nq[2], tmp[2]);
1700                 skip = 0;
1701             }
1702
1703             /* second, look at the current position */
1704             bits = get_bit(g_scalar, i + 192) << 3;
1705             bits |= get_bit(g_scalar, i + 128) << 2;
1706             bits |= get_bit(g_scalar, i + 64) << 1;
1707             bits |= get_bit(g_scalar, i);
1708             /* select the point to add, in constant time */
1709             select_point(bits, 16, g_pre_comp[0], tmp);
1710             /* Arg 1 below is for "mixed" */
1711             point_add(nq[0], nq[1], nq[2],
1712                       nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1713         }
1714
1715         /* do other additions every 5 doublings */
1716         if (num_points && (i % 5 == 0)) {
1717             /* loop over all scalars */
1718             for (num = 0; num < num_points; ++num) {
1719                 bits = get_bit(scalars[num], i + 4) << 5;
1720                 bits |= get_bit(scalars[num], i + 3) << 4;
1721                 bits |= get_bit(scalars[num], i + 2) << 3;
1722                 bits |= get_bit(scalars[num], i + 1) << 2;
1723                 bits |= get_bit(scalars[num], i) << 1;
1724                 bits |= get_bit(scalars[num], i - 1);
1725                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1726
1727                 /*
1728                  * select the point to add or subtract, in constant time
1729                  */
1730                 select_point(digit, 17, pre_comp[num], tmp);
1731                 smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1732                                                * point */
1733                 copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1734                 felem_contract(tmp[1], ftmp);
1735
1736                 if (!skip) {
1737                     point_add(nq[0], nq[1], nq[2],
1738                               nq[0], nq[1], nq[2],
1739                               mixed, tmp[0], tmp[1], tmp[2]);
1740                 } else {
1741                     smallfelem_expand(nq[0], tmp[0]);
1742                     smallfelem_expand(nq[1], tmp[1]);
1743                     smallfelem_expand(nq[2], tmp[2]);
1744                     skip = 0;
1745                 }
1746             }
1747         }
1748     }
1749     felem_assign(x_out, nq[0]);
1750     felem_assign(y_out, nq[1]);
1751     felem_assign(z_out, nq[2]);
1752 }
1753
1754 /* Precomputation for the group generator. */
1755 typedef struct {
1756     smallfelem g_pre_comp[2][16][3];
1757     int references;
1758 } NISTP256_PRE_COMP;
1759
1760 const EC_METHOD *EC_GFp_nistp256_method(void)
1761 {
1762     static const EC_METHOD ret = {
1763         EC_FLAGS_DEFAULT_OCT,
1764         NID_X9_62_prime_field,
1765         ec_GFp_nistp256_group_init,
1766         ec_GFp_simple_group_finish,
1767         ec_GFp_simple_group_clear_finish,
1768         ec_GFp_nist_group_copy,
1769         ec_GFp_nistp256_group_set_curve,
1770         ec_GFp_simple_group_get_curve,
1771         ec_GFp_simple_group_get_degree,
1772         ec_GFp_simple_group_check_discriminant,
1773         ec_GFp_simple_point_init,
1774         ec_GFp_simple_point_finish,
1775         ec_GFp_simple_point_clear_finish,
1776         ec_GFp_simple_point_copy,
1777         ec_GFp_simple_point_set_to_infinity,
1778         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1779         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1780         ec_GFp_simple_point_set_affine_coordinates,
1781         ec_GFp_nistp256_point_get_affine_coordinates,
1782         0 /* point_set_compressed_coordinates */ ,
1783         0 /* point2oct */ ,
1784         0 /* oct2point */ ,
1785         ec_GFp_simple_add,
1786         ec_GFp_simple_dbl,
1787         ec_GFp_simple_invert,
1788         ec_GFp_simple_is_at_infinity,
1789         ec_GFp_simple_is_on_curve,
1790         ec_GFp_simple_cmp,
1791         ec_GFp_simple_make_affine,
1792         ec_GFp_simple_points_make_affine,
1793         ec_GFp_nistp256_points_mul,
1794         ec_GFp_nistp256_precompute_mult,
1795         ec_GFp_nistp256_have_precompute_mult,
1796         ec_GFp_nist_field_mul,
1797         ec_GFp_nist_field_sqr,
1798         0 /* field_div */ ,
1799         0 /* field_encode */ ,
1800         0 /* field_decode */ ,
1801         0                       /* field_set_to_one */
1802     };
1803
1804     return &ret;
1805 }
1806
1807 /******************************************************************************/
1808 /*
1809  * FUNCTIONS TO MANAGE PRECOMPUTATION
1810  */
1811
1812 static NISTP256_PRE_COMP *nistp256_pre_comp_new()
1813 {
1814     NISTP256_PRE_COMP *ret = NULL;
1815     ret = (NISTP256_PRE_COMP *) OPENSSL_malloc(sizeof *ret);
1816     if (!ret) {
1817         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1818         return ret;
1819     }
1820     memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1821     ret->references = 1;
1822     return ret;
1823 }
1824
1825 static void *nistp256_pre_comp_dup(void *src_)
1826 {
1827     NISTP256_PRE_COMP *src = src_;
1828
1829     /* no need to actually copy, these objects never change! */
1830     CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1831
1832     return src_;
1833 }
1834
1835 static void nistp256_pre_comp_free(void *pre_)
1836 {
1837     int i;
1838     NISTP256_PRE_COMP *pre = pre_;
1839
1840     if (!pre)
1841         return;
1842
1843     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1844     if (i > 0)
1845         return;
1846
1847     OPENSSL_free(pre);
1848 }
1849
1850 static void nistp256_pre_comp_clear_free(void *pre_)
1851 {
1852     int i;
1853     NISTP256_PRE_COMP *pre = pre_;
1854
1855     if (!pre)
1856         return;
1857
1858     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1859     if (i > 0)
1860         return;
1861
1862     OPENSSL_cleanse(pre, sizeof *pre);
1863     OPENSSL_free(pre);
1864 }
1865
1866 /******************************************************************************/
1867 /*
1868  * OPENSSL EC_METHOD FUNCTIONS
1869  */
1870
1871 int ec_GFp_nistp256_group_init(EC_GROUP *group)
1872 {
1873     int ret;
1874     ret = ec_GFp_simple_group_init(group);
1875     group->a_is_minus3 = 1;
1876     return ret;
1877 }
1878
1879 int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1880                                     const BIGNUM *a, const BIGNUM *b,
1881                                     BN_CTX *ctx)
1882 {
1883     int ret = 0;
1884     BN_CTX *new_ctx = NULL;
1885     BIGNUM *curve_p, *curve_a, *curve_b;
1886
1887     if (ctx == NULL)
1888         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1889             return 0;
1890     BN_CTX_start(ctx);
1891     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1892         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1893         ((curve_b = BN_CTX_get(ctx)) == NULL))
1894         goto err;
1895     BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1896     BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1897     BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1898     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1899         ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1900               EC_R_WRONG_CURVE_PARAMETERS);
1901         goto err;
1902     }
1903     group->field_mod_func = BN_nist_mod_256;
1904     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1905  err:
1906     BN_CTX_end(ctx);
1907     if (new_ctx != NULL)
1908         BN_CTX_free(new_ctx);
1909     return ret;
1910 }
1911
1912 /*
1913  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1914  * (X/Z^2, Y/Z^3)
1915  */
1916 int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1917                                                  const EC_POINT *point,
1918                                                  BIGNUM *x, BIGNUM *y,
1919                                                  BN_CTX *ctx)
1920 {
1921     felem z1, z2, x_in, y_in;
1922     smallfelem x_out, y_out;
1923     longfelem tmp;
1924
1925     if (EC_POINT_is_at_infinity(group, point)) {
1926         ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1927               EC_R_POINT_AT_INFINITY);
1928         return 0;
1929     }
1930     if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1931         (!BN_to_felem(z1, &point->Z)))
1932         return 0;
1933     felem_inv(z2, z1);
1934     felem_square(tmp, z2);
1935     felem_reduce(z1, tmp);
1936     felem_mul(tmp, x_in, z1);
1937     felem_reduce(x_in, tmp);
1938     felem_contract(x_out, x_in);
1939     if (x != NULL) {
1940         if (!smallfelem_to_BN(x, x_out)) {
1941             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1942                   ERR_R_BN_LIB);
1943             return 0;
1944         }
1945     }
1946     felem_mul(tmp, z1, z2);
1947     felem_reduce(z1, tmp);
1948     felem_mul(tmp, y_in, z1);
1949     felem_reduce(y_in, tmp);
1950     felem_contract(y_out, y_in);
1951     if (y != NULL) {
1952         if (!smallfelem_to_BN(y, y_out)) {
1953             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1954                   ERR_R_BN_LIB);
1955             return 0;
1956         }
1957     }
1958     return 1;
1959 }
1960
1961 /* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
1962 static void make_points_affine(size_t num, smallfelem points[][3],
1963                                smallfelem tmp_smallfelems[])
1964 {
1965     /*
1966      * Runs in constant time, unless an input is the point at infinity (which
1967      * normally shouldn't happen).
1968      */
1969     ec_GFp_nistp_points_make_affine_internal(num,
1970                                              points,
1971                                              sizeof(smallfelem),
1972                                              tmp_smallfelems,
1973                                              (void (*)(void *))smallfelem_one,
1974                                              (int (*)(const void *))
1975                                              smallfelem_is_zero_int,
1976                                              (void (*)(void *, const void *))
1977                                              smallfelem_assign,
1978                                              (void (*)(void *, const void *))
1979                                              smallfelem_square_contract,
1980                                              (void (*)
1981                                               (void *, const void *,
1982                                                const void *))
1983                                              smallfelem_mul_contract,
1984                                              (void (*)(void *, const void *))
1985                                              smallfelem_inv_contract,
1986                                              /* nothing to contract */
1987                                              (void (*)(void *, const void *))
1988                                              smallfelem_assign);
1989 }
1990
1991 /*
1992  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1993  * values Result is stored in r (r can equal one of the inputs).
1994  */
1995 int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
1996                                const BIGNUM *scalar, size_t num,
1997                                const EC_POINT *points[],
1998                                const BIGNUM *scalars[], BN_CTX *ctx)
1999 {
2000     int ret = 0;
2001     int j;
2002     int mixed = 0;
2003     BN_CTX *new_ctx = NULL;
2004     BIGNUM *x, *y, *z, *tmp_scalar;
2005     felem_bytearray g_secret;
2006     felem_bytearray *secrets = NULL;
2007     smallfelem(*pre_comp)[17][3] = NULL;
2008     smallfelem *tmp_smallfelems = NULL;
2009     felem_bytearray tmp;
2010     unsigned i, num_bytes;
2011     int have_pre_comp = 0;
2012     size_t num_points = num;
2013     smallfelem x_in, y_in, z_in;
2014     felem x_out, y_out, z_out;
2015     NISTP256_PRE_COMP *pre = NULL;
2016     const smallfelem(*g_pre_comp)[16][3] = NULL;
2017     EC_POINT *generator = NULL;
2018     const EC_POINT *p = NULL;
2019     const BIGNUM *p_scalar = NULL;
2020
2021     if (ctx == NULL)
2022         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2023             return 0;
2024     BN_CTX_start(ctx);
2025     if (((x = BN_CTX_get(ctx)) == NULL) ||
2026         ((y = BN_CTX_get(ctx)) == NULL) ||
2027         ((z = BN_CTX_get(ctx)) == NULL) ||
2028         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
2029         goto err;
2030
2031     if (scalar != NULL) {
2032         pre = EC_EX_DATA_get_data(group->extra_data,
2033                                   nistp256_pre_comp_dup,
2034                                   nistp256_pre_comp_free,
2035                                   nistp256_pre_comp_clear_free);
2036         if (pre)
2037             /* we have precomputation, try to use it */
2038             g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2039         else
2040             /* try to use the standard precomputation */
2041             g_pre_comp = &gmul[0];
2042         generator = EC_POINT_new(group);
2043         if (generator == NULL)
2044             goto err;
2045         /* get the generator from precomputation */
2046         if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2047             !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2048             !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2049             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2050             goto err;
2051         }
2052         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
2053                                                       generator, x, y, z,
2054                                                       ctx))
2055             goto err;
2056         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2057             /* precomputation matches generator */
2058             have_pre_comp = 1;
2059         else
2060             /*
2061              * we don't have valid precomputation: treat the generator as a
2062              * random point
2063              */
2064             num_points++;
2065     }
2066     if (num_points > 0) {
2067         if (num_points >= 3) {
2068             /*
2069              * unless we precompute multiples for just one or two points,
2070              * converting those into affine form is time well spent
2071              */
2072             mixed = 1;
2073         }
2074         secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
2075         pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(smallfelem));
2076         if (mixed)
2077             tmp_smallfelems =
2078                 OPENSSL_malloc((num_points * 17 + 1) * sizeof(smallfelem));
2079         if ((secrets == NULL) || (pre_comp == NULL)
2080             || (mixed && (tmp_smallfelems == NULL))) {
2081             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
2082             goto err;
2083         }
2084
2085         /*
2086          * we treat NULL scalars as 0, and NULL points as points at infinity,
2087          * i.e., they contribute nothing to the linear combination
2088          */
2089         memset(secrets, 0, num_points * sizeof(felem_bytearray));
2090         memset(pre_comp, 0, num_points * 17 * 3 * sizeof(smallfelem));
2091         for (i = 0; i < num_points; ++i) {
2092             if (i == num)
2093                 /*
2094                  * we didn't have a valid precomputation, so we pick the
2095                  * generator
2096                  */
2097             {
2098                 p = EC_GROUP_get0_generator(group);
2099                 p_scalar = scalar;
2100             } else
2101                 /* the i^th point */
2102             {
2103                 p = points[i];
2104                 p_scalar = scalars[i];
2105             }
2106             if ((p_scalar != NULL) && (p != NULL)) {
2107                 /* reduce scalar to 0 <= scalar < 2^256 */
2108                 if ((BN_num_bits(p_scalar) > 256)
2109                     || (BN_is_negative(p_scalar))) {
2110                     /*
2111                      * this is an unusual input, and we don't guarantee
2112                      * constant-timeness
2113                      */
2114                     if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
2115                         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2116                         goto err;
2117                     }
2118                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
2119                 } else
2120                     num_bytes = BN_bn2bin(p_scalar, tmp);
2121                 flip_endian(secrets[i], tmp, num_bytes);
2122                 /* precompute multiples */
2123                 if ((!BN_to_felem(x_out, &p->X)) ||
2124                     (!BN_to_felem(y_out, &p->Y)) ||
2125                     (!BN_to_felem(z_out, &p->Z)))
2126                     goto err;
2127                 felem_shrink(pre_comp[i][1][0], x_out);
2128                 felem_shrink(pre_comp[i][1][1], y_out);
2129                 felem_shrink(pre_comp[i][1][2], z_out);
2130                 for (j = 2; j <= 16; ++j) {
2131                     if (j & 1) {
2132                         point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2133                                         pre_comp[i][j][2], pre_comp[i][1][0],
2134                                         pre_comp[i][1][1], pre_comp[i][1][2],
2135                                         pre_comp[i][j - 1][0],
2136                                         pre_comp[i][j - 1][1],
2137                                         pre_comp[i][j - 1][2]);
2138                     } else {
2139                         point_double_small(pre_comp[i][j][0],
2140                                            pre_comp[i][j][1],
2141                                            pre_comp[i][j][2],
2142                                            pre_comp[i][j / 2][0],
2143                                            pre_comp[i][j / 2][1],
2144                                            pre_comp[i][j / 2][2]);
2145                     }
2146                 }
2147             }
2148         }
2149         if (mixed)
2150             make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2151     }
2152
2153     /* the scalar for the generator */
2154     if ((scalar != NULL) && (have_pre_comp)) {
2155         memset(g_secret, 0, sizeof(g_secret));
2156         /* reduce scalar to 0 <= scalar < 2^256 */
2157         if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2158             /*
2159              * this is an unusual input, and we don't guarantee
2160              * constant-timeness
2161              */
2162             if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
2163                 ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2164                 goto err;
2165             }
2166             num_bytes = BN_bn2bin(tmp_scalar, tmp);
2167         } else
2168             num_bytes = BN_bn2bin(scalar, tmp);
2169         flip_endian(g_secret, tmp, num_bytes);
2170         /* do the multiplication with generator precomputation */
2171         batch_mul(x_out, y_out, z_out,
2172                   (const felem_bytearray(*))secrets, num_points,
2173                   g_secret,
2174                   mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2175     } else
2176         /* do the multiplication without generator precomputation */
2177         batch_mul(x_out, y_out, z_out,
2178                   (const felem_bytearray(*))secrets, num_points,
2179                   NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2180     /* reduce the output to its unique minimal representation */
2181     felem_contract(x_in, x_out);
2182     felem_contract(y_in, y_out);
2183     felem_contract(z_in, z_out);
2184     if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2185         (!smallfelem_to_BN(z, z_in))) {
2186         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2187         goto err;
2188     }
2189     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2190
2191  err:
2192     BN_CTX_end(ctx);
2193     if (generator != NULL)
2194         EC_POINT_free(generator);
2195     if (new_ctx != NULL)
2196         BN_CTX_free(new_ctx);
2197     if (secrets != NULL)
2198         OPENSSL_free(secrets);
2199     if (pre_comp != NULL)
2200         OPENSSL_free(pre_comp);
2201     if (tmp_smallfelems != NULL)
2202         OPENSSL_free(tmp_smallfelems);
2203     return ret;
2204 }
2205
2206 int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2207 {
2208     int ret = 0;
2209     NISTP256_PRE_COMP *pre = NULL;
2210     int i, j;
2211     BN_CTX *new_ctx = NULL;
2212     BIGNUM *x, *y;
2213     EC_POINT *generator = NULL;
2214     smallfelem tmp_smallfelems[32];
2215     felem x_tmp, y_tmp, z_tmp;
2216
2217     /* throw away old precomputation */
2218     EC_EX_DATA_free_data(&group->extra_data, nistp256_pre_comp_dup,
2219                          nistp256_pre_comp_free,
2220                          nistp256_pre_comp_clear_free);
2221     if (ctx == NULL)
2222         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2223             return 0;
2224     BN_CTX_start(ctx);
2225     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2226         goto err;
2227     /* get the generator */
2228     if (group->generator == NULL)
2229         goto err;
2230     generator = EC_POINT_new(group);
2231     if (generator == NULL)
2232         goto err;
2233     BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2234     BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2235     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2236         goto err;
2237     if ((pre = nistp256_pre_comp_new()) == NULL)
2238         goto err;
2239     /*
2240      * if the generator is the standard one, use built-in precomputation
2241      */
2242     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2243         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2244         ret = 1;
2245         goto err;
2246     }
2247     if ((!BN_to_felem(x_tmp, &group->generator->X)) ||
2248         (!BN_to_felem(y_tmp, &group->generator->Y)) ||
2249         (!BN_to_felem(z_tmp, &group->generator->Z)))
2250         goto err;
2251     felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2252     felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2253     felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2254     /*
2255      * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2256      * 2^160*G, 2^224*G for the second one
2257      */
2258     for (i = 1; i <= 8; i <<= 1) {
2259         point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2260                            pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2261                            pre->g_pre_comp[0][i][1],
2262                            pre->g_pre_comp[0][i][2]);
2263         for (j = 0; j < 31; ++j) {
2264             point_double_small(pre->g_pre_comp[1][i][0],
2265                                pre->g_pre_comp[1][i][1],
2266                                pre->g_pre_comp[1][i][2],
2267                                pre->g_pre_comp[1][i][0],
2268                                pre->g_pre_comp[1][i][1],
2269                                pre->g_pre_comp[1][i][2]);
2270         }
2271         if (i == 8)
2272             break;
2273         point_double_small(pre->g_pre_comp[0][2 * i][0],
2274                            pre->g_pre_comp[0][2 * i][1],
2275                            pre->g_pre_comp[0][2 * i][2],
2276                            pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2277                            pre->g_pre_comp[1][i][2]);
2278         for (j = 0; j < 31; ++j) {
2279             point_double_small(pre->g_pre_comp[0][2 * i][0],
2280                                pre->g_pre_comp[0][2 * i][1],
2281                                pre->g_pre_comp[0][2 * i][2],
2282                                pre->g_pre_comp[0][2 * i][0],
2283                                pre->g_pre_comp[0][2 * i][1],
2284                                pre->g_pre_comp[0][2 * i][2]);
2285         }
2286     }
2287     for (i = 0; i < 2; i++) {
2288         /* g_pre_comp[i][0] is the point at infinity */
2289         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2290         /* the remaining multiples */
2291         /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2292         point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2293                         pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2294                         pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2295                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2296                         pre->g_pre_comp[i][2][2]);
2297         /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2298         point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2299                         pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2300                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2301                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2302                         pre->g_pre_comp[i][2][2]);
2303         /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2304         point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2305                         pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2306                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2307                         pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2308                         pre->g_pre_comp[i][4][2]);
2309         /*
2310          * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2311          */
2312         point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2313                         pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2314                         pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2315                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2316                         pre->g_pre_comp[i][2][2]);
2317         for (j = 1; j < 8; ++j) {
2318             /* odd multiples: add G resp. 2^32*G */
2319             point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2320                             pre->g_pre_comp[i][2 * j + 1][1],
2321                             pre->g_pre_comp[i][2 * j + 1][2],
2322                             pre->g_pre_comp[i][2 * j][0],
2323                             pre->g_pre_comp[i][2 * j][1],
2324                             pre->g_pre_comp[i][2 * j][2],
2325                             pre->g_pre_comp[i][1][0],
2326                             pre->g_pre_comp[i][1][1],
2327                             pre->g_pre_comp[i][1][2]);
2328         }
2329     }
2330     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2331
2332     if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup,
2333                              nistp256_pre_comp_free,
2334                              nistp256_pre_comp_clear_free))
2335         goto err;
2336     ret = 1;
2337     pre = NULL;
2338  err:
2339     BN_CTX_end(ctx);
2340     if (generator != NULL)
2341         EC_POINT_free(generator);
2342     if (new_ctx != NULL)
2343         BN_CTX_free(new_ctx);
2344     if (pre)
2345         nistp256_pre_comp_free(pre);
2346     return ret;
2347 }
2348
2349 int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2350 {
2351     if (EC_EX_DATA_get_data(group->extra_data, nistp256_pre_comp_dup,
2352                             nistp256_pre_comp_free,
2353                             nistp256_pre_comp_clear_free)
2354         != NULL)
2355         return 1;
2356     else
2357         return 0;
2358 }
2359 #else
2360 static void *dummy = &dummy;
2361 #endif