crypto/ec/ecp_nistp256.c

   1 /* crypto/ec/ecp_nistp256.c */
   2 /*
   3  * Written by Adam Langley (Google) for the OpenSSL project
   4  */
   5 /* Copyright 2011 Google Inc.
   6  *
   7  * Licensed under the Apache License, Version 2.0 (the "License");
   8  *
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  *  Unless required by applicable law or agreed to in writing, software
  15  *  distributed under the License is distributed on an "AS IS" BASIS,
  16  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  *  See the License for the specific language governing permissions and
  18  *  limitations under the License.
  19  */
  20
  21 /*
  22  * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
  23  *
  24  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  25  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  26  * work which got its smarts from Daniel J. Bernstein's work on the same.
  27  */
  28
  29 #include <openssl/opensslconf.h>
  30 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
  31
  32 # include <stdint.h>
  33 # include <string.h>
  34 # include <openssl/err.h>
  35 # include "ec_lcl.h"
  36
  37 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  38   /* even with gcc, the typedef won't work for 32-bit platforms */
  39 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  40                                  * platforms */
  41 typedef __int128_t int128_t;
  42 # else
  43 #  error "Need GCC 3.1 or later to define type uint128_t"
  44 # endif
  45
  46 typedef uint8_t u8;
  47 typedef uint32_t u32;
  48 typedef uint64_t u64;
  49 typedef int64_t s64;
  50
  51 /*
  52  * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
  53  * can serialise an element of this field into 32 bytes. We call this an
  54  * felem_bytearray.
  55  */
  56
  57 typedef u8 felem_bytearray[32];
  58
  59 /*
  60  * These are the parameters of P256, taken from FIPS 186-3, page 86. These
  61  * values are big-endian.
  62  */
  63 static const felem_bytearray nistp256_curve_params[5] = {
  64     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
  65      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  66      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  67      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
  68     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
  69      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  70      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
  72     {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
  73      0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
  74      0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
  75      0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
  76     {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
  77      0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
  78      0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
  79      0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
  80     {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
  81      0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
  82      0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
  83      0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
  84 };
  85
  86 /*-
  87  * The representation of field elements.
  88  * ------------------------------------
  89  *
  90  * We represent field elements with either four 128-bit values, eight 128-bit
  91  * values, or four 64-bit values. The field element represented is:
  92  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
  93  * or:
  94  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
  95  *
  96  * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
  97  * apart, but are 128-bits wide, the most significant bits of each limb overlap
  98  * with the least significant bits of the next.
  99  *
 100  * A field element with four limbs is an 'felem'. One with eight limbs is a
 101  * 'longfelem'
 102  *
 103  * A field element with four, 64-bit values is called a 'smallfelem'. Small
 104  * values are used as intermediate values before multiplication.
 105  */
 106
 107 # define NLIMBS 4
 108
 109 typedef uint128_t limb;
 110 typedef limb felem[NLIMBS];
 111 typedef limb longfelem[NLIMBS * 2];
 112 typedef u64 smallfelem[NLIMBS];
 113
 114 /* This is the value of the prime as four 64-bit words, little-endian. */
 115 static const u64 kPrime[4] =
 116     { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
 117 static const u64 bottom63bits = 0x7ffffffffffffffful;
 118
 119 /*
 120  * bin32_to_felem takes a little-endian byte array and converts it into felem
 121  * form. This assumes that the CPU is little-endian.
 122  */
 123 static void bin32_to_felem(felem out, const u8 in[32])
 124 {
 125     out[0] = *((u64 *)&in[0]);
 126     out[1] = *((u64 *)&in[8]);
 127     out[2] = *((u64 *)&in[16]);
 128     out[3] = *((u64 *)&in[24]);
 129 }
 130
 131 /*
 132  * smallfelem_to_bin32 takes a smallfelem and serialises into a little
 133  * endian, 32 byte array. This assumes that the CPU is little-endian.
 134  */
 135 static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
 136 {
 137     *((u64 *)&out[0]) = in[0];
 138     *((u64 *)&out[8]) = in[1];
 139     *((u64 *)&out[16]) = in[2];
 140     *((u64 *)&out[24]) = in[3];
 141 }
 142
 143 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 144 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 145 {
 146     unsigned i;
 147     for (i = 0; i < len; ++i)
 148         out[i] = in[len - 1 - i];
 149 }
 150
 151 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 152 static int BN_to_felem(felem out, const BIGNUM *bn)
 153 {
 154     felem_bytearray b_in;
 155     felem_bytearray b_out;
 156     unsigned num_bytes;
 157
 158     /* BN_bn2bin eats leading zeroes */
 159     memset(b_out, 0, sizeof(b_out));
 160     num_bytes = BN_num_bytes(bn);
 161     if (num_bytes > sizeof b_out) {
 162         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 163         return 0;
 164     }
 165     if (BN_is_negative(bn)) {
 166         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 167         return 0;
 168     }
 169     num_bytes = BN_bn2bin(bn, b_in);
 170     flip_endian(b_out, b_in, num_bytes);
 171     bin32_to_felem(out, b_out);
 172     return 1;
 173 }
 174
 175 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 176 static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
 177 {
 178     felem_bytearray b_in, b_out;
 179     smallfelem_to_bin32(b_in, in);
 180     flip_endian(b_out, b_in, sizeof b_out);
 181     return BN_bin2bn(b_out, sizeof b_out, out);
 182 }
 183
 184 /*-
 185  * Field operations
 186  * ----------------
 187  */
 188
 189 static void smallfelem_one(smallfelem out)
 190 {
 191     out[0] = 1;
 192     out[1] = 0;
 193     out[2] = 0;
 194     out[3] = 0;
 195 }
 196
 197 static void smallfelem_assign(smallfelem out, const smallfelem in)
 198 {
 199     out[0] = in[0];
 200     out[1] = in[1];
 201     out[2] = in[2];
 202     out[3] = in[3];
 203 }
 204
 205 static void felem_assign(felem out, const felem in)
 206 {
 207     out[0] = in[0];
 208     out[1] = in[1];
 209     out[2] = in[2];
 210     out[3] = in[3];
 211 }
 212
 213 /* felem_sum sets out = out + in. */
 214 static void felem_sum(felem out, const felem in)
 215 {
 216     out[0] += in[0];
 217     out[1] += in[1];
 218     out[2] += in[2];
 219     out[3] += in[3];
 220 }
 221
 222 /* felem_small_sum sets out = out + in. */
 223 static void felem_small_sum(felem out, const smallfelem in)
 224 {
 225     out[0] += in[0];
 226     out[1] += in[1];
 227     out[2] += in[2];
 228     out[3] += in[3];
 229 }
 230
 231 /* felem_scalar sets out = out * scalar */
 232 static void felem_scalar(felem out, const u64 scalar)
 233 {
 234     out[0] *= scalar;
 235     out[1] *= scalar;
 236     out[2] *= scalar;
 237     out[3] *= scalar;
 238 }
 239
 240 /* longfelem_scalar sets out = out * scalar */
 241 static void longfelem_scalar(longfelem out, const u64 scalar)
 242 {
 243     out[0] *= scalar;
 244     out[1] *= scalar;
 245     out[2] *= scalar;
 246     out[3] *= scalar;
 247     out[4] *= scalar;
 248     out[5] *= scalar;
 249     out[6] *= scalar;
 250     out[7] *= scalar;
 251 }
 252
 253 # define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
 254 # define two105 (((limb)1) << 105)
 255 # define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
 256
 257 /* zero105 is 0 mod p */
 258 static const felem zero105 =
 259     { two105m41m9, two105, two105m41p9, two105m41p9 };
 260
 261 /*-
 262  * smallfelem_neg sets |out| to |-small|
 263  * On exit:
 264  *   out[i] < out[i] + 2^105
 265  */
 266 static void smallfelem_neg(felem out, const smallfelem small)
 267 {
 268     /* In order to prevent underflow, we subtract from 0 mod p. */
 269     out[0] = zero105[0] - small[0];
 270     out[1] = zero105[1] - small[1];
 271     out[2] = zero105[2] - small[2];
 272     out[3] = zero105[3] - small[3];
 273 }
 274
 275 /*-
 276  * felem_diff subtracts |in| from |out|
 277  * On entry:
 278  *   in[i] < 2^104
 279  * On exit:
 280  *   out[i] < out[i] + 2^105
 281  */
 282 static void felem_diff(felem out, const felem in)
 283 {
 284     /*
 285      * In order to prevent underflow, we add 0 mod p before subtracting.
 286      */
 287     out[0] += zero105[0];
 288     out[1] += zero105[1];
 289     out[2] += zero105[2];
 290     out[3] += zero105[3];
 291
 292     out[0] -= in[0];
 293     out[1] -= in[1];
 294     out[2] -= in[2];
 295     out[3] -= in[3];
 296 }
 297
 298 # define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
 299 # define two107 (((limb)1) << 107)
 300 # define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
 301
 302 /* zero107 is 0 mod p */
 303 static const felem zero107 =
 304     { two107m43m11, two107, two107m43p11, two107m43p11 };
 305
 306 /*-
 307  * An alternative felem_diff for larger inputs |in|
 308  * felem_diff_zero107 subtracts |in| from |out|
 309  * On entry:
 310  *   in[i] < 2^106
 311  * On exit:
 312  *   out[i] < out[i] + 2^107
 313  */
 314 static void felem_diff_zero107(felem out, const felem in)
 315 {
 316     /*
 317      * In order to prevent underflow, we add 0 mod p before subtracting.
 318      */
 319     out[0] += zero107[0];
 320     out[1] += zero107[1];
 321     out[2] += zero107[2];
 322     out[3] += zero107[3];
 323
 324     out[0] -= in[0];
 325     out[1] -= in[1];
 326     out[2] -= in[2];
 327     out[3] -= in[3];
 328 }
 329
 330 /*-
 331  * longfelem_diff subtracts |in| from |out|
 332  * On entry:
 333  *   in[i] < 7*2^67
 334  * On exit:
 335  *   out[i] < out[i] + 2^70 + 2^40
 336  */
 337 static void longfelem_diff(longfelem out, const longfelem in)
 338 {
 339     static const limb two70m8p6 =
 340         (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
 341     static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
 342     static const limb two70 = (((limb) 1) << 70);
 343     static const limb two70m40m38p6 =
 344         (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
 345         (((limb) 1) << 6);
 346     static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
 347
 348     /* add 0 mod p to avoid underflow */
 349     out[0] += two70m8p6;
 350     out[1] += two70p40;
 351     out[2] += two70;
 352     out[3] += two70m40m38p6;
 353     out[4] += two70m6;
 354     out[5] += two70m6;
 355     out[6] += two70m6;
 356     out[7] += two70m6;
 357
 358     /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
 359     out[0] -= in[0];
 360     out[1] -= in[1];
 361     out[2] -= in[2];
 362     out[3] -= in[3];
 363     out[4] -= in[4];
 364     out[5] -= in[5];
 365     out[6] -= in[6];
 366     out[7] -= in[7];
 367 }
 368
 369 # define two64m0 (((limb)1) << 64) - 1
 370 # define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
 371 # define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
 372 # define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
 373
 374 /* zero110 is 0 mod p */
 375 static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
 376
 377 /*-
 378  * felem_shrink converts an felem into a smallfelem. The result isn't quite
 379  * minimal as the value may be greater than p.
 380  *
 381  * On entry:
 382  *   in[i] < 2^109
 383  * On exit:
 384  *   out[i] < 2^64
 385  */
 386 static void felem_shrink(smallfelem out, const felem in)
 387 {
 388     felem tmp;
 389     u64 a, b, mask;
 390     s64 high, low;
 391     static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
 392
 393     /* Carry 2->3 */
 394     tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
 395     /* tmp[3] < 2^110 */
 396
 397     tmp[2] = zero110[2] + (u64)in[2];
 398     tmp[0] = zero110[0] + in[0];
 399     tmp[1] = zero110[1] + in[1];
 400     /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
 401
 402     /*
 403      * We perform two partial reductions where we eliminate the high-word of
 404      * tmp[3]. We don't update the other words till the end.
 405      */
 406     a = tmp[3] >> 64;           /* a < 2^46 */
 407     tmp[3] = (u64)tmp[3];
 408     tmp[3] -= a;
 409     tmp[3] += ((limb) a) << 32;
 410     /* tmp[3] < 2^79 */
 411
 412     b = a;
 413     a = tmp[3] >> 64;           /* a < 2^15 */
 414     b += a;                     /* b < 2^46 + 2^15 < 2^47 */
 415     tmp[3] = (u64)tmp[3];
 416     tmp[3] -= a;
 417     tmp[3] += ((limb) a) << 32;
 418     /* tmp[3] < 2^64 + 2^47 */
 419
 420     /*
 421      * This adjusts the other two words to complete the two partial
 422      * reductions.
 423      */
 424     tmp[0] += b;
 425     tmp[1] -= (((limb) b) << 32);
 426
 427     /*
 428      * In order to make space in tmp[3] for the carry from 2 -> 3, we
 429      * conditionally subtract kPrime if tmp[3] is large enough.
 430      */
 431     high = tmp[3] >> 64;
 432     /* As tmp[3] < 2^65, high is either 1 or 0 */
 433     high <<= 63;
 434     high >>= 63;
 435     /*-
 436      * high is:
 437      *   all ones   if the high word of tmp[3] is 1
 438      *   all zeros  if the high word of tmp[3] if 0 */
 439     low = tmp[3];
 440     mask = low >> 63;
 441     /*-
 442      * mask is:
 443      *   all ones   if the MSB of low is 1
 444      *   all zeros  if the MSB of low if 0 */
 445     low &= bottom63bits;
 446     low -= kPrime3Test;
 447     /* if low was greater than kPrime3Test then the MSB is zero */
 448     low = ~low;
 449     low >>= 63;
 450     /*-
 451      * low is:
 452      *   all ones   if low was > kPrime3Test
 453      *   all zeros  if low was <= kPrime3Test */
 454     mask = (mask & low) | high;
 455     tmp[0] -= mask & kPrime[0];
 456     tmp[1] -= mask & kPrime[1];
 457     /* kPrime[2] is zero, so omitted */
 458     tmp[3] -= mask & kPrime[3];
 459     /* tmp[3] < 2**64 - 2**32 + 1 */
 460
 461     tmp[1] += ((u64)(tmp[0] >> 64));
 462     tmp[0] = (u64)tmp[0];
 463     tmp[2] += ((u64)(tmp[1] >> 64));
 464     tmp[1] = (u64)tmp[1];
 465     tmp[3] += ((u64)(tmp[2] >> 64));
 466     tmp[2] = (u64)tmp[2];
 467     /* tmp[i] < 2^64 */
 468
 469     out[0] = tmp[0];
 470     out[1] = tmp[1];
 471     out[2] = tmp[2];
 472     out[3] = tmp[3];
 473 }
 474
 475 /* smallfelem_expand converts a smallfelem to an felem */
 476 static void smallfelem_expand(felem out, const smallfelem in)
 477 {
 478     out[0] = in[0];
 479     out[1] = in[1];
 480     out[2] = in[2];
 481     out[3] = in[3];
 482 }
 483
 484 /*-
 485  * smallfelem_square sets |out| = |small|^2
 486  * On entry:
 487  *   small[i] < 2^64
 488  * On exit:
 489  *   out[i] < 7 * 2^64 < 2^67
 490  */
 491 static void smallfelem_square(longfelem out, const smallfelem small)
 492 {
 493     limb a;
 494     u64 high, low;
 495
 496     a = ((uint128_t) small[0]) * small[0];
 497     low = a;
 498     high = a >> 64;
 499     out[0] = low;
 500     out[1] = high;
 501
 502     a = ((uint128_t) small[0]) * small[1];
 503     low = a;
 504     high = a >> 64;
 505     out[1] += low;
 506     out[1] += low;
 507     out[2] = high;
 508
 509     a = ((uint128_t) small[0]) * small[2];
 510     low = a;
 511     high = a >> 64;
 512     out[2] += low;
 513     out[2] *= 2;
 514     out[3] = high;
 515
 516     a = ((uint128_t) small[0]) * small[3];
 517     low = a;
 518     high = a >> 64;
 519     out[3] += low;
 520     out[4] = high;
 521
 522     a = ((uint128_t) small[1]) * small[2];
 523     low = a;
 524     high = a >> 64;
 525     out[3] += low;
 526     out[3] *= 2;
 527     out[4] += high;
 528
 529     a = ((uint128_t) small[1]) * small[1];
 530     low = a;
 531     high = a >> 64;
 532     out[2] += low;
 533     out[3] += high;
 534
 535     a = ((uint128_t) small[1]) * small[3];
 536     low = a;
 537     high = a >> 64;
 538     out[4] += low;
 539     out[4] *= 2;
 540     out[5] = high;
 541
 542     a = ((uint128_t) small[2]) * small[3];
 543     low = a;
 544     high = a >> 64;
 545     out[5] += low;
 546     out[5] *= 2;
 547     out[6] = high;
 548     out[6] += high;
 549
 550     a = ((uint128_t) small[2]) * small[2];
 551     low = a;
 552     high = a >> 64;
 553     out[4] += low;
 554     out[5] += high;
 555
 556     a = ((uint128_t) small[3]) * small[3];
 557     low = a;
 558     high = a >> 64;
 559     out[6] += low;
 560     out[7] = high;
 561 }
 562
 563 /*-
 564  * felem_square sets |out| = |in|^2
 565  * On entry:
 566  *   in[i] < 2^109
 567  * On exit:
 568  *   out[i] < 7 * 2^64 < 2^67
 569  */
 570 static void felem_square(longfelem out, const felem in)
 571 {
 572     u64 small[4];
 573     felem_shrink(small, in);
 574     smallfelem_square(out, small);
 575 }
 576
 577 /*-
 578  * smallfelem_mul sets |out| = |small1| * |small2|
 579  * On entry:
 580  *   small1[i] < 2^64
 581  *   small2[i] < 2^64
 582  * On exit:
 583  *   out[i] < 7 * 2^64 < 2^67
 584  */
 585 static void smallfelem_mul(longfelem out, const smallfelem small1,
 586                            const smallfelem small2)
 587 {
 588     limb a;
 589     u64 high, low;
 590
 591     a = ((uint128_t) small1[0]) * small2[0];
 592     low = a;
 593     high = a >> 64;
 594     out[0] = low;
 595     out[1] = high;
 596
 597     a = ((uint128_t) small1[0]) * small2[1];
 598     low = a;
 599     high = a >> 64;
 600     out[1] += low;
 601     out[2] = high;
 602
 603     a = ((uint128_t) small1[1]) * small2[0];
 604     low = a;
 605     high = a >> 64;
 606     out[1] += low;
 607     out[2] += high;
 608
 609     a = ((uint128_t) small1[0]) * small2[2];
 610     low = a;
 611     high = a >> 64;
 612     out[2] += low;
 613     out[3] = high;
 614
 615     a = ((uint128_t) small1[1]) * small2[1];
 616     low = a;
 617     high = a >> 64;
 618     out[2] += low;
 619     out[3] += high;
 620
 621     a = ((uint128_t) small1[2]) * small2[0];
 622     low = a;
 623     high = a >> 64;
 624     out[2] += low;
 625     out[3] += high;
 626
 627     a = ((uint128_t) small1[0]) * small2[3];
 628     low = a;
 629     high = a >> 64;
 630     out[3] += low;
 631     out[4] = high;
 632
 633     a = ((uint128_t) small1[1]) * small2[2];
 634     low = a;
 635     high = a >> 64;
 636     out[3] += low;
 637     out[4] += high;
 638
 639     a = ((uint128_t) small1[2]) * small2[1];
 640     low = a;
 641     high = a >> 64;
 642     out[3] += low;
 643     out[4] += high;
 644
 645     a = ((uint128_t) small1[3]) * small2[0];
 646     low = a;
 647     high = a >> 64;
 648     out[3] += low;
 649     out[4] += high;
 650
 651     a = ((uint128_t) small1[1]) * small2[3];
 652     low = a;
 653     high = a >> 64;
 654     out[4] += low;
 655     out[5] = high;
 656
 657     a = ((uint128_t) small1[2]) * small2[2];
 658     low = a;
 659     high = a >> 64;
 660     out[4] += low;
 661     out[5] += high;
 662
 663     a = ((uint128_t) small1[3]) * small2[1];
 664     low = a;
 665     high = a >> 64;
 666     out[4] += low;
 667     out[5] += high;
 668
 669     a = ((uint128_t) small1[2]) * small2[3];
 670     low = a;
 671     high = a >> 64;
 672     out[5] += low;
 673     out[6] = high;
 674
 675     a = ((uint128_t) small1[3]) * small2[2];
 676     low = a;
 677     high = a >> 64;
 678     out[5] += low;
 679     out[6] += high;
 680
 681     a = ((uint128_t) small1[3]) * small2[3];
 682     low = a;
 683     high = a >> 64;
 684     out[6] += low;
 685     out[7] = high;
 686 }
 687
 688 /*-
 689  * felem_mul sets |out| = |in1| * |in2|
 690  * On entry:
 691  *   in1[i] < 2^109
 692  *   in2[i] < 2^109
 693  * On exit:
 694  *   out[i] < 7 * 2^64 < 2^67
 695  */
 696 static void felem_mul(longfelem out, const felem in1, const felem in2)
 697 {
 698     smallfelem small1, small2;
 699     felem_shrink(small1, in1);
 700     felem_shrink(small2, in2);
 701     smallfelem_mul(out, small1, small2);
 702 }
 703
 704 /*-
 705  * felem_small_mul sets |out| = |small1| * |in2|
 706  * On entry:
 707  *   small1[i] < 2^64
 708  *   in2[i] < 2^109
 709  * On exit:
 710  *   out[i] < 7 * 2^64 < 2^67
 711  */
 712 static void felem_small_mul(longfelem out, const smallfelem small1,
 713                             const felem in2)
 714 {
 715     smallfelem small2;
 716     felem_shrink(small2, in2);
 717     smallfelem_mul(out, small1, small2);
 718 }
 719
 720 # define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
 721 # define two100 (((limb)1) << 100)
 722 # define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
 723 /* zero100 is 0 mod p */
 724 static const felem zero100 =
 725     { two100m36m4, two100, two100m36p4, two100m36p4 };
 726
 727 /*-
 728  * Internal function for the different flavours of felem_reduce.
 729  * felem_reduce_ reduces the higher coefficients in[4]-in[7].
 730  * On entry:
 731  *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
 732  *   out[1] >= in[7] + 2^32*in[4]
 733  *   out[2] >= in[5] + 2^32*in[5]
 734  *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
 735  * On exit:
 736  *   out[0] <= out[0] + in[4] + 2^32*in[5]
 737  *   out[1] <= out[1] + in[5] + 2^33*in[6]
 738  *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
 739  *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
 740  */
 741 static void felem_reduce_(felem out, const longfelem in)
 742 {
 743     int128_t c;
 744     /* combine common terms from below */
 745     c = in[4] + (in[5] << 32);
 746     out[0] += c;
 747     out[3] -= c;
 748
 749     c = in[5] - in[7];
 750     out[1] += c;
 751     out[2] -= c;
 752
 753     /* the remaining terms */
 754     /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
 755     out[1] -= (in[4] << 32);
 756     out[3] += (in[4] << 32);
 757
 758     /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
 759     out[2] -= (in[5] << 32);
 760
 761     /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
 762     out[0] -= in[6];
 763     out[0] -= (in[6] << 32);
 764     out[1] += (in[6] << 33);
 765     out[2] += (in[6] * 2);
 766     out[3] -= (in[6] << 32);
 767
 768     /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
 769     out[0] -= in[7];
 770     out[0] -= (in[7] << 32);
 771     out[2] += (in[7] << 33);
 772     out[3] += (in[7] * 3);
 773 }
 774
 775 /*-
 776  * felem_reduce converts a longfelem into an felem.
 777  * To be called directly after felem_square or felem_mul.
 778  * On entry:
 779  *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
 780  *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
 781  * On exit:
 782  *   out[i] < 2^101
 783  */
 784 static void felem_reduce(felem out, const longfelem in)
 785 {
 786     out[0] = zero100[0] + in[0];
 787     out[1] = zero100[1] + in[1];
 788     out[2] = zero100[2] + in[2];
 789     out[3] = zero100[3] + in[3];
 790
 791     felem_reduce_(out, in);
 792
 793     /*-
 794      * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
 795      * out[1] > 2^100 - 2^64 - 7*2^96 > 0
 796      * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
 797      * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
 798      *
 799      * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
 800      * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
 801      * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
 802      * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
 803      */
 804 }
 805
 806 /*-
 807  * felem_reduce_zero105 converts a larger longfelem into an felem.
 808  * On entry:
 809  *   in[0] < 2^71
 810  * On exit:
 811  *   out[i] < 2^106
 812  */
 813 static void felem_reduce_zero105(felem out, const longfelem in)
 814 {
 815     out[0] = zero105[0] + in[0];
 816     out[1] = zero105[1] + in[1];
 817     out[2] = zero105[2] + in[2];
 818     out[3] = zero105[3] + in[3];
 819
 820     felem_reduce_(out, in);
 821
 822     /*-
 823      * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
 824      * out[1] > 2^105 - 2^71 - 2^103 > 0
 825      * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
 826      * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
 827      *
 828      * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 829      * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 830      * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
 831      * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
 832      */
 833 }
 834
 835 /*
 836  * subtract_u64 sets *result = *result - v and *carry to one if the
 837  * subtraction underflowed.
 838  */
 839 static void subtract_u64(u64 *result, u64 *carry, u64 v)
 840 {
 841     uint128_t r = *result;
 842     r -= v;
 843     *carry = (r >> 64) & 1;
 844     *result = (u64)r;
 845 }
 846
 847 /*
 848  * felem_contract converts |in| to its unique, minimal representation. On
 849  * entry: in[i] < 2^109
 850  */
 851 static void felem_contract(smallfelem out, const felem in)
 852 {
 853     unsigned i;
 854     u64 all_equal_so_far = 0, result = 0, carry;
 855
 856     felem_shrink(out, in);
 857     /* small is minimal except that the value might be > p */
 858
 859     all_equal_so_far--;
 860     /*
 861      * We are doing a constant time test if out >= kPrime. We need to compare
 862      * each u64, from most-significant to least significant. For each one, if
 863      * all words so far have been equal (m is all ones) then a non-equal
 864      * result is the answer. Otherwise we continue.
 865      */
 866     for (i = 3; i < 4; i--) {
 867         u64 equal;
 868         uint128_t a = ((uint128_t) kPrime[i]) - out[i];
 869         /*
 870          * if out[i] > kPrime[i] then a will underflow and the high 64-bits
 871          * will all be set.
 872          */
 873         result |= all_equal_so_far & ((u64)(a >> 64));
 874
 875         /*
 876          * if kPrime[i] == out[i] then |equal| will be all zeros and the
 877          * decrement will make it all ones.
 878          */
 879         equal = kPrime[i] ^ out[i];
 880         equal--;
 881         equal &= equal << 32;
 882         equal &= equal << 16;
 883         equal &= equal << 8;
 884         equal &= equal << 4;
 885         equal &= equal << 2;
 886         equal &= equal << 1;
 887         equal = ((s64) equal) >> 63;
 888
 889         all_equal_so_far &= equal;
 890     }
 891
 892     /*
 893      * if all_equal_so_far is still all ones then the two values are equal
 894      * and so out >= kPrime is true.
 895      */
 896     result |= all_equal_so_far;
 897
 898     /* if out >= kPrime then we subtract kPrime. */
 899     subtract_u64(&out[0], &carry, result & kPrime[0]);
 900     subtract_u64(&out[1], &carry, carry);
 901     subtract_u64(&out[2], &carry, carry);
 902     subtract_u64(&out[3], &carry, carry);
 903
 904     subtract_u64(&out[1], &carry, result & kPrime[1]);
 905     subtract_u64(&out[2], &carry, carry);
 906     subtract_u64(&out[3], &carry, carry);
 907
 908     subtract_u64(&out[2], &carry, result & kPrime[2]);
 909     subtract_u64(&out[3], &carry, carry);
 910
 911     subtract_u64(&out[3], &carry, result & kPrime[3]);
 912 }
 913
 914 static void smallfelem_square_contract(smallfelem out, const smallfelem in)
 915 {
 916     longfelem longtmp;
 917     felem tmp;
 918
 919     smallfelem_square(longtmp, in);
 920     felem_reduce(tmp, longtmp);
 921     felem_contract(out, tmp);
 922 }
 923
 924 static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
 925                                     const smallfelem in2)
 926 {
 927     longfelem longtmp;
 928     felem tmp;
 929
 930     smallfelem_mul(longtmp, in1, in2);
 931     felem_reduce(tmp, longtmp);
 932     felem_contract(out, tmp);
 933 }
 934
 935 /*-
 936  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 937  * otherwise.
 938  * On entry:
 939  *   small[i] < 2^64
 940  */
 941 static limb smallfelem_is_zero(const smallfelem small)
 942 {
 943     limb result;
 944     u64 is_p;
 945
 946     u64 is_zero = small[0] | small[1] | small[2] | small[3];
 947     is_zero--;
 948     is_zero &= is_zero << 32;
 949     is_zero &= is_zero << 16;
 950     is_zero &= is_zero << 8;
 951     is_zero &= is_zero << 4;
 952     is_zero &= is_zero << 2;
 953     is_zero &= is_zero << 1;
 954     is_zero = ((s64) is_zero) >> 63;
 955
 956     is_p = (small[0] ^ kPrime[0]) |
 957         (small[1] ^ kPrime[1]) |
 958         (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
 959     is_p--;
 960     is_p &= is_p << 32;
 961     is_p &= is_p << 16;
 962     is_p &= is_p << 8;
 963     is_p &= is_p << 4;
 964     is_p &= is_p << 2;
 965     is_p &= is_p << 1;
 966     is_p = ((s64) is_p) >> 63;
 967
 968     is_zero |= is_p;
 969
 970     result = is_zero;
 971     result |= ((limb) is_zero) << 64;
 972     return result;
 973 }
 974
 975 static int smallfelem_is_zero_int(const smallfelem small)
 976 {
 977     return (int)(smallfelem_is_zero(small) & ((limb) 1));
 978 }
 979
 980 /*-
 981  * felem_inv calculates |out| = |in|^{-1}
 982  *
 983  * Based on Fermat's Little Theorem:
 984  *   a^p = a (mod p)
 985  *   a^{p-1} = 1 (mod p)
 986  *   a^{p-2} = a^{-1} (mod p)
 987  */
 988 static void felem_inv(felem out, const felem in)
 989 {
 990     felem ftmp, ftmp2;
 991     /* each e_I will hold |in|^{2^I - 1} */
 992     felem e2, e4, e8, e16, e32, e64;
 993     longfelem tmp;
 994     unsigned i;
 995
 996     felem_square(tmp, in);
 997     felem_reduce(ftmp, tmp);    /* 2^1 */
 998     felem_mul(tmp, in, ftmp);
 999     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
1000     felem_assign(e2, ftmp);
1001     felem_square(tmp, ftmp);
1002     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
1003     felem_square(tmp, ftmp);
1004     felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
1005     felem_mul(tmp, ftmp, e2);
1006     felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1007     felem_assign(e4, ftmp);
1008     felem_square(tmp, ftmp);
1009     felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1010     felem_square(tmp, ftmp);
1011     felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1012     felem_square(tmp, ftmp);
1013     felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1014     felem_square(tmp, ftmp);
1015     felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1016     felem_mul(tmp, ftmp, e4);
1017     felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1018     felem_assign(e8, ftmp);
1019     for (i = 0; i < 8; i++) {
1020         felem_square(tmp, ftmp);
1021         felem_reduce(ftmp, tmp);
1022     }                           /* 2^16 - 2^8 */
1023     felem_mul(tmp, ftmp, e8);
1024     felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1025     felem_assign(e16, ftmp);
1026     for (i = 0; i < 16; i++) {
1027         felem_square(tmp, ftmp);
1028         felem_reduce(ftmp, tmp);
1029     }                           /* 2^32 - 2^16 */
1030     felem_mul(tmp, ftmp, e16);
1031     felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1032     felem_assign(e32, ftmp);
1033     for (i = 0; i < 32; i++) {
1034         felem_square(tmp, ftmp);
1035         felem_reduce(ftmp, tmp);
1036     }                           /* 2^64 - 2^32 */
1037     felem_assign(e64, ftmp);
1038     felem_mul(tmp, ftmp, in);
1039     felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1040     for (i = 0; i < 192; i++) {
1041         felem_square(tmp, ftmp);
1042         felem_reduce(ftmp, tmp);
1043     }                           /* 2^256 - 2^224 + 2^192 */
1044
1045     felem_mul(tmp, e64, e32);
1046     felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1047     for (i = 0; i < 16; i++) {
1048         felem_square(tmp, ftmp2);
1049         felem_reduce(ftmp2, tmp);
1050     }                           /* 2^80 - 2^16 */
1051     felem_mul(tmp, ftmp2, e16);
1052     felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1053     for (i = 0; i < 8; i++) {
1054         felem_square(tmp, ftmp2);
1055         felem_reduce(ftmp2, tmp);
1056     }                           /* 2^88 - 2^8 */
1057     felem_mul(tmp, ftmp2, e8);
1058     felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1059     for (i = 0; i < 4; i++) {
1060         felem_square(tmp, ftmp2);
1061         felem_reduce(ftmp2, tmp);
1062     }                           /* 2^92 - 2^4 */
1063     felem_mul(tmp, ftmp2, e4);
1064     felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1065     felem_square(tmp, ftmp2);
1066     felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1067     felem_square(tmp, ftmp2);
1068     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1069     felem_mul(tmp, ftmp2, e2);
1070     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1071     felem_square(tmp, ftmp2);
1072     felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1073     felem_square(tmp, ftmp2);
1074     felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1075     felem_mul(tmp, ftmp2, in);
1076     felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1077
1078     felem_mul(tmp, ftmp2, ftmp);
1079     felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1080 }
1081
1082 static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1083 {
1084     felem tmp;
1085
1086     smallfelem_expand(tmp, in);
1087     felem_inv(tmp, tmp);
1088     felem_contract(out, tmp);
1089 }
1090
1091 /*-
1092  * Group operations
1093  * ----------------
1094  *
1095  * Building on top of the field operations we have the operations on the
1096  * elliptic curve group itself. Points on the curve are represented in Jacobian
1097  * coordinates
1098  */
1099
1100 /*-
1101  * point_double calculates 2*(x_in, y_in, z_in)
1102  *
1103  * The method is taken from:
1104  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1105  *
1106  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1107  * while x_out == y_in is not (maybe this works, but it's not tested).
1108  */
1109 static void
1110 point_double(felem x_out, felem y_out, felem z_out,
1111              const felem x_in, const felem y_in, const felem z_in)
1112 {
1113     longfelem tmp, tmp2;
1114     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1115     smallfelem small1, small2;
1116
1117     felem_assign(ftmp, x_in);
1118     /* ftmp[i] < 2^106 */
1119     felem_assign(ftmp2, x_in);
1120     /* ftmp2[i] < 2^106 */
1121
1122     /* delta = z^2 */
1123     felem_square(tmp, z_in);
1124     felem_reduce(delta, tmp);
1125     /* delta[i] < 2^101 */
1126
1127     /* gamma = y^2 */
1128     felem_square(tmp, y_in);
1129     felem_reduce(gamma, tmp);
1130     /* gamma[i] < 2^101 */
1131     felem_shrink(small1, gamma);
1132
1133     /* beta = x*gamma */
1134     felem_small_mul(tmp, small1, x_in);
1135     felem_reduce(beta, tmp);
1136     /* beta[i] < 2^101 */
1137
1138     /* alpha = 3*(x-delta)*(x+delta) */
1139     felem_diff(ftmp, delta);
1140     /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1141     felem_sum(ftmp2, delta);
1142     /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1143     felem_scalar(ftmp2, 3);
1144     /* ftmp2[i] < 3 * 2^107 < 2^109 */
1145     felem_mul(tmp, ftmp, ftmp2);
1146     felem_reduce(alpha, tmp);
1147     /* alpha[i] < 2^101 */
1148     felem_shrink(small2, alpha);
1149
1150     /* x' = alpha^2 - 8*beta */
1151     smallfelem_square(tmp, small2);
1152     felem_reduce(x_out, tmp);
1153     felem_assign(ftmp, beta);
1154     felem_scalar(ftmp, 8);
1155     /* ftmp[i] < 8 * 2^101 = 2^104 */
1156     felem_diff(x_out, ftmp);
1157     /* x_out[i] < 2^105 + 2^101 < 2^106 */
1158
1159     /* z' = (y + z)^2 - gamma - delta */
1160     felem_sum(delta, gamma);
1161     /* delta[i] < 2^101 + 2^101 = 2^102 */
1162     felem_assign(ftmp, y_in);
1163     felem_sum(ftmp, z_in);
1164     /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1165     felem_square(tmp, ftmp);
1166     felem_reduce(z_out, tmp);
1167     felem_diff(z_out, delta);
1168     /* z_out[i] < 2^105 + 2^101 < 2^106 */
1169
1170     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1171     felem_scalar(beta, 4);
1172     /* beta[i] < 4 * 2^101 = 2^103 */
1173     felem_diff_zero107(beta, x_out);
1174     /* beta[i] < 2^107 + 2^103 < 2^108 */
1175     felem_small_mul(tmp, small2, beta);
1176     /* tmp[i] < 7 * 2^64 < 2^67 */
1177     smallfelem_square(tmp2, small1);
1178     /* tmp2[i] < 7 * 2^64 */
1179     longfelem_scalar(tmp2, 8);
1180     /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1181     longfelem_diff(tmp, tmp2);
1182     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1183     felem_reduce_zero105(y_out, tmp);
1184     /* y_out[i] < 2^106 */
1185 }
1186
1187 /*
1188  * point_double_small is the same as point_double, except that it operates on
1189  * smallfelems
1190  */
1191 static void
1192 point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1193                    const smallfelem x_in, const smallfelem y_in,
1194                    const smallfelem z_in)
1195 {
1196     felem felem_x_out, felem_y_out, felem_z_out;
1197     felem felem_x_in, felem_y_in, felem_z_in;
1198
1199     smallfelem_expand(felem_x_in, x_in);
1200     smallfelem_expand(felem_y_in, y_in);
1201     smallfelem_expand(felem_z_in, z_in);
1202     point_double(felem_x_out, felem_y_out, felem_z_out,
1203                  felem_x_in, felem_y_in, felem_z_in);
1204     felem_shrink(x_out, felem_x_out);
1205     felem_shrink(y_out, felem_y_out);
1206     felem_shrink(z_out, felem_z_out);
1207 }
1208
1209 /* copy_conditional copies in to out iff mask is all ones. */
1210 static void copy_conditional(felem out, const felem in, limb mask)
1211 {
1212     unsigned i;
1213     for (i = 0; i < NLIMBS; ++i) {
1214         const limb tmp = mask & (in[i] ^ out[i]);
1215         out[i] ^= tmp;
1216     }
1217 }
1218
1219 /* copy_small_conditional copies in to out iff mask is all ones. */
1220 static void copy_small_conditional(felem out, const smallfelem in, limb mask)
1221 {
1222     unsigned i;
1223     const u64 mask64 = mask;
1224     for (i = 0; i < NLIMBS; ++i) {
1225         out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1226     }
1227 }
1228
1229 /*-
1230  * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1231  *
1232  * The method is taken from:
1233  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1234  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1235  *
1236  * This function includes a branch for checking whether the two input points
1237  * are equal, (while not equal to the point at infinity). This case never
1238  * happens during single point multiplication, so there is no timing leak for
1239  * ECDH or ECDSA signing.
1240  */
1241 static void point_add(felem x3, felem y3, felem z3,
1242                       const felem x1, const felem y1, const felem z1,
1243                       const int mixed, const smallfelem x2,
1244                       const smallfelem y2, const smallfelem z2)
1245 {
1246     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1247     longfelem tmp, tmp2;
1248     smallfelem small1, small2, small3, small4, small5;
1249     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1250
1251     felem_shrink(small3, z1);
1252
1253     z1_is_zero = smallfelem_is_zero(small3);
1254     z2_is_zero = smallfelem_is_zero(z2);
1255
1256     /* ftmp = z1z1 = z1**2 */
1257     smallfelem_square(tmp, small3);
1258     felem_reduce(ftmp, tmp);
1259     /* ftmp[i] < 2^101 */
1260     felem_shrink(small1, ftmp);
1261
1262     if (!mixed) {
1263         /* ftmp2 = z2z2 = z2**2 */
1264         smallfelem_square(tmp, z2);
1265         felem_reduce(ftmp2, tmp);
1266         /* ftmp2[i] < 2^101 */
1267         felem_shrink(small2, ftmp2);
1268
1269         felem_shrink(small5, x1);
1270
1271         /* u1 = ftmp3 = x1*z2z2 */
1272         smallfelem_mul(tmp, small5, small2);
1273         felem_reduce(ftmp3, tmp);
1274         /* ftmp3[i] < 2^101 */
1275
1276         /* ftmp5 = z1 + z2 */
1277         felem_assign(ftmp5, z1);
1278         felem_small_sum(ftmp5, z2);
1279         /* ftmp5[i] < 2^107 */
1280
1281         /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1282         felem_square(tmp, ftmp5);
1283         felem_reduce(ftmp5, tmp);
1284         /* ftmp2 = z2z2 + z1z1 */
1285         felem_sum(ftmp2, ftmp);
1286         /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1287         felem_diff(ftmp5, ftmp2);
1288         /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1289
1290         /* ftmp2 = z2 * z2z2 */
1291         smallfelem_mul(tmp, small2, z2);
1292         felem_reduce(ftmp2, tmp);
1293
1294         /* s1 = ftmp2 = y1 * z2**3 */
1295         felem_mul(tmp, y1, ftmp2);
1296         felem_reduce(ftmp6, tmp);
1297         /* ftmp6[i] < 2^101 */
1298     } else {
1299         /*
1300          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1301          */
1302
1303         /* u1 = ftmp3 = x1*z2z2 */
1304         felem_assign(ftmp3, x1);
1305         /* ftmp3[i] < 2^106 */
1306
1307         /* ftmp5 = 2z1z2 */
1308         felem_assign(ftmp5, z1);
1309         felem_scalar(ftmp5, 2);
1310         /* ftmp5[i] < 2*2^106 = 2^107 */
1311
1312         /* s1 = ftmp2 = y1 * z2**3 */
1313         felem_assign(ftmp6, y1);
1314         /* ftmp6[i] < 2^106 */
1315     }
1316
1317     /* u2 = x2*z1z1 */
1318     smallfelem_mul(tmp, x2, small1);
1319     felem_reduce(ftmp4, tmp);
1320
1321     /* h = ftmp4 = u2 - u1 */
1322     felem_diff_zero107(ftmp4, ftmp3);
1323     /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1324     felem_shrink(small4, ftmp4);
1325
1326     x_equal = smallfelem_is_zero(small4);
1327
1328     /* z_out = ftmp5 * h */
1329     felem_small_mul(tmp, small4, ftmp5);
1330     felem_reduce(z_out, tmp);
1331     /* z_out[i] < 2^101 */
1332
1333     /* ftmp = z1 * z1z1 */
1334     smallfelem_mul(tmp, small1, small3);
1335     felem_reduce(ftmp, tmp);
1336
1337     /* s2 = tmp = y2 * z1**3 */
1338     felem_small_mul(tmp, y2, ftmp);
1339     felem_reduce(ftmp5, tmp);
1340
1341     /* r = ftmp5 = (s2 - s1)*2 */
1342     felem_diff_zero107(ftmp5, ftmp6);
1343     /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1344     felem_scalar(ftmp5, 2);
1345     /* ftmp5[i] < 2^109 */
1346     felem_shrink(small1, ftmp5);
1347     y_equal = smallfelem_is_zero(small1);
1348
1349     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1350         point_double(x3, y3, z3, x1, y1, z1);
1351         return;
1352     }
1353
1354     /* I = ftmp = (2h)**2 */
1355     felem_assign(ftmp, ftmp4);
1356     felem_scalar(ftmp, 2);
1357     /* ftmp[i] < 2*2^108 = 2^109 */
1358     felem_square(tmp, ftmp);
1359     felem_reduce(ftmp, tmp);
1360
1361     /* J = ftmp2 = h * I */
1362     felem_mul(tmp, ftmp4, ftmp);
1363     felem_reduce(ftmp2, tmp);
1364
1365     /* V = ftmp4 = U1 * I */
1366     felem_mul(tmp, ftmp3, ftmp);
1367     felem_reduce(ftmp4, tmp);
1368
1369     /* x_out = r**2 - J - 2V */
1370     smallfelem_square(tmp, small1);
1371     felem_reduce(x_out, tmp);
1372     felem_assign(ftmp3, ftmp4);
1373     felem_scalar(ftmp4, 2);
1374     felem_sum(ftmp4, ftmp2);
1375     /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1376     felem_diff(x_out, ftmp4);
1377     /* x_out[i] < 2^105 + 2^101 */
1378
1379     /* y_out = r(V-x_out) - 2 * s1 * J */
1380     felem_diff_zero107(ftmp3, x_out);
1381     /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1382     felem_small_mul(tmp, small1, ftmp3);
1383     felem_mul(tmp2, ftmp6, ftmp2);
1384     longfelem_scalar(tmp2, 2);
1385     /* tmp2[i] < 2*2^67 = 2^68 */
1386     longfelem_diff(tmp, tmp2);
1387     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1388     felem_reduce_zero105(y_out, tmp);
1389     /* y_out[i] < 2^106 */
1390
1391     copy_small_conditional(x_out, x2, z1_is_zero);
1392     copy_conditional(x_out, x1, z2_is_zero);
1393     copy_small_conditional(y_out, y2, z1_is_zero);
1394     copy_conditional(y_out, y1, z2_is_zero);
1395     copy_small_conditional(z_out, z2, z1_is_zero);
1396     copy_conditional(z_out, z1, z2_is_zero);
1397     felem_assign(x3, x_out);
1398     felem_assign(y3, y_out);
1399     felem_assign(z3, z_out);
1400 }
1401
1402 /*
1403  * point_add_small is the same as point_add, except that it operates on
1404  * smallfelems
1405  */
1406 static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1407                             smallfelem x1, smallfelem y1, smallfelem z1,
1408                             smallfelem x2, smallfelem y2, smallfelem z2)
1409 {
1410     felem felem_x3, felem_y3, felem_z3;
1411     felem felem_x1, felem_y1, felem_z1;
1412     smallfelem_expand(felem_x1, x1);
1413     smallfelem_expand(felem_y1, y1);
1414     smallfelem_expand(felem_z1, z1);
1415     point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1416               x2, y2, z2);
1417     felem_shrink(x3, felem_x3);
1418     felem_shrink(y3, felem_y3);
1419     felem_shrink(z3, felem_z3);
1420 }
1421
1422 /*-
1423  * Base point pre computation
1424  * --------------------------
1425  *
1426  * Two different sorts of precomputed tables are used in the following code.
1427  * Each contain various points on the curve, where each point is three field
1428  * elements (x, y, z).
1429  *
1430  * For the base point table, z is usually 1 (0 for the point at infinity).
1431  * This table has 2 * 16 elements, starting with the following:
1432  * index | bits    | point
1433  * ------+---------+------------------------------
1434  *     0 | 0 0 0 0 | 0G
1435  *     1 | 0 0 0 1 | 1G
1436  *     2 | 0 0 1 0 | 2^64G
1437  *     3 | 0 0 1 1 | (2^64 + 1)G
1438  *     4 | 0 1 0 0 | 2^128G
1439  *     5 | 0 1 0 1 | (2^128 + 1)G
1440  *     6 | 0 1 1 0 | (2^128 + 2^64)G
1441  *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1442  *     8 | 1 0 0 0 | 2^192G
1443  *     9 | 1 0 0 1 | (2^192 + 1)G
1444  *    10 | 1 0 1 0 | (2^192 + 2^64)G
1445  *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1446  *    12 | 1 1 0 0 | (2^192 + 2^128)G
1447  *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1448  *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1449  *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1450  * followed by a copy of this with each element multiplied by 2^32.
1451  *
1452  * The reason for this is so that we can clock bits into four different
1453  * locations when doing simple scalar multiplies against the base point,
1454  * and then another four locations using the second 16 elements.
1455  *
1456  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1457
1458 /* gmul is the table of precomputed base points */
1459 static const smallfelem gmul[2][16][3] = {
1460     {{{0, 0, 0, 0},
1461       {0, 0, 0, 0},
1462       {0, 0, 0, 0}},
1463      {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1464        0x6b17d1f2e12c4247},
1465       {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1466        0x4fe342e2fe1a7f9b},
1467       {1, 0, 0, 0}},
1468      {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1469        0x0fa822bc2811aaa5},
1470       {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1471        0xbff44ae8f5dba80d},
1472       {1, 0, 0, 0}},
1473      {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1474        0x300a4bbc89d6726f},
1475       {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1476        0x72aac7e0d09b4644},
1477       {1, 0, 0, 0}},
1478      {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1479        0x447d739beedb5e67},
1480       {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1481        0x2d4825ab834131ee},
1482       {1, 0, 0, 0}},
1483      {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1484        0xef9519328a9c72ff},
1485       {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1486        0x611e9fc37dbb2c9b},
1487       {1, 0, 0, 0}},
1488      {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1489        0x550663797b51f5d8},
1490       {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1491        0x157164848aecb851},
1492       {1, 0, 0, 0}},
1493      {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1494        0xeb5d7745b21141ea},
1495       {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1496        0xeafd72ebdbecc17b},
1497       {1, 0, 0, 0}},
1498      {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1499        0xa6d39677a7849276},
1500       {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1501        0x674f84749b0b8816},
1502       {1, 0, 0, 0}},
1503      {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1504        0x4e769e7672c9ddad},
1505       {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1506        0x42b99082de830663},
1507       {1, 0, 0, 0}},
1508      {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1509        0x78878ef61c6ce04d},
1510       {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1511        0xb6cb3f5d7b72c321},
1512       {1, 0, 0, 0}},
1513      {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1514        0x0c88bc4d716b1287},
1515       {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1516        0xdd5ddea3f3901dc6},
1517       {1, 0, 0, 0}},
1518      {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1519        0x68f344af6b317466},
1520       {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1521        0x31b9c405f8540a20},
1522       {1, 0, 0, 0}},
1523      {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1524        0x4052bf4b6f461db9},
1525       {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1526        0xfecf4d5190b0fc61},
1527       {1, 0, 0, 0}},
1528      {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1529        0x1eddbae2c802e41a},
1530       {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1531        0x43104d86560ebcfc},
1532       {1, 0, 0, 0}},
1533      {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1534        0xb48e26b484f7a21c},
1535       {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1536        0xfac015404d4d3dab},
1537       {1, 0, 0, 0}}},
1538     {{{0, 0, 0, 0},
1539       {0, 0, 0, 0},
1540       {0, 0, 0, 0}},
1541      {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1542        0x7fe36b40af22af89},
1543       {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1544        0xe697d45825b63624},
1545       {1, 0, 0, 0}},
1546      {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1547        0x4a5b506612a677a6},
1548       {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1549        0xeb13461ceac089f1},
1550       {1, 0, 0, 0}},
1551      {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1552        0x0781b8291c6a220a},
1553       {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1554        0x690cde8df0151593},
1555       {1, 0, 0, 0}},
1556      {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1557        0x8a535f566ec73617},
1558       {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1559        0x0455c08468b08bd7},
1560       {1, 0, 0, 0}},
1561      {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1562        0x06bada7ab77f8276},
1563       {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1564        0x5b476dfd0e6cb18a},
1565       {1, 0, 0, 0}},
1566      {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1567        0x3e29864e8a2ec908},
1568       {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1569        0x239b90ea3dc31e7e},
1570       {1, 0, 0, 0}},
1571      {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1572        0x820f4dd949f72ff7},
1573       {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1574        0x140406ec783a05ec},
1575       {1, 0, 0, 0}},
1576      {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1577        0x68f6b8542783dfee},
1578       {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1579        0xcbe1feba92e40ce6},
1580       {1, 0, 0, 0}},
1581      {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1582        0xd0b2f94d2f420109},
1583       {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1584        0x971459828b0719e5},
1585       {1, 0, 0, 0}},
1586      {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1587        0x961610004a866aba},
1588       {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1589        0x7acb9fadcee75e44},
1590       {1, 0, 0, 0}},
1591      {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1592        0x24eb9acca333bf5b},
1593       {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1594        0x69f891c5acd079cc},
1595       {1, 0, 0, 0}},
1596      {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1597        0xe51f547c5972a107},
1598       {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1599        0x1c309a2b25bb1387},
1600       {1, 0, 0, 0}},
1601      {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1602        0x20b87b8aa2c4e503},
1603       {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1604        0xf5c6fa49919776be},
1605       {1, 0, 0, 0}},
1606      {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1607        0x1ed7d1b9332010b9},
1608       {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1609        0x3a2b03f03217257a},
1610       {1, 0, 0, 0}},
1611      {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1612        0x15fee545c78dd9f6},
1613       {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1614        0x4ab5b6b2b8753f81},
1615       {1, 0, 0, 0}}}
1616 };
1617
1618 /*
1619  * select_point selects the |idx|th point from a precomputation table and
1620  * copies it to out.
1621  */
1622 static void select_point(const u64 idx, unsigned int size,
1623                          const smallfelem pre_comp[16][3], smallfelem out[3])
1624 {
1625     unsigned i, j;
1626     u64 *outlimbs = &out[0][0];
1627
1628     memset(out, 0, sizeof(*out) * 3);
1629
1630     for (i = 0; i < size; i++) {
1631         const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1632         u64 mask = i ^ idx;
1633         mask |= mask >> 4;
1634         mask |= mask >> 2;
1635         mask |= mask >> 1;
1636         mask &= 1;
1637         mask--;
1638         for (j = 0; j < NLIMBS * 3; j++)
1639             outlimbs[j] |= inlimbs[j] & mask;
1640     }
1641 }
1642
1643 /* get_bit returns the |i|th bit in |in| */
1644 static char get_bit(const felem_bytearray in, int i)
1645 {
1646     if ((i < 0) || (i >= 256))
1647         return 0;
1648     return (in[i >> 3] >> (i & 7)) & 1;
1649 }
1650
1651 /*
1652  * Interleaved point multiplication using precomputed point multiples: The
1653  * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1654  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1655  * generator, using certain (large) precomputed multiples in g_pre_comp.
1656  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1657  */
1658 static void batch_mul(felem x_out, felem y_out, felem z_out,
1659                       const felem_bytearray scalars[],
1660                       const unsigned num_points, const u8 *g_scalar,
1661                       const int mixed, const smallfelem pre_comp[][17][3],
1662                       const smallfelem g_pre_comp[2][16][3])
1663 {
1664     int i, skip;
1665     unsigned num, gen_mul = (g_scalar != NULL);
1666     felem nq[3], ftmp;
1667     smallfelem tmp[3];
1668     u64 bits;
1669     u8 sign, digit;
1670
1671     /* set nq to the point at infinity */
1672     memset(nq, 0, sizeof(nq));
1673
1674     /*
1675      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1676      * of the generator (two in each of the last 32 rounds) and additions of
1677      * other points multiples (every 5th round).
1678      */
1679     skip = 1;                   /* save two point operations in the first
1680                                  * round */
1681     for (i = (num_points ? 255 : 31); i >= 0; --i) {
1682         /* double */
1683         if (!skip)
1684             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1685
1686         /* add multiples of the generator */
1687         if (gen_mul && (i <= 31)) {
1688             /* first, look 32 bits upwards */
1689             bits = get_bit(g_scalar, i + 224) << 3;
1690             bits |= get_bit(g_scalar, i + 160) << 2;
1691             bits |= get_bit(g_scalar, i + 96) << 1;
1692             bits |= get_bit(g_scalar, i + 32);
1693             /* select the point to add, in constant time */
1694             select_point(bits, 16, g_pre_comp[1], tmp);
1695
1696             if (!skip) {
1697                 /* Arg 1 below is for "mixed" */
1698                 point_add(nq[0], nq[1], nq[2],
1699                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1700             } else {
1701                 smallfelem_expand(nq[0], tmp[0]);
1702                 smallfelem_expand(nq[1], tmp[1]);
1703                 smallfelem_expand(nq[2], tmp[2]);
1704                 skip = 0;
1705             }
1706
1707             /* second, look at the current position */
1708             bits = get_bit(g_scalar, i + 192) << 3;
1709             bits |= get_bit(g_scalar, i + 128) << 2;
1710             bits |= get_bit(g_scalar, i + 64) << 1;
1711             bits |= get_bit(g_scalar, i);
1712             /* select the point to add, in constant time */
1713             select_point(bits, 16, g_pre_comp[0], tmp);
1714             /* Arg 1 below is for "mixed" */
1715             point_add(nq[0], nq[1], nq[2],
1716                       nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1717         }
1718
1719         /* do other additions every 5 doublings */
1720         if (num_points && (i % 5 == 0)) {
1721             /* loop over all scalars */
1722             for (num = 0; num < num_points; ++num) {
1723                 bits = get_bit(scalars[num], i + 4) << 5;
1724                 bits |= get_bit(scalars[num], i + 3) << 4;
1725                 bits |= get_bit(scalars[num], i + 2) << 3;
1726                 bits |= get_bit(scalars[num], i + 1) << 2;
1727                 bits |= get_bit(scalars[num], i) << 1;
1728                 bits |= get_bit(scalars[num], i - 1);
1729                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1730
1731                 /*
1732                  * select the point to add or subtract, in constant time
1733                  */
1734                 select_point(digit, 17, pre_comp[num], tmp);
1735                 smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1736                                                * point */
1737                 copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1738                 felem_contract(tmp[1], ftmp);
1739
1740                 if (!skip) {
1741                     point_add(nq[0], nq[1], nq[2],
1742                               nq[0], nq[1], nq[2],
1743                               mixed, tmp[0], tmp[1], tmp[2]);
1744                 } else {
1745                     smallfelem_expand(nq[0], tmp[0]);
1746                     smallfelem_expand(nq[1], tmp[1]);
1747                     smallfelem_expand(nq[2], tmp[2]);
1748                     skip = 0;
1749                 }
1750             }
1751         }
1752     }
1753     felem_assign(x_out, nq[0]);
1754     felem_assign(y_out, nq[1]);
1755     felem_assign(z_out, nq[2]);
1756 }
1757
1758 /* Precomputation for the group generator. */
1759 typedef struct {
1760     smallfelem g_pre_comp[2][16][3];
1761     int references;
1762 } NISTP256_PRE_COMP;
1763
1764 const EC_METHOD *EC_GFp_nistp256_method(void)
1765 {
1766     static const EC_METHOD ret = {
1767         EC_FLAGS_DEFAULT_OCT,
1768         NID_X9_62_prime_field,
1769         ec_GFp_nistp256_group_init,
1770         ec_GFp_simple_group_finish,
1771         ec_GFp_simple_group_clear_finish,
1772         ec_GFp_nist_group_copy,
1773         ec_GFp_nistp256_group_set_curve,
1774         ec_GFp_simple_group_get_curve,
1775         ec_GFp_simple_group_get_degree,
1776         ec_GFp_simple_group_check_discriminant,
1777         ec_GFp_simple_point_init,
1778         ec_GFp_simple_point_finish,
1779         ec_GFp_simple_point_clear_finish,
1780         ec_GFp_simple_point_copy,
1781         ec_GFp_simple_point_set_to_infinity,
1782         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1783         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1784         ec_GFp_simple_point_set_affine_coordinates,
1785         ec_GFp_nistp256_point_get_affine_coordinates,
1786         0 /* point_set_compressed_coordinates */ ,
1787         0 /* point2oct */ ,
1788         0 /* oct2point */ ,
1789         ec_GFp_simple_add,
1790         ec_GFp_simple_dbl,
1791         ec_GFp_simple_invert,
1792         ec_GFp_simple_is_at_infinity,
1793         ec_GFp_simple_is_on_curve,
1794         ec_GFp_simple_cmp,
1795         ec_GFp_simple_make_affine,
1796         ec_GFp_simple_points_make_affine,
1797         ec_GFp_nistp256_points_mul,
1798         ec_GFp_nistp256_precompute_mult,
1799         ec_GFp_nistp256_have_precompute_mult,
1800         ec_GFp_nist_field_mul,
1801         ec_GFp_nist_field_sqr,
1802         0 /* field_div */ ,
1803         0 /* field_encode */ ,
1804         0 /* field_decode */ ,
1805         0                       /* field_set_to_one */
1806     };
1807
1808     return &ret;
1809 }
1810
1811 /******************************************************************************/
1812 /*
1813  * FUNCTIONS TO MANAGE PRECOMPUTATION
1814  */
1815
1816 static NISTP256_PRE_COMP *nistp256_pre_comp_new()
1817 {
1818     NISTP256_PRE_COMP *ret = NULL;
1819     ret = OPENSSL_malloc(sizeof(*ret));
1820     if (!ret) {
1821         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1822         return ret;
1823     }
1824     memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1825     ret->references = 1;
1826     return ret;
1827 }
1828
1829 static void *nistp256_pre_comp_dup(void *src_)
1830 {
1831     NISTP256_PRE_COMP *src = src_;
1832
1833     /* no need to actually copy, these objects never change! */
1834     CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1835
1836     return src_;
1837 }
1838
1839 static void nistp256_pre_comp_free(void *pre_)
1840 {
1841     int i;
1842     NISTP256_PRE_COMP *pre = pre_;
1843
1844     if (!pre)
1845         return;
1846
1847     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1848     if (i > 0)
1849         return;
1850
1851     OPENSSL_free(pre);
1852 }
1853
1854 static void nistp256_pre_comp_clear_free(void *pre_)
1855 {
1856     int i;
1857     NISTP256_PRE_COMP *pre = pre_;
1858
1859     if (!pre)
1860         return;
1861
1862     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1863     if (i > 0)
1864         return;
1865
1866     OPENSSL_clear_free(pre, sizeof(*pre));
1867 }
1868
1869 /******************************************************************************/
1870 /*
1871  * OPENSSL EC_METHOD FUNCTIONS
1872  */
1873
1874 int ec_GFp_nistp256_group_init(EC_GROUP *group)
1875 {
1876     int ret;
1877     ret = ec_GFp_simple_group_init(group);
1878     group->a_is_minus3 = 1;
1879     return ret;
1880 }
1881
1882 int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1883                                     const BIGNUM *a, const BIGNUM *b,
1884                                     BN_CTX *ctx)
1885 {
1886     int ret = 0;
1887     BN_CTX *new_ctx = NULL;
1888     BIGNUM *curve_p, *curve_a, *curve_b;
1889
1890     if (ctx == NULL)
1891         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1892             return 0;
1893     BN_CTX_start(ctx);
1894     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1895         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1896         ((curve_b = BN_CTX_get(ctx)) == NULL))
1897         goto err;
1898     BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1899     BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1900     BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1901     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1902         ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1903               EC_R_WRONG_CURVE_PARAMETERS);
1904         goto err;
1905     }
1906     group->field_mod_func = BN_nist_mod_256;
1907     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1908  err:
1909     BN_CTX_end(ctx);
1910     BN_CTX_free(new_ctx);
1911     return ret;
1912 }
1913
1914 /*
1915  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1916  * (X/Z^2, Y/Z^3)
1917  */
1918 int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1919                                                  const EC_POINT *point,
1920                                                  BIGNUM *x, BIGNUM *y,
1921                                                  BN_CTX *ctx)
1922 {
1923     felem z1, z2, x_in, y_in;
1924     smallfelem x_out, y_out;
1925     longfelem tmp;
1926
1927     if (EC_POINT_is_at_infinity(group, point)) {
1928         ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1929               EC_R_POINT_AT_INFINITY);
1930         return 0;
1931     }
1932     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1933         (!BN_to_felem(z1, point->Z)))
1934         return 0;
1935     felem_inv(z2, z1);
1936     felem_square(tmp, z2);
1937     felem_reduce(z1, tmp);
1938     felem_mul(tmp, x_in, z1);
1939     felem_reduce(x_in, tmp);
1940     felem_contract(x_out, x_in);
1941     if (x != NULL) {
1942         if (!smallfelem_to_BN(x, x_out)) {
1943             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1944                   ERR_R_BN_LIB);
1945             return 0;
1946         }
1947     }
1948     felem_mul(tmp, z1, z2);
1949     felem_reduce(z1, tmp);
1950     felem_mul(tmp, y_in, z1);
1951     felem_reduce(y_in, tmp);
1952     felem_contract(y_out, y_in);
1953     if (y != NULL) {
1954         if (!smallfelem_to_BN(y, y_out)) {
1955             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1956                   ERR_R_BN_LIB);
1957             return 0;
1958         }
1959     }
1960     return 1;
1961 }
1962
1963 /* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
1964 static void make_points_affine(size_t num, smallfelem points[][3],
1965                                smallfelem tmp_smallfelems[])
1966 {
1967     /*
1968      * Runs in constant time, unless an input is the point at infinity (which
1969      * normally shouldn't happen).
1970      */
1971     ec_GFp_nistp_points_make_affine_internal(num,
1972                                              points,
1973                                              sizeof(smallfelem),
1974                                              tmp_smallfelems,
1975                                              (void (*)(void *))smallfelem_one,
1976                                              (int (*)(const void *))
1977                                              smallfelem_is_zero_int,
1978                                              (void (*)(void *, const void *))
1979                                              smallfelem_assign,
1980                                              (void (*)(void *, const void *))
1981                                              smallfelem_square_contract,
1982                                              (void (*)
1983                                               (void *, const void *,
1984                                                const void *))
1985                                              smallfelem_mul_contract,
1986                                              (void (*)(void *, const void *))
1987                                              smallfelem_inv_contract,
1988                                              /* nothing to contract */
1989                                              (void (*)(void *, const void *))
1990                                              smallfelem_assign);
1991 }
1992
1993 /*
1994  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1995  * values Result is stored in r (r can equal one of the inputs).
1996  */
1997 int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
1998                                const BIGNUM *scalar, size_t num,
1999                                const EC_POINT *points[],
2000                                const BIGNUM *scalars[], BN_CTX *ctx)
2001 {
2002     int ret = 0;
2003     int j;
2004     int mixed = 0;
2005     BN_CTX *new_ctx = NULL;
2006     BIGNUM *x, *y, *z, *tmp_scalar;
2007     felem_bytearray g_secret;
2008     felem_bytearray *secrets = NULL;
2009     smallfelem (*pre_comp)[17][3] = NULL;
2010     smallfelem *tmp_smallfelems = NULL;
2011     felem_bytearray tmp;
2012     unsigned i, num_bytes;
2013     int have_pre_comp = 0;
2014     size_t num_points = num;
2015     smallfelem x_in, y_in, z_in;
2016     felem x_out, y_out, z_out;
2017     NISTP256_PRE_COMP *pre = NULL;
2018     const smallfelem(*g_pre_comp)[16][3] = NULL;
2019     EC_POINT *generator = NULL;
2020     const EC_POINT *p = NULL;
2021     const BIGNUM *p_scalar = NULL;
2022
2023     if (ctx == NULL)
2024         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2025             return 0;
2026     BN_CTX_start(ctx);
2027     if (((x = BN_CTX_get(ctx)) == NULL) ||
2028         ((y = BN_CTX_get(ctx)) == NULL) ||
2029         ((z = BN_CTX_get(ctx)) == NULL) ||
2030         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
2031         goto err;
2032
2033     if (scalar != NULL) {
2034         pre = EC_EX_DATA_get_data(group->extra_data,
2035                                   nistp256_pre_comp_dup,
2036                                   nistp256_pre_comp_free,
2037                                   nistp256_pre_comp_clear_free);
2038         if (pre)
2039             /* we have precomputation, try to use it */
2040             g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2041         else
2042             /* try to use the standard precomputation */
2043             g_pre_comp = &gmul[0];
2044         generator = EC_POINT_new(group);
2045         if (generator == NULL)
2046             goto err;
2047         /* get the generator from precomputation */
2048         if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2049             !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2050             !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2051             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2052             goto err;
2053         }
2054         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
2055                                                       generator, x, y, z,
2056                                                       ctx))
2057             goto err;
2058         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2059             /* precomputation matches generator */
2060             have_pre_comp = 1;
2061         else
2062             /*
2063              * we don't have valid precomputation: treat the generator as a
2064              * random point
2065              */
2066             num_points++;
2067     }
2068     if (num_points > 0) {
2069         if (num_points >= 3) {
2070             /*
2071              * unless we precompute multiples for just one or two points,
2072              * converting those into affine form is time well spent
2073              */
2074             mixed = 1;
2075         }
2076         secrets = OPENSSL_malloc(sizeof(*secrets) * num_points);
2077         pre_comp = OPENSSL_malloc(sizeof(*pre_comp) * num_points);
2078         if (mixed)
2079             tmp_smallfelems =
2080               OPENSSL_malloc(sizeof(*tmp_smallfelems) * (num_points * 17 + 1));
2081         if ((secrets == NULL) || (pre_comp == NULL)
2082             || (mixed && (tmp_smallfelems == NULL))) {
2083             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
2084             goto err;
2085         }
2086
2087         /*
2088          * we treat NULL scalars as 0, and NULL points as points at infinity,
2089          * i.e., they contribute nothing to the linear combination
2090          */
2091         memset(secrets, 0, sizeof(*secrets) * num_points);
2092         memset(pre_comp, 0, sizeof(*pre_comp) * num_points);
2093         for (i = 0; i < num_points; ++i) {
2094             if (i == num)
2095                 /*
2096                  * we didn't have a valid precomputation, so we pick the
2097                  * generator
2098                  */
2099             {
2100                 p = EC_GROUP_get0_generator(group);
2101                 p_scalar = scalar;
2102             } else
2103                 /* the i^th point */
2104             {
2105                 p = points[i];
2106                 p_scalar = scalars[i];
2107             }
2108             if ((p_scalar != NULL) && (p != NULL)) {
2109                 /* reduce scalar to 0 <= scalar < 2^256 */
2110                 if ((BN_num_bits(p_scalar) > 256)
2111                     || (BN_is_negative(p_scalar))) {
2112                     /*
2113                      * this is an unusual input, and we don't guarantee
2114                      * constant-timeness
2115                      */
2116                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
2117                         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2118                         goto err;
2119                     }
2120                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
2121                 } else
2122                     num_bytes = BN_bn2bin(p_scalar, tmp);
2123                 flip_endian(secrets[i], tmp, num_bytes);
2124                 /* precompute multiples */
2125                 if ((!BN_to_felem(x_out, p->X)) ||
2126                     (!BN_to_felem(y_out, p->Y)) ||
2127                     (!BN_to_felem(z_out, p->Z)))
2128                     goto err;
2129                 felem_shrink(pre_comp[i][1][0], x_out);
2130                 felem_shrink(pre_comp[i][1][1], y_out);
2131                 felem_shrink(pre_comp[i][1][2], z_out);
2132                 for (j = 2; j <= 16; ++j) {
2133                     if (j & 1) {
2134                         point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2135                                         pre_comp[i][j][2], pre_comp[i][1][0],
2136                                         pre_comp[i][1][1], pre_comp[i][1][2],
2137                                         pre_comp[i][j - 1][0],
2138                                         pre_comp[i][j - 1][1],
2139                                         pre_comp[i][j - 1][2]);
2140                     } else {
2141                         point_double_small(pre_comp[i][j][0],
2142                                            pre_comp[i][j][1],
2143                                            pre_comp[i][j][2],
2144                                            pre_comp[i][j / 2][0],
2145                                            pre_comp[i][j / 2][1],
2146                                            pre_comp[i][j / 2][2]);
2147                     }
2148                 }
2149             }
2150         }
2151         if (mixed)
2152             make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2153     }
2154
2155     /* the scalar for the generator */
2156     if ((scalar != NULL) && (have_pre_comp)) {
2157         memset(g_secret, 0, sizeof(g_secret));
2158         /* reduce scalar to 0 <= scalar < 2^256 */
2159         if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2160             /*
2161              * this is an unusual input, and we don't guarantee
2162              * constant-timeness
2163              */
2164             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2165                 ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2166                 goto err;
2167             }
2168             num_bytes = BN_bn2bin(tmp_scalar, tmp);
2169         } else
2170             num_bytes = BN_bn2bin(scalar, tmp);
2171         flip_endian(g_secret, tmp, num_bytes);
2172         /* do the multiplication with generator precomputation */
2173         batch_mul(x_out, y_out, z_out,
2174                   (const felem_bytearray(*))secrets, num_points,
2175                   g_secret,
2176                   mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2177     } else
2178         /* do the multiplication without generator precomputation */
2179         batch_mul(x_out, y_out, z_out,
2180                   (const felem_bytearray(*))secrets, num_points,
2181                   NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2182     /* reduce the output to its unique minimal representation */
2183     felem_contract(x_in, x_out);
2184     felem_contract(y_in, y_out);
2185     felem_contract(z_in, z_out);
2186     if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2187         (!smallfelem_to_BN(z, z_in))) {
2188         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2189         goto err;
2190     }
2191     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2192
2193  err:
2194     BN_CTX_end(ctx);
2195     EC_POINT_free(generator);
2196     BN_CTX_free(new_ctx);
2197     OPENSSL_free(secrets);
2198     OPENSSL_free(pre_comp);
2199     OPENSSL_free(tmp_smallfelems);
2200     return ret;
2201 }
2202
2203 int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2204 {
2205     int ret = 0;
2206     NISTP256_PRE_COMP *pre = NULL;
2207     int i, j;
2208     BN_CTX *new_ctx = NULL;
2209     BIGNUM *x, *y;
2210     EC_POINT *generator = NULL;
2211     smallfelem tmp_smallfelems[32];
2212     felem x_tmp, y_tmp, z_tmp;
2213
2214     /* throw away old precomputation */
2215     EC_EX_DATA_free_data(&group->extra_data, nistp256_pre_comp_dup,
2216                          nistp256_pre_comp_free,
2217                          nistp256_pre_comp_clear_free);
2218     if (ctx == NULL)
2219         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2220             return 0;
2221     BN_CTX_start(ctx);
2222     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2223         goto err;
2224     /* get the generator */
2225     if (group->generator == NULL)
2226         goto err;
2227     generator = EC_POINT_new(group);
2228     if (generator == NULL)
2229         goto err;
2230     BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2231     BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2232     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2233         goto err;
2234     if ((pre = nistp256_pre_comp_new()) == NULL)
2235         goto err;
2236     /*
2237      * if the generator is the standard one, use built-in precomputation
2238      */
2239     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2240         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2241         ret = 1;
2242         goto err;
2243     }
2244     if ((!BN_to_felem(x_tmp, group->generator->X)) ||
2245         (!BN_to_felem(y_tmp, group->generator->Y)) ||
2246         (!BN_to_felem(z_tmp, group->generator->Z)))
2247         goto err;
2248     felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2249     felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2250     felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2251     /*
2252      * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2253      * 2^160*G, 2^224*G for the second one
2254      */
2255     for (i = 1; i <= 8; i <<= 1) {
2256         point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2257                            pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2258                            pre->g_pre_comp[0][i][1],
2259                            pre->g_pre_comp[0][i][2]);
2260         for (j = 0; j < 31; ++j) {
2261             point_double_small(pre->g_pre_comp[1][i][0],
2262                                pre->g_pre_comp[1][i][1],
2263                                pre->g_pre_comp[1][i][2],
2264                                pre->g_pre_comp[1][i][0],
2265                                pre->g_pre_comp[1][i][1],
2266                                pre->g_pre_comp[1][i][2]);
2267         }
2268         if (i == 8)
2269             break;
2270         point_double_small(pre->g_pre_comp[0][2 * i][0],
2271                            pre->g_pre_comp[0][2 * i][1],
2272                            pre->g_pre_comp[0][2 * i][2],
2273                            pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2274                            pre->g_pre_comp[1][i][2]);
2275         for (j = 0; j < 31; ++j) {
2276             point_double_small(pre->g_pre_comp[0][2 * i][0],
2277                                pre->g_pre_comp[0][2 * i][1],
2278                                pre->g_pre_comp[0][2 * i][2],
2279                                pre->g_pre_comp[0][2 * i][0],
2280                                pre->g_pre_comp[0][2 * i][1],
2281                                pre->g_pre_comp[0][2 * i][2]);
2282         }
2283     }
2284     for (i = 0; i < 2; i++) {
2285         /* g_pre_comp[i][0] is the point at infinity */
2286         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2287         /* the remaining multiples */
2288         /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2289         point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2290                         pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2291                         pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2292                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2293                         pre->g_pre_comp[i][2][2]);
2294         /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2295         point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2296                         pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2297                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2298                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2299                         pre->g_pre_comp[i][2][2]);
2300         /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2301         point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2302                         pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2303                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2304                         pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2305                         pre->g_pre_comp[i][4][2]);
2306         /*
2307          * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2308          */
2309         point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2310                         pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2311                         pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2312                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2313                         pre->g_pre_comp[i][2][2]);
2314         for (j = 1; j < 8; ++j) {
2315             /* odd multiples: add G resp. 2^32*G */
2316             point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2317                             pre->g_pre_comp[i][2 * j + 1][1],
2318                             pre->g_pre_comp[i][2 * j + 1][2],
2319                             pre->g_pre_comp[i][2 * j][0],
2320                             pre->g_pre_comp[i][2 * j][1],
2321                             pre->g_pre_comp[i][2 * j][2],
2322                             pre->g_pre_comp[i][1][0],
2323                             pre->g_pre_comp[i][1][1],
2324                             pre->g_pre_comp[i][1][2]);
2325         }
2326     }
2327     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2328
2329     if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup,
2330                              nistp256_pre_comp_free,
2331                              nistp256_pre_comp_clear_free))
2332         goto err;
2333     ret = 1;
2334     pre = NULL;
2335  err:
2336     BN_CTX_end(ctx);
2337     EC_POINT_free(generator);
2338     BN_CTX_free(new_ctx);
2339     nistp256_pre_comp_free(pre);
2340     return ret;
2341 }
2342
2343 int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2344 {
2345     if (EC_EX_DATA_get_data(group->extra_data, nistp256_pre_comp_dup,
2346                             nistp256_pre_comp_free,
2347                             nistp256_pre_comp_clear_free)
2348         != NULL)
2349         return 1;
2350     else
2351         return 0;
2352 }
2353 #else
2354 static void *dummy = &dummy;
2355 #endif