crypto/ec/ecp_nistp256.c

   1 /*
   2  * Written by Adam Langley (Google) for the OpenSSL project
   3  */
   4 /* Copyright 2011 Google Inc.
   5  *
   6  * Licensed under the Apache License, Version 2.0 (the "License");
   7  *
   8  * you may not use this file except in compliance with the License.
   9  * You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  *  Unless required by applicable law or agreed to in writing, software
  14  *  distributed under the License is distributed on an "AS IS" BASIS,
  15  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  *  See the License for the specific language governing permissions and
  17  *  limitations under the License.
  18  */
  19
  20 /*
  21  * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
  22  *
  23  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  24  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  25  * work which got its smarts from Daniel J. Bernstein's work on the same.
  26  */
  27
  28 #include <openssl/opensslconf.h>
  29 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  30 NON_EMPTY_TRANSLATION_UNIT
  31 #else
  32
  33 # include <stdint.h>
  34 # include <string.h>
  35 # include <openssl/err.h>
  36 # include "ec_lcl.h"
  37
  38 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  39   /* even with gcc, the typedef won't work for 32-bit platforms */
  40 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  41                                  * platforms */
  42 typedef __int128_t int128_t;
  43 # else
  44 #  error "Need GCC 3.1 or later to define type uint128_t"
  45 # endif
  46
  47 typedef uint8_t u8;
  48 typedef uint32_t u32;
  49 typedef uint64_t u64;
  50 typedef int64_t s64;
  51
  52 /*
  53  * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
  54  * can serialise an element of this field into 32 bytes. We call this an
  55  * felem_bytearray.
  56  */
  57
  58 typedef u8 felem_bytearray[32];
  59
  60 /*
  61  * These are the parameters of P256, taken from FIPS 186-3, page 86. These
  62  * values are big-endian.
  63  */
  64 static const felem_bytearray nistp256_curve_params[5] = {
  65     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
  66      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  67      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  68      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
  69     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
  70      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  71      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
  73     {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
  74      0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
  75      0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
  76      0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
  77     {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
  78      0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
  79      0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
  80      0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
  81     {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
  82      0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
  83      0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
  84      0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
  85 };
  86
  87 /*-
  88  * The representation of field elements.
  89  * ------------------------------------
  90  *
  91  * We represent field elements with either four 128-bit values, eight 128-bit
  92  * values, or four 64-bit values. The field element represented is:
  93  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
  94  * or:
  95  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
  96  *
  97  * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
  98  * apart, but are 128-bits wide, the most significant bits of each limb overlap
  99  * with the least significant bits of the next.
 100  *
 101  * A field element with four limbs is an 'felem'. One with eight limbs is a
 102  * 'longfelem'
 103  *
 104  * A field element with four, 64-bit values is called a 'smallfelem'. Small
 105  * values are used as intermediate values before multiplication.
 106  */
 107
 108 # define NLIMBS 4
 109
 110 typedef uint128_t limb;
 111 typedef limb felem[NLIMBS];
 112 typedef limb longfelem[NLIMBS * 2];
 113 typedef u64 smallfelem[NLIMBS];
 114
 115 /* This is the value of the prime as four 64-bit words, little-endian. */
 116 static const u64 kPrime[4] =
 117     { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
 118 static const u64 bottom63bits = 0x7ffffffffffffffful;
 119
 120 /*
 121  * bin32_to_felem takes a little-endian byte array and converts it into felem
 122  * form. This assumes that the CPU is little-endian.
 123  */
 124 static void bin32_to_felem(felem out, const u8 in[32])
 125 {
 126     out[0] = *((u64 *)&in[0]);
 127     out[1] = *((u64 *)&in[8]);
 128     out[2] = *((u64 *)&in[16]);
 129     out[3] = *((u64 *)&in[24]);
 130 }
 131
 132 /*
 133  * smallfelem_to_bin32 takes a smallfelem and serialises into a little
 134  * endian, 32 byte array. This assumes that the CPU is little-endian.
 135  */
 136 static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
 137 {
 138     *((u64 *)&out[0]) = in[0];
 139     *((u64 *)&out[8]) = in[1];
 140     *((u64 *)&out[16]) = in[2];
 141     *((u64 *)&out[24]) = in[3];
 142 }
 143
 144 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 145 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 146 {
 147     unsigned i;
 148     for (i = 0; i < len; ++i)
 149         out[i] = in[len - 1 - i];
 150 }
 151
 152 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 153 static int BN_to_felem(felem out, const BIGNUM *bn)
 154 {
 155     felem_bytearray b_in;
 156     felem_bytearray b_out;
 157     unsigned num_bytes;
 158
 159     /* BN_bn2bin eats leading zeroes */
 160     memset(b_out, 0, sizeof(b_out));
 161     num_bytes = BN_num_bytes(bn);
 162     if (num_bytes > sizeof b_out) {
 163         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 164         return 0;
 165     }
 166     if (BN_is_negative(bn)) {
 167         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 168         return 0;
 169     }
 170     num_bytes = BN_bn2bin(bn, b_in);
 171     flip_endian(b_out, b_in, num_bytes);
 172     bin32_to_felem(out, b_out);
 173     return 1;
 174 }
 175
 176 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 177 static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
 178 {
 179     felem_bytearray b_in, b_out;
 180     smallfelem_to_bin32(b_in, in);
 181     flip_endian(b_out, b_in, sizeof b_out);
 182     return BN_bin2bn(b_out, sizeof b_out, out);
 183 }
 184
 185 /*-
 186  * Field operations
 187  * ----------------
 188  */
 189
 190 static void smallfelem_one(smallfelem out)
 191 {
 192     out[0] = 1;
 193     out[1] = 0;
 194     out[2] = 0;
 195     out[3] = 0;
 196 }
 197
 198 static void smallfelem_assign(smallfelem out, const smallfelem in)
 199 {
 200     out[0] = in[0];
 201     out[1] = in[1];
 202     out[2] = in[2];
 203     out[3] = in[3];
 204 }
 205
 206 static void felem_assign(felem out, const felem in)
 207 {
 208     out[0] = in[0];
 209     out[1] = in[1];
 210     out[2] = in[2];
 211     out[3] = in[3];
 212 }
 213
 214 /* felem_sum sets out = out + in. */
 215 static void felem_sum(felem out, const felem in)
 216 {
 217     out[0] += in[0];
 218     out[1] += in[1];
 219     out[2] += in[2];
 220     out[3] += in[3];
 221 }
 222
 223 /* felem_small_sum sets out = out + in. */
 224 static void felem_small_sum(felem out, const smallfelem in)
 225 {
 226     out[0] += in[0];
 227     out[1] += in[1];
 228     out[2] += in[2];
 229     out[3] += in[3];
 230 }
 231
 232 /* felem_scalar sets out = out * scalar */
 233 static void felem_scalar(felem out, const u64 scalar)
 234 {
 235     out[0] *= scalar;
 236     out[1] *= scalar;
 237     out[2] *= scalar;
 238     out[3] *= scalar;
 239 }
 240
 241 /* longfelem_scalar sets out = out * scalar */
 242 static void longfelem_scalar(longfelem out, const u64 scalar)
 243 {
 244     out[0] *= scalar;
 245     out[1] *= scalar;
 246     out[2] *= scalar;
 247     out[3] *= scalar;
 248     out[4] *= scalar;
 249     out[5] *= scalar;
 250     out[6] *= scalar;
 251     out[7] *= scalar;
 252 }
 253
 254 # define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
 255 # define two105 (((limb)1) << 105)
 256 # define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
 257
 258 /* zero105 is 0 mod p */
 259 static const felem zero105 =
 260     { two105m41m9, two105, two105m41p9, two105m41p9 };
 261
 262 /*-
 263  * smallfelem_neg sets |out| to |-small|
 264  * On exit:
 265  *   out[i] < out[i] + 2^105
 266  */
 267 static void smallfelem_neg(felem out, const smallfelem small)
 268 {
 269     /* In order to prevent underflow, we subtract from 0 mod p. */
 270     out[0] = zero105[0] - small[0];
 271     out[1] = zero105[1] - small[1];
 272     out[2] = zero105[2] - small[2];
 273     out[3] = zero105[3] - small[3];
 274 }
 275
 276 /*-
 277  * felem_diff subtracts |in| from |out|
 278  * On entry:
 279  *   in[i] < 2^104
 280  * On exit:
 281  *   out[i] < out[i] + 2^105
 282  */
 283 static void felem_diff(felem out, const felem in)
 284 {
 285     /*
 286      * In order to prevent underflow, we add 0 mod p before subtracting.
 287      */
 288     out[0] += zero105[0];
 289     out[1] += zero105[1];
 290     out[2] += zero105[2];
 291     out[3] += zero105[3];
 292
 293     out[0] -= in[0];
 294     out[1] -= in[1];
 295     out[2] -= in[2];
 296     out[3] -= in[3];
 297 }
 298
 299 # define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
 300 # define two107 (((limb)1) << 107)
 301 # define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
 302
 303 /* zero107 is 0 mod p */
 304 static const felem zero107 =
 305     { two107m43m11, two107, two107m43p11, two107m43p11 };
 306
 307 /*-
 308  * An alternative felem_diff for larger inputs |in|
 309  * felem_diff_zero107 subtracts |in| from |out|
 310  * On entry:
 311  *   in[i] < 2^106
 312  * On exit:
 313  *   out[i] < out[i] + 2^107
 314  */
 315 static void felem_diff_zero107(felem out, const felem in)
 316 {
 317     /*
 318      * In order to prevent underflow, we add 0 mod p before subtracting.
 319      */
 320     out[0] += zero107[0];
 321     out[1] += zero107[1];
 322     out[2] += zero107[2];
 323     out[3] += zero107[3];
 324
 325     out[0] -= in[0];
 326     out[1] -= in[1];
 327     out[2] -= in[2];
 328     out[3] -= in[3];
 329 }
 330
 331 /*-
 332  * longfelem_diff subtracts |in| from |out|
 333  * On entry:
 334  *   in[i] < 7*2^67
 335  * On exit:
 336  *   out[i] < out[i] + 2^70 + 2^40
 337  */
 338 static void longfelem_diff(longfelem out, const longfelem in)
 339 {
 340     static const limb two70m8p6 =
 341         (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
 342     static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
 343     static const limb two70 = (((limb) 1) << 70);
 344     static const limb two70m40m38p6 =
 345         (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
 346         (((limb) 1) << 6);
 347     static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
 348
 349     /* add 0 mod p to avoid underflow */
 350     out[0] += two70m8p6;
 351     out[1] += two70p40;
 352     out[2] += two70;
 353     out[3] += two70m40m38p6;
 354     out[4] += two70m6;
 355     out[5] += two70m6;
 356     out[6] += two70m6;
 357     out[7] += two70m6;
 358
 359     /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
 360     out[0] -= in[0];
 361     out[1] -= in[1];
 362     out[2] -= in[2];
 363     out[3] -= in[3];
 364     out[4] -= in[4];
 365     out[5] -= in[5];
 366     out[6] -= in[6];
 367     out[7] -= in[7];
 368 }
 369
 370 # define two64m0 (((limb)1) << 64) - 1
 371 # define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
 372 # define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
 373 # define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
 374
 375 /* zero110 is 0 mod p */
 376 static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
 377
 378 /*-
 379  * felem_shrink converts an felem into a smallfelem. The result isn't quite
 380  * minimal as the value may be greater than p.
 381  *
 382  * On entry:
 383  *   in[i] < 2^109
 384  * On exit:
 385  *   out[i] < 2^64
 386  */
 387 static void felem_shrink(smallfelem out, const felem in)
 388 {
 389     felem tmp;
 390     u64 a, b, mask;
 391     s64 high, low;
 392     static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
 393
 394     /* Carry 2->3 */
 395     tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
 396     /* tmp[3] < 2^110 */
 397
 398     tmp[2] = zero110[2] + (u64)in[2];
 399     tmp[0] = zero110[0] + in[0];
 400     tmp[1] = zero110[1] + in[1];
 401     /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
 402
 403     /*
 404      * We perform two partial reductions where we eliminate the high-word of
 405      * tmp[3]. We don't update the other words till the end.
 406      */
 407     a = tmp[3] >> 64;           /* a < 2^46 */
 408     tmp[3] = (u64)tmp[3];
 409     tmp[3] -= a;
 410     tmp[3] += ((limb) a) << 32;
 411     /* tmp[3] < 2^79 */
 412
 413     b = a;
 414     a = tmp[3] >> 64;           /* a < 2^15 */
 415     b += a;                     /* b < 2^46 + 2^15 < 2^47 */
 416     tmp[3] = (u64)tmp[3];
 417     tmp[3] -= a;
 418     tmp[3] += ((limb) a) << 32;
 419     /* tmp[3] < 2^64 + 2^47 */
 420
 421     /*
 422      * This adjusts the other two words to complete the two partial
 423      * reductions.
 424      */
 425     tmp[0] += b;
 426     tmp[1] -= (((limb) b) << 32);
 427
 428     /*
 429      * In order to make space in tmp[3] for the carry from 2 -> 3, we
 430      * conditionally subtract kPrime if tmp[3] is large enough.
 431      */
 432     high = tmp[3] >> 64;
 433     /* As tmp[3] < 2^65, high is either 1 or 0 */
 434     high <<= 63;
 435     high >>= 63;
 436     /*-
 437      * high is:
 438      *   all ones   if the high word of tmp[3] is 1
 439      *   all zeros  if the high word of tmp[3] if 0 */
 440     low = tmp[3];
 441     mask = low >> 63;
 442     /*-
 443      * mask is:
 444      *   all ones   if the MSB of low is 1
 445      *   all zeros  if the MSB of low if 0 */
 446     low &= bottom63bits;
 447     low -= kPrime3Test;
 448     /* if low was greater than kPrime3Test then the MSB is zero */
 449     low = ~low;
 450     low >>= 63;
 451     /*-
 452      * low is:
 453      *   all ones   if low was > kPrime3Test
 454      *   all zeros  if low was <= kPrime3Test */
 455     mask = (mask & low) | high;
 456     tmp[0] -= mask & kPrime[0];
 457     tmp[1] -= mask & kPrime[1];
 458     /* kPrime[2] is zero, so omitted */
 459     tmp[3] -= mask & kPrime[3];
 460     /* tmp[3] < 2**64 - 2**32 + 1 */
 461
 462     tmp[1] += ((u64)(tmp[0] >> 64));
 463     tmp[0] = (u64)tmp[0];
 464     tmp[2] += ((u64)(tmp[1] >> 64));
 465     tmp[1] = (u64)tmp[1];
 466     tmp[3] += ((u64)(tmp[2] >> 64));
 467     tmp[2] = (u64)tmp[2];
 468     /* tmp[i] < 2^64 */
 469
 470     out[0] = tmp[0];
 471     out[1] = tmp[1];
 472     out[2] = tmp[2];
 473     out[3] = tmp[3];
 474 }
 475
 476 /* smallfelem_expand converts a smallfelem to an felem */
 477 static void smallfelem_expand(felem out, const smallfelem in)
 478 {
 479     out[0] = in[0];
 480     out[1] = in[1];
 481     out[2] = in[2];
 482     out[3] = in[3];
 483 }
 484
 485 /*-
 486  * smallfelem_square sets |out| = |small|^2
 487  * On entry:
 488  *   small[i] < 2^64
 489  * On exit:
 490  *   out[i] < 7 * 2^64 < 2^67
 491  */
 492 static void smallfelem_square(longfelem out, const smallfelem small)
 493 {
 494     limb a;
 495     u64 high, low;
 496
 497     a = ((uint128_t) small[0]) * small[0];
 498     low = a;
 499     high = a >> 64;
 500     out[0] = low;
 501     out[1] = high;
 502
 503     a = ((uint128_t) small[0]) * small[1];
 504     low = a;
 505     high = a >> 64;
 506     out[1] += low;
 507     out[1] += low;
 508     out[2] = high;
 509
 510     a = ((uint128_t) small[0]) * small[2];
 511     low = a;
 512     high = a >> 64;
 513     out[2] += low;
 514     out[2] *= 2;
 515     out[3] = high;
 516
 517     a = ((uint128_t) small[0]) * small[3];
 518     low = a;
 519     high = a >> 64;
 520     out[3] += low;
 521     out[4] = high;
 522
 523     a = ((uint128_t) small[1]) * small[2];
 524     low = a;
 525     high = a >> 64;
 526     out[3] += low;
 527     out[3] *= 2;
 528     out[4] += high;
 529
 530     a = ((uint128_t) small[1]) * small[1];
 531     low = a;
 532     high = a >> 64;
 533     out[2] += low;
 534     out[3] += high;
 535
 536     a = ((uint128_t) small[1]) * small[3];
 537     low = a;
 538     high = a >> 64;
 539     out[4] += low;
 540     out[4] *= 2;
 541     out[5] = high;
 542
 543     a = ((uint128_t) small[2]) * small[3];
 544     low = a;
 545     high = a >> 64;
 546     out[5] += low;
 547     out[5] *= 2;
 548     out[6] = high;
 549     out[6] += high;
 550
 551     a = ((uint128_t) small[2]) * small[2];
 552     low = a;
 553     high = a >> 64;
 554     out[4] += low;
 555     out[5] += high;
 556
 557     a = ((uint128_t) small[3]) * small[3];
 558     low = a;
 559     high = a >> 64;
 560     out[6] += low;
 561     out[7] = high;
 562 }
 563
 564 /*-
 565  * felem_square sets |out| = |in|^2
 566  * On entry:
 567  *   in[i] < 2^109
 568  * On exit:
 569  *   out[i] < 7 * 2^64 < 2^67
 570  */
 571 static void felem_square(longfelem out, const felem in)
 572 {
 573     u64 small[4];
 574     felem_shrink(small, in);
 575     smallfelem_square(out, small);
 576 }
 577
 578 /*-
 579  * smallfelem_mul sets |out| = |small1| * |small2|
 580  * On entry:
 581  *   small1[i] < 2^64
 582  *   small2[i] < 2^64
 583  * On exit:
 584  *   out[i] < 7 * 2^64 < 2^67
 585  */
 586 static void smallfelem_mul(longfelem out, const smallfelem small1,
 587                            const smallfelem small2)
 588 {
 589     limb a;
 590     u64 high, low;
 591
 592     a = ((uint128_t) small1[0]) * small2[0];
 593     low = a;
 594     high = a >> 64;
 595     out[0] = low;
 596     out[1] = high;
 597
 598     a = ((uint128_t) small1[0]) * small2[1];
 599     low = a;
 600     high = a >> 64;
 601     out[1] += low;
 602     out[2] = high;
 603
 604     a = ((uint128_t) small1[1]) * small2[0];
 605     low = a;
 606     high = a >> 64;
 607     out[1] += low;
 608     out[2] += high;
 609
 610     a = ((uint128_t) small1[0]) * small2[2];
 611     low = a;
 612     high = a >> 64;
 613     out[2] += low;
 614     out[3] = high;
 615
 616     a = ((uint128_t) small1[1]) * small2[1];
 617     low = a;
 618     high = a >> 64;
 619     out[2] += low;
 620     out[3] += high;
 621
 622     a = ((uint128_t) small1[2]) * small2[0];
 623     low = a;
 624     high = a >> 64;
 625     out[2] += low;
 626     out[3] += high;
 627
 628     a = ((uint128_t) small1[0]) * small2[3];
 629     low = a;
 630     high = a >> 64;
 631     out[3] += low;
 632     out[4] = high;
 633
 634     a = ((uint128_t) small1[1]) * small2[2];
 635     low = a;
 636     high = a >> 64;
 637     out[3] += low;
 638     out[4] += high;
 639
 640     a = ((uint128_t) small1[2]) * small2[1];
 641     low = a;
 642     high = a >> 64;
 643     out[3] += low;
 644     out[4] += high;
 645
 646     a = ((uint128_t) small1[3]) * small2[0];
 647     low = a;
 648     high = a >> 64;
 649     out[3] += low;
 650     out[4] += high;
 651
 652     a = ((uint128_t) small1[1]) * small2[3];
 653     low = a;
 654     high = a >> 64;
 655     out[4] += low;
 656     out[5] = high;
 657
 658     a = ((uint128_t) small1[2]) * small2[2];
 659     low = a;
 660     high = a >> 64;
 661     out[4] += low;
 662     out[5] += high;
 663
 664     a = ((uint128_t) small1[3]) * small2[1];
 665     low = a;
 666     high = a >> 64;
 667     out[4] += low;
 668     out[5] += high;
 669
 670     a = ((uint128_t) small1[2]) * small2[3];
 671     low = a;
 672     high = a >> 64;
 673     out[5] += low;
 674     out[6] = high;
 675
 676     a = ((uint128_t) small1[3]) * small2[2];
 677     low = a;
 678     high = a >> 64;
 679     out[5] += low;
 680     out[6] += high;
 681
 682     a = ((uint128_t) small1[3]) * small2[3];
 683     low = a;
 684     high = a >> 64;
 685     out[6] += low;
 686     out[7] = high;
 687 }
 688
 689 /*-
 690  * felem_mul sets |out| = |in1| * |in2|
 691  * On entry:
 692  *   in1[i] < 2^109
 693  *   in2[i] < 2^109
 694  * On exit:
 695  *   out[i] < 7 * 2^64 < 2^67
 696  */
 697 static void felem_mul(longfelem out, const felem in1, const felem in2)
 698 {
 699     smallfelem small1, small2;
 700     felem_shrink(small1, in1);
 701     felem_shrink(small2, in2);
 702     smallfelem_mul(out, small1, small2);
 703 }
 704
 705 /*-
 706  * felem_small_mul sets |out| = |small1| * |in2|
 707  * On entry:
 708  *   small1[i] < 2^64
 709  *   in2[i] < 2^109
 710  * On exit:
 711  *   out[i] < 7 * 2^64 < 2^67
 712  */
 713 static void felem_small_mul(longfelem out, const smallfelem small1,
 714                             const felem in2)
 715 {
 716     smallfelem small2;
 717     felem_shrink(small2, in2);
 718     smallfelem_mul(out, small1, small2);
 719 }
 720
 721 # define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
 722 # define two100 (((limb)1) << 100)
 723 # define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
 724 /* zero100 is 0 mod p */
 725 static const felem zero100 =
 726     { two100m36m4, two100, two100m36p4, two100m36p4 };
 727
 728 /*-
 729  * Internal function for the different flavours of felem_reduce.
 730  * felem_reduce_ reduces the higher coefficients in[4]-in[7].
 731  * On entry:
 732  *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
 733  *   out[1] >= in[7] + 2^32*in[4]
 734  *   out[2] >= in[5] + 2^32*in[5]
 735  *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
 736  * On exit:
 737  *   out[0] <= out[0] + in[4] + 2^32*in[5]
 738  *   out[1] <= out[1] + in[5] + 2^33*in[6]
 739  *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
 740  *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
 741  */
 742 static void felem_reduce_(felem out, const longfelem in)
 743 {
 744     int128_t c;
 745     /* combine common terms from below */
 746     c = in[4] + (in[5] << 32);
 747     out[0] += c;
 748     out[3] -= c;
 749
 750     c = in[5] - in[7];
 751     out[1] += c;
 752     out[2] -= c;
 753
 754     /* the remaining terms */
 755     /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
 756     out[1] -= (in[4] << 32);
 757     out[3] += (in[4] << 32);
 758
 759     /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
 760     out[2] -= (in[5] << 32);
 761
 762     /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
 763     out[0] -= in[6];
 764     out[0] -= (in[6] << 32);
 765     out[1] += (in[6] << 33);
 766     out[2] += (in[6] * 2);
 767     out[3] -= (in[6] << 32);
 768
 769     /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
 770     out[0] -= in[7];
 771     out[0] -= (in[7] << 32);
 772     out[2] += (in[7] << 33);
 773     out[3] += (in[7] * 3);
 774 }
 775
 776 /*-
 777  * felem_reduce converts a longfelem into an felem.
 778  * To be called directly after felem_square or felem_mul.
 779  * On entry:
 780  *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
 781  *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
 782  * On exit:
 783  *   out[i] < 2^101
 784  */
 785 static void felem_reduce(felem out, const longfelem in)
 786 {
 787     out[0] = zero100[0] + in[0];
 788     out[1] = zero100[1] + in[1];
 789     out[2] = zero100[2] + in[2];
 790     out[3] = zero100[3] + in[3];
 791
 792     felem_reduce_(out, in);
 793
 794     /*-
 795      * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
 796      * out[1] > 2^100 - 2^64 - 7*2^96 > 0
 797      * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
 798      * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
 799      *
 800      * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
 801      * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
 802      * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
 803      * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
 804      */
 805 }
 806
 807 /*-
 808  * felem_reduce_zero105 converts a larger longfelem into an felem.
 809  * On entry:
 810  *   in[0] < 2^71
 811  * On exit:
 812  *   out[i] < 2^106
 813  */
 814 static void felem_reduce_zero105(felem out, const longfelem in)
 815 {
 816     out[0] = zero105[0] + in[0];
 817     out[1] = zero105[1] + in[1];
 818     out[2] = zero105[2] + in[2];
 819     out[3] = zero105[3] + in[3];
 820
 821     felem_reduce_(out, in);
 822
 823     /*-
 824      * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
 825      * out[1] > 2^105 - 2^71 - 2^103 > 0
 826      * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
 827      * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
 828      *
 829      * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 830      * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 831      * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
 832      * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
 833      */
 834 }
 835
 836 /*
 837  * subtract_u64 sets *result = *result - v and *carry to one if the
 838  * subtraction underflowed.
 839  */
 840 static void subtract_u64(u64 *result, u64 *carry, u64 v)
 841 {
 842     uint128_t r = *result;
 843     r -= v;
 844     *carry = (r >> 64) & 1;
 845     *result = (u64)r;
 846 }
 847
 848 /*
 849  * felem_contract converts |in| to its unique, minimal representation. On
 850  * entry: in[i] < 2^109
 851  */
 852 static void felem_contract(smallfelem out, const felem in)
 853 {
 854     unsigned i;
 855     u64 all_equal_so_far = 0, result = 0, carry;
 856
 857     felem_shrink(out, in);
 858     /* small is minimal except that the value might be > p */
 859
 860     all_equal_so_far--;
 861     /*
 862      * We are doing a constant time test if out >= kPrime. We need to compare
 863      * each u64, from most-significant to least significant. For each one, if
 864      * all words so far have been equal (m is all ones) then a non-equal
 865      * result is the answer. Otherwise we continue.
 866      */
 867     for (i = 3; i < 4; i--) {
 868         u64 equal;
 869         uint128_t a = ((uint128_t) kPrime[i]) - out[i];
 870         /*
 871          * if out[i] > kPrime[i] then a will underflow and the high 64-bits
 872          * will all be set.
 873          */
 874         result |= all_equal_so_far & ((u64)(a >> 64));
 875
 876         /*
 877          * if kPrime[i] == out[i] then |equal| will be all zeros and the
 878          * decrement will make it all ones.
 879          */
 880         equal = kPrime[i] ^ out[i];
 881         equal--;
 882         equal &= equal << 32;
 883         equal &= equal << 16;
 884         equal &= equal << 8;
 885         equal &= equal << 4;
 886         equal &= equal << 2;
 887         equal &= equal << 1;
 888         equal = ((s64) equal) >> 63;
 889
 890         all_equal_so_far &= equal;
 891     }
 892
 893     /*
 894      * if all_equal_so_far is still all ones then the two values are equal
 895      * and so out >= kPrime is true.
 896      */
 897     result |= all_equal_so_far;
 898
 899     /* if out >= kPrime then we subtract kPrime. */
 900     subtract_u64(&out[0], &carry, result & kPrime[0]);
 901     subtract_u64(&out[1], &carry, carry);
 902     subtract_u64(&out[2], &carry, carry);
 903     subtract_u64(&out[3], &carry, carry);
 904
 905     subtract_u64(&out[1], &carry, result & kPrime[1]);
 906     subtract_u64(&out[2], &carry, carry);
 907     subtract_u64(&out[3], &carry, carry);
 908
 909     subtract_u64(&out[2], &carry, result & kPrime[2]);
 910     subtract_u64(&out[3], &carry, carry);
 911
 912     subtract_u64(&out[3], &carry, result & kPrime[3]);
 913 }
 914
 915 static void smallfelem_square_contract(smallfelem out, const smallfelem in)
 916 {
 917     longfelem longtmp;
 918     felem tmp;
 919
 920     smallfelem_square(longtmp, in);
 921     felem_reduce(tmp, longtmp);
 922     felem_contract(out, tmp);
 923 }
 924
 925 static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
 926                                     const smallfelem in2)
 927 {
 928     longfelem longtmp;
 929     felem tmp;
 930
 931     smallfelem_mul(longtmp, in1, in2);
 932     felem_reduce(tmp, longtmp);
 933     felem_contract(out, tmp);
 934 }
 935
 936 /*-
 937  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 938  * otherwise.
 939  * On entry:
 940  *   small[i] < 2^64
 941  */
 942 static limb smallfelem_is_zero(const smallfelem small)
 943 {
 944     limb result;
 945     u64 is_p;
 946
 947     u64 is_zero = small[0] | small[1] | small[2] | small[3];
 948     is_zero--;
 949     is_zero &= is_zero << 32;
 950     is_zero &= is_zero << 16;
 951     is_zero &= is_zero << 8;
 952     is_zero &= is_zero << 4;
 953     is_zero &= is_zero << 2;
 954     is_zero &= is_zero << 1;
 955     is_zero = ((s64) is_zero) >> 63;
 956
 957     is_p = (small[0] ^ kPrime[0]) |
 958         (small[1] ^ kPrime[1]) |
 959         (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
 960     is_p--;
 961     is_p &= is_p << 32;
 962     is_p &= is_p << 16;
 963     is_p &= is_p << 8;
 964     is_p &= is_p << 4;
 965     is_p &= is_p << 2;
 966     is_p &= is_p << 1;
 967     is_p = ((s64) is_p) >> 63;
 968
 969     is_zero |= is_p;
 970
 971     result = is_zero;
 972     result |= ((limb) is_zero) << 64;
 973     return result;
 974 }
 975
 976 static int smallfelem_is_zero_int(const smallfelem small)
 977 {
 978     return (int)(smallfelem_is_zero(small) & ((limb) 1));
 979 }
 980
 981 /*-
 982  * felem_inv calculates |out| = |in|^{-1}
 983  *
 984  * Based on Fermat's Little Theorem:
 985  *   a^p = a (mod p)
 986  *   a^{p-1} = 1 (mod p)
 987  *   a^{p-2} = a^{-1} (mod p)
 988  */
 989 static void felem_inv(felem out, const felem in)
 990 {
 991     felem ftmp, ftmp2;
 992     /* each e_I will hold |in|^{2^I - 1} */
 993     felem e2, e4, e8, e16, e32, e64;
 994     longfelem tmp;
 995     unsigned i;
 996
 997     felem_square(tmp, in);
 998     felem_reduce(ftmp, tmp);    /* 2^1 */
 999     felem_mul(tmp, in, ftmp);
1000     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
1001     felem_assign(e2, ftmp);
1002     felem_square(tmp, ftmp);
1003     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
1004     felem_square(tmp, ftmp);
1005     felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
1006     felem_mul(tmp, ftmp, e2);
1007     felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1008     felem_assign(e4, ftmp);
1009     felem_square(tmp, ftmp);
1010     felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1011     felem_square(tmp, ftmp);
1012     felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1013     felem_square(tmp, ftmp);
1014     felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1015     felem_square(tmp, ftmp);
1016     felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1017     felem_mul(tmp, ftmp, e4);
1018     felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1019     felem_assign(e8, ftmp);
1020     for (i = 0; i < 8; i++) {
1021         felem_square(tmp, ftmp);
1022         felem_reduce(ftmp, tmp);
1023     }                           /* 2^16 - 2^8 */
1024     felem_mul(tmp, ftmp, e8);
1025     felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1026     felem_assign(e16, ftmp);
1027     for (i = 0; i < 16; i++) {
1028         felem_square(tmp, ftmp);
1029         felem_reduce(ftmp, tmp);
1030     }                           /* 2^32 - 2^16 */
1031     felem_mul(tmp, ftmp, e16);
1032     felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1033     felem_assign(e32, ftmp);
1034     for (i = 0; i < 32; i++) {
1035         felem_square(tmp, ftmp);
1036         felem_reduce(ftmp, tmp);
1037     }                           /* 2^64 - 2^32 */
1038     felem_assign(e64, ftmp);
1039     felem_mul(tmp, ftmp, in);
1040     felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1041     for (i = 0; i < 192; i++) {
1042         felem_square(tmp, ftmp);
1043         felem_reduce(ftmp, tmp);
1044     }                           /* 2^256 - 2^224 + 2^192 */
1045
1046     felem_mul(tmp, e64, e32);
1047     felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1048     for (i = 0; i < 16; i++) {
1049         felem_square(tmp, ftmp2);
1050         felem_reduce(ftmp2, tmp);
1051     }                           /* 2^80 - 2^16 */
1052     felem_mul(tmp, ftmp2, e16);
1053     felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1054     for (i = 0; i < 8; i++) {
1055         felem_square(tmp, ftmp2);
1056         felem_reduce(ftmp2, tmp);
1057     }                           /* 2^88 - 2^8 */
1058     felem_mul(tmp, ftmp2, e8);
1059     felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1060     for (i = 0; i < 4; i++) {
1061         felem_square(tmp, ftmp2);
1062         felem_reduce(ftmp2, tmp);
1063     }                           /* 2^92 - 2^4 */
1064     felem_mul(tmp, ftmp2, e4);
1065     felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1066     felem_square(tmp, ftmp2);
1067     felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1068     felem_square(tmp, ftmp2);
1069     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1070     felem_mul(tmp, ftmp2, e2);
1071     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1072     felem_square(tmp, ftmp2);
1073     felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1074     felem_square(tmp, ftmp2);
1075     felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1076     felem_mul(tmp, ftmp2, in);
1077     felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1078
1079     felem_mul(tmp, ftmp2, ftmp);
1080     felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1081 }
1082
1083 static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1084 {
1085     felem tmp;
1086
1087     smallfelem_expand(tmp, in);
1088     felem_inv(tmp, tmp);
1089     felem_contract(out, tmp);
1090 }
1091
1092 /*-
1093  * Group operations
1094  * ----------------
1095  *
1096  * Building on top of the field operations we have the operations on the
1097  * elliptic curve group itself. Points on the curve are represented in Jacobian
1098  * coordinates
1099  */
1100
1101 /*-
1102  * point_double calculates 2*(x_in, y_in, z_in)
1103  *
1104  * The method is taken from:
1105  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1106  *
1107  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1108  * while x_out == y_in is not (maybe this works, but it's not tested).
1109  */
1110 static void
1111 point_double(felem x_out, felem y_out, felem z_out,
1112              const felem x_in, const felem y_in, const felem z_in)
1113 {
1114     longfelem tmp, tmp2;
1115     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1116     smallfelem small1, small2;
1117
1118     felem_assign(ftmp, x_in);
1119     /* ftmp[i] < 2^106 */
1120     felem_assign(ftmp2, x_in);
1121     /* ftmp2[i] < 2^106 */
1122
1123     /* delta = z^2 */
1124     felem_square(tmp, z_in);
1125     felem_reduce(delta, tmp);
1126     /* delta[i] < 2^101 */
1127
1128     /* gamma = y^2 */
1129     felem_square(tmp, y_in);
1130     felem_reduce(gamma, tmp);
1131     /* gamma[i] < 2^101 */
1132     felem_shrink(small1, gamma);
1133
1134     /* beta = x*gamma */
1135     felem_small_mul(tmp, small1, x_in);
1136     felem_reduce(beta, tmp);
1137     /* beta[i] < 2^101 */
1138
1139     /* alpha = 3*(x-delta)*(x+delta) */
1140     felem_diff(ftmp, delta);
1141     /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1142     felem_sum(ftmp2, delta);
1143     /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1144     felem_scalar(ftmp2, 3);
1145     /* ftmp2[i] < 3 * 2^107 < 2^109 */
1146     felem_mul(tmp, ftmp, ftmp2);
1147     felem_reduce(alpha, tmp);
1148     /* alpha[i] < 2^101 */
1149     felem_shrink(small2, alpha);
1150
1151     /* x' = alpha^2 - 8*beta */
1152     smallfelem_square(tmp, small2);
1153     felem_reduce(x_out, tmp);
1154     felem_assign(ftmp, beta);
1155     felem_scalar(ftmp, 8);
1156     /* ftmp[i] < 8 * 2^101 = 2^104 */
1157     felem_diff(x_out, ftmp);
1158     /* x_out[i] < 2^105 + 2^101 < 2^106 */
1159
1160     /* z' = (y + z)^2 - gamma - delta */
1161     felem_sum(delta, gamma);
1162     /* delta[i] < 2^101 + 2^101 = 2^102 */
1163     felem_assign(ftmp, y_in);
1164     felem_sum(ftmp, z_in);
1165     /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1166     felem_square(tmp, ftmp);
1167     felem_reduce(z_out, tmp);
1168     felem_diff(z_out, delta);
1169     /* z_out[i] < 2^105 + 2^101 < 2^106 */
1170
1171     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1172     felem_scalar(beta, 4);
1173     /* beta[i] < 4 * 2^101 = 2^103 */
1174     felem_diff_zero107(beta, x_out);
1175     /* beta[i] < 2^107 + 2^103 < 2^108 */
1176     felem_small_mul(tmp, small2, beta);
1177     /* tmp[i] < 7 * 2^64 < 2^67 */
1178     smallfelem_square(tmp2, small1);
1179     /* tmp2[i] < 7 * 2^64 */
1180     longfelem_scalar(tmp2, 8);
1181     /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1182     longfelem_diff(tmp, tmp2);
1183     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1184     felem_reduce_zero105(y_out, tmp);
1185     /* y_out[i] < 2^106 */
1186 }
1187
1188 /*
1189  * point_double_small is the same as point_double, except that it operates on
1190  * smallfelems
1191  */
1192 static void
1193 point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1194                    const smallfelem x_in, const smallfelem y_in,
1195                    const smallfelem z_in)
1196 {
1197     felem felem_x_out, felem_y_out, felem_z_out;
1198     felem felem_x_in, felem_y_in, felem_z_in;
1199
1200     smallfelem_expand(felem_x_in, x_in);
1201     smallfelem_expand(felem_y_in, y_in);
1202     smallfelem_expand(felem_z_in, z_in);
1203     point_double(felem_x_out, felem_y_out, felem_z_out,
1204                  felem_x_in, felem_y_in, felem_z_in);
1205     felem_shrink(x_out, felem_x_out);
1206     felem_shrink(y_out, felem_y_out);
1207     felem_shrink(z_out, felem_z_out);
1208 }
1209
1210 /* copy_conditional copies in to out iff mask is all ones. */
1211 static void copy_conditional(felem out, const felem in, limb mask)
1212 {
1213     unsigned i;
1214     for (i = 0; i < NLIMBS; ++i) {
1215         const limb tmp = mask & (in[i] ^ out[i]);
1216         out[i] ^= tmp;
1217     }
1218 }
1219
1220 /* copy_small_conditional copies in to out iff mask is all ones. */
1221 static void copy_small_conditional(felem out, const smallfelem in, limb mask)
1222 {
1223     unsigned i;
1224     const u64 mask64 = mask;
1225     for (i = 0; i < NLIMBS; ++i) {
1226         out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1227     }
1228 }
1229
1230 /*-
1231  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1232  *
1233  * The method is taken from:
1234  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1235  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1236  *
1237  * This function includes a branch for checking whether the two input points
1238  * are equal, (while not equal to the point at infinity). This case never
1239  * happens during single point multiplication, so there is no timing leak for
1240  * ECDH or ECDSA signing.
1241  */
1242 static void point_add(felem x3, felem y3, felem z3,
1243                       const felem x1, const felem y1, const felem z1,
1244                       const int mixed, const smallfelem x2,
1245                       const smallfelem y2, const smallfelem z2)
1246 {
1247     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1248     longfelem tmp, tmp2;
1249     smallfelem small1, small2, small3, small4, small5;
1250     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1251
1252     felem_shrink(small3, z1);
1253
1254     z1_is_zero = smallfelem_is_zero(small3);
1255     z2_is_zero = smallfelem_is_zero(z2);
1256
1257     /* ftmp = z1z1 = z1**2 */
1258     smallfelem_square(tmp, small3);
1259     felem_reduce(ftmp, tmp);
1260     /* ftmp[i] < 2^101 */
1261     felem_shrink(small1, ftmp);
1262
1263     if (!mixed) {
1264         /* ftmp2 = z2z2 = z2**2 */
1265         smallfelem_square(tmp, z2);
1266         felem_reduce(ftmp2, tmp);
1267         /* ftmp2[i] < 2^101 */
1268         felem_shrink(small2, ftmp2);
1269
1270         felem_shrink(small5, x1);
1271
1272         /* u1 = ftmp3 = x1*z2z2 */
1273         smallfelem_mul(tmp, small5, small2);
1274         felem_reduce(ftmp3, tmp);
1275         /* ftmp3[i] < 2^101 */
1276
1277         /* ftmp5 = z1 + z2 */
1278         felem_assign(ftmp5, z1);
1279         felem_small_sum(ftmp5, z2);
1280         /* ftmp5[i] < 2^107 */
1281
1282         /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1283         felem_square(tmp, ftmp5);
1284         felem_reduce(ftmp5, tmp);
1285         /* ftmp2 = z2z2 + z1z1 */
1286         felem_sum(ftmp2, ftmp);
1287         /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1288         felem_diff(ftmp5, ftmp2);
1289         /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1290
1291         /* ftmp2 = z2 * z2z2 */
1292         smallfelem_mul(tmp, small2, z2);
1293         felem_reduce(ftmp2, tmp);
1294
1295         /* s1 = ftmp2 = y1 * z2**3 */
1296         felem_mul(tmp, y1, ftmp2);
1297         felem_reduce(ftmp6, tmp);
1298         /* ftmp6[i] < 2^101 */
1299     } else {
1300         /*
1301          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1302          */
1303
1304         /* u1 = ftmp3 = x1*z2z2 */
1305         felem_assign(ftmp3, x1);
1306         /* ftmp3[i] < 2^106 */
1307
1308         /* ftmp5 = 2z1z2 */
1309         felem_assign(ftmp5, z1);
1310         felem_scalar(ftmp5, 2);
1311         /* ftmp5[i] < 2*2^106 = 2^107 */
1312
1313         /* s1 = ftmp2 = y1 * z2**3 */
1314         felem_assign(ftmp6, y1);
1315         /* ftmp6[i] < 2^106 */
1316     }
1317
1318     /* u2 = x2*z1z1 */
1319     smallfelem_mul(tmp, x2, small1);
1320     felem_reduce(ftmp4, tmp);
1321
1322     /* h = ftmp4 = u2 - u1 */
1323     felem_diff_zero107(ftmp4, ftmp3);
1324     /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1325     felem_shrink(small4, ftmp4);
1326
1327     x_equal = smallfelem_is_zero(small4);
1328
1329     /* z_out = ftmp5 * h */
1330     felem_small_mul(tmp, small4, ftmp5);
1331     felem_reduce(z_out, tmp);
1332     /* z_out[i] < 2^101 */
1333
1334     /* ftmp = z1 * z1z1 */
1335     smallfelem_mul(tmp, small1, small3);
1336     felem_reduce(ftmp, tmp);
1337
1338     /* s2 = tmp = y2 * z1**3 */
1339     felem_small_mul(tmp, y2, ftmp);
1340     felem_reduce(ftmp5, tmp);
1341
1342     /* r = ftmp5 = (s2 - s1)*2 */
1343     felem_diff_zero107(ftmp5, ftmp6);
1344     /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1345     felem_scalar(ftmp5, 2);
1346     /* ftmp5[i] < 2^109 */
1347     felem_shrink(small1, ftmp5);
1348     y_equal = smallfelem_is_zero(small1);
1349
1350     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1351         point_double(x3, y3, z3, x1, y1, z1);
1352         return;
1353     }
1354
1355     /* I = ftmp = (2h)**2 */
1356     felem_assign(ftmp, ftmp4);
1357     felem_scalar(ftmp, 2);
1358     /* ftmp[i] < 2*2^108 = 2^109 */
1359     felem_square(tmp, ftmp);
1360     felem_reduce(ftmp, tmp);
1361
1362     /* J = ftmp2 = h * I */
1363     felem_mul(tmp, ftmp4, ftmp);
1364     felem_reduce(ftmp2, tmp);
1365
1366     /* V = ftmp4 = U1 * I */
1367     felem_mul(tmp, ftmp3, ftmp);
1368     felem_reduce(ftmp4, tmp);
1369
1370     /* x_out = r**2 - J - 2V */
1371     smallfelem_square(tmp, small1);
1372     felem_reduce(x_out, tmp);
1373     felem_assign(ftmp3, ftmp4);
1374     felem_scalar(ftmp4, 2);
1375     felem_sum(ftmp4, ftmp2);
1376     /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1377     felem_diff(x_out, ftmp4);
1378     /* x_out[i] < 2^105 + 2^101 */
1379
1380     /* y_out = r(V-x_out) - 2 * s1 * J */
1381     felem_diff_zero107(ftmp3, x_out);
1382     /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1383     felem_small_mul(tmp, small1, ftmp3);
1384     felem_mul(tmp2, ftmp6, ftmp2);
1385     longfelem_scalar(tmp2, 2);
1386     /* tmp2[i] < 2*2^67 = 2^68 */
1387     longfelem_diff(tmp, tmp2);
1388     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1389     felem_reduce_zero105(y_out, tmp);
1390     /* y_out[i] < 2^106 */
1391
1392     copy_small_conditional(x_out, x2, z1_is_zero);
1393     copy_conditional(x_out, x1, z2_is_zero);
1394     copy_small_conditional(y_out, y2, z1_is_zero);
1395     copy_conditional(y_out, y1, z2_is_zero);
1396     copy_small_conditional(z_out, z2, z1_is_zero);
1397     copy_conditional(z_out, z1, z2_is_zero);
1398     felem_assign(x3, x_out);
1399     felem_assign(y3, y_out);
1400     felem_assign(z3, z_out);
1401 }
1402
1403 /*
1404  * point_add_small is the same as point_add, except that it operates on
1405  * smallfelems
1406  */
1407 static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1408                             smallfelem x1, smallfelem y1, smallfelem z1,
1409                             smallfelem x2, smallfelem y2, smallfelem z2)
1410 {
1411     felem felem_x3, felem_y3, felem_z3;
1412     felem felem_x1, felem_y1, felem_z1;
1413     smallfelem_expand(felem_x1, x1);
1414     smallfelem_expand(felem_y1, y1);
1415     smallfelem_expand(felem_z1, z1);
1416     point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1417               x2, y2, z2);
1418     felem_shrink(x3, felem_x3);
1419     felem_shrink(y3, felem_y3);
1420     felem_shrink(z3, felem_z3);
1421 }
1422
1423 /*-
1424  * Base point pre computation
1425  * --------------------------
1426  *
1427  * Two different sorts of precomputed tables are used in the following code.
1428  * Each contain various points on the curve, where each point is three field
1429  * elements (x, y, z).
1430  *
1431  * For the base point table, z is usually 1 (0 for the point at infinity).
1432  * This table has 2 * 16 elements, starting with the following:
1433  * index | bits    | point
1434  * ------+---------+------------------------------
1435  *     0 | 0 0 0 0 | 0G
1436  *     1 | 0 0 0 1 | 1G
1437  *     2 | 0 0 1 0 | 2^64G
1438  *     3 | 0 0 1 1 | (2^64 + 1)G
1439  *     4 | 0 1 0 0 | 2^128G
1440  *     5 | 0 1 0 1 | (2^128 + 1)G
1441  *     6 | 0 1 1 0 | (2^128 + 2^64)G
1442  *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1443  *     8 | 1 0 0 0 | 2^192G
1444  *     9 | 1 0 0 1 | (2^192 + 1)G
1445  *    10 | 1 0 1 0 | (2^192 + 2^64)G
1446  *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1447  *    12 | 1 1 0 0 | (2^192 + 2^128)G
1448  *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1449  *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1450  *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1451  * followed by a copy of this with each element multiplied by 2^32.
1452  *
1453  * The reason for this is so that we can clock bits into four different
1454  * locations when doing simple scalar multiplies against the base point,
1455  * and then another four locations using the second 16 elements.
1456  *
1457  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1458
1459 /* gmul is the table of precomputed base points */
1460 static const smallfelem gmul[2][16][3] = {
1461     {{{0, 0, 0, 0},
1462       {0, 0, 0, 0},
1463       {0, 0, 0, 0}},
1464      {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1465        0x6b17d1f2e12c4247},
1466       {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1467        0x4fe342e2fe1a7f9b},
1468       {1, 0, 0, 0}},
1469      {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1470        0x0fa822bc2811aaa5},
1471       {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1472        0xbff44ae8f5dba80d},
1473       {1, 0, 0, 0}},
1474      {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1475        0x300a4bbc89d6726f},
1476       {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1477        0x72aac7e0d09b4644},
1478       {1, 0, 0, 0}},
1479      {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1480        0x447d739beedb5e67},
1481       {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1482        0x2d4825ab834131ee},
1483       {1, 0, 0, 0}},
1484      {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1485        0xef9519328a9c72ff},
1486       {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1487        0x611e9fc37dbb2c9b},
1488       {1, 0, 0, 0}},
1489      {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1490        0x550663797b51f5d8},
1491       {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1492        0x157164848aecb851},
1493       {1, 0, 0, 0}},
1494      {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1495        0xeb5d7745b21141ea},
1496       {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1497        0xeafd72ebdbecc17b},
1498       {1, 0, 0, 0}},
1499      {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1500        0xa6d39677a7849276},
1501       {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1502        0x674f84749b0b8816},
1503       {1, 0, 0, 0}},
1504      {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1505        0x4e769e7672c9ddad},
1506       {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1507        0x42b99082de830663},
1508       {1, 0, 0, 0}},
1509      {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1510        0x78878ef61c6ce04d},
1511       {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1512        0xb6cb3f5d7b72c321},
1513       {1, 0, 0, 0}},
1514      {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1515        0x0c88bc4d716b1287},
1516       {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1517        0xdd5ddea3f3901dc6},
1518       {1, 0, 0, 0}},
1519      {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1520        0x68f344af6b317466},
1521       {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1522        0x31b9c405f8540a20},
1523       {1, 0, 0, 0}},
1524      {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1525        0x4052bf4b6f461db9},
1526       {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1527        0xfecf4d5190b0fc61},
1528       {1, 0, 0, 0}},
1529      {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1530        0x1eddbae2c802e41a},
1531       {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1532        0x43104d86560ebcfc},
1533       {1, 0, 0, 0}},
1534      {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1535        0xb48e26b484f7a21c},
1536       {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1537        0xfac015404d4d3dab},
1538       {1, 0, 0, 0}}},
1539     {{{0, 0, 0, 0},
1540       {0, 0, 0, 0},
1541       {0, 0, 0, 0}},
1542      {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1543        0x7fe36b40af22af89},
1544       {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1545        0xe697d45825b63624},
1546       {1, 0, 0, 0}},
1547      {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1548        0x4a5b506612a677a6},
1549       {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1550        0xeb13461ceac089f1},
1551       {1, 0, 0, 0}},
1552      {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1553        0x0781b8291c6a220a},
1554       {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1555        0x690cde8df0151593},
1556       {1, 0, 0, 0}},
1557      {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1558        0x8a535f566ec73617},
1559       {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1560        0x0455c08468b08bd7},
1561       {1, 0, 0, 0}},
1562      {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1563        0x06bada7ab77f8276},
1564       {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1565        0x5b476dfd0e6cb18a},
1566       {1, 0, 0, 0}},
1567      {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1568        0x3e29864e8a2ec908},
1569       {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1570        0x239b90ea3dc31e7e},
1571       {1, 0, 0, 0}},
1572      {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1573        0x820f4dd949f72ff7},
1574       {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1575        0x140406ec783a05ec},
1576       {1, 0, 0, 0}},
1577      {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1578        0x68f6b8542783dfee},
1579       {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1580        0xcbe1feba92e40ce6},
1581       {1, 0, 0, 0}},
1582      {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1583        0xd0b2f94d2f420109},
1584       {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1585        0x971459828b0719e5},
1586       {1, 0, 0, 0}},
1587      {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1588        0x961610004a866aba},
1589       {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1590        0x7acb9fadcee75e44},
1591       {1, 0, 0, 0}},
1592      {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1593        0x24eb9acca333bf5b},
1594       {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1595        0x69f891c5acd079cc},
1596       {1, 0, 0, 0}},
1597      {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1598        0xe51f547c5972a107},
1599       {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1600        0x1c309a2b25bb1387},
1601       {1, 0, 0, 0}},
1602      {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1603        0x20b87b8aa2c4e503},
1604       {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1605        0xf5c6fa49919776be},
1606       {1, 0, 0, 0}},
1607      {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1608        0x1ed7d1b9332010b9},
1609       {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1610        0x3a2b03f03217257a},
1611       {1, 0, 0, 0}},
1612      {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1613        0x15fee545c78dd9f6},
1614       {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1615        0x4ab5b6b2b8753f81},
1616       {1, 0, 0, 0}}}
1617 };
1618
1619 /*
1620  * select_point selects the |idx|th point from a precomputation table and
1621  * copies it to out.
1622  */
1623 static void select_point(const u64 idx, unsigned int size,
1624                          const smallfelem pre_comp[16][3], smallfelem out[3])
1625 {
1626     unsigned i, j;
1627     u64 *outlimbs = &out[0][0];
1628
1629     memset(out, 0, sizeof(*out) * 3);
1630
1631     for (i = 0; i < size; i++) {
1632         const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1633         u64 mask = i ^ idx;
1634         mask |= mask >> 4;
1635         mask |= mask >> 2;
1636         mask |= mask >> 1;
1637         mask &= 1;
1638         mask--;
1639         for (j = 0; j < NLIMBS * 3; j++)
1640             outlimbs[j] |= inlimbs[j] & mask;
1641     }
1642 }
1643
1644 /* get_bit returns the |i|th bit in |in| */
1645 static char get_bit(const felem_bytearray in, int i)
1646 {
1647     if ((i < 0) || (i >= 256))
1648         return 0;
1649     return (in[i >> 3] >> (i & 7)) & 1;
1650 }
1651
1652 /*
1653  * Interleaved point multiplication using precomputed point multiples: The
1654  * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1655  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1656  * generator, using certain (large) precomputed multiples in g_pre_comp.
1657  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1658  */
1659 static void batch_mul(felem x_out, felem y_out, felem z_out,
1660                       const felem_bytearray scalars[],
1661                       const unsigned num_points, const u8 *g_scalar,
1662                       const int mixed, const smallfelem pre_comp[][17][3],
1663                       const smallfelem g_pre_comp[2][16][3])
1664 {
1665     int i, skip;
1666     unsigned num, gen_mul = (g_scalar != NULL);
1667     felem nq[3], ftmp;
1668     smallfelem tmp[3];
1669     u64 bits;
1670     u8 sign, digit;
1671
1672     /* set nq to the point at infinity */
1673     memset(nq, 0, sizeof(nq));
1674
1675     /*
1676      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1677      * of the generator (two in each of the last 32 rounds) and additions of
1678      * other points multiples (every 5th round).
1679      */
1680     skip = 1;                   /* save two point operations in the first
1681                                  * round */
1682     for (i = (num_points ? 255 : 31); i >= 0; --i) {
1683         /* double */
1684         if (!skip)
1685             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1686
1687         /* add multiples of the generator */
1688         if (gen_mul && (i <= 31)) {
1689             /* first, look 32 bits upwards */
1690             bits = get_bit(g_scalar, i + 224) << 3;
1691             bits |= get_bit(g_scalar, i + 160) << 2;
1692             bits |= get_bit(g_scalar, i + 96) << 1;
1693             bits |= get_bit(g_scalar, i + 32);
1694             /* select the point to add, in constant time */
1695             select_point(bits, 16, g_pre_comp[1], tmp);
1696
1697             if (!skip) {
1698                 /* Arg 1 below is for "mixed" */
1699                 point_add(nq[0], nq[1], nq[2],
1700                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1701             } else {
1702                 smallfelem_expand(nq[0], tmp[0]);
1703                 smallfelem_expand(nq[1], tmp[1]);
1704                 smallfelem_expand(nq[2], tmp[2]);
1705                 skip = 0;
1706             }
1707
1708             /* second, look at the current position */
1709             bits = get_bit(g_scalar, i + 192) << 3;
1710             bits |= get_bit(g_scalar, i + 128) << 2;
1711             bits |= get_bit(g_scalar, i + 64) << 1;
1712             bits |= get_bit(g_scalar, i);
1713             /* select the point to add, in constant time */
1714             select_point(bits, 16, g_pre_comp[0], tmp);
1715             /* Arg 1 below is for "mixed" */
1716             point_add(nq[0], nq[1], nq[2],
1717                       nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1718         }
1719
1720         /* do other additions every 5 doublings */
1721         if (num_points && (i % 5 == 0)) {
1722             /* loop over all scalars */
1723             for (num = 0; num < num_points; ++num) {
1724                 bits = get_bit(scalars[num], i + 4) << 5;
1725                 bits |= get_bit(scalars[num], i + 3) << 4;
1726                 bits |= get_bit(scalars[num], i + 2) << 3;
1727                 bits |= get_bit(scalars[num], i + 1) << 2;
1728                 bits |= get_bit(scalars[num], i) << 1;
1729                 bits |= get_bit(scalars[num], i - 1);
1730                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1731
1732                 /*
1733                  * select the point to add or subtract, in constant time
1734                  */
1735                 select_point(digit, 17, pre_comp[num], tmp);
1736                 smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1737                                                * point */
1738                 copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1739                 felem_contract(tmp[1], ftmp);
1740
1741                 if (!skip) {
1742                     point_add(nq[0], nq[1], nq[2],
1743                               nq[0], nq[1], nq[2],
1744                               mixed, tmp[0], tmp[1], tmp[2]);
1745                 } else {
1746                     smallfelem_expand(nq[0], tmp[0]);
1747                     smallfelem_expand(nq[1], tmp[1]);
1748                     smallfelem_expand(nq[2], tmp[2]);
1749                     skip = 0;
1750                 }
1751             }
1752         }
1753     }
1754     felem_assign(x_out, nq[0]);
1755     felem_assign(y_out, nq[1]);
1756     felem_assign(z_out, nq[2]);
1757 }
1758
1759 /* Precomputation for the group generator. */
1760 struct nistp256_pre_comp_st {
1761     smallfelem g_pre_comp[2][16][3];
1762     int references;
1763 };
1764
1765 const EC_METHOD *EC_GFp_nistp256_method(void)
1766 {
1767     static const EC_METHOD ret = {
1768         EC_FLAGS_DEFAULT_OCT,
1769         NID_X9_62_prime_field,
1770         ec_GFp_nistp256_group_init,
1771         ec_GFp_simple_group_finish,
1772         ec_GFp_simple_group_clear_finish,
1773         ec_GFp_nist_group_copy,
1774         ec_GFp_nistp256_group_set_curve,
1775         ec_GFp_simple_group_get_curve,
1776         ec_GFp_simple_group_get_degree,
1777         0, /* group_order_bits */
1778         ec_GFp_simple_group_check_discriminant,
1779         ec_GFp_simple_point_init,
1780         ec_GFp_simple_point_finish,
1781         ec_GFp_simple_point_clear_finish,
1782         ec_GFp_simple_point_copy,
1783         ec_GFp_simple_point_set_to_infinity,
1784         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1785         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1786         ec_GFp_simple_point_set_affine_coordinates,
1787         ec_GFp_nistp256_point_get_affine_coordinates,
1788         0 /* point_set_compressed_coordinates */ ,
1789         0 /* point2oct */ ,
1790         0 /* oct2point */ ,
1791         ec_GFp_simple_add,
1792         ec_GFp_simple_dbl,
1793         ec_GFp_simple_invert,
1794         ec_GFp_simple_is_at_infinity,
1795         ec_GFp_simple_is_on_curve,
1796         ec_GFp_simple_cmp,
1797         ec_GFp_simple_make_affine,
1798         ec_GFp_simple_points_make_affine,
1799         ec_GFp_nistp256_points_mul,
1800         ec_GFp_nistp256_precompute_mult,
1801         ec_GFp_nistp256_have_precompute_mult,
1802         ec_GFp_nist_field_mul,
1803         ec_GFp_nist_field_sqr,
1804         0 /* field_div */ ,
1805         0 /* field_encode */ ,
1806         0 /* field_decode */ ,
1807         0                       /* field_set_to_one */
1808     };
1809
1810     return &ret;
1811 }
1812
1813 /******************************************************************************/
1814 /*
1815  * FUNCTIONS TO MANAGE PRECOMPUTATION
1816  */
1817
1818 static NISTP256_PRE_COMP *nistp256_pre_comp_new()
1819 {
1820     NISTP256_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1821
1822     if (ret == NULL) {
1823         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1824         return ret;
1825     }
1826
1827     ret->references = 1;
1828     return ret;
1829 }
1830
1831 NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *p)
1832 {
1833     if (p != NULL)
1834         CRYPTO_add(&p->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1835     return p;
1836 }
1837
1838 void EC_nistp256_pre_comp_free(NISTP256_PRE_COMP *pre)
1839 {
1840     if (pre == NULL
1841             || CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP) > 0)
1842         return;
1843     OPENSSL_free(pre);
1844 }
1845
1846 /******************************************************************************/
1847 /*
1848  * OPENSSL EC_METHOD FUNCTIONS
1849  */
1850
1851 int ec_GFp_nistp256_group_init(EC_GROUP *group)
1852 {
1853     int ret;
1854     ret = ec_GFp_simple_group_init(group);
1855     group->a_is_minus3 = 1;
1856     return ret;
1857 }
1858
1859 int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1860                                     const BIGNUM *a, const BIGNUM *b,
1861                                     BN_CTX *ctx)
1862 {
1863     int ret = 0;
1864     BN_CTX *new_ctx = NULL;
1865     BIGNUM *curve_p, *curve_a, *curve_b;
1866
1867     if (ctx == NULL)
1868         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1869             return 0;
1870     BN_CTX_start(ctx);
1871     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1872         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1873         ((curve_b = BN_CTX_get(ctx)) == NULL))
1874         goto err;
1875     BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1876     BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1877     BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1878     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1879         ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1880               EC_R_WRONG_CURVE_PARAMETERS);
1881         goto err;
1882     }
1883     group->field_mod_func = BN_nist_mod_256;
1884     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1885  err:
1886     BN_CTX_end(ctx);
1887     BN_CTX_free(new_ctx);
1888     return ret;
1889 }
1890
1891 /*
1892  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1893  * (X/Z^2, Y/Z^3)
1894  */
1895 int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1896                                                  const EC_POINT *point,
1897                                                  BIGNUM *x, BIGNUM *y,
1898                                                  BN_CTX *ctx)
1899 {
1900     felem z1, z2, x_in, y_in;
1901     smallfelem x_out, y_out;
1902     longfelem tmp;
1903
1904     if (EC_POINT_is_at_infinity(group, point)) {
1905         ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1906               EC_R_POINT_AT_INFINITY);
1907         return 0;
1908     }
1909     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1910         (!BN_to_felem(z1, point->Z)))
1911         return 0;
1912     felem_inv(z2, z1);
1913     felem_square(tmp, z2);
1914     felem_reduce(z1, tmp);
1915     felem_mul(tmp, x_in, z1);
1916     felem_reduce(x_in, tmp);
1917     felem_contract(x_out, x_in);
1918     if (x != NULL) {
1919         if (!smallfelem_to_BN(x, x_out)) {
1920             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1921                   ERR_R_BN_LIB);
1922             return 0;
1923         }
1924     }
1925     felem_mul(tmp, z1, z2);
1926     felem_reduce(z1, tmp);
1927     felem_mul(tmp, y_in, z1);
1928     felem_reduce(y_in, tmp);
1929     felem_contract(y_out, y_in);
1930     if (y != NULL) {
1931         if (!smallfelem_to_BN(y, y_out)) {
1932             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1933                   ERR_R_BN_LIB);
1934             return 0;
1935         }
1936     }
1937     return 1;
1938 }
1939
1940 /* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
1941 static void make_points_affine(size_t num, smallfelem points[][3],
1942                                smallfelem tmp_smallfelems[])
1943 {
1944     /*
1945      * Runs in constant time, unless an input is the point at infinity (which
1946      * normally shouldn't happen).
1947      */
1948     ec_GFp_nistp_points_make_affine_internal(num,
1949                                              points,
1950                                              sizeof(smallfelem),
1951                                              tmp_smallfelems,
1952                                              (void (*)(void *))smallfelem_one,
1953                                              (int (*)(const void *))
1954                                              smallfelem_is_zero_int,
1955                                              (void (*)(void *, const void *))
1956                                              smallfelem_assign,
1957                                              (void (*)(void *, const void *))
1958                                              smallfelem_square_contract,
1959                                              (void (*)
1960                                               (void *, const void *,
1961                                                const void *))
1962                                              smallfelem_mul_contract,
1963                                              (void (*)(void *, const void *))
1964                                              smallfelem_inv_contract,
1965                                              /* nothing to contract */
1966                                              (void (*)(void *, const void *))
1967                                              smallfelem_assign);
1968 }
1969
1970 /*
1971  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1972  * values Result is stored in r (r can equal one of the inputs).
1973  */
1974 int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
1975                                const BIGNUM *scalar, size_t num,
1976                                const EC_POINT *points[],
1977                                const BIGNUM *scalars[], BN_CTX *ctx)
1978 {
1979     int ret = 0;
1980     int j;
1981     int mixed = 0;
1982     BN_CTX *new_ctx = NULL;
1983     BIGNUM *x, *y, *z, *tmp_scalar;
1984     felem_bytearray g_secret;
1985     felem_bytearray *secrets = NULL;
1986     smallfelem (*pre_comp)[17][3] = NULL;
1987     smallfelem *tmp_smallfelems = NULL;
1988     felem_bytearray tmp;
1989     unsigned i, num_bytes;
1990     int have_pre_comp = 0;
1991     size_t num_points = num;
1992     smallfelem x_in, y_in, z_in;
1993     felem x_out, y_out, z_out;
1994     NISTP256_PRE_COMP *pre = NULL;
1995     const smallfelem(*g_pre_comp)[16][3] = NULL;
1996     EC_POINT *generator = NULL;
1997     const EC_POINT *p = NULL;
1998     const BIGNUM *p_scalar = NULL;
1999
2000     if (ctx == NULL)
2001         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2002             return 0;
2003     BN_CTX_start(ctx);
2004     if (((x = BN_CTX_get(ctx)) == NULL) ||
2005         ((y = BN_CTX_get(ctx)) == NULL) ||
2006         ((z = BN_CTX_get(ctx)) == NULL) ||
2007         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
2008         goto err;
2009
2010     if (scalar != NULL) {
2011         pre = group->pre_comp.nistp256;
2012         if (pre)
2013             /* we have precomputation, try to use it */
2014             g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2015         else
2016             /* try to use the standard precomputation */
2017             g_pre_comp = &gmul[0];
2018         generator = EC_POINT_new(group);
2019         if (generator == NULL)
2020             goto err;
2021         /* get the generator from precomputation */
2022         if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2023             !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2024             !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2025             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2026             goto err;
2027         }
2028         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
2029                                                       generator, x, y, z,
2030                                                       ctx))
2031             goto err;
2032         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2033             /* precomputation matches generator */
2034             have_pre_comp = 1;
2035         else
2036             /*
2037              * we don't have valid precomputation: treat the generator as a
2038              * random point
2039              */
2040             num_points++;
2041     }
2042     if (num_points > 0) {
2043         if (num_points >= 3) {
2044             /*
2045              * unless we precompute multiples for just one or two points,
2046              * converting those into affine form is time well spent
2047              */
2048             mixed = 1;
2049         }
2050         secrets = OPENSSL_malloc(sizeof(*secrets) * num_points);
2051         pre_comp = OPENSSL_malloc(sizeof(*pre_comp) * num_points);
2052         if (mixed)
2053             tmp_smallfelems =
2054               OPENSSL_malloc(sizeof(*tmp_smallfelems) * (num_points * 17 + 1));
2055         if ((secrets == NULL) || (pre_comp == NULL)
2056             || (mixed && (tmp_smallfelems == NULL))) {
2057             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
2058             goto err;
2059         }
2060
2061         /*
2062          * we treat NULL scalars as 0, and NULL points as points at infinity,
2063          * i.e., they contribute nothing to the linear combination
2064          */
2065         memset(secrets, 0, sizeof(*secrets) * num_points);
2066         memset(pre_comp, 0, sizeof(*pre_comp) * num_points);
2067         for (i = 0; i < num_points; ++i) {
2068             if (i == num)
2069                 /*
2070                  * we didn't have a valid precomputation, so we pick the
2071                  * generator
2072                  */
2073             {
2074                 p = EC_GROUP_get0_generator(group);
2075                 p_scalar = scalar;
2076             } else
2077                 /* the i^th point */
2078             {
2079                 p = points[i];
2080                 p_scalar = scalars[i];
2081             }
2082             if ((p_scalar != NULL) && (p != NULL)) {
2083                 /* reduce scalar to 0 <= scalar < 2^256 */
2084                 if ((BN_num_bits(p_scalar) > 256)
2085                     || (BN_is_negative(p_scalar))) {
2086                     /*
2087                      * this is an unusual input, and we don't guarantee
2088                      * constant-timeness
2089                      */
2090                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
2091                         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2092                         goto err;
2093                     }
2094                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
2095                 } else
2096                     num_bytes = BN_bn2bin(p_scalar, tmp);
2097                 flip_endian(secrets[i], tmp, num_bytes);
2098                 /* precompute multiples */
2099                 if ((!BN_to_felem(x_out, p->X)) ||
2100                     (!BN_to_felem(y_out, p->Y)) ||
2101                     (!BN_to_felem(z_out, p->Z)))
2102                     goto err;
2103                 felem_shrink(pre_comp[i][1][0], x_out);
2104                 felem_shrink(pre_comp[i][1][1], y_out);
2105                 felem_shrink(pre_comp[i][1][2], z_out);
2106                 for (j = 2; j <= 16; ++j) {
2107                     if (j & 1) {
2108                         point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2109                                         pre_comp[i][j][2], pre_comp[i][1][0],
2110                                         pre_comp[i][1][1], pre_comp[i][1][2],
2111                                         pre_comp[i][j - 1][0],
2112                                         pre_comp[i][j - 1][1],
2113                                         pre_comp[i][j - 1][2]);
2114                     } else {
2115                         point_double_small(pre_comp[i][j][0],
2116                                            pre_comp[i][j][1],
2117                                            pre_comp[i][j][2],
2118                                            pre_comp[i][j / 2][0],
2119                                            pre_comp[i][j / 2][1],
2120                                            pre_comp[i][j / 2][2]);
2121                     }
2122                 }
2123             }
2124         }
2125         if (mixed)
2126             make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2127     }
2128
2129     /* the scalar for the generator */
2130     if ((scalar != NULL) && (have_pre_comp)) {
2131         memset(g_secret, 0, sizeof(g_secret));
2132         /* reduce scalar to 0 <= scalar < 2^256 */
2133         if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2134             /*
2135              * this is an unusual input, and we don't guarantee
2136              * constant-timeness
2137              */
2138             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2139                 ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2140                 goto err;
2141             }
2142             num_bytes = BN_bn2bin(tmp_scalar, tmp);
2143         } else
2144             num_bytes = BN_bn2bin(scalar, tmp);
2145         flip_endian(g_secret, tmp, num_bytes);
2146         /* do the multiplication with generator precomputation */
2147         batch_mul(x_out, y_out, z_out,
2148                   (const felem_bytearray(*))secrets, num_points,
2149                   g_secret,
2150                   mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2151     } else
2152         /* do the multiplication without generator precomputation */
2153         batch_mul(x_out, y_out, z_out,
2154                   (const felem_bytearray(*))secrets, num_points,
2155                   NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2156     /* reduce the output to its unique minimal representation */
2157     felem_contract(x_in, x_out);
2158     felem_contract(y_in, y_out);
2159     felem_contract(z_in, z_out);
2160     if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2161         (!smallfelem_to_BN(z, z_in))) {
2162         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2163         goto err;
2164     }
2165     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2166
2167  err:
2168     BN_CTX_end(ctx);
2169     EC_POINT_free(generator);
2170     BN_CTX_free(new_ctx);
2171     OPENSSL_free(secrets);
2172     OPENSSL_free(pre_comp);
2173     OPENSSL_free(tmp_smallfelems);
2174     return ret;
2175 }
2176
2177 int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2178 {
2179     int ret = 0;
2180     NISTP256_PRE_COMP *pre = NULL;
2181     int i, j;
2182     BN_CTX *new_ctx = NULL;
2183     BIGNUM *x, *y;
2184     EC_POINT *generator = NULL;
2185     smallfelem tmp_smallfelems[32];
2186     felem x_tmp, y_tmp, z_tmp;
2187
2188     /* throw away old precomputation */
2189     EC_pre_comp_free(group);
2190     if (ctx == NULL)
2191         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2192             return 0;
2193     BN_CTX_start(ctx);
2194     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2195         goto err;
2196     /* get the generator */
2197     if (group->generator == NULL)
2198         goto err;
2199     generator = EC_POINT_new(group);
2200     if (generator == NULL)
2201         goto err;
2202     BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2203     BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2204     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2205         goto err;
2206     if ((pre = nistp256_pre_comp_new()) == NULL)
2207         goto err;
2208     /*
2209      * if the generator is the standard one, use built-in precomputation
2210      */
2211     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2212         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2213         goto done;
2214     }
2215     if ((!BN_to_felem(x_tmp, group->generator->X)) ||
2216         (!BN_to_felem(y_tmp, group->generator->Y)) ||
2217         (!BN_to_felem(z_tmp, group->generator->Z)))
2218         goto err;
2219     felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2220     felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2221     felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2222     /*
2223      * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2224      * 2^160*G, 2^224*G for the second one
2225      */
2226     for (i = 1; i <= 8; i <<= 1) {
2227         point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2228                            pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2229                            pre->g_pre_comp[0][i][1],
2230                            pre->g_pre_comp[0][i][2]);
2231         for (j = 0; j < 31; ++j) {
2232             point_double_small(pre->g_pre_comp[1][i][0],
2233                                pre->g_pre_comp[1][i][1],
2234                                pre->g_pre_comp[1][i][2],
2235                                pre->g_pre_comp[1][i][0],
2236                                pre->g_pre_comp[1][i][1],
2237                                pre->g_pre_comp[1][i][2]);
2238         }
2239         if (i == 8)
2240             break;
2241         point_double_small(pre->g_pre_comp[0][2 * i][0],
2242                            pre->g_pre_comp[0][2 * i][1],
2243                            pre->g_pre_comp[0][2 * i][2],
2244                            pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2245                            pre->g_pre_comp[1][i][2]);
2246         for (j = 0; j < 31; ++j) {
2247             point_double_small(pre->g_pre_comp[0][2 * i][0],
2248                                pre->g_pre_comp[0][2 * i][1],
2249                                pre->g_pre_comp[0][2 * i][2],
2250                                pre->g_pre_comp[0][2 * i][0],
2251                                pre->g_pre_comp[0][2 * i][1],
2252                                pre->g_pre_comp[0][2 * i][2]);
2253         }
2254     }
2255     for (i = 0; i < 2; i++) {
2256         /* g_pre_comp[i][0] is the point at infinity */
2257         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2258         /* the remaining multiples */
2259         /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2260         point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2261                         pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2262                         pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2263                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2264                         pre->g_pre_comp[i][2][2]);
2265         /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2266         point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2267                         pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2268                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2269                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2270                         pre->g_pre_comp[i][2][2]);
2271         /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2272         point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2273                         pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2274                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2275                         pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2276                         pre->g_pre_comp[i][4][2]);
2277         /*
2278          * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2279          */
2280         point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2281                         pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2282                         pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2283                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2284                         pre->g_pre_comp[i][2][2]);
2285         for (j = 1; j < 8; ++j) {
2286             /* odd multiples: add G resp. 2^32*G */
2287             point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2288                             pre->g_pre_comp[i][2 * j + 1][1],
2289                             pre->g_pre_comp[i][2 * j + 1][2],
2290                             pre->g_pre_comp[i][2 * j][0],
2291                             pre->g_pre_comp[i][2 * j][1],
2292                             pre->g_pre_comp[i][2 * j][2],
2293                             pre->g_pre_comp[i][1][0],
2294                             pre->g_pre_comp[i][1][1],
2295                             pre->g_pre_comp[i][1][2]);
2296         }
2297     }
2298     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2299
2300  done:
2301     SETPRECOMP(group, nistp256, pre);
2302     pre = NULL;
2303     ret = 1;
2304
2305  err:
2306     BN_CTX_end(ctx);
2307     EC_POINT_free(generator);
2308     BN_CTX_free(new_ctx);
2309     EC_nistp256_pre_comp_free(pre);
2310     return ret;
2311 }
2312
2313 int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2314 {
2315     return HAVEPRECOMP(group, nistp256);
2316 }
2317 #endif