crypto/ec/ecp_nistp256.c

   1 /* crypto/ec/ecp_nistp256.c */
   2 /*
   3  * Written by Adam Langley (Google) for the OpenSSL project
   4  */
   5 /* Copyright 2011 Google Inc.
   6  *
   7  * Licensed under the Apache License, Version 2.0 (the "License");
   8  *
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  *  Unless required by applicable law or agreed to in writing, software
  15  *  distributed under the License is distributed on an "AS IS" BASIS,
  16  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  *  See the License for the specific language governing permissions and
  18  *  limitations under the License.
  19  */
  20
  21 /*
  22  * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
  23  *
  24  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  25  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  26  * work which got its smarts from Daniel J. Bernstein's work on the same.
  27  */
  28
  29 #include <openssl/opensslconf.h>
  30 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
  31
  32 # include <stdint.h>
  33 # include <string.h>
  34 # include <openssl/err.h>
  35 # include "ec_lcl.h"
  36
  37 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  38   /* even with gcc, the typedef won't work for 32-bit platforms */
  39 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  40                                  * platforms */
  41 typedef __int128_t int128_t;
  42 # else
  43 #  error "Need GCC 3.1 or later to define type uint128_t"
  44 # endif
  45
  46 typedef uint8_t u8;
  47 typedef uint32_t u32;
  48 typedef uint64_t u64;
  49 typedef int64_t s64;
  50
  51 /*
  52  * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
  53  * can serialise an element of this field into 32 bytes. We call this an
  54  * felem_bytearray.
  55  */
  56
  57 typedef u8 felem_bytearray[32];
  58
  59 /*
  60  * These are the parameters of P256, taken from FIPS 186-3, page 86. These
  61  * values are big-endian.
  62  */
  63 static const felem_bytearray nistp256_curve_params[5] = {
  64     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
  65      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  66      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  67      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
  68     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
  69      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  70      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
  72     {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
  73      0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
  74      0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
  75      0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
  76     {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
  77      0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
  78      0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
  79      0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
  80     {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
  81      0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
  82      0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
  83      0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
  84 };
  85
  86 /*-
  87  * The representation of field elements.
  88  * ------------------------------------
  89  *
  90  * We represent field elements with either four 128-bit values, eight 128-bit
  91  * values, or four 64-bit values. The field element represented is:
  92  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
  93  * or:
  94  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
  95  *
  96  * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
  97  * apart, but are 128-bits wide, the most significant bits of each limb overlap
  98  * with the least significant bits of the next.
  99  *
 100  * A field element with four limbs is an 'felem'. One with eight limbs is a
 101  * 'longfelem'
 102  *
 103  * A field element with four, 64-bit values is called a 'smallfelem'. Small
 104  * values are used as intermediate values before multiplication.
 105  */
 106
 107 # define NLIMBS 4
 108
 109 typedef uint128_t limb;
 110 typedef limb felem[NLIMBS];
 111 typedef limb longfelem[NLIMBS * 2];
 112 typedef u64 smallfelem[NLIMBS];
 113
 114 /* This is the value of the prime as four 64-bit words, little-endian. */
 115 static const u64 kPrime[4] =
 116     { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
 117 static const u64 bottom63bits = 0x7ffffffffffffffful;
 118
 119 /*
 120  * bin32_to_felem takes a little-endian byte array and converts it into felem
 121  * form. This assumes that the CPU is little-endian.
 122  */
 123 static void bin32_to_felem(felem out, const u8 in[32])
 124 {
 125     out[0] = *((u64 *)&in[0]);
 126     out[1] = *((u64 *)&in[8]);
 127     out[2] = *((u64 *)&in[16]);
 128     out[3] = *((u64 *)&in[24]);
 129 }
 130
 131 /*
 132  * smallfelem_to_bin32 takes a smallfelem and serialises into a little
 133  * endian, 32 byte array. This assumes that the CPU is little-endian.
 134  */
 135 static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
 136 {
 137     *((u64 *)&out[0]) = in[0];
 138     *((u64 *)&out[8]) = in[1];
 139     *((u64 *)&out[16]) = in[2];
 140     *((u64 *)&out[24]) = in[3];
 141 }
 142
 143 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 144 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 145 {
 146     unsigned i;
 147     for (i = 0; i < len; ++i)
 148         out[i] = in[len - 1 - i];
 149 }
 150
 151 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 152 static int BN_to_felem(felem out, const BIGNUM *bn)
 153 {
 154     felem_bytearray b_in;
 155     felem_bytearray b_out;
 156     unsigned num_bytes;
 157
 158     /* BN_bn2bin eats leading zeroes */
 159     memset(b_out, 0, sizeof(b_out));
 160     num_bytes = BN_num_bytes(bn);
 161     if (num_bytes > sizeof b_out) {
 162         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 163         return 0;
 164     }
 165     if (BN_is_negative(bn)) {
 166         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 167         return 0;
 168     }
 169     num_bytes = BN_bn2bin(bn, b_in);
 170     flip_endian(b_out, b_in, num_bytes);
 171     bin32_to_felem(out, b_out);
 172     return 1;
 173 }
 174
 175 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 176 static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
 177 {
 178     felem_bytearray b_in, b_out;
 179     smallfelem_to_bin32(b_in, in);
 180     flip_endian(b_out, b_in, sizeof b_out);
 181     return BN_bin2bn(b_out, sizeof b_out, out);
 182 }
 183
 184 /*-
 185  * Field operations
 186  * ----------------
 187  */
 188
 189 static void smallfelem_one(smallfelem out)
 190 {
 191     out[0] = 1;
 192     out[1] = 0;
 193     out[2] = 0;
 194     out[3] = 0;
 195 }
 196
 197 static void smallfelem_assign(smallfelem out, const smallfelem in)
 198 {
 199     out[0] = in[0];
 200     out[1] = in[1];
 201     out[2] = in[2];
 202     out[3] = in[3];
 203 }
 204
 205 static void felem_assign(felem out, const felem in)
 206 {
 207     out[0] = in[0];
 208     out[1] = in[1];
 209     out[2] = in[2];
 210     out[3] = in[3];
 211 }
 212
 213 /* felem_sum sets out = out + in. */
 214 static void felem_sum(felem out, const felem in)
 215 {
 216     out[0] += in[0];
 217     out[1] += in[1];
 218     out[2] += in[2];
 219     out[3] += in[3];
 220 }
 221
 222 /* felem_small_sum sets out = out + in. */
 223 static void felem_small_sum(felem out, const smallfelem in)
 224 {
 225     out[0] += in[0];
 226     out[1] += in[1];
 227     out[2] += in[2];
 228     out[3] += in[3];
 229 }
 230
 231 /* felem_scalar sets out = out * scalar */
 232 static void felem_scalar(felem out, const u64 scalar)
 233 {
 234     out[0] *= scalar;
 235     out[1] *= scalar;
 236     out[2] *= scalar;
 237     out[3] *= scalar;
 238 }
 239
 240 /* longfelem_scalar sets out = out * scalar */
 241 static void longfelem_scalar(longfelem out, const u64 scalar)
 242 {
 243     out[0] *= scalar;
 244     out[1] *= scalar;
 245     out[2] *= scalar;
 246     out[3] *= scalar;
 247     out[4] *= scalar;
 248     out[5] *= scalar;
 249     out[6] *= scalar;
 250     out[7] *= scalar;
 251 }
 252
 253 # define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
 254 # define two105 (((limb)1) << 105)
 255 # define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
 256
 257 /* zero105 is 0 mod p */
 258 static const felem zero105 =
 259     { two105m41m9, two105, two105m41p9, two105m41p9 };
 260
 261 /*-
 262  * smallfelem_neg sets |out| to |-small|
 263  * On exit:
 264  *   out[i] < out[i] + 2^105
 265  */
 266 static void smallfelem_neg(felem out, const smallfelem small)
 267 {
 268     /* In order to prevent underflow, we subtract from 0 mod p. */
 269     out[0] = zero105[0] - small[0];
 270     out[1] = zero105[1] - small[1];
 271     out[2] = zero105[2] - small[2];
 272     out[3] = zero105[3] - small[3];
 273 }
 274
 275 /*-
 276  * felem_diff subtracts |in| from |out|
 277  * On entry:
 278  *   in[i] < 2^104
 279  * On exit:
 280  *   out[i] < out[i] + 2^105
 281  */
 282 static void felem_diff(felem out, const felem in)
 283 {
 284     /*
 285      * In order to prevent underflow, we add 0 mod p before subtracting.
 286      */
 287     out[0] += zero105[0];
 288     out[1] += zero105[1];
 289     out[2] += zero105[2];
 290     out[3] += zero105[3];
 291
 292     out[0] -= in[0];
 293     out[1] -= in[1];
 294     out[2] -= in[2];
 295     out[3] -= in[3];
 296 }
 297
 298 # define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
 299 # define two107 (((limb)1) << 107)
 300 # define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
 301
 302 /* zero107 is 0 mod p */
 303 static const felem zero107 =
 304     { two107m43m11, two107, two107m43p11, two107m43p11 };
 305
 306 /*-
 307  * An alternative felem_diff for larger inputs |in|
 308  * felem_diff_zero107 subtracts |in| from |out|
 309  * On entry:
 310  *   in[i] < 2^106
 311  * On exit:
 312  *   out[i] < out[i] + 2^107
 313  */
 314 static void felem_diff_zero107(felem out, const felem in)
 315 {
 316     /*
 317      * In order to prevent underflow, we add 0 mod p before subtracting.
 318      */
 319     out[0] += zero107[0];
 320     out[1] += zero107[1];
 321     out[2] += zero107[2];
 322     out[3] += zero107[3];
 323
 324     out[0] -= in[0];
 325     out[1] -= in[1];
 326     out[2] -= in[2];
 327     out[3] -= in[3];
 328 }
 329
 330 /*-
 331  * longfelem_diff subtracts |in| from |out|
 332  * On entry:
 333  *   in[i] < 7*2^67
 334  * On exit:
 335  *   out[i] < out[i] + 2^70 + 2^40
 336  */
 337 static void longfelem_diff(longfelem out, const longfelem in)
 338 {
 339     static const limb two70m8p6 =
 340         (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
 341     static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
 342     static const limb two70 = (((limb) 1) << 70);
 343     static const limb two70m40m38p6 =
 344         (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
 345         (((limb) 1) << 6);
 346     static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
 347
 348     /* add 0 mod p to avoid underflow */
 349     out[0] += two70m8p6;
 350     out[1] += two70p40;
 351     out[2] += two70;
 352     out[3] += two70m40m38p6;
 353     out[4] += two70m6;
 354     out[5] += two70m6;
 355     out[6] += two70m6;
 356     out[7] += two70m6;
 357
 358     /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
 359     out[0] -= in[0];
 360     out[1] -= in[1];
 361     out[2] -= in[2];
 362     out[3] -= in[3];
 363     out[4] -= in[4];
 364     out[5] -= in[5];
 365     out[6] -= in[6];
 366     out[7] -= in[7];
 367 }
 368
 369 # define two64m0 (((limb)1) << 64) - 1
 370 # define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
 371 # define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
 372 # define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
 373
 374 /* zero110 is 0 mod p */
 375 static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
 376
 377 /*-
 378  * felem_shrink converts an felem into a smallfelem. The result isn't quite
 379  * minimal as the value may be greater than p.
 380  *
 381  * On entry:
 382  *   in[i] < 2^109
 383  * On exit:
 384  *   out[i] < 2^64
 385  */
 386 static void felem_shrink(smallfelem out, const felem in)
 387 {
 388     felem tmp;
 389     u64 a, b, mask;
 390     s64 high, low;
 391     static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
 392
 393     /* Carry 2->3 */
 394     tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
 395     /* tmp[3] < 2^110 */
 396
 397     tmp[2] = zero110[2] + (u64)in[2];
 398     tmp[0] = zero110[0] + in[0];
 399     tmp[1] = zero110[1] + in[1];
 400     /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
 401
 402     /*
 403      * We perform two partial reductions where we eliminate the high-word of
 404      * tmp[3]. We don't update the other words till the end.
 405      */
 406     a = tmp[3] >> 64;           /* a < 2^46 */
 407     tmp[3] = (u64)tmp[3];
 408     tmp[3] -= a;
 409     tmp[3] += ((limb) a) << 32;
 410     /* tmp[3] < 2^79 */
 411
 412     b = a;
 413     a = tmp[3] >> 64;           /* a < 2^15 */
 414     b += a;                     /* b < 2^46 + 2^15 < 2^47 */
 415     tmp[3] = (u64)tmp[3];
 416     tmp[3] -= a;
 417     tmp[3] += ((limb) a) << 32;
 418     /* tmp[3] < 2^64 + 2^47 */
 419
 420     /*
 421      * This adjusts the other two words to complete the two partial
 422      * reductions.
 423      */
 424     tmp[0] += b;
 425     tmp[1] -= (((limb) b) << 32);
 426
 427     /*
 428      * In order to make space in tmp[3] for the carry from 2 -> 3, we
 429      * conditionally subtract kPrime if tmp[3] is large enough.
 430      */
 431     high = tmp[3] >> 64;
 432     /* As tmp[3] < 2^65, high is either 1 or 0 */
 433     high <<= 63;
 434     high >>= 63;
 435     /*-
 436      * high is:
 437      *   all ones   if the high word of tmp[3] is 1
 438      *   all zeros  if the high word of tmp[3] if 0 */
 439     low = tmp[3];
 440     mask = low >> 63;
 441     /*-
 442      * mask is:
 443      *   all ones   if the MSB of low is 1
 444      *   all zeros  if the MSB of low if 0 */
 445     low &= bottom63bits;
 446     low -= kPrime3Test;
 447     /* if low was greater than kPrime3Test then the MSB is zero */
 448     low = ~low;
 449     low >>= 63;
 450     /*-
 451      * low is:
 452      *   all ones   if low was > kPrime3Test
 453      *   all zeros  if low was <= kPrime3Test */
 454     mask = (mask & low) | high;
 455     tmp[0] -= mask & kPrime[0];
 456     tmp[1] -= mask & kPrime[1];
 457     /* kPrime[2] is zero, so omitted */
 458     tmp[3] -= mask & kPrime[3];
 459     /* tmp[3] < 2**64 - 2**32 + 1 */
 460
 461     tmp[1] += ((u64)(tmp[0] >> 64));
 462     tmp[0] = (u64)tmp[0];
 463     tmp[2] += ((u64)(tmp[1] >> 64));
 464     tmp[1] = (u64)tmp[1];
 465     tmp[3] += ((u64)(tmp[2] >> 64));
 466     tmp[2] = (u64)tmp[2];
 467     /* tmp[i] < 2^64 */
 468
 469     out[0] = tmp[0];
 470     out[1] = tmp[1];
 471     out[2] = tmp[2];
 472     out[3] = tmp[3];
 473 }
 474
 475 /* smallfelem_expand converts a smallfelem to an felem */
 476 static void smallfelem_expand(felem out, const smallfelem in)
 477 {
 478     out[0] = in[0];
 479     out[1] = in[1];
 480     out[2] = in[2];
 481     out[3] = in[3];
 482 }
 483
 484 /*-
 485  * smallfelem_square sets |out| = |small|^2
 486  * On entry:
 487  *   small[i] < 2^64
 488  * On exit:
 489  *   out[i] < 7 * 2^64 < 2^67
 490  */
 491 static void smallfelem_square(longfelem out, const smallfelem small)
 492 {
 493     limb a;
 494     u64 high, low;
 495
 496     a = ((uint128_t) small[0]) * small[0];
 497     low = a;
 498     high = a >> 64;
 499     out[0] = low;
 500     out[1] = high;
 501
 502     a = ((uint128_t) small[0]) * small[1];
 503     low = a;
 504     high = a >> 64;
 505     out[1] += low;
 506     out[1] += low;
 507     out[2] = high;
 508
 509     a = ((uint128_t) small[0]) * small[2];
 510     low = a;
 511     high = a >> 64;
 512     out[2] += low;
 513     out[2] *= 2;
 514     out[3] = high;
 515
 516     a = ((uint128_t) small[0]) * small[3];
 517     low = a;
 518     high = a >> 64;
 519     out[3] += low;
 520     out[4] = high;
 521
 522     a = ((uint128_t) small[1]) * small[2];
 523     low = a;
 524     high = a >> 64;
 525     out[3] += low;
 526     out[3] *= 2;
 527     out[4] += high;
 528
 529     a = ((uint128_t) small[1]) * small[1];
 530     low = a;
 531     high = a >> 64;
 532     out[2] += low;
 533     out[3] += high;
 534
 535     a = ((uint128_t) small[1]) * small[3];
 536     low = a;
 537     high = a >> 64;
 538     out[4] += low;
 539     out[4] *= 2;
 540     out[5] = high;
 541
 542     a = ((uint128_t) small[2]) * small[3];
 543     low = a;
 544     high = a >> 64;
 545     out[5] += low;
 546     out[5] *= 2;
 547     out[6] = high;
 548     out[6] += high;
 549
 550     a = ((uint128_t) small[2]) * small[2];
 551     low = a;
 552     high = a >> 64;
 553     out[4] += low;
 554     out[5] += high;
 555
 556     a = ((uint128_t) small[3]) * small[3];
 557     low = a;
 558     high = a >> 64;
 559     out[6] += low;
 560     out[7] = high;
 561 }
 562
 563 /*-
 564  * felem_square sets |out| = |in|^2
 565  * On entry:
 566  *   in[i] < 2^109
 567  * On exit:
 568  *   out[i] < 7 * 2^64 < 2^67
 569  */
 570 static void felem_square(longfelem out, const felem in)
 571 {
 572     u64 small[4];
 573     felem_shrink(small, in);
 574     smallfelem_square(out, small);
 575 }
 576
 577 /*-
 578  * smallfelem_mul sets |out| = |small1| * |small2|
 579  * On entry:
 580  *   small1[i] < 2^64
 581  *   small2[i] < 2^64
 582  * On exit:
 583  *   out[i] < 7 * 2^64 < 2^67
 584  */
 585 static void smallfelem_mul(longfelem out, const smallfelem small1,
 586                            const smallfelem small2)
 587 {
 588     limb a;
 589     u64 high, low;
 590
 591     a = ((uint128_t) small1[0]) * small2[0];
 592     low = a;
 593     high = a >> 64;
 594     out[0] = low;
 595     out[1] = high;
 596
 597     a = ((uint128_t) small1[0]) * small2[1];
 598     low = a;
 599     high = a >> 64;
 600     out[1] += low;
 601     out[2] = high;
 602
 603     a = ((uint128_t) small1[1]) * small2[0];
 604     low = a;
 605     high = a >> 64;
 606     out[1] += low;
 607     out[2] += high;
 608
 609     a = ((uint128_t) small1[0]) * small2[2];
 610     low = a;
 611     high = a >> 64;
 612     out[2] += low;
 613     out[3] = high;
 614
 615     a = ((uint128_t) small1[1]) * small2[1];
 616     low = a;
 617     high = a >> 64;
 618     out[2] += low;
 619     out[3] += high;
 620
 621     a = ((uint128_t) small1[2]) * small2[0];
 622     low = a;
 623     high = a >> 64;
 624     out[2] += low;
 625     out[3] += high;
 626
 627     a = ((uint128_t) small1[0]) * small2[3];
 628     low = a;
 629     high = a >> 64;
 630     out[3] += low;
 631     out[4] = high;
 632
 633     a = ((uint128_t) small1[1]) * small2[2];
 634     low = a;
 635     high = a >> 64;
 636     out[3] += low;
 637     out[4] += high;
 638
 639     a = ((uint128_t) small1[2]) * small2[1];
 640     low = a;
 641     high = a >> 64;
 642     out[3] += low;
 643     out[4] += high;
 644
 645     a = ((uint128_t) small1[3]) * small2[0];
 646     low = a;
 647     high = a >> 64;
 648     out[3] += low;
 649     out[4] += high;
 650
 651     a = ((uint128_t) small1[1]) * small2[3];
 652     low = a;
 653     high = a >> 64;
 654     out[4] += low;
 655     out[5] = high;
 656
 657     a = ((uint128_t) small1[2]) * small2[2];
 658     low = a;
 659     high = a >> 64;
 660     out[4] += low;
 661     out[5] += high;
 662
 663     a = ((uint128_t) small1[3]) * small2[1];
 664     low = a;
 665     high = a >> 64;
 666     out[4] += low;
 667     out[5] += high;
 668
 669     a = ((uint128_t) small1[2]) * small2[3];
 670     low = a;
 671     high = a >> 64;
 672     out[5] += low;
 673     out[6] = high;
 674
 675     a = ((uint128_t) small1[3]) * small2[2];
 676     low = a;
 677     high = a >> 64;
 678     out[5] += low;
 679     out[6] += high;
 680
 681     a = ((uint128_t) small1[3]) * small2[3];
 682     low = a;
 683     high = a >> 64;
 684     out[6] += low;
 685     out[7] = high;
 686 }
 687
 688 /*-
 689  * felem_mul sets |out| = |in1| * |in2|
 690  * On entry:
 691  *   in1[i] < 2^109
 692  *   in2[i] < 2^109
 693  * On exit:
 694  *   out[i] < 7 * 2^64 < 2^67
 695  */
 696 static void felem_mul(longfelem out, const felem in1, const felem in2)
 697 {
 698     smallfelem small1, small2;
 699     felem_shrink(small1, in1);
 700     felem_shrink(small2, in2);
 701     smallfelem_mul(out, small1, small2);
 702 }
 703
 704 /*-
 705  * felem_small_mul sets |out| = |small1| * |in2|
 706  * On entry:
 707  *   small1[i] < 2^64
 708  *   in2[i] < 2^109
 709  * On exit:
 710  *   out[i] < 7 * 2^64 < 2^67
 711  */
 712 static void felem_small_mul(longfelem out, const smallfelem small1,
 713                             const felem in2)
 714 {
 715     smallfelem small2;
 716     felem_shrink(small2, in2);
 717     smallfelem_mul(out, small1, small2);
 718 }
 719
 720 # define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
 721 # define two100 (((limb)1) << 100)
 722 # define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
 723 /* zero100 is 0 mod p */
 724 static const felem zero100 =
 725     { two100m36m4, two100, two100m36p4, two100m36p4 };
 726
 727 /*-
 728  * Internal function for the different flavours of felem_reduce.
 729  * felem_reduce_ reduces the higher coefficients in[4]-in[7].
 730  * On entry:
 731  *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
 732  *   out[1] >= in[7] + 2^32*in[4]
 733  *   out[2] >= in[5] + 2^32*in[5]
 734  *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
 735  * On exit:
 736  *   out[0] <= out[0] + in[4] + 2^32*in[5]
 737  *   out[1] <= out[1] + in[5] + 2^33*in[6]
 738  *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
 739  *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
 740  */
 741 static void felem_reduce_(felem out, const longfelem in)
 742 {
 743     int128_t c;
 744     /* combine common terms from below */
 745     c = in[4] + (in[5] << 32);
 746     out[0] += c;
 747     out[3] -= c;
 748
 749     c = in[5] - in[7];
 750     out[1] += c;
 751     out[2] -= c;
 752
 753     /* the remaining terms */
 754     /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
 755     out[1] -= (in[4] << 32);
 756     out[3] += (in[4] << 32);
 757
 758     /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
 759     out[2] -= (in[5] << 32);
 760
 761     /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
 762     out[0] -= in[6];
 763     out[0] -= (in[6] << 32);
 764     out[1] += (in[6] << 33);
 765     out[2] += (in[6] * 2);
 766     out[3] -= (in[6] << 32);
 767
 768     /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
 769     out[0] -= in[7];
 770     out[0] -= (in[7] << 32);
 771     out[2] += (in[7] << 33);
 772     out[3] += (in[7] * 3);
 773 }
 774
 775 /*-
 776  * felem_reduce converts a longfelem into an felem.
 777  * To be called directly after felem_square or felem_mul.
 778  * On entry:
 779  *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
 780  *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
 781  * On exit:
 782  *   out[i] < 2^101
 783  */
 784 static void felem_reduce(felem out, const longfelem in)
 785 {
 786     out[0] = zero100[0] + in[0];
 787     out[1] = zero100[1] + in[1];
 788     out[2] = zero100[2] + in[2];
 789     out[3] = zero100[3] + in[3];
 790
 791     felem_reduce_(out, in);
 792
 793     /*-
 794      * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
 795      * out[1] > 2^100 - 2^64 - 7*2^96 > 0
 796      * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
 797      * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
 798      *
 799      * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
 800      * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
 801      * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
 802      * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
 803      */
 804 }
 805
 806 /*-
 807  * felem_reduce_zero105 converts a larger longfelem into an felem.
 808  * On entry:
 809  *   in[0] < 2^71
 810  * On exit:
 811  *   out[i] < 2^106
 812  */
 813 static void felem_reduce_zero105(felem out, const longfelem in)
 814 {
 815     out[0] = zero105[0] + in[0];
 816     out[1] = zero105[1] + in[1];
 817     out[2] = zero105[2] + in[2];
 818     out[3] = zero105[3] + in[3];
 819
 820     felem_reduce_(out, in);
 821
 822     /*-
 823      * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
 824      * out[1] > 2^105 - 2^71 - 2^103 > 0
 825      * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
 826      * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
 827      *
 828      * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 829      * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 830      * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
 831      * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
 832      */
 833 }
 834
 835 /*
 836  * subtract_u64 sets *result = *result - v and *carry to one if the
 837  * subtraction underflowed.
 838  */
 839 static void subtract_u64(u64 *result, u64 *carry, u64 v)
 840 {
 841     uint128_t r = *result;
 842     r -= v;
 843     *carry = (r >> 64) & 1;
 844     *result = (u64)r;
 845 }
 846
 847 /*
 848  * felem_contract converts |in| to its unique, minimal representation. On
 849  * entry: in[i] < 2^109
 850  */
 851 static void felem_contract(smallfelem out, const felem in)
 852 {
 853     unsigned i;
 854     u64 all_equal_so_far = 0, result = 0, carry;
 855
 856     felem_shrink(out, in);
 857     /* small is minimal except that the value might be > p */
 858
 859     all_equal_so_far--;
 860     /*
 861      * We are doing a constant time test if out >= kPrime. We need to compare
 862      * each u64, from most-significant to least significant. For each one, if
 863      * all words so far have been equal (m is all ones) then a non-equal
 864      * result is the answer. Otherwise we continue.
 865      */
 866     for (i = 3; i < 4; i--) {
 867         u64 equal;
 868         uint128_t a = ((uint128_t) kPrime[i]) - out[i];
 869         /*
 870          * if out[i] > kPrime[i] then a will underflow and the high 64-bits
 871          * will all be set.
 872          */
 873         result |= all_equal_so_far & ((u64)(a >> 64));
 874
 875         /*
 876          * if kPrime[i] == out[i] then |equal| will be all zeros and the
 877          * decrement will make it all ones.
 878          */
 879         equal = kPrime[i] ^ out[i];
 880         equal--;
 881         equal &= equal << 32;
 882         equal &= equal << 16;
 883         equal &= equal << 8;
 884         equal &= equal << 4;
 885         equal &= equal << 2;
 886         equal &= equal << 1;
 887         equal = ((s64) equal) >> 63;
 888
 889         all_equal_so_far &= equal;
 890     }
 891
 892     /*
 893      * if all_equal_so_far is still all ones then the two values are equal
 894      * and so out >= kPrime is true.
 895      */
 896     result |= all_equal_so_far;
 897
 898     /* if out >= kPrime then we subtract kPrime. */
 899     subtract_u64(&out[0], &carry, result & kPrime[0]);
 900     subtract_u64(&out[1], &carry, carry);
 901     subtract_u64(&out[2], &carry, carry);
 902     subtract_u64(&out[3], &carry, carry);
 903
 904     subtract_u64(&out[1], &carry, result & kPrime[1]);
 905     subtract_u64(&out[2], &carry, carry);
 906     subtract_u64(&out[3], &carry, carry);
 907
 908     subtract_u64(&out[2], &carry, result & kPrime[2]);
 909     subtract_u64(&out[3], &carry, carry);
 910
 911     subtract_u64(&out[3], &carry, result & kPrime[3]);
 912 }
 913
 914 static void smallfelem_square_contract(smallfelem out, const smallfelem in)
 915 {
 916     longfelem longtmp;
 917     felem tmp;
 918
 919     smallfelem_square(longtmp, in);
 920     felem_reduce(tmp, longtmp);
 921     felem_contract(out, tmp);
 922 }
 923
 924 static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
 925                                     const smallfelem in2)
 926 {
 927     longfelem longtmp;
 928     felem tmp;
 929
 930     smallfelem_mul(longtmp, in1, in2);
 931     felem_reduce(tmp, longtmp);
 932     felem_contract(out, tmp);
 933 }
 934
 935 /*-
 936  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 937  * otherwise.
 938  * On entry:
 939  *   small[i] < 2^64
 940  */
 941 static limb smallfelem_is_zero(const smallfelem small)
 942 {
 943     limb result;
 944     u64 is_p;
 945
 946     u64 is_zero = small[0] | small[1] | small[2] | small[3];
 947     is_zero--;
 948     is_zero &= is_zero << 32;
 949     is_zero &= is_zero << 16;
 950     is_zero &= is_zero << 8;
 951     is_zero &= is_zero << 4;
 952     is_zero &= is_zero << 2;
 953     is_zero &= is_zero << 1;
 954     is_zero = ((s64) is_zero) >> 63;
 955
 956     is_p = (small[0] ^ kPrime[0]) |
 957         (small[1] ^ kPrime[1]) |
 958         (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
 959     is_p--;
 960     is_p &= is_p << 32;
 961     is_p &= is_p << 16;
 962     is_p &= is_p << 8;
 963     is_p &= is_p << 4;
 964     is_p &= is_p << 2;
 965     is_p &= is_p << 1;
 966     is_p = ((s64) is_p) >> 63;
 967
 968     is_zero |= is_p;
 969
 970     result = is_zero;
 971     result |= ((limb) is_zero) << 64;
 972     return result;
 973 }
 974
 975 static int smallfelem_is_zero_int(const smallfelem small)
 976 {
 977     return (int)(smallfelem_is_zero(small) & ((limb) 1));
 978 }
 979
 980 /*-
 981  * felem_inv calculates |out| = |in|^{-1}
 982  *
 983  * Based on Fermat's Little Theorem:
 984  *   a^p = a (mod p)
 985  *   a^{p-1} = 1 (mod p)
 986  *   a^{p-2} = a^{-1} (mod p)
 987  */
 988 static void felem_inv(felem out, const felem in)
 989 {
 990     felem ftmp, ftmp2;
 991     /* each e_I will hold |in|^{2^I - 1} */
 992     felem e2, e4, e8, e16, e32, e64;
 993     longfelem tmp;
 994     unsigned i;
 995
 996     felem_square(tmp, in);
 997     felem_reduce(ftmp, tmp);    /* 2^1 */
 998     felem_mul(tmp, in, ftmp);
 999     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
1000     felem_assign(e2, ftmp);
1001     felem_square(tmp, ftmp);
1002     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
1003     felem_square(tmp, ftmp);
1004     felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
1005     felem_mul(tmp, ftmp, e2);
1006     felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1007     felem_assign(e4, ftmp);
1008     felem_square(tmp, ftmp);
1009     felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1010     felem_square(tmp, ftmp);
1011     felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1012     felem_square(tmp, ftmp);
1013     felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1014     felem_square(tmp, ftmp);
1015     felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1016     felem_mul(tmp, ftmp, e4);
1017     felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1018     felem_assign(e8, ftmp);
1019     for (i = 0; i < 8; i++) {
1020         felem_square(tmp, ftmp);
1021         felem_reduce(ftmp, tmp);
1022     }                           /* 2^16 - 2^8 */
1023     felem_mul(tmp, ftmp, e8);
1024     felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1025     felem_assign(e16, ftmp);
1026     for (i = 0; i < 16; i++) {
1027         felem_square(tmp, ftmp);
1028         felem_reduce(ftmp, tmp);
1029     }                           /* 2^32 - 2^16 */
1030     felem_mul(tmp, ftmp, e16);
1031     felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1032     felem_assign(e32, ftmp);
1033     for (i = 0; i < 32; i++) {
1034         felem_square(tmp, ftmp);
1035         felem_reduce(ftmp, tmp);
1036     }                           /* 2^64 - 2^32 */
1037     felem_assign(e64, ftmp);
1038     felem_mul(tmp, ftmp, in);
1039     felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1040     for (i = 0; i < 192; i++) {
1041         felem_square(tmp, ftmp);
1042         felem_reduce(ftmp, tmp);
1043     }                           /* 2^256 - 2^224 + 2^192 */
1044
1045     felem_mul(tmp, e64, e32);
1046     felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1047     for (i = 0; i < 16; i++) {
1048         felem_square(tmp, ftmp2);
1049         felem_reduce(ftmp2, tmp);
1050     }                           /* 2^80 - 2^16 */
1051     felem_mul(tmp, ftmp2, e16);
1052     felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1053     for (i = 0; i < 8; i++) {
1054         felem_square(tmp, ftmp2);
1055         felem_reduce(ftmp2, tmp);
1056     }                           /* 2^88 - 2^8 */
1057     felem_mul(tmp, ftmp2, e8);
1058     felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1059     for (i = 0; i < 4; i++) {
1060         felem_square(tmp, ftmp2);
1061         felem_reduce(ftmp2, tmp);
1062     }                           /* 2^92 - 2^4 */
1063     felem_mul(tmp, ftmp2, e4);
1064     felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1065     felem_square(tmp, ftmp2);
1066     felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1067     felem_square(tmp, ftmp2);
1068     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1069     felem_mul(tmp, ftmp2, e2);
1070     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1071     felem_square(tmp, ftmp2);
1072     felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1073     felem_square(tmp, ftmp2);
1074     felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1075     felem_mul(tmp, ftmp2, in);
1076     felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1077
1078     felem_mul(tmp, ftmp2, ftmp);
1079     felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1080 }
1081
1082 static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1083 {
1084     felem tmp;
1085
1086     smallfelem_expand(tmp, in);
1087     felem_inv(tmp, tmp);
1088     felem_contract(out, tmp);
1089 }
1090
1091 /*-
1092  * Group operations
1093  * ----------------
1094  *
1095  * Building on top of the field operations we have the operations on the
1096  * elliptic curve group itself. Points on the curve are represented in Jacobian
1097  * coordinates
1098  */
1099
1100 /*-
1101  * point_double calculates 2*(x_in, y_in, z_in)
1102  *
1103  * The method is taken from:
1104  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1105  *
1106  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1107  * while x_out == y_in is not (maybe this works, but it's not tested).
1108  */
1109 static void
1110 point_double(felem x_out, felem y_out, felem z_out,
1111              const felem x_in, const felem y_in, const felem z_in)
1112 {
1113     longfelem tmp, tmp2;
1114     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1115     smallfelem small1, small2;
1116
1117     felem_assign(ftmp, x_in);
1118     /* ftmp[i] < 2^106 */
1119     felem_assign(ftmp2, x_in);
1120     /* ftmp2[i] < 2^106 */
1121
1122     /* delta = z^2 */
1123     felem_square(tmp, z_in);
1124     felem_reduce(delta, tmp);
1125     /* delta[i] < 2^101 */
1126
1127     /* gamma = y^2 */
1128     felem_square(tmp, y_in);
1129     felem_reduce(gamma, tmp);
1130     /* gamma[i] < 2^101 */
1131     felem_shrink(small1, gamma);
1132
1133     /* beta = x*gamma */
1134     felem_small_mul(tmp, small1, x_in);
1135     felem_reduce(beta, tmp);
1136     /* beta[i] < 2^101 */
1137
1138     /* alpha = 3*(x-delta)*(x+delta) */
1139     felem_diff(ftmp, delta);
1140     /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1141     felem_sum(ftmp2, delta);
1142     /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1143     felem_scalar(ftmp2, 3);
1144     /* ftmp2[i] < 3 * 2^107 < 2^109 */
1145     felem_mul(tmp, ftmp, ftmp2);
1146     felem_reduce(alpha, tmp);
1147     /* alpha[i] < 2^101 */
1148     felem_shrink(small2, alpha);
1149
1150     /* x' = alpha^2 - 8*beta */
1151     smallfelem_square(tmp, small2);
1152     felem_reduce(x_out, tmp);
1153     felem_assign(ftmp, beta);
1154     felem_scalar(ftmp, 8);
1155     /* ftmp[i] < 8 * 2^101 = 2^104 */
1156     felem_diff(x_out, ftmp);
1157     /* x_out[i] < 2^105 + 2^101 < 2^106 */
1158
1159     /* z' = (y + z)^2 - gamma - delta */
1160     felem_sum(delta, gamma);
1161     /* delta[i] < 2^101 + 2^101 = 2^102 */
1162     felem_assign(ftmp, y_in);
1163     felem_sum(ftmp, z_in);
1164     /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1165     felem_square(tmp, ftmp);
1166     felem_reduce(z_out, tmp);
1167     felem_diff(z_out, delta);
1168     /* z_out[i] < 2^105 + 2^101 < 2^106 */
1169
1170     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1171     felem_scalar(beta, 4);
1172     /* beta[i] < 4 * 2^101 = 2^103 */
1173     felem_diff_zero107(beta, x_out);
1174     /* beta[i] < 2^107 + 2^103 < 2^108 */
1175     felem_small_mul(tmp, small2, beta);
1176     /* tmp[i] < 7 * 2^64 < 2^67 */
1177     smallfelem_square(tmp2, small1);
1178     /* tmp2[i] < 7 * 2^64 */
1179     longfelem_scalar(tmp2, 8);
1180     /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1181     longfelem_diff(tmp, tmp2);
1182     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1183     felem_reduce_zero105(y_out, tmp);
1184     /* y_out[i] < 2^106 */
1185 }
1186
1187 /*
1188  * point_double_small is the same as point_double, except that it operates on
1189  * smallfelems
1190  */
1191 static void
1192 point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1193                    const smallfelem x_in, const smallfelem y_in,
1194                    const smallfelem z_in)
1195 {
1196     felem felem_x_out, felem_y_out, felem_z_out;
1197     felem felem_x_in, felem_y_in, felem_z_in;
1198
1199     smallfelem_expand(felem_x_in, x_in);
1200     smallfelem_expand(felem_y_in, y_in);
1201     smallfelem_expand(felem_z_in, z_in);
1202     point_double(felem_x_out, felem_y_out, felem_z_out,
1203                  felem_x_in, felem_y_in, felem_z_in);
1204     felem_shrink(x_out, felem_x_out);
1205     felem_shrink(y_out, felem_y_out);
1206     felem_shrink(z_out, felem_z_out);
1207 }
1208
1209 /* copy_conditional copies in to out iff mask is all ones. */
1210 static void copy_conditional(felem out, const felem in, limb mask)
1211 {
1212     unsigned i;
1213     for (i = 0; i < NLIMBS; ++i) {
1214         const limb tmp = mask & (in[i] ^ out[i]);
1215         out[i] ^= tmp;
1216     }
1217 }
1218
1219 /* copy_small_conditional copies in to out iff mask is all ones. */
1220 static void copy_small_conditional(felem out, const smallfelem in, limb mask)
1221 {
1222     unsigned i;
1223     const u64 mask64 = mask;
1224     for (i = 0; i < NLIMBS; ++i) {
1225         out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1226     }
1227 }
1228
1229 /*-
1230  * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1231  *
1232  * The method is taken from:
1233  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1234  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1235  *
1236  * This function includes a branch for checking whether the two input points
1237  * are equal, (while not equal to the point at infinity). This case never
1238  * happens during single point multiplication, so there is no timing leak for
1239  * ECDH or ECDSA signing.
1240  */
1241 static void point_add(felem x3, felem y3, felem z3,
1242                       const felem x1, const felem y1, const felem z1,
1243                       const int mixed, const smallfelem x2,
1244                       const smallfelem y2, const smallfelem z2)
1245 {
1246     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1247     longfelem tmp, tmp2;
1248     smallfelem small1, small2, small3, small4, small5;
1249     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1250
1251     felem_shrink(small3, z1);
1252
1253     z1_is_zero = smallfelem_is_zero(small3);
1254     z2_is_zero = smallfelem_is_zero(z2);
1255
1256     /* ftmp = z1z1 = z1**2 */
1257     smallfelem_square(tmp, small3);
1258     felem_reduce(ftmp, tmp);
1259     /* ftmp[i] < 2^101 */
1260     felem_shrink(small1, ftmp);
1261
1262     if (!mixed) {
1263         /* ftmp2 = z2z2 = z2**2 */
1264         smallfelem_square(tmp, z2);
1265         felem_reduce(ftmp2, tmp);
1266         /* ftmp2[i] < 2^101 */
1267         felem_shrink(small2, ftmp2);
1268
1269         felem_shrink(small5, x1);
1270
1271         /* u1 = ftmp3 = x1*z2z2 */
1272         smallfelem_mul(tmp, small5, small2);
1273         felem_reduce(ftmp3, tmp);
1274         /* ftmp3[i] < 2^101 */
1275
1276         /* ftmp5 = z1 + z2 */
1277         felem_assign(ftmp5, z1);
1278         felem_small_sum(ftmp5, z2);
1279         /* ftmp5[i] < 2^107 */
1280
1281         /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1282         felem_square(tmp, ftmp5);
1283         felem_reduce(ftmp5, tmp);
1284         /* ftmp2 = z2z2 + z1z1 */
1285         felem_sum(ftmp2, ftmp);
1286         /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1287         felem_diff(ftmp5, ftmp2);
1288         /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1289
1290         /* ftmp2 = z2 * z2z2 */
1291         smallfelem_mul(tmp, small2, z2);
1292         felem_reduce(ftmp2, tmp);
1293
1294         /* s1 = ftmp2 = y1 * z2**3 */
1295         felem_mul(tmp, y1, ftmp2);
1296         felem_reduce(ftmp6, tmp);
1297         /* ftmp6[i] < 2^101 */
1298     } else {
1299         /*
1300          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1301          */
1302
1303         /* u1 = ftmp3 = x1*z2z2 */
1304         felem_assign(ftmp3, x1);
1305         /* ftmp3[i] < 2^106 */
1306
1307         /* ftmp5 = 2z1z2 */
1308         felem_assign(ftmp5, z1);
1309         felem_scalar(ftmp5, 2);
1310         /* ftmp5[i] < 2*2^106 = 2^107 */
1311
1312         /* s1 = ftmp2 = y1 * z2**3 */
1313         felem_assign(ftmp6, y1);
1314         /* ftmp6[i] < 2^106 */
1315     }
1316
1317     /* u2 = x2*z1z1 */
1318     smallfelem_mul(tmp, x2, small1);
1319     felem_reduce(ftmp4, tmp);
1320
1321     /* h = ftmp4 = u2 - u1 */
1322     felem_diff_zero107(ftmp4, ftmp3);
1323     /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1324     felem_shrink(small4, ftmp4);
1325
1326     x_equal = smallfelem_is_zero(small4);
1327
1328     /* z_out = ftmp5 * h */
1329     felem_small_mul(tmp, small4, ftmp5);
1330     felem_reduce(z_out, tmp);
1331     /* z_out[i] < 2^101 */
1332
1333     /* ftmp = z1 * z1z1 */
1334     smallfelem_mul(tmp, small1, small3);
1335     felem_reduce(ftmp, tmp);
1336
1337     /* s2 = tmp = y2 * z1**3 */
1338     felem_small_mul(tmp, y2, ftmp);
1339     felem_reduce(ftmp5, tmp);
1340
1341     /* r = ftmp5 = (s2 - s1)*2 */
1342     felem_diff_zero107(ftmp5, ftmp6);
1343     /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1344     felem_scalar(ftmp5, 2);
1345     /* ftmp5[i] < 2^109 */
1346     felem_shrink(small1, ftmp5);
1347     y_equal = smallfelem_is_zero(small1);
1348
1349     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1350         point_double(x3, y3, z3, x1, y1, z1);
1351         return;
1352     }
1353
1354     /* I = ftmp = (2h)**2 */
1355     felem_assign(ftmp, ftmp4);
1356     felem_scalar(ftmp, 2);
1357     /* ftmp[i] < 2*2^108 = 2^109 */
1358     felem_square(tmp, ftmp);
1359     felem_reduce(ftmp, tmp);
1360
1361     /* J = ftmp2 = h * I */
1362     felem_mul(tmp, ftmp4, ftmp);
1363     felem_reduce(ftmp2, tmp);
1364
1365     /* V = ftmp4 = U1 * I */
1366     felem_mul(tmp, ftmp3, ftmp);
1367     felem_reduce(ftmp4, tmp);
1368
1369     /* x_out = r**2 - J - 2V */
1370     smallfelem_square(tmp, small1);
1371     felem_reduce(x_out, tmp);
1372     felem_assign(ftmp3, ftmp4);
1373     felem_scalar(ftmp4, 2);
1374     felem_sum(ftmp4, ftmp2);
1375     /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1376     felem_diff(x_out, ftmp4);
1377     /* x_out[i] < 2^105 + 2^101 */
1378
1379     /* y_out = r(V-x_out) - 2 * s1 * J */
1380     felem_diff_zero107(ftmp3, x_out);
1381     /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1382     felem_small_mul(tmp, small1, ftmp3);
1383     felem_mul(tmp2, ftmp6, ftmp2);
1384     longfelem_scalar(tmp2, 2);
1385     /* tmp2[i] < 2*2^67 = 2^68 */
1386     longfelem_diff(tmp, tmp2);
1387     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1388     felem_reduce_zero105(y_out, tmp);
1389     /* y_out[i] < 2^106 */
1390
1391     copy_small_conditional(x_out, x2, z1_is_zero);
1392     copy_conditional(x_out, x1, z2_is_zero);
1393     copy_small_conditional(y_out, y2, z1_is_zero);
1394     copy_conditional(y_out, y1, z2_is_zero);
1395     copy_small_conditional(z_out, z2, z1_is_zero);
1396     copy_conditional(z_out, z1, z2_is_zero);
1397     felem_assign(x3, x_out);
1398     felem_assign(y3, y_out);
1399     felem_assign(z3, z_out);
1400 }
1401
1402 /*
1403  * point_add_small is the same as point_add, except that it operates on
1404  * smallfelems
1405  */
1406 static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1407                             smallfelem x1, smallfelem y1, smallfelem z1,
1408                             smallfelem x2, smallfelem y2, smallfelem z2)
1409 {
1410     felem felem_x3, felem_y3, felem_z3;
1411     felem felem_x1, felem_y1, felem_z1;
1412     smallfelem_expand(felem_x1, x1);
1413     smallfelem_expand(felem_y1, y1);
1414     smallfelem_expand(felem_z1, z1);
1415     point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1416               x2, y2, z2);
1417     felem_shrink(x3, felem_x3);
1418     felem_shrink(y3, felem_y3);
1419     felem_shrink(z3, felem_z3);
1420 }
1421
1422 /*-
1423  * Base point pre computation
1424  * --------------------------
1425  *
1426  * Two different sorts of precomputed tables are used in the following code.
1427  * Each contain various points on the curve, where each point is three field
1428  * elements (x, y, z).
1429  *
1430  * For the base point table, z is usually 1 (0 for the point at infinity).
1431  * This table has 2 * 16 elements, starting with the following:
1432  * index | bits    | point
1433  * ------+---------+------------------------------
1434  *     0 | 0 0 0 0 | 0G
1435  *     1 | 0 0 0 1 | 1G
1436  *     2 | 0 0 1 0 | 2^64G
1437  *     3 | 0 0 1 1 | (2^64 + 1)G
1438  *     4 | 0 1 0 0 | 2^128G
1439  *     5 | 0 1 0 1 | (2^128 + 1)G
1440  *     6 | 0 1 1 0 | (2^128 + 2^64)G
1441  *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1442  *     8 | 1 0 0 0 | 2^192G
1443  *     9 | 1 0 0 1 | (2^192 + 1)G
1444  *    10 | 1 0 1 0 | (2^192 + 2^64)G
1445  *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1446  *    12 | 1 1 0 0 | (2^192 + 2^128)G
1447  *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1448  *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1449  *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1450  * followed by a copy of this with each element multiplied by 2^32.
1451  *
1452  * The reason for this is so that we can clock bits into four different
1453  * locations when doing simple scalar multiplies against the base point,
1454  * and then another four locations using the second 16 elements.
1455  *
1456  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1457
1458 /* gmul is the table of precomputed base points */
1459 static const smallfelem gmul[2][16][3] = {
1460     {{{0, 0, 0, 0},
1461       {0, 0, 0, 0},
1462       {0, 0, 0, 0}},
1463      {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1464        0x6b17d1f2e12c4247},
1465       {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1466        0x4fe342e2fe1a7f9b},
1467       {1, 0, 0, 0}},
1468      {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1469        0x0fa822bc2811aaa5},
1470       {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1471        0xbff44ae8f5dba80d},
1472       {1, 0, 0, 0}},
1473      {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1474        0x300a4bbc89d6726f},
1475       {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1476        0x72aac7e0d09b4644},
1477       {1, 0, 0, 0}},
1478      {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1479        0x447d739beedb5e67},
1480       {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1481        0x2d4825ab834131ee},
1482       {1, 0, 0, 0}},
1483      {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1484        0xef9519328a9c72ff},
1485       {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1486        0x611e9fc37dbb2c9b},
1487       {1, 0, 0, 0}},
1488      {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1489        0x550663797b51f5d8},
1490       {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1491        0x157164848aecb851},
1492       {1, 0, 0, 0}},
1493      {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1494        0xeb5d7745b21141ea},
1495       {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1496        0xeafd72ebdbecc17b},
1497       {1, 0, 0, 0}},
1498      {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1499        0xa6d39677a7849276},
1500       {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1501        0x674f84749b0b8816},
1502       {1, 0, 0, 0}},
1503      {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1504        0x4e769e7672c9ddad},
1505       {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1506        0x42b99082de830663},
1507       {1, 0, 0, 0}},
1508      {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1509        0x78878ef61c6ce04d},
1510       {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1511        0xb6cb3f5d7b72c321},
1512       {1, 0, 0, 0}},
1513      {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1514        0x0c88bc4d716b1287},
1515       {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1516        0xdd5ddea3f3901dc6},
1517       {1, 0, 0, 0}},
1518      {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1519        0x68f344af6b317466},
1520       {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1521        0x31b9c405f8540a20},
1522       {1, 0, 0, 0}},
1523      {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1524        0x4052bf4b6f461db9},
1525       {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1526        0xfecf4d5190b0fc61},
1527       {1, 0, 0, 0}},
1528      {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1529        0x1eddbae2c802e41a},
1530       {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1531        0x43104d86560ebcfc},
1532       {1, 0, 0, 0}},
1533      {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1534        0xb48e26b484f7a21c},
1535       {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1536        0xfac015404d4d3dab},
1537       {1, 0, 0, 0}}},
1538     {{{0, 0, 0, 0},
1539       {0, 0, 0, 0},
1540       {0, 0, 0, 0}},
1541      {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1542        0x7fe36b40af22af89},
1543       {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1544        0xe697d45825b63624},
1545       {1, 0, 0, 0}},
1546      {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1547        0x4a5b506612a677a6},
1548       {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1549        0xeb13461ceac089f1},
1550       {1, 0, 0, 0}},
1551      {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1552        0x0781b8291c6a220a},
1553       {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1554        0x690cde8df0151593},
1555       {1, 0, 0, 0}},
1556      {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1557        0x8a535f566ec73617},
1558       {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1559        0x0455c08468b08bd7},
1560       {1, 0, 0, 0}},
1561      {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1562        0x06bada7ab77f8276},
1563       {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1564        0x5b476dfd0e6cb18a},
1565       {1, 0, 0, 0}},
1566      {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1567        0x3e29864e8a2ec908},
1568       {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1569        0x239b90ea3dc31e7e},
1570       {1, 0, 0, 0}},
1571      {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1572        0x820f4dd949f72ff7},
1573       {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1574        0x140406ec783a05ec},
1575       {1, 0, 0, 0}},
1576      {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1577        0x68f6b8542783dfee},
1578       {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1579        0xcbe1feba92e40ce6},
1580       {1, 0, 0, 0}},
1581      {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1582        0xd0b2f94d2f420109},
1583       {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1584        0x971459828b0719e5},
1585       {1, 0, 0, 0}},
1586      {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1587        0x961610004a866aba},
1588       {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1589        0x7acb9fadcee75e44},
1590       {1, 0, 0, 0}},
1591      {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1592        0x24eb9acca333bf5b},
1593       {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1594        0x69f891c5acd079cc},
1595       {1, 0, 0, 0}},
1596      {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1597        0xe51f547c5972a107},
1598       {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1599        0x1c309a2b25bb1387},
1600       {1, 0, 0, 0}},
1601      {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1602        0x20b87b8aa2c4e503},
1603       {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1604        0xf5c6fa49919776be},
1605       {1, 0, 0, 0}},
1606      {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1607        0x1ed7d1b9332010b9},
1608       {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1609        0x3a2b03f03217257a},
1610       {1, 0, 0, 0}},
1611      {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1612        0x15fee545c78dd9f6},
1613       {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1614        0x4ab5b6b2b8753f81},
1615       {1, 0, 0, 0}}}
1616 };
1617
1618 /*
1619  * select_point selects the |idx|th point from a precomputation table and
1620  * copies it to out.
1621  */
1622 static void select_point(const u64 idx, unsigned int size,
1623                          const smallfelem pre_comp[16][3], smallfelem out[3])
1624 {
1625     unsigned i, j;
1626     u64 *outlimbs = &out[0][0];
1627
1628     memset(out, 0, sizeof(*out) * 3);
1629
1630     for (i = 0; i < size; i++) {
1631         const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1632         u64 mask = i ^ idx;
1633         mask |= mask >> 4;
1634         mask |= mask >> 2;
1635         mask |= mask >> 1;
1636         mask &= 1;
1637         mask--;
1638         for (j = 0; j < NLIMBS * 3; j++)
1639             outlimbs[j] |= inlimbs[j] & mask;
1640     }
1641 }
1642
1643 /* get_bit returns the |i|th bit in |in| */
1644 static char get_bit(const felem_bytearray in, int i)
1645 {
1646     if ((i < 0) || (i >= 256))
1647         return 0;
1648     return (in[i >> 3] >> (i & 7)) & 1;
1649 }
1650
1651 /*
1652  * Interleaved point multiplication using precomputed point multiples: The
1653  * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1654  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1655  * generator, using certain (large) precomputed multiples in g_pre_comp.
1656  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1657  */
1658 static void batch_mul(felem x_out, felem y_out, felem z_out,
1659                       const felem_bytearray scalars[],
1660                       const unsigned num_points, const u8 *g_scalar,
1661                       const int mixed, const smallfelem pre_comp[][17][3],
1662                       const smallfelem g_pre_comp[2][16][3])
1663 {
1664     int i, skip;
1665     unsigned num, gen_mul = (g_scalar != NULL);
1666     felem nq[3], ftmp;
1667     smallfelem tmp[3];
1668     u64 bits;
1669     u8 sign, digit;
1670
1671     /* set nq to the point at infinity */
1672     memset(nq, 0, sizeof(nq));
1673
1674     /*
1675      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1676      * of the generator (two in each of the last 32 rounds) and additions of
1677      * other points multiples (every 5th round).
1678      */
1679     skip = 1;                   /* save two point operations in the first
1680                                  * round */
1681     for (i = (num_points ? 255 : 31); i >= 0; --i) {
1682         /* double */
1683         if (!skip)
1684             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1685
1686         /* add multiples of the generator */
1687         if (gen_mul && (i <= 31)) {
1688             /* first, look 32 bits upwards */
1689             bits = get_bit(g_scalar, i + 224) << 3;
1690             bits |= get_bit(g_scalar, i + 160) << 2;
1691             bits |= get_bit(g_scalar, i + 96) << 1;
1692             bits |= get_bit(g_scalar, i + 32);
1693             /* select the point to add, in constant time */
1694             select_point(bits, 16, g_pre_comp[1], tmp);
1695
1696             if (!skip) {
1697                 /* Arg 1 below is for "mixed" */
1698                 point_add(nq[0], nq[1], nq[2],
1699                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1700             } else {
1701                 smallfelem_expand(nq[0], tmp[0]);
1702                 smallfelem_expand(nq[1], tmp[1]);
1703                 smallfelem_expand(nq[2], tmp[2]);
1704                 skip = 0;
1705             }
1706
1707             /* second, look at the current position */
1708             bits = get_bit(g_scalar, i + 192) << 3;
1709             bits |= get_bit(g_scalar, i + 128) << 2;
1710             bits |= get_bit(g_scalar, i + 64) << 1;
1711             bits |= get_bit(g_scalar, i);
1712             /* select the point to add, in constant time */
1713             select_point(bits, 16, g_pre_comp[0], tmp);
1714             /* Arg 1 below is for "mixed" */
1715             point_add(nq[0], nq[1], nq[2],
1716                       nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1717         }
1718
1719         /* do other additions every 5 doublings */
1720         if (num_points && (i % 5 == 0)) {
1721             /* loop over all scalars */
1722             for (num = 0; num < num_points; ++num) {
1723                 bits = get_bit(scalars[num], i + 4) << 5;
1724                 bits |= get_bit(scalars[num], i + 3) << 4;
1725                 bits |= get_bit(scalars[num], i + 2) << 3;
1726                 bits |= get_bit(scalars[num], i + 1) << 2;
1727                 bits |= get_bit(scalars[num], i) << 1;
1728                 bits |= get_bit(scalars[num], i - 1);
1729                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1730
1731                 /*
1732                  * select the point to add or subtract, in constant time
1733                  */
1734                 select_point(digit, 17, pre_comp[num], tmp);
1735                 smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1736                                                * point */
1737                 copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1738                 felem_contract(tmp[1], ftmp);
1739
1740                 if (!skip) {
1741                     point_add(nq[0], nq[1], nq[2],
1742                               nq[0], nq[1], nq[2],
1743                               mixed, tmp[0], tmp[1], tmp[2]);
1744                 } else {
1745                     smallfelem_expand(nq[0], tmp[0]);
1746                     smallfelem_expand(nq[1], tmp[1]);
1747                     smallfelem_expand(nq[2], tmp[2]);
1748                     skip = 0;
1749                 }
1750             }
1751         }
1752     }
1753     felem_assign(x_out, nq[0]);
1754     felem_assign(y_out, nq[1]);
1755     felem_assign(z_out, nq[2]);
1756 }
1757
1758 /* Precomputation for the group generator. */
1759 struct nistp256_pre_comp_st {
1760     smallfelem g_pre_comp[2][16][3];
1761     int references;
1762 };
1763
1764 const EC_METHOD *EC_GFp_nistp256_method(void)
1765 {
1766     static const EC_METHOD ret = {
1767         EC_FLAGS_DEFAULT_OCT,
1768         NID_X9_62_prime_field,
1769         ec_GFp_nistp256_group_init,
1770         ec_GFp_simple_group_finish,
1771         ec_GFp_simple_group_clear_finish,
1772         ec_GFp_nist_group_copy,
1773         ec_GFp_nistp256_group_set_curve,
1774         ec_GFp_simple_group_get_curve,
1775         ec_GFp_simple_group_get_degree,
1776         ec_GFp_simple_group_check_discriminant,
1777         ec_GFp_simple_point_init,
1778         ec_GFp_simple_point_finish,
1779         ec_GFp_simple_point_clear_finish,
1780         ec_GFp_simple_point_copy,
1781         ec_GFp_simple_point_set_to_infinity,
1782         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1783         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1784         ec_GFp_simple_point_set_affine_coordinates,
1785         ec_GFp_nistp256_point_get_affine_coordinates,
1786         0 /* point_set_compressed_coordinates */ ,
1787         0 /* point2oct */ ,
1788         0 /* oct2point */ ,
1789         ec_GFp_simple_add,
1790         ec_GFp_simple_dbl,
1791         ec_GFp_simple_invert,
1792         ec_GFp_simple_is_at_infinity,
1793         ec_GFp_simple_is_on_curve,
1794         ec_GFp_simple_cmp,
1795         ec_GFp_simple_make_affine,
1796         ec_GFp_simple_points_make_affine,
1797         ec_GFp_nistp256_points_mul,
1798         ec_GFp_nistp256_precompute_mult,
1799         ec_GFp_nistp256_have_precompute_mult,
1800         ec_GFp_nist_field_mul,
1801         ec_GFp_nist_field_sqr,
1802         0 /* field_div */ ,
1803         0 /* field_encode */ ,
1804         0 /* field_decode */ ,
1805         0                       /* field_set_to_one */
1806     };
1807
1808     return &ret;
1809 }
1810
1811 /******************************************************************************/
1812 /*
1813  * FUNCTIONS TO MANAGE PRECOMPUTATION
1814  */
1815
1816 static NISTP256_PRE_COMP *nistp256_pre_comp_new()
1817 {
1818     NISTP256_PRE_COMP *ret = NULL;
1819     ret = OPENSSL_malloc(sizeof(*ret));
1820     if (ret == NULL) {
1821         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1822         return ret;
1823     }
1824     memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1825     ret->references = 1;
1826     return ret;
1827 }
1828
1829 NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *p)
1830 {
1831     if (p != NULL)
1832         CRYPTO_add(&p->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1833     return p;
1834 }
1835
1836 void EC_nistp256_pre_comp_free(NISTP256_PRE_COMP *pre)
1837 {
1838     if (pre == NULL
1839             || CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP) > 0)
1840         return;
1841     OPENSSL_free(pre);
1842 }
1843
1844 /******************************************************************************/
1845 /*
1846  * OPENSSL EC_METHOD FUNCTIONS
1847  */
1848
1849 int ec_GFp_nistp256_group_init(EC_GROUP *group)
1850 {
1851     int ret;
1852     ret = ec_GFp_simple_group_init(group);
1853     group->a_is_minus3 = 1;
1854     return ret;
1855 }
1856
1857 int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1858                                     const BIGNUM *a, const BIGNUM *b,
1859                                     BN_CTX *ctx)
1860 {
1861     int ret = 0;
1862     BN_CTX *new_ctx = NULL;
1863     BIGNUM *curve_p, *curve_a, *curve_b;
1864
1865     if (ctx == NULL)
1866         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1867             return 0;
1868     BN_CTX_start(ctx);
1869     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1870         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1871         ((curve_b = BN_CTX_get(ctx)) == NULL))
1872         goto err;
1873     BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1874     BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1875     BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1876     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1877         ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1878               EC_R_WRONG_CURVE_PARAMETERS);
1879         goto err;
1880     }
1881     group->field_mod_func = BN_nist_mod_256;
1882     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1883  err:
1884     BN_CTX_end(ctx);
1885     BN_CTX_free(new_ctx);
1886     return ret;
1887 }
1888
1889 /*
1890  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1891  * (X/Z^2, Y/Z^3)
1892  */
1893 int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1894                                                  const EC_POINT *point,
1895                                                  BIGNUM *x, BIGNUM *y,
1896                                                  BN_CTX *ctx)
1897 {
1898     felem z1, z2, x_in, y_in;
1899     smallfelem x_out, y_out;
1900     longfelem tmp;
1901
1902     if (EC_POINT_is_at_infinity(group, point)) {
1903         ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1904               EC_R_POINT_AT_INFINITY);
1905         return 0;
1906     }
1907     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1908         (!BN_to_felem(z1, point->Z)))
1909         return 0;
1910     felem_inv(z2, z1);
1911     felem_square(tmp, z2);
1912     felem_reduce(z1, tmp);
1913     felem_mul(tmp, x_in, z1);
1914     felem_reduce(x_in, tmp);
1915     felem_contract(x_out, x_in);
1916     if (x != NULL) {
1917         if (!smallfelem_to_BN(x, x_out)) {
1918             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1919                   ERR_R_BN_LIB);
1920             return 0;
1921         }
1922     }
1923     felem_mul(tmp, z1, z2);
1924     felem_reduce(z1, tmp);
1925     felem_mul(tmp, y_in, z1);
1926     felem_reduce(y_in, tmp);
1927     felem_contract(y_out, y_in);
1928     if (y != NULL) {
1929         if (!smallfelem_to_BN(y, y_out)) {
1930             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1931                   ERR_R_BN_LIB);
1932             return 0;
1933         }
1934     }
1935     return 1;
1936 }
1937
1938 /* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
1939 static void make_points_affine(size_t num, smallfelem points[][3],
1940                                smallfelem tmp_smallfelems[])
1941 {
1942     /*
1943      * Runs in constant time, unless an input is the point at infinity (which
1944      * normally shouldn't happen).
1945      */
1946     ec_GFp_nistp_points_make_affine_internal(num,
1947                                              points,
1948                                              sizeof(smallfelem),
1949                                              tmp_smallfelems,
1950                                              (void (*)(void *))smallfelem_one,
1951                                              (int (*)(const void *))
1952                                              smallfelem_is_zero_int,
1953                                              (void (*)(void *, const void *))
1954                                              smallfelem_assign,
1955                                              (void (*)(void *, const void *))
1956                                              smallfelem_square_contract,
1957                                              (void (*)
1958                                               (void *, const void *,
1959                                                const void *))
1960                                              smallfelem_mul_contract,
1961                                              (void (*)(void *, const void *))
1962                                              smallfelem_inv_contract,
1963                                              /* nothing to contract */
1964                                              (void (*)(void *, const void *))
1965                                              smallfelem_assign);
1966 }
1967
1968 /*
1969  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1970  * values Result is stored in r (r can equal one of the inputs).
1971  */
1972 int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
1973                                const BIGNUM *scalar, size_t num,
1974                                const EC_POINT *points[],
1975                                const BIGNUM *scalars[], BN_CTX *ctx)
1976 {
1977     int ret = 0;
1978     int j;
1979     int mixed = 0;
1980     BN_CTX *new_ctx = NULL;
1981     BIGNUM *x, *y, *z, *tmp_scalar;
1982     felem_bytearray g_secret;
1983     felem_bytearray *secrets = NULL;
1984     smallfelem (*pre_comp)[17][3] = NULL;
1985     smallfelem *tmp_smallfelems = NULL;
1986     felem_bytearray tmp;
1987     unsigned i, num_bytes;
1988     int have_pre_comp = 0;
1989     size_t num_points = num;
1990     smallfelem x_in, y_in, z_in;
1991     felem x_out, y_out, z_out;
1992     NISTP256_PRE_COMP *pre = NULL;
1993     const smallfelem(*g_pre_comp)[16][3] = NULL;
1994     EC_POINT *generator = NULL;
1995     const EC_POINT *p = NULL;
1996     const BIGNUM *p_scalar = NULL;
1997
1998     if (ctx == NULL)
1999         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2000             return 0;
2001     BN_CTX_start(ctx);
2002     if (((x = BN_CTX_get(ctx)) == NULL) ||
2003         ((y = BN_CTX_get(ctx)) == NULL) ||
2004         ((z = BN_CTX_get(ctx)) == NULL) ||
2005         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
2006         goto err;
2007
2008     if (scalar != NULL) {
2009         pre = group->pre_comp.nistp256;
2010         if (pre)
2011             /* we have precomputation, try to use it */
2012             g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2013         else
2014             /* try to use the standard precomputation */
2015             g_pre_comp = &gmul[0];
2016         generator = EC_POINT_new(group);
2017         if (generator == NULL)
2018             goto err;
2019         /* get the generator from precomputation */
2020         if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2021             !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2022             !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2023             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2024             goto err;
2025         }
2026         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
2027                                                       generator, x, y, z,
2028                                                       ctx))
2029             goto err;
2030         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2031             /* precomputation matches generator */
2032             have_pre_comp = 1;
2033         else
2034             /*
2035              * we don't have valid precomputation: treat the generator as a
2036              * random point
2037              */
2038             num_points++;
2039     }
2040     if (num_points > 0) {
2041         if (num_points >= 3) {
2042             /*
2043              * unless we precompute multiples for just one or two points,
2044              * converting those into affine form is time well spent
2045              */
2046             mixed = 1;
2047         }
2048         secrets = OPENSSL_malloc(sizeof(*secrets) * num_points);
2049         pre_comp = OPENSSL_malloc(sizeof(*pre_comp) * num_points);
2050         if (mixed)
2051             tmp_smallfelems =
2052               OPENSSL_malloc(sizeof(*tmp_smallfelems) * (num_points * 17 + 1));
2053         if ((secrets == NULL) || (pre_comp == NULL)
2054             || (mixed && (tmp_smallfelems == NULL))) {
2055             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
2056             goto err;
2057         }
2058
2059         /*
2060          * we treat NULL scalars as 0, and NULL points as points at infinity,
2061          * i.e., they contribute nothing to the linear combination
2062          */
2063         memset(secrets, 0, sizeof(*secrets) * num_points);
2064         memset(pre_comp, 0, sizeof(*pre_comp) * num_points);
2065         for (i = 0; i < num_points; ++i) {
2066             if (i == num)
2067                 /*
2068                  * we didn't have a valid precomputation, so we pick the
2069                  * generator
2070                  */
2071             {
2072                 p = EC_GROUP_get0_generator(group);
2073                 p_scalar = scalar;
2074             } else
2075                 /* the i^th point */
2076             {
2077                 p = points[i];
2078                 p_scalar = scalars[i];
2079             }
2080             if ((p_scalar != NULL) && (p != NULL)) {
2081                 /* reduce scalar to 0 <= scalar < 2^256 */
2082                 if ((BN_num_bits(p_scalar) > 256)
2083                     || (BN_is_negative(p_scalar))) {
2084                     /*
2085                      * this is an unusual input, and we don't guarantee
2086                      * constant-timeness
2087                      */
2088                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
2089                         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2090                         goto err;
2091                     }
2092                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
2093                 } else
2094                     num_bytes = BN_bn2bin(p_scalar, tmp);
2095                 flip_endian(secrets[i], tmp, num_bytes);
2096                 /* precompute multiples */
2097                 if ((!BN_to_felem(x_out, p->X)) ||
2098                     (!BN_to_felem(y_out, p->Y)) ||
2099                     (!BN_to_felem(z_out, p->Z)))
2100                     goto err;
2101                 felem_shrink(pre_comp[i][1][0], x_out);
2102                 felem_shrink(pre_comp[i][1][1], y_out);
2103                 felem_shrink(pre_comp[i][1][2], z_out);
2104                 for (j = 2; j <= 16; ++j) {
2105                     if (j & 1) {
2106                         point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2107                                         pre_comp[i][j][2], pre_comp[i][1][0],
2108                                         pre_comp[i][1][1], pre_comp[i][1][2],
2109                                         pre_comp[i][j - 1][0],
2110                                         pre_comp[i][j - 1][1],
2111                                         pre_comp[i][j - 1][2]);
2112                     } else {
2113                         point_double_small(pre_comp[i][j][0],
2114                                            pre_comp[i][j][1],
2115                                            pre_comp[i][j][2],
2116                                            pre_comp[i][j / 2][0],
2117                                            pre_comp[i][j / 2][1],
2118                                            pre_comp[i][j / 2][2]);
2119                     }
2120                 }
2121             }
2122         }
2123         if (mixed)
2124             make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2125     }
2126
2127     /* the scalar for the generator */
2128     if ((scalar != NULL) && (have_pre_comp)) {
2129         memset(g_secret, 0, sizeof(g_secret));
2130         /* reduce scalar to 0 <= scalar < 2^256 */
2131         if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2132             /*
2133              * this is an unusual input, and we don't guarantee
2134              * constant-timeness
2135              */
2136             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2137                 ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2138                 goto err;
2139             }
2140             num_bytes = BN_bn2bin(tmp_scalar, tmp);
2141         } else
2142             num_bytes = BN_bn2bin(scalar, tmp);
2143         flip_endian(g_secret, tmp, num_bytes);
2144         /* do the multiplication with generator precomputation */
2145         batch_mul(x_out, y_out, z_out,
2146                   (const felem_bytearray(*))secrets, num_points,
2147                   g_secret,
2148                   mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2149     } else
2150         /* do the multiplication without generator precomputation */
2151         batch_mul(x_out, y_out, z_out,
2152                   (const felem_bytearray(*))secrets, num_points,
2153                   NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2154     /* reduce the output to its unique minimal representation */
2155     felem_contract(x_in, x_out);
2156     felem_contract(y_in, y_out);
2157     felem_contract(z_in, z_out);
2158     if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2159         (!smallfelem_to_BN(z, z_in))) {
2160         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2161         goto err;
2162     }
2163     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2164
2165  err:
2166     BN_CTX_end(ctx);
2167     EC_POINT_free(generator);
2168     BN_CTX_free(new_ctx);
2169     OPENSSL_free(secrets);
2170     OPENSSL_free(pre_comp);
2171     OPENSSL_free(tmp_smallfelems);
2172     return ret;
2173 }
2174
2175 int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2176 {
2177     int ret = 0;
2178     NISTP256_PRE_COMP *pre = NULL;
2179     int i, j;
2180     BN_CTX *new_ctx = NULL;
2181     BIGNUM *x, *y;
2182     EC_POINT *generator = NULL;
2183     smallfelem tmp_smallfelems[32];
2184     felem x_tmp, y_tmp, z_tmp;
2185
2186     /* throw away old precomputation */
2187     EC_nistp256_pre_comp_free(group->pre_comp.nistp256);
2188     group->pre_comp.nistp256 = NULL;
2189     if (ctx == NULL)
2190         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2191             return 0;
2192     BN_CTX_start(ctx);
2193     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2194         goto err;
2195     /* get the generator */
2196     if (group->generator == NULL)
2197         goto err;
2198     generator = EC_POINT_new(group);
2199     if (generator == NULL)
2200         goto err;
2201     BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2202     BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2203     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2204         goto err;
2205     if ((pre = nistp256_pre_comp_new()) == NULL)
2206         goto err;
2207     /*
2208      * if the generator is the standard one, use built-in precomputation
2209      */
2210     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2211         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2212         ret = 1;
2213         goto err;
2214     }
2215     if ((!BN_to_felem(x_tmp, group->generator->X)) ||
2216         (!BN_to_felem(y_tmp, group->generator->Y)) ||
2217         (!BN_to_felem(z_tmp, group->generator->Z)))
2218         goto err;
2219     felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2220     felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2221     felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2222     /*
2223      * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2224      * 2^160*G, 2^224*G for the second one
2225      */
2226     for (i = 1; i <= 8; i <<= 1) {
2227         point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2228                            pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2229                            pre->g_pre_comp[0][i][1],
2230                            pre->g_pre_comp[0][i][2]);
2231         for (j = 0; j < 31; ++j) {
2232             point_double_small(pre->g_pre_comp[1][i][0],
2233                                pre->g_pre_comp[1][i][1],
2234                                pre->g_pre_comp[1][i][2],
2235                                pre->g_pre_comp[1][i][0],
2236                                pre->g_pre_comp[1][i][1],
2237                                pre->g_pre_comp[1][i][2]);
2238         }
2239         if (i == 8)
2240             break;
2241         point_double_small(pre->g_pre_comp[0][2 * i][0],
2242                            pre->g_pre_comp[0][2 * i][1],
2243                            pre->g_pre_comp[0][2 * i][2],
2244                            pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2245                            pre->g_pre_comp[1][i][2]);
2246         for (j = 0; j < 31; ++j) {
2247             point_double_small(pre->g_pre_comp[0][2 * i][0],
2248                                pre->g_pre_comp[0][2 * i][1],
2249                                pre->g_pre_comp[0][2 * i][2],
2250                                pre->g_pre_comp[0][2 * i][0],
2251                                pre->g_pre_comp[0][2 * i][1],
2252                                pre->g_pre_comp[0][2 * i][2]);
2253         }
2254     }
2255     for (i = 0; i < 2; i++) {
2256         /* g_pre_comp[i][0] is the point at infinity */
2257         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2258         /* the remaining multiples */
2259         /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2260         point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2261                         pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2262                         pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2263                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2264                         pre->g_pre_comp[i][2][2]);
2265         /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2266         point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2267                         pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2268                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2269                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2270                         pre->g_pre_comp[i][2][2]);
2271         /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2272         point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2273                         pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2274                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2275                         pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2276                         pre->g_pre_comp[i][4][2]);
2277         /*
2278          * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2279          */
2280         point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2281                         pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2282                         pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2283                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2284                         pre->g_pre_comp[i][2][2]);
2285         for (j = 1; j < 8; ++j) {
2286             /* odd multiples: add G resp. 2^32*G */
2287             point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2288                             pre->g_pre_comp[i][2 * j + 1][1],
2289                             pre->g_pre_comp[i][2 * j + 1][2],
2290                             pre->g_pre_comp[i][2 * j][0],
2291                             pre->g_pre_comp[i][2 * j][1],
2292                             pre->g_pre_comp[i][2 * j][2],
2293                             pre->g_pre_comp[i][1][0],
2294                             pre->g_pre_comp[i][1][1],
2295                             pre->g_pre_comp[i][1][2]);
2296         }
2297     }
2298     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2299
2300     SETPRECOMP(group, nistp256, pre);
2301     pre = NULL;
2302     ret = 1;
2303
2304  err:
2305     BN_CTX_end(ctx);
2306     EC_POINT_free(generator);
2307     BN_CTX_free(new_ctx);
2308     EC_nistp256_pre_comp_free(pre);
2309     return ret;
2310 }
2311
2312 int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2313 {
2314     return HAVEPRECOMP(group, nistp256);
2315 }
2316 #else
2317 static void *dummy = &dummy;
2318 #endif