crypto/ec/ecp_nistp256.c

   1 /*
   2  * Copyright 2011-2019 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the OpenSSL license (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
  28  *
  29  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  30  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  31  * work which got its smarts from Daniel J. Bernstein's work on the same.
  32  */
  33
  34 #include <openssl/opensslconf.h>
  35 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  36 NON_EMPTY_TRANSLATION_UNIT
  37 #else
  38
  39 # include <stdint.h>
  40 # include <string.h>
  41 # include <openssl/err.h>
  42 # include "ec_lcl.h"
  43
  44 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  45   /* even with gcc, the typedef won't work for 32-bit platforms */
  46 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  47                                  * platforms */
  48 typedef __int128_t int128_t;
  49 # else
  50 #  error "Need GCC 3.1 or later to define type uint128_t"
  51 # endif
  52
  53 typedef uint8_t u8;
  54 typedef uint32_t u32;
  55 typedef uint64_t u64;
  56
  57 /*
  58  * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
  59  * can serialise an element of this field into 32 bytes. We call this an
  60  * felem_bytearray.
  61  */
  62
  63 typedef u8 felem_bytearray[32];
  64
  65 /*
  66  * These are the parameters of P256, taken from FIPS 186-3, page 86. These
  67  * values are big-endian.
  68  */
  69 static const felem_bytearray nistp256_curve_params[5] = {
  70     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
  71      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  72      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
  74     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
  75      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  76      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
  78     {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
  79      0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
  80      0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
  81      0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
  82     {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
  83      0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
  84      0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
  85      0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
  86     {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
  87      0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
  88      0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
  89      0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
  90 };
  91
  92 /*-
  93  * The representation of field elements.
  94  * ------------------------------------
  95  *
  96  * We represent field elements with either four 128-bit values, eight 128-bit
  97  * values, or four 64-bit values. The field element represented is:
  98  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
  99  * or:
 100  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
 101  *
 102  * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
 103  * apart, but are 128-bits wide, the most significant bits of each limb overlap
 104  * with the least significant bits of the next.
 105  *
 106  * A field element with four limbs is an 'felem'. One with eight limbs is a
 107  * 'longfelem'
 108  *
 109  * A field element with four, 64-bit values is called a 'smallfelem'. Small
 110  * values are used as intermediate values before multiplication.
 111  */
 112
 113 # define NLIMBS 4
 114
 115 typedef uint128_t limb;
 116 typedef limb felem[NLIMBS];
 117 typedef limb longfelem[NLIMBS * 2];
 118 typedef u64 smallfelem[NLIMBS];
 119
 120 /* This is the value of the prime as four 64-bit words, little-endian. */
 121 static const u64 kPrime[4] =
 122     { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
 123 static const u64 bottom63bits = 0x7ffffffffffffffful;
 124
 125 /*
 126  * bin32_to_felem takes a little-endian byte array and converts it into felem
 127  * form. This assumes that the CPU is little-endian.
 128  */
 129 static void bin32_to_felem(felem out, const u8 in[32])
 130 {
 131     out[0] = *((u64 *)&in[0]);
 132     out[1] = *((u64 *)&in[8]);
 133     out[2] = *((u64 *)&in[16]);
 134     out[3] = *((u64 *)&in[24]);
 135 }
 136
 137 /*
 138  * smallfelem_to_bin32 takes a smallfelem and serialises into a little
 139  * endian, 32 byte array. This assumes that the CPU is little-endian.
 140  */
 141 static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
 142 {
 143     *((u64 *)&out[0]) = in[0];
 144     *((u64 *)&out[8]) = in[1];
 145     *((u64 *)&out[16]) = in[2];
 146     *((u64 *)&out[24]) = in[3];
 147 }
 148
 149 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 150 static int BN_to_felem(felem out, const BIGNUM *bn)
 151 {
 152     felem_bytearray b_out;
 153     int num_bytes;
 154
 155     if (BN_is_negative(bn)) {
 156         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 157         return 0;
 158     }
 159     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
 160     if (num_bytes < 0) {
 161         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 162         return 0;
 163     }
 164     bin32_to_felem(out, b_out);
 165     return 1;
 166 }
 167
 168 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 169 static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
 170 {
 171     felem_bytearray b_out;
 172     smallfelem_to_bin32(b_out, in);
 173     return BN_lebin2bn(b_out, sizeof(b_out), out);
 174 }
 175
 176 /*-
 177  * Field operations
 178  * ----------------
 179  */
 180
 181 static void smallfelem_one(smallfelem out)
 182 {
 183     out[0] = 1;
 184     out[1] = 0;
 185     out[2] = 0;
 186     out[3] = 0;
 187 }
 188
 189 static void smallfelem_assign(smallfelem out, const smallfelem in)
 190 {
 191     out[0] = in[0];
 192     out[1] = in[1];
 193     out[2] = in[2];
 194     out[3] = in[3];
 195 }
 196
 197 static void felem_assign(felem out, const felem in)
 198 {
 199     out[0] = in[0];
 200     out[1] = in[1];
 201     out[2] = in[2];
 202     out[3] = in[3];
 203 }
 204
 205 /* felem_sum sets out = out + in. */
 206 static void felem_sum(felem out, const felem in)
 207 {
 208     out[0] += in[0];
 209     out[1] += in[1];
 210     out[2] += in[2];
 211     out[3] += in[3];
 212 }
 213
 214 /* felem_small_sum sets out = out + in. */
 215 static void felem_small_sum(felem out, const smallfelem in)
 216 {
 217     out[0] += in[0];
 218     out[1] += in[1];
 219     out[2] += in[2];
 220     out[3] += in[3];
 221 }
 222
 223 /* felem_scalar sets out = out * scalar */
 224 static void felem_scalar(felem out, const u64 scalar)
 225 {
 226     out[0] *= scalar;
 227     out[1] *= scalar;
 228     out[2] *= scalar;
 229     out[3] *= scalar;
 230 }
 231
 232 /* longfelem_scalar sets out = out * scalar */
 233 static void longfelem_scalar(longfelem out, const u64 scalar)
 234 {
 235     out[0] *= scalar;
 236     out[1] *= scalar;
 237     out[2] *= scalar;
 238     out[3] *= scalar;
 239     out[4] *= scalar;
 240     out[5] *= scalar;
 241     out[6] *= scalar;
 242     out[7] *= scalar;
 243 }
 244
 245 # define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
 246 # define two105 (((limb)1) << 105)
 247 # define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
 248
 249 /* zero105 is 0 mod p */
 250 static const felem zero105 =
 251     { two105m41m9, two105, two105m41p9, two105m41p9 };
 252
 253 /*-
 254  * smallfelem_neg sets |out| to |-small|
 255  * On exit:
 256  *   out[i] < out[i] + 2^105
 257  */
 258 static void smallfelem_neg(felem out, const smallfelem small)
 259 {
 260     /* In order to prevent underflow, we subtract from 0 mod p. */
 261     out[0] = zero105[0] - small[0];
 262     out[1] = zero105[1] - small[1];
 263     out[2] = zero105[2] - small[2];
 264     out[3] = zero105[3] - small[3];
 265 }
 266
 267 /*-
 268  * felem_diff subtracts |in| from |out|
 269  * On entry:
 270  *   in[i] < 2^104
 271  * On exit:
 272  *   out[i] < out[i] + 2^105
 273  */
 274 static void felem_diff(felem out, const felem in)
 275 {
 276     /*
 277      * In order to prevent underflow, we add 0 mod p before subtracting.
 278      */
 279     out[0] += zero105[0];
 280     out[1] += zero105[1];
 281     out[2] += zero105[2];
 282     out[3] += zero105[3];
 283
 284     out[0] -= in[0];
 285     out[1] -= in[1];
 286     out[2] -= in[2];
 287     out[3] -= in[3];
 288 }
 289
 290 # define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
 291 # define two107 (((limb)1) << 107)
 292 # define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
 293
 294 /* zero107 is 0 mod p */
 295 static const felem zero107 =
 296     { two107m43m11, two107, two107m43p11, two107m43p11 };
 297
 298 /*-
 299  * An alternative felem_diff for larger inputs |in|
 300  * felem_diff_zero107 subtracts |in| from |out|
 301  * On entry:
 302  *   in[i] < 2^106
 303  * On exit:
 304  *   out[i] < out[i] + 2^107
 305  */
 306 static void felem_diff_zero107(felem out, const felem in)
 307 {
 308     /*
 309      * In order to prevent underflow, we add 0 mod p before subtracting.
 310      */
 311     out[0] += zero107[0];
 312     out[1] += zero107[1];
 313     out[2] += zero107[2];
 314     out[3] += zero107[3];
 315
 316     out[0] -= in[0];
 317     out[1] -= in[1];
 318     out[2] -= in[2];
 319     out[3] -= in[3];
 320 }
 321
 322 /*-
 323  * longfelem_diff subtracts |in| from |out|
 324  * On entry:
 325  *   in[i] < 7*2^67
 326  * On exit:
 327  *   out[i] < out[i] + 2^70 + 2^40
 328  */
 329 static void longfelem_diff(longfelem out, const longfelem in)
 330 {
 331     static const limb two70m8p6 =
 332         (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
 333     static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
 334     static const limb two70 = (((limb) 1) << 70);
 335     static const limb two70m40m38p6 =
 336         (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
 337         (((limb) 1) << 6);
 338     static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
 339
 340     /* add 0 mod p to avoid underflow */
 341     out[0] += two70m8p6;
 342     out[1] += two70p40;
 343     out[2] += two70;
 344     out[3] += two70m40m38p6;
 345     out[4] += two70m6;
 346     out[5] += two70m6;
 347     out[6] += two70m6;
 348     out[7] += two70m6;
 349
 350     /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
 351     out[0] -= in[0];
 352     out[1] -= in[1];
 353     out[2] -= in[2];
 354     out[3] -= in[3];
 355     out[4] -= in[4];
 356     out[5] -= in[5];
 357     out[6] -= in[6];
 358     out[7] -= in[7];
 359 }
 360
 361 # define two64m0 (((limb)1) << 64) - 1
 362 # define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
 363 # define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
 364 # define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
 365
 366 /* zero110 is 0 mod p */
 367 static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
 368
 369 /*-
 370  * felem_shrink converts an felem into a smallfelem. The result isn't quite
 371  * minimal as the value may be greater than p.
 372  *
 373  * On entry:
 374  *   in[i] < 2^109
 375  * On exit:
 376  *   out[i] < 2^64
 377  */
 378 static void felem_shrink(smallfelem out, const felem in)
 379 {
 380     felem tmp;
 381     u64 a, b, mask;
 382     u64 high, low;
 383     static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
 384
 385     /* Carry 2->3 */
 386     tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
 387     /* tmp[3] < 2^110 */
 388
 389     tmp[2] = zero110[2] + (u64)in[2];
 390     tmp[0] = zero110[0] + in[0];
 391     tmp[1] = zero110[1] + in[1];
 392     /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
 393
 394     /*
 395      * We perform two partial reductions where we eliminate the high-word of
 396      * tmp[3]. We don't update the other words till the end.
 397      */
 398     a = tmp[3] >> 64;           /* a < 2^46 */
 399     tmp[3] = (u64)tmp[3];
 400     tmp[3] -= a;
 401     tmp[3] += ((limb) a) << 32;
 402     /* tmp[3] < 2^79 */
 403
 404     b = a;
 405     a = tmp[3] >> 64;           /* a < 2^15 */
 406     b += a;                     /* b < 2^46 + 2^15 < 2^47 */
 407     tmp[3] = (u64)tmp[3];
 408     tmp[3] -= a;
 409     tmp[3] += ((limb) a) << 32;
 410     /* tmp[3] < 2^64 + 2^47 */
 411
 412     /*
 413      * This adjusts the other two words to complete the two partial
 414      * reductions.
 415      */
 416     tmp[0] += b;
 417     tmp[1] -= (((limb) b) << 32);
 418
 419     /*
 420      * In order to make space in tmp[3] for the carry from 2 -> 3, we
 421      * conditionally subtract kPrime if tmp[3] is large enough.
 422      */
 423     high = (u64)(tmp[3] >> 64);
 424     /* As tmp[3] < 2^65, high is either 1 or 0 */
 425     high = 0 - high;
 426     /*-
 427      * high is:
 428      *   all ones   if the high word of tmp[3] is 1
 429      *   all zeros  if the high word of tmp[3] if 0
 430      */
 431     low = (u64)tmp[3];
 432     mask = 0 - (low >> 63);
 433     /*-
 434      * mask is:
 435      *   all ones   if the MSB of low is 1
 436      *   all zeros  if the MSB of low if 0
 437      */
 438     low &= bottom63bits;
 439     low -= kPrime3Test;
 440     /* if low was greater than kPrime3Test then the MSB is zero */
 441     low = ~low;
 442     low = 0 - (low >> 63);
 443     /*-
 444      * low is:
 445      *   all ones   if low was > kPrime3Test
 446      *   all zeros  if low was <= kPrime3Test
 447      */
 448     mask = (mask & low) | high;
 449     tmp[0] -= mask & kPrime[0];
 450     tmp[1] -= mask & kPrime[1];
 451     /* kPrime[2] is zero, so omitted */
 452     tmp[3] -= mask & kPrime[3];
 453     /* tmp[3] < 2**64 - 2**32 + 1 */
 454
 455     tmp[1] += ((u64)(tmp[0] >> 64));
 456     tmp[0] = (u64)tmp[0];
 457     tmp[2] += ((u64)(tmp[1] >> 64));
 458     tmp[1] = (u64)tmp[1];
 459     tmp[3] += ((u64)(tmp[2] >> 64));
 460     tmp[2] = (u64)tmp[2];
 461     /* tmp[i] < 2^64 */
 462
 463     out[0] = tmp[0];
 464     out[1] = tmp[1];
 465     out[2] = tmp[2];
 466     out[3] = tmp[3];
 467 }
 468
 469 /* smallfelem_expand converts a smallfelem to an felem */
 470 static void smallfelem_expand(felem out, const smallfelem in)
 471 {
 472     out[0] = in[0];
 473     out[1] = in[1];
 474     out[2] = in[2];
 475     out[3] = in[3];
 476 }
 477
 478 /*-
 479  * smallfelem_square sets |out| = |small|^2
 480  * On entry:
 481  *   small[i] < 2^64
 482  * On exit:
 483  *   out[i] < 7 * 2^64 < 2^67
 484  */
 485 static void smallfelem_square(longfelem out, const smallfelem small)
 486 {
 487     limb a;
 488     u64 high, low;
 489
 490     a = ((uint128_t) small[0]) * small[0];
 491     low = a;
 492     high = a >> 64;
 493     out[0] = low;
 494     out[1] = high;
 495
 496     a = ((uint128_t) small[0]) * small[1];
 497     low = a;
 498     high = a >> 64;
 499     out[1] += low;
 500     out[1] += low;
 501     out[2] = high;
 502
 503     a = ((uint128_t) small[0]) * small[2];
 504     low = a;
 505     high = a >> 64;
 506     out[2] += low;
 507     out[2] *= 2;
 508     out[3] = high;
 509
 510     a = ((uint128_t) small[0]) * small[3];
 511     low = a;
 512     high = a >> 64;
 513     out[3] += low;
 514     out[4] = high;
 515
 516     a = ((uint128_t) small[1]) * small[2];
 517     low = a;
 518     high = a >> 64;
 519     out[3] += low;
 520     out[3] *= 2;
 521     out[4] += high;
 522
 523     a = ((uint128_t) small[1]) * small[1];
 524     low = a;
 525     high = a >> 64;
 526     out[2] += low;
 527     out[3] += high;
 528
 529     a = ((uint128_t) small[1]) * small[3];
 530     low = a;
 531     high = a >> 64;
 532     out[4] += low;
 533     out[4] *= 2;
 534     out[5] = high;
 535
 536     a = ((uint128_t) small[2]) * small[3];
 537     low = a;
 538     high = a >> 64;
 539     out[5] += low;
 540     out[5] *= 2;
 541     out[6] = high;
 542     out[6] += high;
 543
 544     a = ((uint128_t) small[2]) * small[2];
 545     low = a;
 546     high = a >> 64;
 547     out[4] += low;
 548     out[5] += high;
 549
 550     a = ((uint128_t) small[3]) * small[3];
 551     low = a;
 552     high = a >> 64;
 553     out[6] += low;
 554     out[7] = high;
 555 }
 556
 557 /*-
 558  * felem_square sets |out| = |in|^2
 559  * On entry:
 560  *   in[i] < 2^109
 561  * On exit:
 562  *   out[i] < 7 * 2^64 < 2^67
 563  */
 564 static void felem_square(longfelem out, const felem in)
 565 {
 566     u64 small[4];
 567     felem_shrink(small, in);
 568     smallfelem_square(out, small);
 569 }
 570
 571 /*-
 572  * smallfelem_mul sets |out| = |small1| * |small2|
 573  * On entry:
 574  *   small1[i] < 2^64
 575  *   small2[i] < 2^64
 576  * On exit:
 577  *   out[i] < 7 * 2^64 < 2^67
 578  */
 579 static void smallfelem_mul(longfelem out, const smallfelem small1,
 580                            const smallfelem small2)
 581 {
 582     limb a;
 583     u64 high, low;
 584
 585     a = ((uint128_t) small1[0]) * small2[0];
 586     low = a;
 587     high = a >> 64;
 588     out[0] = low;
 589     out[1] = high;
 590
 591     a = ((uint128_t) small1[0]) * small2[1];
 592     low = a;
 593     high = a >> 64;
 594     out[1] += low;
 595     out[2] = high;
 596
 597     a = ((uint128_t) small1[1]) * small2[0];
 598     low = a;
 599     high = a >> 64;
 600     out[1] += low;
 601     out[2] += high;
 602
 603     a = ((uint128_t) small1[0]) * small2[2];
 604     low = a;
 605     high = a >> 64;
 606     out[2] += low;
 607     out[3] = high;
 608
 609     a = ((uint128_t) small1[1]) * small2[1];
 610     low = a;
 611     high = a >> 64;
 612     out[2] += low;
 613     out[3] += high;
 614
 615     a = ((uint128_t) small1[2]) * small2[0];
 616     low = a;
 617     high = a >> 64;
 618     out[2] += low;
 619     out[3] += high;
 620
 621     a = ((uint128_t) small1[0]) * small2[3];
 622     low = a;
 623     high = a >> 64;
 624     out[3] += low;
 625     out[4] = high;
 626
 627     a = ((uint128_t) small1[1]) * small2[2];
 628     low = a;
 629     high = a >> 64;
 630     out[3] += low;
 631     out[4] += high;
 632
 633     a = ((uint128_t) small1[2]) * small2[1];
 634     low = a;
 635     high = a >> 64;
 636     out[3] += low;
 637     out[4] += high;
 638
 639     a = ((uint128_t) small1[3]) * small2[0];
 640     low = a;
 641     high = a >> 64;
 642     out[3] += low;
 643     out[4] += high;
 644
 645     a = ((uint128_t) small1[1]) * small2[3];
 646     low = a;
 647     high = a >> 64;
 648     out[4] += low;
 649     out[5] = high;
 650
 651     a = ((uint128_t) small1[2]) * small2[2];
 652     low = a;
 653     high = a >> 64;
 654     out[4] += low;
 655     out[5] += high;
 656
 657     a = ((uint128_t) small1[3]) * small2[1];
 658     low = a;
 659     high = a >> 64;
 660     out[4] += low;
 661     out[5] += high;
 662
 663     a = ((uint128_t) small1[2]) * small2[3];
 664     low = a;
 665     high = a >> 64;
 666     out[5] += low;
 667     out[6] = high;
 668
 669     a = ((uint128_t) small1[3]) * small2[2];
 670     low = a;
 671     high = a >> 64;
 672     out[5] += low;
 673     out[6] += high;
 674
 675     a = ((uint128_t) small1[3]) * small2[3];
 676     low = a;
 677     high = a >> 64;
 678     out[6] += low;
 679     out[7] = high;
 680 }
 681
 682 /*-
 683  * felem_mul sets |out| = |in1| * |in2|
 684  * On entry:
 685  *   in1[i] < 2^109
 686  *   in2[i] < 2^109
 687  * On exit:
 688  *   out[i] < 7 * 2^64 < 2^67
 689  */
 690 static void felem_mul(longfelem out, const felem in1, const felem in2)
 691 {
 692     smallfelem small1, small2;
 693     felem_shrink(small1, in1);
 694     felem_shrink(small2, in2);
 695     smallfelem_mul(out, small1, small2);
 696 }
 697
 698 /*-
 699  * felem_small_mul sets |out| = |small1| * |in2|
 700  * On entry:
 701  *   small1[i] < 2^64
 702  *   in2[i] < 2^109
 703  * On exit:
 704  *   out[i] < 7 * 2^64 < 2^67
 705  */
 706 static void felem_small_mul(longfelem out, const smallfelem small1,
 707                             const felem in2)
 708 {
 709     smallfelem small2;
 710     felem_shrink(small2, in2);
 711     smallfelem_mul(out, small1, small2);
 712 }
 713
 714 # define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
 715 # define two100 (((limb)1) << 100)
 716 # define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
 717 /* zero100 is 0 mod p */
 718 static const felem zero100 =
 719     { two100m36m4, two100, two100m36p4, two100m36p4 };
 720
 721 /*-
 722  * Internal function for the different flavours of felem_reduce.
 723  * felem_reduce_ reduces the higher coefficients in[4]-in[7].
 724  * On entry:
 725  *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
 726  *   out[1] >= in[7] + 2^32*in[4]
 727  *   out[2] >= in[5] + 2^32*in[5]
 728  *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
 729  * On exit:
 730  *   out[0] <= out[0] + in[4] + 2^32*in[5]
 731  *   out[1] <= out[1] + in[5] + 2^33*in[6]
 732  *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
 733  *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
 734  */
 735 static void felem_reduce_(felem out, const longfelem in)
 736 {
 737     int128_t c;
 738     /* combine common terms from below */
 739     c = in[4] + (in[5] << 32);
 740     out[0] += c;
 741     out[3] -= c;
 742
 743     c = in[5] - in[7];
 744     out[1] += c;
 745     out[2] -= c;
 746
 747     /* the remaining terms */
 748     /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
 749     out[1] -= (in[4] << 32);
 750     out[3] += (in[4] << 32);
 751
 752     /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
 753     out[2] -= (in[5] << 32);
 754
 755     /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
 756     out[0] -= in[6];
 757     out[0] -= (in[6] << 32);
 758     out[1] += (in[6] << 33);
 759     out[2] += (in[6] * 2);
 760     out[3] -= (in[6] << 32);
 761
 762     /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
 763     out[0] -= in[7];
 764     out[0] -= (in[7] << 32);
 765     out[2] += (in[7] << 33);
 766     out[3] += (in[7] * 3);
 767 }
 768
 769 /*-
 770  * felem_reduce converts a longfelem into an felem.
 771  * To be called directly after felem_square or felem_mul.
 772  * On entry:
 773  *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
 774  *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
 775  * On exit:
 776  *   out[i] < 2^101
 777  */
 778 static void felem_reduce(felem out, const longfelem in)
 779 {
 780     out[0] = zero100[0] + in[0];
 781     out[1] = zero100[1] + in[1];
 782     out[2] = zero100[2] + in[2];
 783     out[3] = zero100[3] + in[3];
 784
 785     felem_reduce_(out, in);
 786
 787     /*-
 788      * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
 789      * out[1] > 2^100 - 2^64 - 7*2^96 > 0
 790      * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
 791      * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
 792      *
 793      * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
 794      * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
 795      * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
 796      * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
 797      */
 798 }
 799
 800 /*-
 801  * felem_reduce_zero105 converts a larger longfelem into an felem.
 802  * On entry:
 803  *   in[0] < 2^71
 804  * On exit:
 805  *   out[i] < 2^106
 806  */
 807 static void felem_reduce_zero105(felem out, const longfelem in)
 808 {
 809     out[0] = zero105[0] + in[0];
 810     out[1] = zero105[1] + in[1];
 811     out[2] = zero105[2] + in[2];
 812     out[3] = zero105[3] + in[3];
 813
 814     felem_reduce_(out, in);
 815
 816     /*-
 817      * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
 818      * out[1] > 2^105 - 2^71 - 2^103 > 0
 819      * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
 820      * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
 821      *
 822      * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 823      * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 824      * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
 825      * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
 826      */
 827 }
 828
 829 /*
 830  * subtract_u64 sets *result = *result - v and *carry to one if the
 831  * subtraction underflowed.
 832  */
 833 static void subtract_u64(u64 *result, u64 *carry, u64 v)
 834 {
 835     uint128_t r = *result;
 836     r -= v;
 837     *carry = (r >> 64) & 1;
 838     *result = (u64)r;
 839 }
 840
 841 /*
 842  * felem_contract converts |in| to its unique, minimal representation. On
 843  * entry: in[i] < 2^109
 844  */
 845 static void felem_contract(smallfelem out, const felem in)
 846 {
 847     unsigned i;
 848     u64 all_equal_so_far = 0, result = 0, carry;
 849
 850     felem_shrink(out, in);
 851     /* small is minimal except that the value might be > p */
 852
 853     all_equal_so_far--;
 854     /*
 855      * We are doing a constant time test if out >= kPrime. We need to compare
 856      * each u64, from most-significant to least significant. For each one, if
 857      * all words so far have been equal (m is all ones) then a non-equal
 858      * result is the answer. Otherwise we continue.
 859      */
 860     for (i = 3; i < 4; i--) {
 861         u64 equal;
 862         uint128_t a = ((uint128_t) kPrime[i]) - out[i];
 863         /*
 864          * if out[i] > kPrime[i] then a will underflow and the high 64-bits
 865          * will all be set.
 866          */
 867         result |= all_equal_so_far & ((u64)(a >> 64));
 868
 869         /*
 870          * if kPrime[i] == out[i] then |equal| will be all zeros and the
 871          * decrement will make it all ones.
 872          */
 873         equal = kPrime[i] ^ out[i];
 874         equal--;
 875         equal &= equal << 32;
 876         equal &= equal << 16;
 877         equal &= equal << 8;
 878         equal &= equal << 4;
 879         equal &= equal << 2;
 880         equal &= equal << 1;
 881         equal = 0 - (equal >> 63);
 882
 883         all_equal_so_far &= equal;
 884     }
 885
 886     /*
 887      * if all_equal_so_far is still all ones then the two values are equal
 888      * and so out >= kPrime is true.
 889      */
 890     result |= all_equal_so_far;
 891
 892     /* if out >= kPrime then we subtract kPrime. */
 893     subtract_u64(&out[0], &carry, result & kPrime[0]);
 894     subtract_u64(&out[1], &carry, carry);
 895     subtract_u64(&out[2], &carry, carry);
 896     subtract_u64(&out[3], &carry, carry);
 897
 898     subtract_u64(&out[1], &carry, result & kPrime[1]);
 899     subtract_u64(&out[2], &carry, carry);
 900     subtract_u64(&out[3], &carry, carry);
 901
 902     subtract_u64(&out[2], &carry, result & kPrime[2]);
 903     subtract_u64(&out[3], &carry, carry);
 904
 905     subtract_u64(&out[3], &carry, result & kPrime[3]);
 906 }
 907
 908 static void smallfelem_square_contract(smallfelem out, const smallfelem in)
 909 {
 910     longfelem longtmp;
 911     felem tmp;
 912
 913     smallfelem_square(longtmp, in);
 914     felem_reduce(tmp, longtmp);
 915     felem_contract(out, tmp);
 916 }
 917
 918 static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
 919                                     const smallfelem in2)
 920 {
 921     longfelem longtmp;
 922     felem tmp;
 923
 924     smallfelem_mul(longtmp, in1, in2);
 925     felem_reduce(tmp, longtmp);
 926     felem_contract(out, tmp);
 927 }
 928
 929 /*-
 930  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 931  * otherwise.
 932  * On entry:
 933  *   small[i] < 2^64
 934  */
 935 static limb smallfelem_is_zero(const smallfelem small)
 936 {
 937     limb result;
 938     u64 is_p;
 939
 940     u64 is_zero = small[0] | small[1] | small[2] | small[3];
 941     is_zero--;
 942     is_zero &= is_zero << 32;
 943     is_zero &= is_zero << 16;
 944     is_zero &= is_zero << 8;
 945     is_zero &= is_zero << 4;
 946     is_zero &= is_zero << 2;
 947     is_zero &= is_zero << 1;
 948     is_zero = 0 - (is_zero >> 63);
 949
 950     is_p = (small[0] ^ kPrime[0]) |
 951         (small[1] ^ kPrime[1]) |
 952         (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
 953     is_p--;
 954     is_p &= is_p << 32;
 955     is_p &= is_p << 16;
 956     is_p &= is_p << 8;
 957     is_p &= is_p << 4;
 958     is_p &= is_p << 2;
 959     is_p &= is_p << 1;
 960     is_p = 0 - (is_p >> 63);
 961
 962     is_zero |= is_p;
 963
 964     result = is_zero;
 965     result |= ((limb) is_zero) << 64;
 966     return result;
 967 }
 968
 969 static int smallfelem_is_zero_int(const void *small)
 970 {
 971     return (int)(smallfelem_is_zero(small) & ((limb) 1));
 972 }
 973
 974 /*-
 975  * felem_inv calculates |out| = |in|^{-1}
 976  *
 977  * Based on Fermat's Little Theorem:
 978  *   a^p = a (mod p)
 979  *   a^{p-1} = 1 (mod p)
 980  *   a^{p-2} = a^{-1} (mod p)
 981  */
 982 static void felem_inv(felem out, const felem in)
 983 {
 984     felem ftmp, ftmp2;
 985     /* each e_I will hold |in|^{2^I - 1} */
 986     felem e2, e4, e8, e16, e32, e64;
 987     longfelem tmp;
 988     unsigned i;
 989
 990     felem_square(tmp, in);
 991     felem_reduce(ftmp, tmp);    /* 2^1 */
 992     felem_mul(tmp, in, ftmp);
 993     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
 994     felem_assign(e2, ftmp);
 995     felem_square(tmp, ftmp);
 996     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
 997     felem_square(tmp, ftmp);
 998     felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
 999     felem_mul(tmp, ftmp, e2);
1000     felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1001     felem_assign(e4, ftmp);
1002     felem_square(tmp, ftmp);
1003     felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1004     felem_square(tmp, ftmp);
1005     felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1006     felem_square(tmp, ftmp);
1007     felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1008     felem_square(tmp, ftmp);
1009     felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1010     felem_mul(tmp, ftmp, e4);
1011     felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1012     felem_assign(e8, ftmp);
1013     for (i = 0; i < 8; i++) {
1014         felem_square(tmp, ftmp);
1015         felem_reduce(ftmp, tmp);
1016     }                           /* 2^16 - 2^8 */
1017     felem_mul(tmp, ftmp, e8);
1018     felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1019     felem_assign(e16, ftmp);
1020     for (i = 0; i < 16; i++) {
1021         felem_square(tmp, ftmp);
1022         felem_reduce(ftmp, tmp);
1023     }                           /* 2^32 - 2^16 */
1024     felem_mul(tmp, ftmp, e16);
1025     felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1026     felem_assign(e32, ftmp);
1027     for (i = 0; i < 32; i++) {
1028         felem_square(tmp, ftmp);
1029         felem_reduce(ftmp, tmp);
1030     }                           /* 2^64 - 2^32 */
1031     felem_assign(e64, ftmp);
1032     felem_mul(tmp, ftmp, in);
1033     felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1034     for (i = 0; i < 192; i++) {
1035         felem_square(tmp, ftmp);
1036         felem_reduce(ftmp, tmp);
1037     }                           /* 2^256 - 2^224 + 2^192 */
1038
1039     felem_mul(tmp, e64, e32);
1040     felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1041     for (i = 0; i < 16; i++) {
1042         felem_square(tmp, ftmp2);
1043         felem_reduce(ftmp2, tmp);
1044     }                           /* 2^80 - 2^16 */
1045     felem_mul(tmp, ftmp2, e16);
1046     felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1047     for (i = 0; i < 8; i++) {
1048         felem_square(tmp, ftmp2);
1049         felem_reduce(ftmp2, tmp);
1050     }                           /* 2^88 - 2^8 */
1051     felem_mul(tmp, ftmp2, e8);
1052     felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1053     for (i = 0; i < 4; i++) {
1054         felem_square(tmp, ftmp2);
1055         felem_reduce(ftmp2, tmp);
1056     }                           /* 2^92 - 2^4 */
1057     felem_mul(tmp, ftmp2, e4);
1058     felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1059     felem_square(tmp, ftmp2);
1060     felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1061     felem_square(tmp, ftmp2);
1062     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1063     felem_mul(tmp, ftmp2, e2);
1064     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1065     felem_square(tmp, ftmp2);
1066     felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1067     felem_square(tmp, ftmp2);
1068     felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1069     felem_mul(tmp, ftmp2, in);
1070     felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1071
1072     felem_mul(tmp, ftmp2, ftmp);
1073     felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1074 }
1075
1076 static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1077 {
1078     felem tmp;
1079
1080     smallfelem_expand(tmp, in);
1081     felem_inv(tmp, tmp);
1082     felem_contract(out, tmp);
1083 }
1084
1085 /*-
1086  * Group operations
1087  * ----------------
1088  *
1089  * Building on top of the field operations we have the operations on the
1090  * elliptic curve group itself. Points on the curve are represented in Jacobian
1091  * coordinates
1092  */
1093
1094 /*-
1095  * point_double calculates 2*(x_in, y_in, z_in)
1096  *
1097  * The method is taken from:
1098  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1099  *
1100  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1101  * while x_out == y_in is not (maybe this works, but it's not tested).
1102  */
1103 static void
1104 point_double(felem x_out, felem y_out, felem z_out,
1105              const felem x_in, const felem y_in, const felem z_in)
1106 {
1107     longfelem tmp, tmp2;
1108     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1109     smallfelem small1, small2;
1110
1111     felem_assign(ftmp, x_in);
1112     /* ftmp[i] < 2^106 */
1113     felem_assign(ftmp2, x_in);
1114     /* ftmp2[i] < 2^106 */
1115
1116     /* delta = z^2 */
1117     felem_square(tmp, z_in);
1118     felem_reduce(delta, tmp);
1119     /* delta[i] < 2^101 */
1120
1121     /* gamma = y^2 */
1122     felem_square(tmp, y_in);
1123     felem_reduce(gamma, tmp);
1124     /* gamma[i] < 2^101 */
1125     felem_shrink(small1, gamma);
1126
1127     /* beta = x*gamma */
1128     felem_small_mul(tmp, small1, x_in);
1129     felem_reduce(beta, tmp);
1130     /* beta[i] < 2^101 */
1131
1132     /* alpha = 3*(x-delta)*(x+delta) */
1133     felem_diff(ftmp, delta);
1134     /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1135     felem_sum(ftmp2, delta);
1136     /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1137     felem_scalar(ftmp2, 3);
1138     /* ftmp2[i] < 3 * 2^107 < 2^109 */
1139     felem_mul(tmp, ftmp, ftmp2);
1140     felem_reduce(alpha, tmp);
1141     /* alpha[i] < 2^101 */
1142     felem_shrink(small2, alpha);
1143
1144     /* x' = alpha^2 - 8*beta */
1145     smallfelem_square(tmp, small2);
1146     felem_reduce(x_out, tmp);
1147     felem_assign(ftmp, beta);
1148     felem_scalar(ftmp, 8);
1149     /* ftmp[i] < 8 * 2^101 = 2^104 */
1150     felem_diff(x_out, ftmp);
1151     /* x_out[i] < 2^105 + 2^101 < 2^106 */
1152
1153     /* z' = (y + z)^2 - gamma - delta */
1154     felem_sum(delta, gamma);
1155     /* delta[i] < 2^101 + 2^101 = 2^102 */
1156     felem_assign(ftmp, y_in);
1157     felem_sum(ftmp, z_in);
1158     /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1159     felem_square(tmp, ftmp);
1160     felem_reduce(z_out, tmp);
1161     felem_diff(z_out, delta);
1162     /* z_out[i] < 2^105 + 2^101 < 2^106 */
1163
1164     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1165     felem_scalar(beta, 4);
1166     /* beta[i] < 4 * 2^101 = 2^103 */
1167     felem_diff_zero107(beta, x_out);
1168     /* beta[i] < 2^107 + 2^103 < 2^108 */
1169     felem_small_mul(tmp, small2, beta);
1170     /* tmp[i] < 7 * 2^64 < 2^67 */
1171     smallfelem_square(tmp2, small1);
1172     /* tmp2[i] < 7 * 2^64 */
1173     longfelem_scalar(tmp2, 8);
1174     /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1175     longfelem_diff(tmp, tmp2);
1176     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1177     felem_reduce_zero105(y_out, tmp);
1178     /* y_out[i] < 2^106 */
1179 }
1180
1181 /*
1182  * point_double_small is the same as point_double, except that it operates on
1183  * smallfelems
1184  */
1185 static void
1186 point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1187                    const smallfelem x_in, const smallfelem y_in,
1188                    const smallfelem z_in)
1189 {
1190     felem felem_x_out, felem_y_out, felem_z_out;
1191     felem felem_x_in, felem_y_in, felem_z_in;
1192
1193     smallfelem_expand(felem_x_in, x_in);
1194     smallfelem_expand(felem_y_in, y_in);
1195     smallfelem_expand(felem_z_in, z_in);
1196     point_double(felem_x_out, felem_y_out, felem_z_out,
1197                  felem_x_in, felem_y_in, felem_z_in);
1198     felem_shrink(x_out, felem_x_out);
1199     felem_shrink(y_out, felem_y_out);
1200     felem_shrink(z_out, felem_z_out);
1201 }
1202
1203 /* copy_conditional copies in to out iff mask is all ones. */
1204 static void copy_conditional(felem out, const felem in, limb mask)
1205 {
1206     unsigned i;
1207     for (i = 0; i < NLIMBS; ++i) {
1208         const limb tmp = mask & (in[i] ^ out[i]);
1209         out[i] ^= tmp;
1210     }
1211 }
1212
1213 /* copy_small_conditional copies in to out iff mask is all ones. */
1214 static void copy_small_conditional(felem out, const smallfelem in, limb mask)
1215 {
1216     unsigned i;
1217     const u64 mask64 = mask;
1218     for (i = 0; i < NLIMBS; ++i) {
1219         out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1220     }
1221 }
1222
1223 /*-
1224  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1225  *
1226  * The method is taken from:
1227  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1228  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1229  *
1230  * This function includes a branch for checking whether the two input points
1231  * are equal, (while not equal to the point at infinity). This case never
1232  * happens during single point multiplication, so there is no timing leak for
1233  * ECDH or ECDSA signing.
1234  */
1235 static void point_add(felem x3, felem y3, felem z3,
1236                       const felem x1, const felem y1, const felem z1,
1237                       const int mixed, const smallfelem x2,
1238                       const smallfelem y2, const smallfelem z2)
1239 {
1240     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1241     longfelem tmp, tmp2;
1242     smallfelem small1, small2, small3, small4, small5;
1243     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1244
1245     felem_shrink(small3, z1);
1246
1247     z1_is_zero = smallfelem_is_zero(small3);
1248     z2_is_zero = smallfelem_is_zero(z2);
1249
1250     /* ftmp = z1z1 = z1**2 */
1251     smallfelem_square(tmp, small3);
1252     felem_reduce(ftmp, tmp);
1253     /* ftmp[i] < 2^101 */
1254     felem_shrink(small1, ftmp);
1255
1256     if (!mixed) {
1257         /* ftmp2 = z2z2 = z2**2 */
1258         smallfelem_square(tmp, z2);
1259         felem_reduce(ftmp2, tmp);
1260         /* ftmp2[i] < 2^101 */
1261         felem_shrink(small2, ftmp2);
1262
1263         felem_shrink(small5, x1);
1264
1265         /* u1 = ftmp3 = x1*z2z2 */
1266         smallfelem_mul(tmp, small5, small2);
1267         felem_reduce(ftmp3, tmp);
1268         /* ftmp3[i] < 2^101 */
1269
1270         /* ftmp5 = z1 + z2 */
1271         felem_assign(ftmp5, z1);
1272         felem_small_sum(ftmp5, z2);
1273         /* ftmp5[i] < 2^107 */
1274
1275         /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1276         felem_square(tmp, ftmp5);
1277         felem_reduce(ftmp5, tmp);
1278         /* ftmp2 = z2z2 + z1z1 */
1279         felem_sum(ftmp2, ftmp);
1280         /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1281         felem_diff(ftmp5, ftmp2);
1282         /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1283
1284         /* ftmp2 = z2 * z2z2 */
1285         smallfelem_mul(tmp, small2, z2);
1286         felem_reduce(ftmp2, tmp);
1287
1288         /* s1 = ftmp2 = y1 * z2**3 */
1289         felem_mul(tmp, y1, ftmp2);
1290         felem_reduce(ftmp6, tmp);
1291         /* ftmp6[i] < 2^101 */
1292     } else {
1293         /*
1294          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1295          */
1296
1297         /* u1 = ftmp3 = x1*z2z2 */
1298         felem_assign(ftmp3, x1);
1299         /* ftmp3[i] < 2^106 */
1300
1301         /* ftmp5 = 2z1z2 */
1302         felem_assign(ftmp5, z1);
1303         felem_scalar(ftmp5, 2);
1304         /* ftmp5[i] < 2*2^106 = 2^107 */
1305
1306         /* s1 = ftmp2 = y1 * z2**3 */
1307         felem_assign(ftmp6, y1);
1308         /* ftmp6[i] < 2^106 */
1309     }
1310
1311     /* u2 = x2*z1z1 */
1312     smallfelem_mul(tmp, x2, small1);
1313     felem_reduce(ftmp4, tmp);
1314
1315     /* h = ftmp4 = u2 - u1 */
1316     felem_diff_zero107(ftmp4, ftmp3);
1317     /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1318     felem_shrink(small4, ftmp4);
1319
1320     x_equal = smallfelem_is_zero(small4);
1321
1322     /* z_out = ftmp5 * h */
1323     felem_small_mul(tmp, small4, ftmp5);
1324     felem_reduce(z_out, tmp);
1325     /* z_out[i] < 2^101 */
1326
1327     /* ftmp = z1 * z1z1 */
1328     smallfelem_mul(tmp, small1, small3);
1329     felem_reduce(ftmp, tmp);
1330
1331     /* s2 = tmp = y2 * z1**3 */
1332     felem_small_mul(tmp, y2, ftmp);
1333     felem_reduce(ftmp5, tmp);
1334
1335     /* r = ftmp5 = (s2 - s1)*2 */
1336     felem_diff_zero107(ftmp5, ftmp6);
1337     /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1338     felem_scalar(ftmp5, 2);
1339     /* ftmp5[i] < 2^109 */
1340     felem_shrink(small1, ftmp5);
1341     y_equal = smallfelem_is_zero(small1);
1342
1343     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1344         point_double(x3, y3, z3, x1, y1, z1);
1345         return;
1346     }
1347
1348     /* I = ftmp = (2h)**2 */
1349     felem_assign(ftmp, ftmp4);
1350     felem_scalar(ftmp, 2);
1351     /* ftmp[i] < 2*2^108 = 2^109 */
1352     felem_square(tmp, ftmp);
1353     felem_reduce(ftmp, tmp);
1354
1355     /* J = ftmp2 = h * I */
1356     felem_mul(tmp, ftmp4, ftmp);
1357     felem_reduce(ftmp2, tmp);
1358
1359     /* V = ftmp4 = U1 * I */
1360     felem_mul(tmp, ftmp3, ftmp);
1361     felem_reduce(ftmp4, tmp);
1362
1363     /* x_out = r**2 - J - 2V */
1364     smallfelem_square(tmp, small1);
1365     felem_reduce(x_out, tmp);
1366     felem_assign(ftmp3, ftmp4);
1367     felem_scalar(ftmp4, 2);
1368     felem_sum(ftmp4, ftmp2);
1369     /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1370     felem_diff(x_out, ftmp4);
1371     /* x_out[i] < 2^105 + 2^101 */
1372
1373     /* y_out = r(V-x_out) - 2 * s1 * J */
1374     felem_diff_zero107(ftmp3, x_out);
1375     /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1376     felem_small_mul(tmp, small1, ftmp3);
1377     felem_mul(tmp2, ftmp6, ftmp2);
1378     longfelem_scalar(tmp2, 2);
1379     /* tmp2[i] < 2*2^67 = 2^68 */
1380     longfelem_diff(tmp, tmp2);
1381     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1382     felem_reduce_zero105(y_out, tmp);
1383     /* y_out[i] < 2^106 */
1384
1385     copy_small_conditional(x_out, x2, z1_is_zero);
1386     copy_conditional(x_out, x1, z2_is_zero);
1387     copy_small_conditional(y_out, y2, z1_is_zero);
1388     copy_conditional(y_out, y1, z2_is_zero);
1389     copy_small_conditional(z_out, z2, z1_is_zero);
1390     copy_conditional(z_out, z1, z2_is_zero);
1391     felem_assign(x3, x_out);
1392     felem_assign(y3, y_out);
1393     felem_assign(z3, z_out);
1394 }
1395
1396 /*
1397  * point_add_small is the same as point_add, except that it operates on
1398  * smallfelems
1399  */
1400 static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1401                             smallfelem x1, smallfelem y1, smallfelem z1,
1402                             smallfelem x2, smallfelem y2, smallfelem z2)
1403 {
1404     felem felem_x3, felem_y3, felem_z3;
1405     felem felem_x1, felem_y1, felem_z1;
1406     smallfelem_expand(felem_x1, x1);
1407     smallfelem_expand(felem_y1, y1);
1408     smallfelem_expand(felem_z1, z1);
1409     point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1410               x2, y2, z2);
1411     felem_shrink(x3, felem_x3);
1412     felem_shrink(y3, felem_y3);
1413     felem_shrink(z3, felem_z3);
1414 }
1415
1416 /*-
1417  * Base point pre computation
1418  * --------------------------
1419  *
1420  * Two different sorts of precomputed tables are used in the following code.
1421  * Each contain various points on the curve, where each point is three field
1422  * elements (x, y, z).
1423  *
1424  * For the base point table, z is usually 1 (0 for the point at infinity).
1425  * This table has 2 * 16 elements, starting with the following:
1426  * index | bits    | point
1427  * ------+---------+------------------------------
1428  *     0 | 0 0 0 0 | 0G
1429  *     1 | 0 0 0 1 | 1G
1430  *     2 | 0 0 1 0 | 2^64G
1431  *     3 | 0 0 1 1 | (2^64 + 1)G
1432  *     4 | 0 1 0 0 | 2^128G
1433  *     5 | 0 1 0 1 | (2^128 + 1)G
1434  *     6 | 0 1 1 0 | (2^128 + 2^64)G
1435  *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1436  *     8 | 1 0 0 0 | 2^192G
1437  *     9 | 1 0 0 1 | (2^192 + 1)G
1438  *    10 | 1 0 1 0 | (2^192 + 2^64)G
1439  *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1440  *    12 | 1 1 0 0 | (2^192 + 2^128)G
1441  *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1442  *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1443  *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1444  * followed by a copy of this with each element multiplied by 2^32.
1445  *
1446  * The reason for this is so that we can clock bits into four different
1447  * locations when doing simple scalar multiplies against the base point,
1448  * and then another four locations using the second 16 elements.
1449  *
1450  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1451
1452 /* gmul is the table of precomputed base points */
1453 static const smallfelem gmul[2][16][3] = {
1454     {{{0, 0, 0, 0},
1455       {0, 0, 0, 0},
1456       {0, 0, 0, 0}},
1457      {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1458        0x6b17d1f2e12c4247},
1459       {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1460        0x4fe342e2fe1a7f9b},
1461       {1, 0, 0, 0}},
1462      {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1463        0x0fa822bc2811aaa5},
1464       {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1465        0xbff44ae8f5dba80d},
1466       {1, 0, 0, 0}},
1467      {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1468        0x300a4bbc89d6726f},
1469       {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1470        0x72aac7e0d09b4644},
1471       {1, 0, 0, 0}},
1472      {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1473        0x447d739beedb5e67},
1474       {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1475        0x2d4825ab834131ee},
1476       {1, 0, 0, 0}},
1477      {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1478        0xef9519328a9c72ff},
1479       {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1480        0x611e9fc37dbb2c9b},
1481       {1, 0, 0, 0}},
1482      {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1483        0x550663797b51f5d8},
1484       {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1485        0x157164848aecb851},
1486       {1, 0, 0, 0}},
1487      {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1488        0xeb5d7745b21141ea},
1489       {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1490        0xeafd72ebdbecc17b},
1491       {1, 0, 0, 0}},
1492      {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1493        0xa6d39677a7849276},
1494       {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1495        0x674f84749b0b8816},
1496       {1, 0, 0, 0}},
1497      {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1498        0x4e769e7672c9ddad},
1499       {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1500        0x42b99082de830663},
1501       {1, 0, 0, 0}},
1502      {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1503        0x78878ef61c6ce04d},
1504       {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1505        0xb6cb3f5d7b72c321},
1506       {1, 0, 0, 0}},
1507      {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1508        0x0c88bc4d716b1287},
1509       {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1510        0xdd5ddea3f3901dc6},
1511       {1, 0, 0, 0}},
1512      {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1513        0x68f344af6b317466},
1514       {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1515        0x31b9c405f8540a20},
1516       {1, 0, 0, 0}},
1517      {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1518        0x4052bf4b6f461db9},
1519       {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1520        0xfecf4d5190b0fc61},
1521       {1, 0, 0, 0}},
1522      {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1523        0x1eddbae2c802e41a},
1524       {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1525        0x43104d86560ebcfc},
1526       {1, 0, 0, 0}},
1527      {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1528        0xb48e26b484f7a21c},
1529       {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1530        0xfac015404d4d3dab},
1531       {1, 0, 0, 0}}},
1532     {{{0, 0, 0, 0},
1533       {0, 0, 0, 0},
1534       {0, 0, 0, 0}},
1535      {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1536        0x7fe36b40af22af89},
1537       {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1538        0xe697d45825b63624},
1539       {1, 0, 0, 0}},
1540      {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1541        0x4a5b506612a677a6},
1542       {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1543        0xeb13461ceac089f1},
1544       {1, 0, 0, 0}},
1545      {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1546        0x0781b8291c6a220a},
1547       {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1548        0x690cde8df0151593},
1549       {1, 0, 0, 0}},
1550      {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1551        0x8a535f566ec73617},
1552       {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1553        0x0455c08468b08bd7},
1554       {1, 0, 0, 0}},
1555      {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1556        0x06bada7ab77f8276},
1557       {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1558        0x5b476dfd0e6cb18a},
1559       {1, 0, 0, 0}},
1560      {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1561        0x3e29864e8a2ec908},
1562       {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1563        0x239b90ea3dc31e7e},
1564       {1, 0, 0, 0}},
1565      {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1566        0x820f4dd949f72ff7},
1567       {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1568        0x140406ec783a05ec},
1569       {1, 0, 0, 0}},
1570      {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1571        0x68f6b8542783dfee},
1572       {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1573        0xcbe1feba92e40ce6},
1574       {1, 0, 0, 0}},
1575      {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1576        0xd0b2f94d2f420109},
1577       {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1578        0x971459828b0719e5},
1579       {1, 0, 0, 0}},
1580      {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1581        0x961610004a866aba},
1582       {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1583        0x7acb9fadcee75e44},
1584       {1, 0, 0, 0}},
1585      {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1586        0x24eb9acca333bf5b},
1587       {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1588        0x69f891c5acd079cc},
1589       {1, 0, 0, 0}},
1590      {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1591        0xe51f547c5972a107},
1592       {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1593        0x1c309a2b25bb1387},
1594       {1, 0, 0, 0}},
1595      {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1596        0x20b87b8aa2c4e503},
1597       {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1598        0xf5c6fa49919776be},
1599       {1, 0, 0, 0}},
1600      {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1601        0x1ed7d1b9332010b9},
1602       {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1603        0x3a2b03f03217257a},
1604       {1, 0, 0, 0}},
1605      {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1606        0x15fee545c78dd9f6},
1607       {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1608        0x4ab5b6b2b8753f81},
1609       {1, 0, 0, 0}}}
1610 };
1611
1612 /*
1613  * select_point selects the |idx|th point from a precomputation table and
1614  * copies it to out.
1615  */
1616 static void select_point(const u64 idx, unsigned int size,
1617                          const smallfelem pre_comp[16][3], smallfelem out[3])
1618 {
1619     unsigned i, j;
1620     u64 *outlimbs = &out[0][0];
1621
1622     memset(out, 0, sizeof(*out) * 3);
1623
1624     for (i = 0; i < size; i++) {
1625         const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1626         u64 mask = i ^ idx;
1627         mask |= mask >> 4;
1628         mask |= mask >> 2;
1629         mask |= mask >> 1;
1630         mask &= 1;
1631         mask--;
1632         for (j = 0; j < NLIMBS * 3; j++)
1633             outlimbs[j] |= inlimbs[j] & mask;
1634     }
1635 }
1636
1637 /* get_bit returns the |i|th bit in |in| */
1638 static char get_bit(const felem_bytearray in, int i)
1639 {
1640     if ((i < 0) || (i >= 256))
1641         return 0;
1642     return (in[i >> 3] >> (i & 7)) & 1;
1643 }
1644
1645 /*
1646  * Interleaved point multiplication using precomputed point multiples: The
1647  * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1648  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1649  * generator, using certain (large) precomputed multiples in g_pre_comp.
1650  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1651  */
1652 static void batch_mul(felem x_out, felem y_out, felem z_out,
1653                       const felem_bytearray scalars[],
1654                       const unsigned num_points, const u8 *g_scalar,
1655                       const int mixed, const smallfelem pre_comp[][17][3],
1656                       const smallfelem g_pre_comp[2][16][3])
1657 {
1658     int i, skip;
1659     unsigned num, gen_mul = (g_scalar != NULL);
1660     felem nq[3], ftmp;
1661     smallfelem tmp[3];
1662     u64 bits;
1663     u8 sign, digit;
1664
1665     /* set nq to the point at infinity */
1666     memset(nq, 0, sizeof(nq));
1667
1668     /*
1669      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1670      * of the generator (two in each of the last 32 rounds) and additions of
1671      * other points multiples (every 5th round).
1672      */
1673     skip = 1;                   /* save two point operations in the first
1674                                  * round */
1675     for (i = (num_points ? 255 : 31); i >= 0; --i) {
1676         /* double */
1677         if (!skip)
1678             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1679
1680         /* add multiples of the generator */
1681         if (gen_mul && (i <= 31)) {
1682             /* first, look 32 bits upwards */
1683             bits = get_bit(g_scalar, i + 224) << 3;
1684             bits |= get_bit(g_scalar, i + 160) << 2;
1685             bits |= get_bit(g_scalar, i + 96) << 1;
1686             bits |= get_bit(g_scalar, i + 32);
1687             /* select the point to add, in constant time */
1688             select_point(bits, 16, g_pre_comp[1], tmp);
1689
1690             if (!skip) {
1691                 /* Arg 1 below is for "mixed" */
1692                 point_add(nq[0], nq[1], nq[2],
1693                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1694             } else {
1695                 smallfelem_expand(nq[0], tmp[0]);
1696                 smallfelem_expand(nq[1], tmp[1]);
1697                 smallfelem_expand(nq[2], tmp[2]);
1698                 skip = 0;
1699             }
1700
1701             /* second, look at the current position */
1702             bits = get_bit(g_scalar, i + 192) << 3;
1703             bits |= get_bit(g_scalar, i + 128) << 2;
1704             bits |= get_bit(g_scalar, i + 64) << 1;
1705             bits |= get_bit(g_scalar, i);
1706             /* select the point to add, in constant time */
1707             select_point(bits, 16, g_pre_comp[0], tmp);
1708             /* Arg 1 below is for "mixed" */
1709             point_add(nq[0], nq[1], nq[2],
1710                       nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1711         }
1712
1713         /* do other additions every 5 doublings */
1714         if (num_points && (i % 5 == 0)) {
1715             /* loop over all scalars */
1716             for (num = 0; num < num_points; ++num) {
1717                 bits = get_bit(scalars[num], i + 4) << 5;
1718                 bits |= get_bit(scalars[num], i + 3) << 4;
1719                 bits |= get_bit(scalars[num], i + 2) << 3;
1720                 bits |= get_bit(scalars[num], i + 1) << 2;
1721                 bits |= get_bit(scalars[num], i) << 1;
1722                 bits |= get_bit(scalars[num], i - 1);
1723                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1724
1725                 /*
1726                  * select the point to add or subtract, in constant time
1727                  */
1728                 select_point(digit, 17, pre_comp[num], tmp);
1729                 smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1730                                                * point */
1731                 copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1732                 felem_contract(tmp[1], ftmp);
1733
1734                 if (!skip) {
1735                     point_add(nq[0], nq[1], nq[2],
1736                               nq[0], nq[1], nq[2],
1737                               mixed, tmp[0], tmp[1], tmp[2]);
1738                 } else {
1739                     smallfelem_expand(nq[0], tmp[0]);
1740                     smallfelem_expand(nq[1], tmp[1]);
1741                     smallfelem_expand(nq[2], tmp[2]);
1742                     skip = 0;
1743                 }
1744             }
1745         }
1746     }
1747     felem_assign(x_out, nq[0]);
1748     felem_assign(y_out, nq[1]);
1749     felem_assign(z_out, nq[2]);
1750 }
1751
1752 /* Precomputation for the group generator. */
1753 struct nistp256_pre_comp_st {
1754     smallfelem g_pre_comp[2][16][3];
1755     int references;
1756     CRYPTO_RWLOCK *lock;
1757 };
1758
1759 const EC_METHOD *EC_GFp_nistp256_method(void)
1760 {
1761     static const EC_METHOD ret = {
1762         EC_FLAGS_DEFAULT_OCT,
1763         NID_X9_62_prime_field,
1764         ec_GFp_nistp256_group_init,
1765         ec_GFp_simple_group_finish,
1766         ec_GFp_simple_group_clear_finish,
1767         ec_GFp_nist_group_copy,
1768         ec_GFp_nistp256_group_set_curve,
1769         ec_GFp_simple_group_get_curve,
1770         ec_GFp_simple_group_get_degree,
1771         ec_group_simple_order_bits,
1772         ec_GFp_simple_group_check_discriminant,
1773         ec_GFp_simple_point_init,
1774         ec_GFp_simple_point_finish,
1775         ec_GFp_simple_point_clear_finish,
1776         ec_GFp_simple_point_copy,
1777         ec_GFp_simple_point_set_to_infinity,
1778         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1779         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1780         ec_GFp_simple_point_set_affine_coordinates,
1781         ec_GFp_nistp256_point_get_affine_coordinates,
1782         0 /* point_set_compressed_coordinates */ ,
1783         0 /* point2oct */ ,
1784         0 /* oct2point */ ,
1785         ec_GFp_simple_add,
1786         ec_GFp_simple_dbl,
1787         ec_GFp_simple_invert,
1788         ec_GFp_simple_is_at_infinity,
1789         ec_GFp_simple_is_on_curve,
1790         ec_GFp_simple_cmp,
1791         ec_GFp_simple_make_affine,
1792         ec_GFp_simple_points_make_affine,
1793         ec_GFp_nistp256_points_mul,
1794         ec_GFp_nistp256_precompute_mult,
1795         ec_GFp_nistp256_have_precompute_mult,
1796         ec_GFp_nist_field_mul,
1797         ec_GFp_nist_field_sqr,
1798         0 /* field_div */ ,
1799         ec_GFp_simple_field_inv,
1800         0 /* field_encode */ ,
1801         0 /* field_decode */ ,
1802         0,                      /* field_set_to_one */
1803         ec_key_simple_priv2oct,
1804         ec_key_simple_oct2priv,
1805         0, /* set private */
1806         ec_key_simple_generate_key,
1807         ec_key_simple_check_key,
1808         ec_key_simple_generate_public_key,
1809         0, /* keycopy */
1810         0, /* keyfinish */
1811         ecdh_simple_compute_key
1812     };
1813
1814     return &ret;
1815 }
1816
1817 /******************************************************************************/
1818 /*
1819  * FUNCTIONS TO MANAGE PRECOMPUTATION
1820  */
1821
1822 static NISTP256_PRE_COMP *nistp256_pre_comp_new()
1823 {
1824     NISTP256_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1825
1826     if (ret == NULL) {
1827         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1828         return ret;
1829     }
1830
1831     ret->references = 1;
1832
1833     ret->lock = CRYPTO_THREAD_lock_new();
1834     if (ret->lock == NULL) {
1835         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1836         OPENSSL_free(ret);
1837         return NULL;
1838     }
1839     return ret;
1840 }
1841
1842 NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *p)
1843 {
1844     int i;
1845     if (p != NULL)
1846         CRYPTO_atomic_add(&p->references, 1, &i, p->lock);
1847     return p;
1848 }
1849
1850 void EC_nistp256_pre_comp_free(NISTP256_PRE_COMP *pre)
1851 {
1852     int i;
1853
1854     if (pre == NULL)
1855         return;
1856
1857     CRYPTO_atomic_add(&pre->references, -1, &i, pre->lock);
1858     REF_PRINT_COUNT("EC_nistp256", x);
1859     if (i > 0)
1860         return;
1861     REF_ASSERT_ISNT(i < 0);
1862
1863     CRYPTO_THREAD_lock_free(pre->lock);
1864     OPENSSL_free(pre);
1865 }
1866
1867 /******************************************************************************/
1868 /*
1869  * OPENSSL EC_METHOD FUNCTIONS
1870  */
1871
1872 int ec_GFp_nistp256_group_init(EC_GROUP *group)
1873 {
1874     int ret;
1875     ret = ec_GFp_simple_group_init(group);
1876     group->a_is_minus3 = 1;
1877     return ret;
1878 }
1879
1880 int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1881                                     const BIGNUM *a, const BIGNUM *b,
1882                                     BN_CTX *ctx)
1883 {
1884     int ret = 0;
1885     BN_CTX *new_ctx = NULL;
1886     BIGNUM *curve_p, *curve_a, *curve_b;
1887
1888     if (ctx == NULL)
1889         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1890             return 0;
1891     BN_CTX_start(ctx);
1892     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1893         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1894         ((curve_b = BN_CTX_get(ctx)) == NULL))
1895         goto err;
1896     BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1897     BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1898     BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1899     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1900         ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1901               EC_R_WRONG_CURVE_PARAMETERS);
1902         goto err;
1903     }
1904     group->field_mod_func = BN_nist_mod_256;
1905     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1906  err:
1907     BN_CTX_end(ctx);
1908     BN_CTX_free(new_ctx);
1909     return ret;
1910 }
1911
1912 /*
1913  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1914  * (X/Z^2, Y/Z^3)
1915  */
1916 int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1917                                                  const EC_POINT *point,
1918                                                  BIGNUM *x, BIGNUM *y,
1919                                                  BN_CTX *ctx)
1920 {
1921     felem z1, z2, x_in, y_in;
1922     smallfelem x_out, y_out;
1923     longfelem tmp;
1924
1925     if (EC_POINT_is_at_infinity(group, point)) {
1926         ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1927               EC_R_POINT_AT_INFINITY);
1928         return 0;
1929     }
1930     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1931         (!BN_to_felem(z1, point->Z)))
1932         return 0;
1933     felem_inv(z2, z1);
1934     felem_square(tmp, z2);
1935     felem_reduce(z1, tmp);
1936     felem_mul(tmp, x_in, z1);
1937     felem_reduce(x_in, tmp);
1938     felem_contract(x_out, x_in);
1939     if (x != NULL) {
1940         if (!smallfelem_to_BN(x, x_out)) {
1941             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1942                   ERR_R_BN_LIB);
1943             return 0;
1944         }
1945     }
1946     felem_mul(tmp, z1, z2);
1947     felem_reduce(z1, tmp);
1948     felem_mul(tmp, y_in, z1);
1949     felem_reduce(y_in, tmp);
1950     felem_contract(y_out, y_in);
1951     if (y != NULL) {
1952         if (!smallfelem_to_BN(y, y_out)) {
1953             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1954                   ERR_R_BN_LIB);
1955             return 0;
1956         }
1957     }
1958     return 1;
1959 }
1960
1961 /* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
1962 static void make_points_affine(size_t num, smallfelem points[][3],
1963                                smallfelem tmp_smallfelems[])
1964 {
1965     /*
1966      * Runs in constant time, unless an input is the point at infinity (which
1967      * normally shouldn't happen).
1968      */
1969     ec_GFp_nistp_points_make_affine_internal(num,
1970                                              points,
1971                                              sizeof(smallfelem),
1972                                              tmp_smallfelems,
1973                                              (void (*)(void *))smallfelem_one,
1974                                              smallfelem_is_zero_int,
1975                                              (void (*)(void *, const void *))
1976                                              smallfelem_assign,
1977                                              (void (*)(void *, const void *))
1978                                              smallfelem_square_contract,
1979                                              (void (*)
1980                                               (void *, const void *,
1981                                                const void *))
1982                                              smallfelem_mul_contract,
1983                                              (void (*)(void *, const void *))
1984                                              smallfelem_inv_contract,
1985                                              /* nothing to contract */
1986                                              (void (*)(void *, const void *))
1987                                              smallfelem_assign);
1988 }
1989
1990 /*
1991  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1992  * values Result is stored in r (r can equal one of the inputs).
1993  */
1994 int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
1995                                const BIGNUM *scalar, size_t num,
1996                                const EC_POINT *points[],
1997                                const BIGNUM *scalars[], BN_CTX *ctx)
1998 {
1999     int ret = 0;
2000     int j;
2001     int mixed = 0;
2002     BN_CTX *new_ctx = NULL;
2003     BIGNUM *x, *y, *z, *tmp_scalar;
2004     felem_bytearray g_secret;
2005     felem_bytearray *secrets = NULL;
2006     smallfelem (*pre_comp)[17][3] = NULL;
2007     smallfelem *tmp_smallfelems = NULL;
2008     unsigned i;
2009     int num_bytes;
2010     int have_pre_comp = 0;
2011     size_t num_points = num;
2012     smallfelem x_in, y_in, z_in;
2013     felem x_out, y_out, z_out;
2014     NISTP256_PRE_COMP *pre = NULL;
2015     const smallfelem(*g_pre_comp)[16][3] = NULL;
2016     EC_POINT *generator = NULL;
2017     const EC_POINT *p = NULL;
2018     const BIGNUM *p_scalar = NULL;
2019
2020     if (ctx == NULL)
2021         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2022             return 0;
2023     BN_CTX_start(ctx);
2024     if (((x = BN_CTX_get(ctx)) == NULL) ||
2025         ((y = BN_CTX_get(ctx)) == NULL) ||
2026         ((z = BN_CTX_get(ctx)) == NULL) ||
2027         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
2028         goto err;
2029
2030     if (scalar != NULL) {
2031         pre = group->pre_comp.nistp256;
2032         if (pre)
2033             /* we have precomputation, try to use it */
2034             g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2035         else
2036             /* try to use the standard precomputation */
2037             g_pre_comp = &gmul[0];
2038         generator = EC_POINT_new(group);
2039         if (generator == NULL)
2040             goto err;
2041         /* get the generator from precomputation */
2042         if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2043             !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2044             !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2045             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2046             goto err;
2047         }
2048         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
2049                                                       generator, x, y, z,
2050                                                       ctx))
2051             goto err;
2052         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2053             /* precomputation matches generator */
2054             have_pre_comp = 1;
2055         else
2056             /*
2057              * we don't have valid precomputation: treat the generator as a
2058              * random point
2059              */
2060             num_points++;
2061     }
2062     if (num_points > 0) {
2063         if (num_points >= 3) {
2064             /*
2065              * unless we precompute multiples for just one or two points,
2066              * converting those into affine form is time well spent
2067              */
2068             mixed = 1;
2069         }
2070         secrets = OPENSSL_malloc(sizeof(*secrets) * num_points);
2071         pre_comp = OPENSSL_malloc(sizeof(*pre_comp) * num_points);
2072         if (mixed)
2073             tmp_smallfelems =
2074               OPENSSL_malloc(sizeof(*tmp_smallfelems) * (num_points * 17 + 1));
2075         if ((secrets == NULL) || (pre_comp == NULL)
2076             || (mixed && (tmp_smallfelems == NULL))) {
2077             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
2078             goto err;
2079         }
2080
2081         /*
2082          * we treat NULL scalars as 0, and NULL points as points at infinity,
2083          * i.e., they contribute nothing to the linear combination
2084          */
2085         memset(secrets, 0, sizeof(*secrets) * num_points);
2086         memset(pre_comp, 0, sizeof(*pre_comp) * num_points);
2087         for (i = 0; i < num_points; ++i) {
2088             if (i == num) {
2089                 /*
2090                  * we didn't have a valid precomputation, so we pick the
2091                  * generator
2092                  */
2093                 p = EC_GROUP_get0_generator(group);
2094                 p_scalar = scalar;
2095             } else {
2096                 /* the i^th point */
2097                 p = points[i];
2098                 p_scalar = scalars[i];
2099             }
2100             if ((p_scalar != NULL) && (p != NULL)) {
2101                 /* reduce scalar to 0 <= scalar < 2^256 */
2102                 if ((BN_num_bits(p_scalar) > 256)
2103                     || (BN_is_negative(p_scalar))) {
2104                     /*
2105                      * this is an unusual input, and we don't guarantee
2106                      * constant-timeness
2107                      */
2108                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
2109                         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2110                         goto err;
2111                     }
2112                     num_bytes = BN_bn2lebinpad(tmp_scalar,
2113                                                secrets[i], sizeof(secrets[i]));
2114                 } else {
2115                     num_bytes = BN_bn2lebinpad(p_scalar,
2116                                                secrets[i], sizeof(secrets[i]));
2117                 }
2118                 if (num_bytes < 0) {
2119                     ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2120                     goto err;
2121                 }
2122                 /* precompute multiples */
2123                 if ((!BN_to_felem(x_out, p->X)) ||
2124                     (!BN_to_felem(y_out, p->Y)) ||
2125                     (!BN_to_felem(z_out, p->Z)))
2126                     goto err;
2127                 felem_shrink(pre_comp[i][1][0], x_out);
2128                 felem_shrink(pre_comp[i][1][1], y_out);
2129                 felem_shrink(pre_comp[i][1][2], z_out);
2130                 for (j = 2; j <= 16; ++j) {
2131                     if (j & 1) {
2132                         point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2133                                         pre_comp[i][j][2], pre_comp[i][1][0],
2134                                         pre_comp[i][1][1], pre_comp[i][1][2],
2135                                         pre_comp[i][j - 1][0],
2136                                         pre_comp[i][j - 1][1],
2137                                         pre_comp[i][j - 1][2]);
2138                     } else {
2139                         point_double_small(pre_comp[i][j][0],
2140                                            pre_comp[i][j][1],
2141                                            pre_comp[i][j][2],
2142                                            pre_comp[i][j / 2][0],
2143                                            pre_comp[i][j / 2][1],
2144                                            pre_comp[i][j / 2][2]);
2145                     }
2146                 }
2147             }
2148         }
2149         if (mixed)
2150             make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2151     }
2152
2153     /* the scalar for the generator */
2154     if ((scalar != NULL) && (have_pre_comp)) {
2155         memset(g_secret, 0, sizeof(g_secret));
2156         /* reduce scalar to 0 <= scalar < 2^256 */
2157         if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2158             /*
2159              * this is an unusual input, and we don't guarantee
2160              * constant-timeness
2161              */
2162             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2163                 ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2164                 goto err;
2165             }
2166             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2167         } else {
2168             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2169         }
2170         /* do the multiplication with generator precomputation */
2171         batch_mul(x_out, y_out, z_out,
2172                   (const felem_bytearray(*))secrets, num_points,
2173                   g_secret,
2174                   mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2175     } else {
2176         /* do the multiplication without generator precomputation */
2177         batch_mul(x_out, y_out, z_out,
2178                   (const felem_bytearray(*))secrets, num_points,
2179                   NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2180     }
2181     /* reduce the output to its unique minimal representation */
2182     felem_contract(x_in, x_out);
2183     felem_contract(y_in, y_out);
2184     felem_contract(z_in, z_out);
2185     if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2186         (!smallfelem_to_BN(z, z_in))) {
2187         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2188         goto err;
2189     }
2190     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2191
2192  err:
2193     BN_CTX_end(ctx);
2194     EC_POINT_free(generator);
2195     BN_CTX_free(new_ctx);
2196     OPENSSL_free(secrets);
2197     OPENSSL_free(pre_comp);
2198     OPENSSL_free(tmp_smallfelems);
2199     return ret;
2200 }
2201
2202 int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2203 {
2204     int ret = 0;
2205     NISTP256_PRE_COMP *pre = NULL;
2206     int i, j;
2207     BN_CTX *new_ctx = NULL;
2208     BIGNUM *x, *y;
2209     EC_POINT *generator = NULL;
2210     smallfelem tmp_smallfelems[32];
2211     felem x_tmp, y_tmp, z_tmp;
2212
2213     /* throw away old precomputation */
2214     EC_pre_comp_free(group);
2215     if (ctx == NULL)
2216         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2217             return 0;
2218     BN_CTX_start(ctx);
2219     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2220         goto err;
2221     /* get the generator */
2222     if (group->generator == NULL)
2223         goto err;
2224     generator = EC_POINT_new(group);
2225     if (generator == NULL)
2226         goto err;
2227     BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2228     BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2229     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2230         goto err;
2231     if ((pre = nistp256_pre_comp_new()) == NULL)
2232         goto err;
2233     /*
2234      * if the generator is the standard one, use built-in precomputation
2235      */
2236     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2237         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2238         goto done;
2239     }
2240     if ((!BN_to_felem(x_tmp, group->generator->X)) ||
2241         (!BN_to_felem(y_tmp, group->generator->Y)) ||
2242         (!BN_to_felem(z_tmp, group->generator->Z)))
2243         goto err;
2244     felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2245     felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2246     felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2247     /*
2248      * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2249      * 2^160*G, 2^224*G for the second one
2250      */
2251     for (i = 1; i <= 8; i <<= 1) {
2252         point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2253                            pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2254                            pre->g_pre_comp[0][i][1],
2255                            pre->g_pre_comp[0][i][2]);
2256         for (j = 0; j < 31; ++j) {
2257             point_double_small(pre->g_pre_comp[1][i][0],
2258                                pre->g_pre_comp[1][i][1],
2259                                pre->g_pre_comp[1][i][2],
2260                                pre->g_pre_comp[1][i][0],
2261                                pre->g_pre_comp[1][i][1],
2262                                pre->g_pre_comp[1][i][2]);
2263         }
2264         if (i == 8)
2265             break;
2266         point_double_small(pre->g_pre_comp[0][2 * i][0],
2267                            pre->g_pre_comp[0][2 * i][1],
2268                            pre->g_pre_comp[0][2 * i][2],
2269                            pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2270                            pre->g_pre_comp[1][i][2]);
2271         for (j = 0; j < 31; ++j) {
2272             point_double_small(pre->g_pre_comp[0][2 * i][0],
2273                                pre->g_pre_comp[0][2 * i][1],
2274                                pre->g_pre_comp[0][2 * i][2],
2275                                pre->g_pre_comp[0][2 * i][0],
2276                                pre->g_pre_comp[0][2 * i][1],
2277                                pre->g_pre_comp[0][2 * i][2]);
2278         }
2279     }
2280     for (i = 0; i < 2; i++) {
2281         /* g_pre_comp[i][0] is the point at infinity */
2282         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2283         /* the remaining multiples */
2284         /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2285         point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2286                         pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2287                         pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2288                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2289                         pre->g_pre_comp[i][2][2]);
2290         /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2291         point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2292                         pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2293                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2294                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2295                         pre->g_pre_comp[i][2][2]);
2296         /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2297         point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2298                         pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2299                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2300                         pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2301                         pre->g_pre_comp[i][4][2]);
2302         /*
2303          * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2304          */
2305         point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2306                         pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2307                         pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2308                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2309                         pre->g_pre_comp[i][2][2]);
2310         for (j = 1; j < 8; ++j) {
2311             /* odd multiples: add G resp. 2^32*G */
2312             point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2313                             pre->g_pre_comp[i][2 * j + 1][1],
2314                             pre->g_pre_comp[i][2 * j + 1][2],
2315                             pre->g_pre_comp[i][2 * j][0],
2316                             pre->g_pre_comp[i][2 * j][1],
2317                             pre->g_pre_comp[i][2 * j][2],
2318                             pre->g_pre_comp[i][1][0],
2319                             pre->g_pre_comp[i][1][1],
2320                             pre->g_pre_comp[i][1][2]);
2321         }
2322     }
2323     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2324
2325  done:
2326     SETPRECOMP(group, nistp256, pre);
2327     pre = NULL;
2328     ret = 1;
2329
2330  err:
2331     BN_CTX_end(ctx);
2332     EC_POINT_free(generator);
2333     BN_CTX_free(new_ctx);
2334     EC_nistp256_pre_comp_free(pre);
2335     return ret;
2336 }
2337
2338 int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2339 {
2340     return HAVEPRECOMP(group, nistp256);
2341 }
2342 #endif