1 /* crypto/ec/ecp_nistp521.c */
3 * Written by Adam Langley (Google) for the OpenSSL project
5 /* Copyright 2011 Google Inc.
7 * Licensed under the Apache License, Version 2.0 (the "License");
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
12 * http://www.apache.org/licenses/LICENSE-2.0
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
22 * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
24 * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25 * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26 * work which got its smarts from Daniel J. Bernstein's work on the same.
29 #ifdef EC_NISTP_64_GCC_128
33 #include <openssl/err.h>
36 #if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
37 /* even with gcc, the typedef won't work for 32-bit platforms */
38 typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */
40 #error "Need GCC 3.1 or later to define type uint128_t"
47 /* The underlying field.
49 * P521 operates over GF(2^521-1). We can serialise an element of this field
50 * into 66 bytes where the most significant byte contains only a single bit. We
51 * call this an felem_bytearray. */
53 typedef u8 felem_bytearray[66];
55 /* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
56 * These values are big-endian. */
57 static const felem_bytearray nistp521_curve_params[5] =
59 {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
60 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
61 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
62 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
63 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
64 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
65 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
66 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
68 {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
69 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
75 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
77 {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
78 0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
79 0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
80 0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
81 0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
82 0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
83 0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
84 0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
86 {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
87 0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
88 0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
89 0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
90 0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
91 0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
92 0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
93 0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
95 {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
96 0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
97 0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
98 0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
99 0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
100 0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
101 0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
102 0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
106 /* The representation of field elements.
107 * ------------------------------------
109 * We represent field elements with nine values. These values are either 64 or
110 * 128 bits and the field element represented is:
111 * v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464 (mod p)
112 * Each of the nine values is called a 'limb'. Since the limbs are spaced only
113 * 58 bits apart, but are greater than 58 bits in length, the most significant
114 * bits of each limb overlap with the least significant bits of the next.
116 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
121 typedef uint64_t limb;
122 typedef limb felem[NLIMBS];
123 typedef uint128_t largefelem[NLIMBS];
125 static const limb bottom57bits = 0x1ffffffffffffff;
126 static const limb bottom58bits = 0x3ffffffffffffff;
128 /* bin66_to_felem takes a little-endian byte array and converts it into felem
129 * form. This assumes that the CPU is little-endian. */
130 static void bin66_to_felem(felem out, const u8 in[66])
132 out[0] = (*((limb*) &in[0])) & bottom58bits;
133 out[1] = (*((limb*) &in[7]) >> 2) & bottom58bits;
134 out[2] = (*((limb*) &in[14]) >> 4) & bottom58bits;
135 out[3] = (*((limb*) &in[21]) >> 6) & bottom58bits;
136 out[4] = (*((limb*) &in[29])) & bottom58bits;
137 out[5] = (*((limb*) &in[36]) >> 2) & bottom58bits;
138 out[6] = (*((limb*) &in[43]) >> 4) & bottom58bits;
139 out[7] = (*((limb*) &in[50]) >> 6) & bottom58bits;
140 out[8] = (*((limb*) &in[58])) & bottom57bits;
143 /* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
144 * array. This assumes that the CPU is little-endian. */
145 static void felem_to_bin66(u8 out[66], const felem in)
148 (*((limb*) &out[0])) = in[0];
149 (*((limb*) &out[7])) |= in[1] << 2;
150 (*((limb*) &out[14])) |= in[2] << 4;
151 (*((limb*) &out[21])) |= in[3] << 6;
152 (*((limb*) &out[29])) = in[4];
153 (*((limb*) &out[36])) |= in[5] << 2;
154 (*((limb*) &out[43])) |= in[6] << 4;
155 (*((limb*) &out[50])) |= in[7] << 6;
156 (*((limb*) &out[58])) = in[8];
159 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
160 static void flip_endian(u8 *out, const u8 *in, unsigned len)
163 for (i = 0; i < len; ++i)
164 out[i] = in[len-1-i];
167 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
168 static int BN_to_felem(felem out, const BIGNUM *bn)
170 felem_bytearray b_in;
171 felem_bytearray b_out;
174 /* BN_bn2bin eats leading zeroes */
175 memset(b_out, 0, sizeof b_out);
176 num_bytes = BN_num_bytes(bn);
177 if (num_bytes > sizeof b_out)
179 ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
182 if (BN_is_negative(bn))
184 ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
187 num_bytes = BN_bn2bin(bn, b_in);
188 flip_endian(b_out, b_in, num_bytes);
189 bin66_to_felem(out, b_out);
193 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
194 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
196 felem_bytearray b_in, b_out;
197 felem_to_bin66(b_in, in);
198 flip_endian(b_out, b_in, sizeof b_out);
199 return BN_bin2bn(b_out, sizeof b_out, out);
204 * ---------------- */
206 static void felem_one(felem out)
219 static void felem_assign(felem out, const felem in)
232 /* felem_sum64 sets out = out + in. */
233 static void felem_sum64(felem out, const felem in)
246 /* felem_scalar sets out = in * scalar */
247 static void felem_scalar(felem out, const felem in, limb scalar)
249 out[0] = in[0] * scalar;
250 out[1] = in[1] * scalar;
251 out[2] = in[2] * scalar;
252 out[3] = in[3] * scalar;
253 out[4] = in[4] * scalar;
254 out[5] = in[5] * scalar;
255 out[6] = in[6] * scalar;
256 out[7] = in[7] * scalar;
257 out[8] = in[8] * scalar;
260 /* felem_scalar64 sets out = out * scalar */
261 static void felem_scalar64(felem out, limb scalar)
274 /* felem_scalar128 sets out = out * scalar */
275 static void felem_scalar128(largefelem out, limb scalar)
288 /* felem_neg sets |out| to |-in|
290 * in[i] < 2^59 + 2^14
294 static void felem_neg(felem out, const felem in)
296 /* In order to prevent underflow, we subtract from 0 mod p. */
297 static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
298 static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
300 out[0] = two62m3 - in[0];
301 out[1] = two62m2 - in[1];
302 out[2] = two62m2 - in[2];
303 out[3] = two62m2 - in[3];
304 out[4] = two62m2 - in[4];
305 out[5] = two62m2 - in[5];
306 out[6] = two62m2 - in[6];
307 out[7] = two62m2 - in[7];
308 out[8] = two62m2 - in[8];
311 /* felem_diff64 subtracts |in| from |out|
313 * in[i] < 2^59 + 2^14
315 * out[i] < out[i] + 2^62
317 static void felem_diff64(felem out, const felem in)
319 /* In order to prevent underflow, we add 0 mod p before subtracting. */
320 static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
321 static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
323 out[0] += two62m3 - in[0];
324 out[1] += two62m2 - in[1];
325 out[2] += two62m2 - in[2];
326 out[3] += two62m2 - in[3];
327 out[4] += two62m2 - in[4];
328 out[5] += two62m2 - in[5];
329 out[6] += two62m2 - in[6];
330 out[7] += two62m2 - in[7];
331 out[8] += two62m2 - in[8];
334 /* felem_diff_128_64 subtracts |in| from |out|
336 * in[i] < 2^62 + 2^17
338 * out[i] < out[i] + 2^63
340 static void felem_diff_128_64(largefelem out, const felem in)
342 // In order to prevent underflow, we add 0 mod p before subtracting.
343 static const limb two63m6 = (((limb)1) << 62) - (((limb)1) << 5);
344 static const limb two63m5 = (((limb)1) << 62) - (((limb)1) << 4);
346 out[0] += two63m6 - in[0];
347 out[1] += two63m5 - in[1];
348 out[2] += two63m5 - in[2];
349 out[3] += two63m5 - in[3];
350 out[4] += two63m5 - in[4];
351 out[5] += two63m5 - in[5];
352 out[6] += two63m5 - in[6];
353 out[7] += two63m5 - in[7];
354 out[8] += two63m5 - in[8];
357 /* felem_diff_128_64 subtracts |in| from |out|
361 * out[i] < out[i] + 2^127 - 2^69
363 static void felem_diff128(largefelem out, const largefelem in)
365 // In order to prevent underflow, we add 0 mod p before subtracting.
366 static const uint128_t two127m70 = (((uint128_t)1) << 127) - (((uint128_t)1) << 70);
367 static const uint128_t two127m69 = (((uint128_t)1) << 127) - (((uint128_t)1) << 69);
369 out[0] += (two127m70 - in[0]);
370 out[1] += (two127m69 - in[1]);
371 out[2] += (two127m69 - in[2]);
372 out[3] += (two127m69 - in[3]);
373 out[4] += (two127m69 - in[4]);
374 out[5] += (two127m69 - in[5]);
375 out[6] += (two127m69 - in[6]);
376 out[7] += (two127m69 - in[7]);
377 out[8] += (two127m69 - in[8]);
380 /* felem_square sets |out| = |in|^2
384 * out[i] < 17 * max(in[i]) * max(in[i])
386 static void felem_square(largefelem out, const felem in)
389 felem_scalar(inx2, in, 2);
390 felem_scalar(inx4, in, 4);
392 /* We have many cases were we want to do
395 * This is obviously just
397 * However, rather than do the doubling on the 128 bit result, we
398 * double one of the inputs to the multiplication by reading from
401 out[0] = ((uint128_t) in[0]) * in[0];
402 out[1] = ((uint128_t) in[0]) * inx2[1];
403 out[2] = ((uint128_t) in[0]) * inx2[2] +
404 ((uint128_t) in[1]) * in[1];
405 out[3] = ((uint128_t) in[0]) * inx2[3] +
406 ((uint128_t) in[1]) * inx2[2];
407 out[4] = ((uint128_t) in[0]) * inx2[4] +
408 ((uint128_t) in[1]) * inx2[3] +
409 ((uint128_t) in[2]) * in[2];
410 out[5] = ((uint128_t) in[0]) * inx2[5] +
411 ((uint128_t) in[1]) * inx2[4] +
412 ((uint128_t) in[2]) * inx2[3];
413 out[6] = ((uint128_t) in[0]) * inx2[6] +
414 ((uint128_t) in[1]) * inx2[5] +
415 ((uint128_t) in[2]) * inx2[4] +
416 ((uint128_t) in[3]) * in[3];
417 out[7] = ((uint128_t) in[0]) * inx2[7] +
418 ((uint128_t) in[1]) * inx2[6] +
419 ((uint128_t) in[2]) * inx2[5] +
420 ((uint128_t) in[3]) * inx2[4];
421 out[8] = ((uint128_t) in[0]) * inx2[8] +
422 ((uint128_t) in[1]) * inx2[7] +
423 ((uint128_t) in[2]) * inx2[6] +
424 ((uint128_t) in[3]) * inx2[5] +
425 ((uint128_t) in[4]) * in[4];
427 /* The remaining limbs fall above 2^521, with the first falling at
428 * 2^522. They correspond to locations one bit up from the limbs
429 * produced above so we would have to multiply by two to align them.
430 * Again, rather than operate on the 128-bit result, we double one of
431 * the inputs to the multiplication. If we want to double for both this
432 * reason, and the reason above, then we end up multiplying by four. */
435 out[0] += ((uint128_t) in[1]) * inx4[8] +
436 ((uint128_t) in[2]) * inx4[7] +
437 ((uint128_t) in[3]) * inx4[6] +
438 ((uint128_t) in[4]) * inx4[5];
441 out[1] += ((uint128_t) in[2]) * inx4[8] +
442 ((uint128_t) in[3]) * inx4[7] +
443 ((uint128_t) in[4]) * inx4[6] +
444 ((uint128_t) in[5]) * inx2[5];
447 out[2] += ((uint128_t) in[3]) * inx4[8] +
448 ((uint128_t) in[4]) * inx4[7] +
449 ((uint128_t) in[5]) * inx4[6];
452 out[3] += ((uint128_t) in[4]) * inx4[8] +
453 ((uint128_t) in[5]) * inx4[7] +
454 ((uint128_t) in[6]) * inx2[6];
457 out[4] += ((uint128_t) in[5]) * inx4[8] +
458 ((uint128_t) in[6]) * inx4[7];
461 out[5] += ((uint128_t) in[6]) * inx4[8] +
462 ((uint128_t) in[7]) * inx2[7];
465 out[6] += ((uint128_t) in[7]) * inx4[8];
468 out[7] += ((uint128_t) in[8]) * inx2[8];
471 /* felem_mul sets |out| = |in1| * |in2|
476 * out[i] < 17 * max(in1[i]) * max(in2[i])
478 static void felem_mul(largefelem out, const felem in1, const felem in2)
481 felem_scalar(in2x2, in2, 2);
483 out[0] = ((uint128_t) in1[0]) * in2[0];
485 out[1] = ((uint128_t) in1[0]) * in2[1] +
486 ((uint128_t) in1[1]) * in2[0];
488 out[2] = ((uint128_t) in1[0]) * in2[2] +
489 ((uint128_t) in1[1]) * in2[1] +
490 ((uint128_t) in1[2]) * in2[0];
492 out[3] = ((uint128_t) in1[0]) * in2[3] +
493 ((uint128_t) in1[1]) * in2[2] +
494 ((uint128_t) in1[2]) * in2[1] +
495 ((uint128_t) in1[3]) * in2[0];
497 out[4] = ((uint128_t) in1[0]) * in2[4] +
498 ((uint128_t) in1[1]) * in2[3] +
499 ((uint128_t) in1[2]) * in2[2] +
500 ((uint128_t) in1[3]) * in2[1] +
501 ((uint128_t) in1[4]) * in2[0];
503 out[5] = ((uint128_t) in1[0]) * in2[5] +
504 ((uint128_t) in1[1]) * in2[4] +
505 ((uint128_t) in1[2]) * in2[3] +
506 ((uint128_t) in1[3]) * in2[2] +
507 ((uint128_t) in1[4]) * in2[1] +
508 ((uint128_t) in1[5]) * in2[0];
510 out[6] = ((uint128_t) in1[0]) * in2[6] +
511 ((uint128_t) in1[1]) * in2[5] +
512 ((uint128_t) in1[2]) * in2[4] +
513 ((uint128_t) in1[3]) * in2[3] +
514 ((uint128_t) in1[4]) * in2[2] +
515 ((uint128_t) in1[5]) * in2[1] +
516 ((uint128_t) in1[6]) * in2[0];
518 out[7] = ((uint128_t) in1[0]) * in2[7] +
519 ((uint128_t) in1[1]) * in2[6] +
520 ((uint128_t) in1[2]) * in2[5] +
521 ((uint128_t) in1[3]) * in2[4] +
522 ((uint128_t) in1[4]) * in2[3] +
523 ((uint128_t) in1[5]) * in2[2] +
524 ((uint128_t) in1[6]) * in2[1] +
525 ((uint128_t) in1[7]) * in2[0];
527 out[8] = ((uint128_t) in1[0]) * in2[8] +
528 ((uint128_t) in1[1]) * in2[7] +
529 ((uint128_t) in1[2]) * in2[6] +
530 ((uint128_t) in1[3]) * in2[5] +
531 ((uint128_t) in1[4]) * in2[4] +
532 ((uint128_t) in1[5]) * in2[3] +
533 ((uint128_t) in1[6]) * in2[2] +
534 ((uint128_t) in1[7]) * in2[1] +
535 ((uint128_t) in1[8]) * in2[0];
537 /* See comment in felem_square about the use of in2x2 here */
539 out[0] += ((uint128_t) in1[1]) * in2x2[8] +
540 ((uint128_t) in1[2]) * in2x2[7] +
541 ((uint128_t) in1[3]) * in2x2[6] +
542 ((uint128_t) in1[4]) * in2x2[5] +
543 ((uint128_t) in1[5]) * in2x2[4] +
544 ((uint128_t) in1[6]) * in2x2[3] +
545 ((uint128_t) in1[7]) * in2x2[2] +
546 ((uint128_t) in1[8]) * in2x2[1];
548 out[1] += ((uint128_t) in1[2]) * in2x2[8] +
549 ((uint128_t) in1[3]) * in2x2[7] +
550 ((uint128_t) in1[4]) * in2x2[6] +
551 ((uint128_t) in1[5]) * in2x2[5] +
552 ((uint128_t) in1[6]) * in2x2[4] +
553 ((uint128_t) in1[7]) * in2x2[3] +
554 ((uint128_t) in1[8]) * in2x2[2];
556 out[2] += ((uint128_t) in1[3]) * in2x2[8] +
557 ((uint128_t) in1[4]) * in2x2[7] +
558 ((uint128_t) in1[5]) * in2x2[6] +
559 ((uint128_t) in1[6]) * in2x2[5] +
560 ((uint128_t) in1[7]) * in2x2[4] +
561 ((uint128_t) in1[8]) * in2x2[3];
563 out[3] += ((uint128_t) in1[4]) * in2x2[8] +
564 ((uint128_t) in1[5]) * in2x2[7] +
565 ((uint128_t) in1[6]) * in2x2[6] +
566 ((uint128_t) in1[7]) * in2x2[5] +
567 ((uint128_t) in1[8]) * in2x2[4];
569 out[4] += ((uint128_t) in1[5]) * in2x2[8] +
570 ((uint128_t) in1[6]) * in2x2[7] +
571 ((uint128_t) in1[7]) * in2x2[6] +
572 ((uint128_t) in1[8]) * in2x2[5];
574 out[5] += ((uint128_t) in1[6]) * in2x2[8] +
575 ((uint128_t) in1[7]) * in2x2[7] +
576 ((uint128_t) in1[8]) * in2x2[6];
578 out[6] += ((uint128_t) in1[7]) * in2x2[8] +
579 ((uint128_t) in1[8]) * in2x2[7];
581 out[7] += ((uint128_t) in1[8]) * in2x2[8];
584 static const limb bottom52bits = 0xfffffffffffff;
586 /* felem_reduce converts a largefelem to an felem.
590 * out[i] < 2^59 + 2^14
592 static void felem_reduce(felem out, const largefelem in)
594 out[0] = ((limb) in[0]) & bottom58bits;
595 out[1] = ((limb) in[1]) & bottom58bits;
596 out[2] = ((limb) in[2]) & bottom58bits;
597 out[3] = ((limb) in[3]) & bottom58bits;
598 out[4] = ((limb) in[4]) & bottom58bits;
599 out[5] = ((limb) in[5]) & bottom58bits;
600 out[6] = ((limb) in[6]) & bottom58bits;
601 out[7] = ((limb) in[7]) & bottom58bits;
602 out[8] = ((limb) in[8]) & bottom58bits;
606 out[1] += ((limb) in[0]) >> 58;
607 out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
608 /* out[1] < 2^58 + 2^6 + 2^58
610 out[2] += ((limb) (in[0] >> 64)) >> 52;
612 out[2] += ((limb) in[1]) >> 58;
613 out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
614 out[3] += ((limb) (in[1] >> 64)) >> 52;
616 out[3] += ((limb) in[2]) >> 58;
617 out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
618 out[4] += ((limb) (in[2] >> 64)) >> 52;
620 out[4] += ((limb) in[3]) >> 58;
621 out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
622 out[5] += ((limb) (in[3] >> 64)) >> 52;
624 out[5] += ((limb) in[4]) >> 58;
625 out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
626 out[6] += ((limb) (in[4] >> 64)) >> 52;
628 out[6] += ((limb) in[5]) >> 58;
629 out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
630 out[7] += ((limb) (in[5] >> 64)) >> 52;
632 out[7] += ((limb) in[6]) >> 58;
633 out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
634 out[8] += ((limb) (in[6] >> 64)) >> 52;
636 out[8] += ((limb) in[7]) >> 58;
637 out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
638 /* out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
640 u64 overflow1 = ((limb) (in[7] >> 64)) >> 52;
642 overflow1 += ((limb) in[8]) >> 58;
643 overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
644 u64 overflow2 = ((limb) (in[8] >> 64)) >> 52;
646 overflow1 <<= 1; /* overflow1 < 2^13 + 2^7 + 2^59 */
647 overflow2 <<= 1; /* overflow2 < 2^13 */
649 out[0] += overflow1; /* out[0] < 2^60 */
650 out[1] += overflow2; /* out[1] < 2^59 + 2^6 + 2^13 */
652 out[1] += out[0] >> 58; out[0] &= bottom58bits;
654 * out[1] < 2^59 + 2^6 + 2^13 + 2^2
658 static void felem_square_reduce(felem out, const felem in)
661 felem_square(tmp, in);
662 felem_reduce(out, tmp);
665 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
668 felem_mul(tmp, in1, in2);
669 felem_reduce(out, tmp);
672 /* felem_inv calculates |out| = |in|^{-1}
674 * Based on Fermat's Little Theorem:
676 * a^{p-1} = 1 (mod p)
677 * a^{p-2} = a^{-1} (mod p)
679 static void felem_inv(felem out, const felem in)
681 felem ftmp, ftmp2, ftmp3, ftmp4;
685 felem_square(tmp, in); felem_reduce(ftmp, tmp); /* 2^1 */
686 felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */
687 felem_assign(ftmp2, ftmp);
688 felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */
689 felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2^0 */
690 felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^4 - 2^1 */
692 felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp); /* 2^3 - 2^1 */
693 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^4 - 2^2 */
694 felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */
696 felem_assign(ftmp2, ftmp3);
697 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^5 - 2^1 */
698 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^6 - 2^2 */
699 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^7 - 2^3 */
700 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^8 - 2^4 */
701 felem_assign(ftmp4, ftmp3);
702 felem_mul(tmp, ftmp3, ftmp); felem_reduce(ftmp4, tmp); /* 2^8 - 2^1 */
703 felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp); /* 2^9 - 2^2 */
704 felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */
705 felem_assign(ftmp2, ftmp3);
707 for (i = 0; i < 8; i++)
709 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
711 felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */
712 felem_assign(ftmp2, ftmp3);
714 for (i = 0; i < 16; i++)
716 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
718 felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */
719 felem_assign(ftmp2, ftmp3);
721 for (i = 0; i < 32; i++)
723 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
725 felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */
726 felem_assign(ftmp2, ftmp3);
728 for (i = 0; i < 64; i++)
730 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
732 felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */
733 felem_assign(ftmp2, ftmp3);
735 for (i = 0; i < 128; i++)
737 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
739 felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */
740 felem_assign(ftmp2, ftmp3);
742 for (i = 0; i < 256; i++)
744 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
746 felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */
748 for (i = 0; i < 9; i++)
750 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
752 felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp); /* 2^512 - 2^2 */
753 felem_mul(tmp, ftmp3, in); felem_reduce(out, tmp); /* 2^512 - 3 */
756 /* This is 2^521-1, expressed as an felem */
757 static const felem kPrime =
759 0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
760 0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
761 0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
764 /* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
767 * in[i] < 2^59 + 2^14
769 static limb felem_is_zero(const felem in)
773 felem_assign(ftmp, in);
775 ftmp[0] += ftmp[8] >> 57; ftmp[8] &= bottom57bits;
777 ftmp[1] += ftmp[0] >> 58; ftmp[0] &= bottom58bits;
778 ftmp[2] += ftmp[1] >> 58; ftmp[1] &= bottom58bits;
779 ftmp[3] += ftmp[2] >> 58; ftmp[2] &= bottom58bits;
780 ftmp[4] += ftmp[3] >> 58; ftmp[3] &= bottom58bits;
781 ftmp[5] += ftmp[4] >> 58; ftmp[4] &= bottom58bits;
782 ftmp[6] += ftmp[5] >> 58; ftmp[5] &= bottom58bits;
783 ftmp[7] += ftmp[6] >> 58; ftmp[6] &= bottom58bits;
784 ftmp[8] += ftmp[7] >> 58; ftmp[7] &= bottom58bits;
785 /* ftmp[8] < 2^57 + 4 */
787 /* The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is
788 * greater than our bound for ftmp[8]. Therefore we only have to check
789 * if the zero is zero or 2^521-1. */
803 // We know that ftmp[i] < 2^63, therefore the only way that the top bit
804 // can be set is if is_zero was 0 before the decrement.
805 is_zero = ((s64) is_zero) >> 63;
807 is_p = ftmp[0] ^ kPrime[0];
808 is_p |= ftmp[1] ^ kPrime[1];
809 is_p |= ftmp[2] ^ kPrime[2];
810 is_p |= ftmp[3] ^ kPrime[3];
811 is_p |= ftmp[4] ^ kPrime[4];
812 is_p |= ftmp[5] ^ kPrime[5];
813 is_p |= ftmp[6] ^ kPrime[6];
814 is_p |= ftmp[7] ^ kPrime[7];
815 is_p |= ftmp[8] ^ kPrime[8];
818 is_p = ((s64) is_p) >> 63;
824 static int felem_is_zero_int(const felem in)
826 return (int) (felem_is_zero(in) & ((limb)1));
829 /* felem_contract converts |in| to its unique, minimal representation.
831 * in[i] < 2^59 + 2^14
833 static void felem_contract(felem out, const felem in)
835 limb is_p, is_greater, sign;
836 static const limb two58 = ((limb)1) << 58;
838 felem_assign(out, in);
840 out[0] += out[8] >> 57; out[8] &= bottom57bits;
842 out[1] += out[0] >> 58; out[0] &= bottom58bits;
843 out[2] += out[1] >> 58; out[1] &= bottom58bits;
844 out[3] += out[2] >> 58; out[2] &= bottom58bits;
845 out[4] += out[3] >> 58; out[3] &= bottom58bits;
846 out[5] += out[4] >> 58; out[4] &= bottom58bits;
847 out[6] += out[5] >> 58; out[5] &= bottom58bits;
848 out[7] += out[6] >> 58; out[6] &= bottom58bits;
849 out[8] += out[7] >> 58; out[7] &= bottom58bits;
850 /* out[8] < 2^57 + 4 */
852 /* If the value is greater than 2^521-1 then we have to subtract
853 * 2^521-1 out. See the comments in felem_is_zero regarding why we
854 * don't test for other multiples of the prime. */
856 /* First, if |out| is equal to 2^521-1, we subtract it out to get zero. */
858 is_p = out[0] ^ kPrime[0];
859 is_p |= out[1] ^ kPrime[1];
860 is_p |= out[2] ^ kPrime[2];
861 is_p |= out[3] ^ kPrime[3];
862 is_p |= out[4] ^ kPrime[4];
863 is_p |= out[5] ^ kPrime[5];
864 is_p |= out[6] ^ kPrime[6];
865 is_p |= out[7] ^ kPrime[7];
866 is_p |= out[8] ^ kPrime[8];
875 is_p = ((s64) is_p) >> 63;
878 /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
890 /* In order to test that |out| >= 2^521-1 we need only test if out[8]
891 * >> 57 is greater than zero as (2^521-1) + x >= 2^522 */
892 is_greater = out[8] >> 57;
893 is_greater |= is_greater << 32;
894 is_greater |= is_greater << 16;
895 is_greater |= is_greater << 8;
896 is_greater |= is_greater << 4;
897 is_greater |= is_greater << 2;
898 is_greater |= is_greater << 1;
899 is_greater = ((s64) is_greater) >> 63;
901 out[0] -= kPrime[0] & is_greater;
902 out[1] -= kPrime[1] & is_greater;
903 out[2] -= kPrime[2] & is_greater;
904 out[3] -= kPrime[3] & is_greater;
905 out[4] -= kPrime[4] & is_greater;
906 out[5] -= kPrime[5] & is_greater;
907 out[6] -= kPrime[6] & is_greater;
908 out[7] -= kPrime[7] & is_greater;
909 out[8] -= kPrime[8] & is_greater;
911 /* Eliminate negative coefficients */
912 sign = -(out[0] >> 63); out[0] += (two58 & sign); out[1] -= (1 & sign);
913 sign = -(out[1] >> 63); out[1] += (two58 & sign); out[2] -= (1 & sign);
914 sign = -(out[2] >> 63); out[2] += (two58 & sign); out[3] -= (1 & sign);
915 sign = -(out[3] >> 63); out[3] += (two58 & sign); out[4] -= (1 & sign);
916 sign = -(out[4] >> 63); out[4] += (two58 & sign); out[5] -= (1 & sign);
917 sign = -(out[0] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
918 sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
919 sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
920 sign = -(out[5] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
921 sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
922 sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
928 * Building on top of the field operations we have the operations on the
929 * elliptic curve group itself. Points on the curve are represented in Jacobian
932 /* point_double calcuates 2*(x_in, y_in, z_in)
934 * The method is taken from:
935 * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
937 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
938 * while x_out == y_in is not (maybe this works, but it's not tested). */
940 point_double(felem x_out, felem y_out, felem z_out,
941 const felem x_in, const felem y_in, const felem z_in)
943 largefelem tmp, tmp2;
944 felem delta, gamma, beta, alpha, ftmp, ftmp2;
946 felem_assign(ftmp, x_in);
947 felem_assign(ftmp2, x_in);
950 felem_square(tmp, z_in);
951 felem_reduce(delta, tmp); /* delta[i] < 2^59 + 2^14 */
954 felem_square(tmp, y_in);
955 felem_reduce(gamma, tmp); /* gamma[i] < 2^59 + 2^14 */
958 felem_mul(tmp, x_in, gamma);
959 felem_reduce(beta, tmp); /* beta[i] < 2^59 + 2^14 */
961 /* alpha = 3*(x-delta)*(x+delta) */
962 felem_diff64(ftmp, delta);
964 felem_sum64(ftmp2, delta);
965 /* ftmp2[i] < 2^60 + 2^15 */
966 felem_scalar64(ftmp2, 3);
967 /* ftmp2[i] < 3*2^60 + 3*2^15 */
968 felem_mul(tmp, ftmp, ftmp2);
969 /* tmp[i] < 17(3*2^121 + 3*2^76)
970 * = 61*2^121 + 61*2^76
971 * < 64*2^121 + 64*2^76
974 felem_reduce(alpha, tmp);
976 /* x' = alpha^2 - 8*beta */
977 felem_square(tmp, alpha);
980 felem_assign(ftmp, beta);
981 felem_scalar64(ftmp, 8);
982 /* ftmp[i] < 2^62 + 2^17 */
983 felem_diff_128_64(tmp, ftmp);
984 /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
985 felem_reduce(x_out, tmp);
987 /* z' = (y + z)^2 - gamma - delta */
988 felem_sum64(delta, gamma);
989 /* delta[i] < 2^60 + 2^15 */
990 felem_assign(ftmp, y_in);
991 felem_sum64(ftmp, z_in);
992 /* ftmp[i] < 2^60 + 2^15 */
993 felem_square(tmp, ftmp);
994 /* tmp[i] < 17(2^122)
996 felem_diff_128_64(tmp, delta);
997 /* tmp[i] < 2^127 + 2^63 */
998 felem_reduce(z_out, tmp);
1000 /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1001 felem_scalar64(beta, 4);
1002 /* beta[i] < 2^61 + 2^16 */
1003 felem_diff64(beta, x_out);
1004 /* beta[i] < 2^61 + 2^60 + 2^16 */
1005 felem_mul(tmp, alpha, beta);
1006 /* tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1007 * = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1008 * = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1010 felem_square(tmp2, gamma);
1011 /* tmp2[i] < 17*(2^59 + 2^14)^2
1012 * = 17*(2^118 + 2^74 + 2^28) */
1013 felem_scalar128(tmp2, 8);
1014 /* tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1015 * = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1017 felem_diff128(tmp, tmp2);
1018 /* tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1019 * = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1020 * 2^74 + 2^69 + 2^34 + 2^30
1022 felem_reduce(y_out, tmp);
1025 /* copy_conditional copies in to out iff mask is all ones. */
1027 copy_conditional(felem out, const felem in, limb mask)
1030 for (i = 0; i < NLIMBS; ++i)
1032 const limb tmp = mask & (in[i] ^ out[i]);
1037 /* point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1039 * The method is taken from
1040 * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1041 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1043 * This function includes a branch for checking whether the two input points
1044 * are equal (while not equal to the point at infinity). This case never
1045 * happens during single point multiplication, so there is no timing leak for
1046 * ECDH or ECDSA signing. */
1047 static void point_add(felem x3, felem y3, felem z3,
1048 const felem x1, const felem y1, const felem z1,
1049 const int mixed, const felem x2, const felem y2, const felem z2)
1051 felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1052 largefelem tmp, tmp2;
1053 limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1055 z1_is_zero = felem_is_zero(z1);
1056 z2_is_zero = felem_is_zero(z2);
1058 /* ftmp = z1z1 = z1**2 */
1059 felem_square(tmp, z1);
1060 felem_reduce(ftmp, tmp);
1064 /* ftmp2 = z2z2 = z2**2 */
1065 felem_square(tmp, z2);
1066 felem_reduce(ftmp2, tmp);
1068 /* u1 = ftmp3 = x1*z2z2 */
1069 felem_mul(tmp, x1, ftmp2);
1070 felem_reduce(ftmp3, tmp);
1072 /* ftmp5 = z1 + z2 */
1073 felem_assign(ftmp5, z1);
1074 felem_sum64(ftmp5, z2);
1075 /* ftmp5[i] < 2^61 */
1077 /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1078 felem_square(tmp, ftmp5);
1079 /* tmp[i] < 17*2^122 */
1080 felem_diff_128_64(tmp, ftmp);
1081 /* tmp[i] < 17*2^122 + 2^63 */
1082 felem_diff_128_64(tmp, ftmp2);
1083 /* tmp[i] < 17*2^122 + 2^64 */
1084 felem_reduce(ftmp5, tmp);
1086 /* ftmp2 = z2 * z2z2 */
1087 felem_mul(tmp, ftmp2, z2);
1088 felem_reduce(ftmp2, tmp);
1090 /* s1 = ftmp6 = y1 * z2**3 */
1091 felem_mul(tmp, y1, ftmp2);
1092 felem_reduce(ftmp6, tmp);
1096 /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */
1098 /* u1 = ftmp3 = x1*z2z2 */
1099 felem_assign(ftmp3, x1);
1101 /* ftmp5 = 2*z1z2 */
1102 felem_scalar(ftmp5, z1, 2);
1104 /* s1 = ftmp6 = y1 * z2**3 */
1105 felem_assign(ftmp6, y1);
1109 felem_mul(tmp, x2, ftmp);
1110 /* tmp[i] < 17*2^120 */
1112 /* h = ftmp4 = u2 - u1 */
1113 felem_diff_128_64(tmp, ftmp3);
1114 /* tmp[i] < 17*2^120 + 2^63 */
1115 felem_reduce(ftmp4, tmp);
1117 x_equal = felem_is_zero(ftmp4);
1119 /* z_out = ftmp5 * h */
1120 felem_mul(tmp, ftmp5, ftmp4);
1121 felem_reduce(z_out, tmp);
1123 /* ftmp = z1 * z1z1 */
1124 felem_mul(tmp, ftmp, z1);
1125 felem_reduce(ftmp, tmp);
1127 /* s2 = tmp = y2 * z1**3 */
1128 felem_mul(tmp, y2, ftmp);
1129 /* tmp[i] < 17*2^120 */
1131 /* r = ftmp5 = (s2 - s1)*2 */
1132 felem_diff_128_64(tmp, ftmp6);
1133 /* tmp[i] < 17*2^120 + 2^63 */
1134 felem_reduce(ftmp5, tmp);
1135 y_equal = felem_is_zero(ftmp5);
1136 felem_scalar64(ftmp5, 2);
1137 /* ftmp5[i] < 2^61 */
1139 if (x_equal && y_equal && !z1_is_zero && !z2_is_zero)
1141 point_double(x3, y3, z3, x1, y1, z1);
1145 /* I = ftmp = (2h)**2 */
1146 felem_assign(ftmp, ftmp4);
1147 felem_scalar64(ftmp, 2);
1148 /* ftmp[i] < 2^61 */
1149 felem_square(tmp, ftmp);
1150 /* tmp[i] < 17*2^122 */
1151 felem_reduce(ftmp, tmp);
1153 /* J = ftmp2 = h * I */
1154 felem_mul(tmp, ftmp4, ftmp);
1155 felem_reduce(ftmp2, tmp);
1157 /* V = ftmp4 = U1 * I */
1158 felem_mul(tmp, ftmp3, ftmp);
1159 felem_reduce(ftmp4, tmp);
1161 /* x_out = r**2 - J - 2V */
1162 felem_square(tmp, ftmp5);
1163 /* tmp[i] < 17*2^122 */
1164 felem_diff_128_64(tmp, ftmp2);
1165 /* tmp[i] < 17*2^122 + 2^63 */
1166 felem_assign(ftmp3, ftmp4);
1167 felem_scalar64(ftmp4, 2);
1168 /* ftmp4[i] < 2^61 */
1169 felem_diff_128_64(tmp, ftmp4);
1170 /* tmp[i] < 17*2^122 + 2^64 */
1171 felem_reduce(x_out, tmp);
1173 /* y_out = r(V-x_out) - 2 * s1 * J */
1174 felem_diff64(ftmp3, x_out);
1175 /* ftmp3[i] < 2^60 + 2^60
1177 felem_mul(tmp, ftmp5, ftmp3);
1178 /* tmp[i] < 17*2^122 */
1179 felem_mul(tmp2, ftmp6, ftmp2);
1180 /* tmp2[i] < 17*2^120 */
1181 felem_scalar128(tmp2, 2);
1182 /* tmp2[i] < 17*2^121 */
1183 felem_diff128(tmp, tmp2);
1184 /* tmp[i] < 2^127 - 2^69 + 17*2^122
1185 * = 2^126 - 2^122 - 2^6 - 2^2 - 1
1187 felem_reduce(y_out, tmp);
1189 copy_conditional(x_out, x2, z1_is_zero);
1190 copy_conditional(x_out, x1, z2_is_zero);
1191 copy_conditional(y_out, y2, z1_is_zero);
1192 copy_conditional(y_out, y1, z2_is_zero);
1193 copy_conditional(z_out, z2, z1_is_zero);
1194 copy_conditional(z_out, z1, z2_is_zero);
1195 felem_assign(x3, x_out);
1196 felem_assign(y3, y_out);
1197 felem_assign(z3, z_out);
1200 /* Base point pre computation
1201 * --------------------------
1203 * Two different sorts of precomputed tables are used in the following code.
1204 * Each contain various points on the curve, where each point is three field
1205 * elements (x, y, z).
1207 * For the base point table, z is usually 1 (0 for the point at infinity).
1208 * This table has 16 elements:
1209 * index | bits | point
1210 * ------+---------+------------------------------
1213 * 2 | 0 0 1 0 | 2^130G
1214 * 3 | 0 0 1 1 | (2^130 + 1)G
1215 * 4 | 0 1 0 0 | 2^260G
1216 * 5 | 0 1 0 1 | (2^260 + 1)G
1217 * 6 | 0 1 1 0 | (2^260 + 2^130)G
1218 * 7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1219 * 8 | 1 0 0 0 | 2^390G
1220 * 9 | 1 0 0 1 | (2^390 + 1)G
1221 * 10 | 1 0 1 0 | (2^390 + 2^130)G
1222 * 11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1223 * 12 | 1 1 0 0 | (2^390 + 2^260)G
1224 * 13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1225 * 14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1226 * 15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1228 * The reason for this is so that we can clock bits into four different
1229 * locations when doing simple scalar multiplies against the base point.
1231 * Tables for other points have table[i] = iG for i in 0 .. 16. */
1233 /* gmul is the table of precomputed base points */
1234 static const felem gmul[16][3] =
1235 {{{0, 0, 0, 0, 0, 0, 0, 0, 0},
1236 {0, 0, 0, 0, 0, 0, 0, 0, 0},
1237 {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1238 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1239 0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1240 0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1241 {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1242 0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1243 0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1244 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1245 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1246 0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1247 0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1248 {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1249 0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1250 0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1251 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1252 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1253 0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1254 0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1255 {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1256 0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1257 0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1258 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1259 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1260 0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1261 0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1262 {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1263 0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1264 0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1265 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1266 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1267 0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1268 0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1269 {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1270 0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1271 0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1272 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1273 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1274 0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1275 0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1276 {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1277 0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1278 0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1279 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1280 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1281 0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1282 0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1283 {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1284 0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1285 0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1286 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1287 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1288 0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1289 0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1290 {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1291 0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1292 0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1293 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1294 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1295 0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1296 0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1297 {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1298 0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1299 0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1300 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1301 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1302 0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1303 0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1304 {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1305 0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1306 0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1307 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1308 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1309 0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1310 0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1311 {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1312 0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1313 0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1314 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1315 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1316 0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1317 0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1318 {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1319 0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1320 0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1321 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1322 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1323 0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1324 0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1325 {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1326 0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1327 0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1328 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1329 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1330 0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1331 0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1332 {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1333 0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1334 0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1335 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1336 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1337 0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1338 0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1339 {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1340 0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1341 0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1342 {1, 0, 0, 0, 0, 0, 0, 0, 0}}};
1344 /* select_point selects the |index|th point from a precomputation table and
1345 * copies it to out. */
1346 static void select_point(const limb index, unsigned int size, const felem pre_comp[size][3],
1350 limb *outlimbs = &out[0][0];
1351 memset(outlimbs, 0, 3 * sizeof(felem));
1353 for (i = 0; i < size; i++)
1355 const limb *inlimbs = &pre_comp[i][0][0];
1356 limb mask = i ^ index;
1362 for (j = 0; j < NLIMBS * 3; j++)
1363 outlimbs[j] |= inlimbs[j] & mask;
1367 /* get_bit returns the |i|th bit in |in| */
1368 static char get_bit(const felem_bytearray in, int i)
1372 return (in[i >> 3] >> (i & 7)) & 1;
1375 /* Interleaved point multiplication using precomputed point multiples:
1376 * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[],
1377 * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
1378 * of the generator, using certain (large) precomputed multiples in g_pre_comp.
1379 * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
1380 static void batch_mul(felem x_out, felem y_out, felem z_out,
1381 const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar,
1382 const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[16][3])
1385 unsigned num, gen_mul = (g_scalar != NULL);
1386 felem nq[3], tmp[4];
1390 /* set nq to the point at infinity */
1391 memset(nq, 0, 3 * sizeof(felem));
1393 /* Loop over all scalars msb-to-lsb, interleaving additions
1394 * of multiples of the generator (last quarter of rounds)
1395 * and additions of other points multiples (every 5th round).
1397 skip = 1; /* save two point operations in the first round */
1398 for (i = (num_points ? 520 : 130); i >= 0; --i)
1402 point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1404 /* add multiples of the generator */
1405 if (gen_mul && (i <= 130))
1407 bits = get_bit(g_scalar, i + 390) << 3;
1410 bits |= get_bit(g_scalar, i + 260) << 2;
1411 bits |= get_bit(g_scalar, i + 130) << 1;
1412 bits |= get_bit(g_scalar, i);
1414 /* select the point to add, in constant time */
1415 select_point(bits, 16, g_pre_comp, tmp);
1418 point_add(nq[0], nq[1], nq[2],
1419 nq[0], nq[1], nq[2],
1420 1 /* mixed */, tmp[0], tmp[1], tmp[2]);
1424 memcpy(nq, tmp, 3 * sizeof(felem));
1429 /* do other additions every 5 doublings */
1430 if (num_points && (i % 5 == 0))
1432 /* loop over all scalars */
1433 for (num = 0; num < num_points; ++num)
1435 bits = get_bit(scalars[num], i + 4) << 5;
1436 bits |= get_bit(scalars[num], i + 3) << 4;
1437 bits |= get_bit(scalars[num], i + 2) << 3;
1438 bits |= get_bit(scalars[num], i + 1) << 2;
1439 bits |= get_bit(scalars[num], i) << 1;
1440 bits |= get_bit(scalars[num], i - 1);
1441 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1443 /* select the point to add or subtract, in constant time */
1444 select_point(digit, 17, pre_comp[num], tmp);
1445 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */
1446 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1450 point_add(nq[0], nq[1], nq[2],
1451 nq[0], nq[1], nq[2],
1452 mixed, tmp[0], tmp[1], tmp[2]);
1456 memcpy(nq, tmp, 3 * sizeof(felem));
1462 felem_assign(x_out, nq[0]);
1463 felem_assign(y_out, nq[1]);
1464 felem_assign(z_out, nq[2]);
1468 /* Precomputation for the group generator. */
1470 felem g_pre_comp[16][3];
1472 } NISTP521_PRE_COMP;
1474 const EC_METHOD *EC_GFp_nistp521_method(void)
1476 static const EC_METHOD ret = {
1477 EC_FLAGS_DEFAULT_OCT,
1478 NID_X9_62_prime_field,
1479 ec_GFp_nistp521_group_init,
1480 ec_GFp_simple_group_finish,
1481 ec_GFp_simple_group_clear_finish,
1482 ec_GFp_nist_group_copy,
1483 ec_GFp_nistp521_group_set_curve,
1484 ec_GFp_simple_group_get_curve,
1485 ec_GFp_simple_group_get_degree,
1486 ec_GFp_simple_group_check_discriminant,
1487 ec_GFp_simple_point_init,
1488 ec_GFp_simple_point_finish,
1489 ec_GFp_simple_point_clear_finish,
1490 ec_GFp_simple_point_copy,
1491 ec_GFp_simple_point_set_to_infinity,
1492 ec_GFp_simple_set_Jprojective_coordinates_GFp,
1493 ec_GFp_simple_get_Jprojective_coordinates_GFp,
1494 ec_GFp_simple_point_set_affine_coordinates,
1495 ec_GFp_nistp521_point_get_affine_coordinates,
1496 0 /* point_set_compressed_coordinates */,
1501 ec_GFp_simple_invert,
1502 ec_GFp_simple_is_at_infinity,
1503 ec_GFp_simple_is_on_curve,
1505 ec_GFp_simple_make_affine,
1506 ec_GFp_simple_points_make_affine,
1507 ec_GFp_nistp521_points_mul,
1508 ec_GFp_nistp521_precompute_mult,
1509 ec_GFp_nistp521_have_precompute_mult,
1510 ec_GFp_nist_field_mul,
1511 ec_GFp_nist_field_sqr,
1513 0 /* field_encode */,
1514 0 /* field_decode */,
1515 0 /* field_set_to_one */ };
1521 /******************************************************************************/
1522 /* FUNCTIONS TO MANAGE PRECOMPUTATION
1525 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1527 NISTP521_PRE_COMP *ret = NULL;
1528 ret = (NISTP521_PRE_COMP *)OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1531 ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1534 memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1535 ret->references = 1;
1539 static void *nistp521_pre_comp_dup(void *src_)
1541 NISTP521_PRE_COMP *src = src_;
1543 /* no need to actually copy, these objects never change! */
1544 CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1549 static void nistp521_pre_comp_free(void *pre_)
1552 NISTP521_PRE_COMP *pre = pre_;
1557 i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1564 static void nistp521_pre_comp_clear_free(void *pre_)
1567 NISTP521_PRE_COMP *pre = pre_;
1572 i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1576 OPENSSL_cleanse(pre, sizeof(*pre));
1580 /******************************************************************************/
1581 /* OPENSSL EC_METHOD FUNCTIONS
1584 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1587 ret = ec_GFp_simple_group_init(group);
1588 group->a_is_minus3 = 1;
1592 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1593 const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
1596 BN_CTX *new_ctx = NULL;
1597 BIGNUM *curve_p, *curve_a, *curve_b;
1600 if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1602 if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1603 ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1604 ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err;
1605 BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1606 BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1607 BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1608 if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) ||
1609 (BN_cmp(curve_b, b)))
1611 ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1612 EC_R_WRONG_CURVE_PARAMETERS);
1615 group->field_mod_func = BN_nist_mod_521;
1616 ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1619 if (new_ctx != NULL)
1620 BN_CTX_free(new_ctx);
1624 /* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
1625 * (X', Y') = (X/Z^2, Y/Z^3) */
1626 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1627 const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
1629 felem z1, z2, x_in, y_in, x_out, y_out;
1632 if (EC_POINT_is_at_infinity(group, point))
1634 ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1635 EC_R_POINT_AT_INFINITY);
1638 if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1639 (!BN_to_felem(z1, &point->Z))) return 0;
1641 felem_square(tmp, z2); felem_reduce(z1, tmp);
1642 felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp);
1643 felem_contract(x_out, x_in);
1646 if (!felem_to_BN(x, x_out))
1648 ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
1652 felem_mul(tmp, z1, z2); felem_reduce(z1, tmp);
1653 felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp);
1654 felem_contract(y_out, y_in);
1657 if (!felem_to_BN(y, y_out))
1659 ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
1666 static void make_points_affine(size_t num, felem points[num][3], felem tmp_felems[num+1])
1668 /* Runs in constant time, unless an input is the point at infinity
1669 * (which normally shouldn't happen). */
1670 ec_GFp_nistp_points_make_affine_internal(
1675 (void (*)(void *)) felem_one,
1676 (int (*)(const void *)) felem_is_zero_int,
1677 (void (*)(void *, const void *)) felem_assign,
1678 (void (*)(void *, const void *)) felem_square_reduce,
1679 (void (*)(void *, const void *, const void *)) felem_mul_reduce,
1680 (void (*)(void *, const void *)) felem_inv,
1681 (void (*)(void *, const void *)) felem_contract);
1684 /* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values
1685 * Result is stored in r (r can equal one of the inputs). */
1686 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1687 const BIGNUM *scalar, size_t num, const EC_POINT *points[],
1688 const BIGNUM *scalars[], BN_CTX *ctx)
1693 BN_CTX *new_ctx = NULL;
1694 BIGNUM *x, *y, *z, *tmp_scalar;
1695 felem_bytearray g_secret;
1696 felem_bytearray *secrets = NULL;
1697 felem (*pre_comp)[17][3] = NULL;
1698 felem *tmp_felems = NULL;
1699 felem_bytearray tmp;
1700 unsigned i, num_bytes;
1701 int have_pre_comp = 0;
1702 size_t num_points = num;
1703 felem x_in, y_in, z_in, x_out, y_out, z_out;
1704 NISTP521_PRE_COMP *pre = NULL;
1705 felem (*g_pre_comp)[3] = NULL;
1706 EC_POINT *generator = NULL;
1707 const EC_POINT *p = NULL;
1708 const BIGNUM *p_scalar = NULL;
1711 if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1713 if (((x = BN_CTX_get(ctx)) == NULL) ||
1714 ((y = BN_CTX_get(ctx)) == NULL) ||
1715 ((z = BN_CTX_get(ctx)) == NULL) ||
1716 ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1721 pre = EC_EX_DATA_get_data(group->extra_data,
1722 nistp521_pre_comp_dup, nistp521_pre_comp_free,
1723 nistp521_pre_comp_clear_free);
1725 /* we have precomputation, try to use it */
1726 g_pre_comp = &pre->g_pre_comp[0];
1728 /* try to use the standard precomputation */
1729 g_pre_comp = (felem (*)[3]) gmul;
1730 generator = EC_POINT_new(group);
1731 if (generator == NULL)
1733 /* get the generator from precomputation */
1734 if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1735 !felem_to_BN(y, g_pre_comp[1][1]) ||
1736 !felem_to_BN(z, g_pre_comp[1][2]))
1738 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1741 if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1742 generator, x, y, z, ctx))
1744 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1745 /* precomputation matches generator */
1748 /* we don't have valid precomputation:
1749 * treat the generator as a random point */
1755 if (num_points >= 2)
1757 /* unless we precompute multiples for just one point,
1758 * converting those into affine form is time well spent */
1761 secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1762 pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1764 tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1765 if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL)))
1767 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1771 /* we treat NULL scalars as 0, and NULL points as points at infinity,
1772 * i.e., they contribute nothing to the linear combination */
1773 memset(secrets, 0, num_points * sizeof(felem_bytearray));
1774 memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1775 for (i = 0; i < num_points; ++i)
1778 /* we didn't have a valid precomputation, so we pick
1781 p = EC_GROUP_get0_generator(group);
1785 /* the i^th point */
1788 p_scalar = scalars[i];
1790 if ((p_scalar != NULL) && (p != NULL))
1792 /* reduce scalar to 0 <= scalar < 2^521 */
1793 if ((BN_num_bits(p_scalar) > 521) || (BN_is_negative(p_scalar)))
1795 /* this is an unusual input, and we don't guarantee
1796 * constant-timeness */
1797 if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx))
1799 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1802 num_bytes = BN_bn2bin(tmp_scalar, tmp);
1805 num_bytes = BN_bn2bin(p_scalar, tmp);
1806 flip_endian(secrets[i], tmp, num_bytes);
1807 /* precompute multiples */
1808 if ((!BN_to_felem(x_out, &p->X)) ||
1809 (!BN_to_felem(y_out, &p->Y)) ||
1810 (!BN_to_felem(z_out, &p->Z))) goto err;
1811 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1812 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1813 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1814 for (j = 2; j <= 16; ++j)
1819 pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1820 pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2],
1821 0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]);
1826 pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1827 pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]);
1833 make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1836 /* the scalar for the generator */
1837 if ((scalar != NULL) && (have_pre_comp))
1839 memset(g_secret, 0, sizeof(g_secret));
1840 /* reduce scalar to 0 <= scalar < 2^521 */
1841 if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar)))
1843 /* this is an unusual input, and we don't guarantee
1844 * constant-timeness */
1845 if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx))
1847 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1850 num_bytes = BN_bn2bin(tmp_scalar, tmp);
1853 num_bytes = BN_bn2bin(scalar, tmp);
1854 flip_endian(g_secret, tmp, num_bytes);
1855 /* do the multiplication with generator precomputation*/
1856 batch_mul(x_out, y_out, z_out,
1857 (const felem_bytearray (*)) secrets, num_points,
1859 mixed, (const felem (*)[17][3]) pre_comp,
1860 (const felem (*)[3]) g_pre_comp);
1863 /* do the multiplication without generator precomputation */
1864 batch_mul(x_out, y_out, z_out,
1865 (const felem_bytearray (*)) secrets, num_points,
1866 NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL);
1867 /* reduce the output to its unique minimal representation */
1868 felem_contract(x_in, x_out);
1869 felem_contract(y_in, y_out);
1870 felem_contract(z_in, z_out);
1871 if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1872 (!felem_to_BN(z, z_in)))
1874 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1877 ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1881 if (generator != NULL)
1882 EC_POINT_free(generator);
1883 if (new_ctx != NULL)
1884 BN_CTX_free(new_ctx);
1885 if (secrets != NULL)
1886 OPENSSL_free(secrets);
1887 if (pre_comp != NULL)
1888 OPENSSL_free(pre_comp);
1889 if (tmp_felems != NULL)
1890 OPENSSL_free(tmp_felems);
1894 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1897 NISTP521_PRE_COMP *pre = NULL;
1899 BN_CTX *new_ctx = NULL;
1901 EC_POINT *generator = NULL;
1902 felem tmp_felems[16];
1904 /* throw away old precomputation */
1905 EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
1906 nistp521_pre_comp_free, nistp521_pre_comp_clear_free);
1908 if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1910 if (((x = BN_CTX_get(ctx)) == NULL) ||
1911 ((y = BN_CTX_get(ctx)) == NULL))
1913 /* get the generator */
1914 if (group->generator == NULL) goto err;
1915 generator = EC_POINT_new(group);
1916 if (generator == NULL)
1918 BN_bin2bn(nistp521_curve_params[3], sizeof (felem_bytearray), x);
1919 BN_bin2bn(nistp521_curve_params[4], sizeof (felem_bytearray), y);
1920 if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
1922 if ((pre = nistp521_pre_comp_new()) == NULL)
1924 /* if the generator is the standard one, use built-in precomputation */
1925 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1927 memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1931 if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
1932 (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
1933 (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
1935 /* compute 2^130*G, 2^260*G, 2^390*G */
1936 for (i = 1; i <= 4; i <<= 1)
1938 point_double(pre->g_pre_comp[2*i][0], pre->g_pre_comp[2*i][1],
1939 pre->g_pre_comp[2*i][2], pre->g_pre_comp[i][0],
1940 pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
1941 for (j = 0; j < 129; ++j)
1943 point_double(pre->g_pre_comp[2*i][0],
1944 pre->g_pre_comp[2*i][1],
1945 pre->g_pre_comp[2*i][2],
1946 pre->g_pre_comp[2*i][0],
1947 pre->g_pre_comp[2*i][1],
1948 pre->g_pre_comp[2*i][2]);
1951 /* g_pre_comp[0] is the point at infinity */
1952 memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
1953 /* the remaining multiples */
1954 /* 2^130*G + 2^260*G */
1955 point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
1956 pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
1957 pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
1958 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1959 pre->g_pre_comp[2][2]);
1960 /* 2^130*G + 2^390*G */
1961 point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
1962 pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
1963 pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
1964 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1965 pre->g_pre_comp[2][2]);
1966 /* 2^260*G + 2^390*G */
1967 point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
1968 pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
1969 pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
1970 0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
1971 pre->g_pre_comp[4][2]);
1972 /* 2^130*G + 2^260*G + 2^390*G */
1973 point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
1974 pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
1975 pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
1976 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1977 pre->g_pre_comp[2][2]);
1978 for (i = 1; i < 8; ++i)
1980 /* odd multiples: add G */
1981 point_add(pre->g_pre_comp[2*i+1][0], pre->g_pre_comp[2*i+1][1],
1982 pre->g_pre_comp[2*i+1][2], pre->g_pre_comp[2*i][0],
1983 pre->g_pre_comp[2*i][1], pre->g_pre_comp[2*i][2],
1984 0, pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
1985 pre->g_pre_comp[1][2]);
1987 make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
1989 if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
1990 nistp521_pre_comp_free, nistp521_pre_comp_clear_free))
1996 if (generator != NULL)
1997 EC_POINT_free(generator);
1998 if (new_ctx != NULL)
1999 BN_CTX_free(new_ctx);
2001 nistp521_pre_comp_free(pre);
2005 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2007 if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2008 nistp521_pre_comp_free, nistp521_pre_comp_clear_free)
2016 static void *dummy=&dummy;