2 * Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
4 * Licensed under the Apache License 2.0 (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
10 /* Copyright 2023 IBM Corp.
12 * Licensed under the Apache License, Version 2.0 (the "License");
14 * you may not use this file except in compliance with the License.
15 * You may obtain a copy of the License at
17 * http://www.apache.org/licenses/LICENSE-2.0
19 * Unless required by applicable law or agreed to in writing, software
20 * distributed under the License is distributed on an "AS IS" BASIS,
21 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22 * See the License for the specific language governing permissions and
23 * limitations under the License.
27 * Designed for 56-bit limbs by Rohan McLure <rohan.mclure@linux.ibm.com>.
28 * The layout is based on that of ecp_nistp{224,521}.c, allowing even for asm
29 * acceleration of felem_{square,mul} as supported in these files.
32 #include <openssl/e_os2.h>
35 #include <openssl/err.h>
38 #include "internal/numbers.h"
41 # error "Your compiler doesn't appear to support 128-bit integer types"
48 * The underlying field. P384 operates over GF(2^384-2^128-2^96+2^32-1). We
49 * can serialize an element of this field into 48 bytes. We call this an
53 typedef u8 felem_bytearray[48];
56 * These are the parameters of P384, taken from FIPS 186-3, section D.1.2.4.
57 * These values are big-endian.
59 static const felem_bytearray nistp384_curve_params[5] = {
60 {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
61 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
62 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
63 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF},
64 {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a = -3 */
65 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
66 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
67 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFC},
68 {0xB3, 0x31, 0x2F, 0xA7, 0xE2, 0x3E, 0xE7, 0xE4, 0x98, 0x8E, 0x05, 0x6B, /* b */
69 0xE3, 0xF8, 0x2D, 0x19, 0x18, 0x1D, 0x9C, 0x6E, 0xFE, 0x81, 0x41, 0x12,
70 0x03, 0x14, 0x08, 0x8F, 0x50, 0x13, 0x87, 0x5A, 0xC6, 0x56, 0x39, 0x8D,
71 0x8A, 0x2E, 0xD1, 0x9D, 0x2A, 0x85, 0xC8, 0xED, 0xD3, 0xEC, 0x2A, 0xEF},
72 {0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37, 0x8E, 0xB1, 0xC7, 0x1E, /* x */
73 0xF3, 0x20, 0xAD, 0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B, 0x98,
74 0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A, 0x38, 0x55, 0x02, 0xF2, 0x5D,
75 0xBF, 0x55, 0x29, 0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A, 0xB7},
76 {0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C, 0x6F, 0x5D, 0x9E, 0x98, 0xBF, /* y */
77 0x92, 0x92, 0xDC, 0x29, 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14, 0x7C,
78 0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8, 0xC0, 0x0A, 0x60, 0xB1, 0xCE,
79 0x1D, 0x7E, 0x81, 0x9D, 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E, 0x5F},
83 * The representation of field elements.
84 * ------------------------------------
86 * We represent field elements with seven values. These values are either 64 or
87 * 128 bits and the field element represented is:
88 * v[0]*2^0 + v[1]*2^56 + v[2]*2^112 + ... + v[6]*2^336 (mod p)
89 * Each of the seven values is called a 'limb'. Since the limbs are spaced only
90 * 56 bits apart, but are greater than 56 bits in length, the most significant
91 * bits of each limb overlap with the least significant bits of the next
93 * This representation is considered to be 'redundant' in the sense that
94 * intermediate values can each contain more than a 56-bit value in each limb.
95 * Reduction causes all but the final limb to be reduced to contain a value less
96 * than 2^56, with the final value represented allowed to be larger than 2^384,
97 * inasmuch as we can be sure that arithmetic overflow remains impossible. The
98 * reduced value must of course be congruent to the unreduced value.
100 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
101 * 'widefelem', featuring enough bits to store the result of a multiplication
102 * and even some further arithmetic without need for immediate reduction.
107 typedef uint64_t limb;
108 typedef uint128_t widelimb;
109 typedef limb limb_aX __attribute((__aligned__(1)));
110 typedef limb felem[NLIMBS];
111 typedef widelimb widefelem[2*NLIMBS-1];
113 static const limb bottom56bits = 0xffffffffffffff;
115 /* Helper functions (de)serialising reduced field elements in little endian */
116 static void bin48_to_felem(felem out, const u8 in[48])
119 out[0] = (*((limb *) & in[0])) & bottom56bits;
120 out[1] = (*((limb_aX *) & in[7])) & bottom56bits;
121 out[2] = (*((limb_aX *) & in[14])) & bottom56bits;
122 out[3] = (*((limb_aX *) & in[21])) & bottom56bits;
123 out[4] = (*((limb_aX *) & in[28])) & bottom56bits;
124 out[5] = (*((limb_aX *) & in[35])) & bottom56bits;
125 memmove(&out[6], &in[42], 6);
128 static void felem_to_bin48(u8 out[48], const felem in)
131 (*((limb *) & out[0])) |= (in[0] & bottom56bits);
132 (*((limb_aX *) & out[7])) |= (in[1] & bottom56bits);
133 (*((limb_aX *) & out[14])) |= (in[2] & bottom56bits);
134 (*((limb_aX *) & out[21])) |= (in[3] & bottom56bits);
135 (*((limb_aX *) & out[28])) |= (in[4] & bottom56bits);
136 (*((limb_aX *) & out[35])) |= (in[5] & bottom56bits);
137 memmove(&out[42], &in[6], 6);
140 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
141 static int BN_to_felem(felem out, const BIGNUM *bn)
143 felem_bytearray b_out;
146 if (BN_is_negative(bn)) {
147 ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
150 num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
152 ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
155 bin48_to_felem(out, b_out);
159 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
160 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
162 felem_bytearray b_out;
164 felem_to_bin48(b_out, in);
165 return BN_lebin2bn(b_out, sizeof(b_out), out);
173 static void felem_one(felem out)
176 memset(&out[1], 0, sizeof(limb) * (NLIMBS-1));
179 static void felem_assign(felem out, const felem in)
181 memcpy(out, in, sizeof(felem));
184 /* felem_sum64 sets out = out + in. */
185 static void felem_sum64(felem out, const felem in)
189 for (i = 0; i < NLIMBS; i++)
193 /* felem_scalar sets out = in * scalar */
194 static void felem_scalar(felem out, const felem in, limb scalar)
198 for (i = 0; i < NLIMBS; i++)
199 out[i] = in[i] * scalar;
202 /* felem_scalar64 sets out = out * scalar */
203 static void felem_scalar64(felem out, limb scalar)
207 for (i = 0; i < NLIMBS; i++)
211 /* felem_scalar128 sets out = out * scalar */
212 static void felem_scalar128(widefelem out, limb scalar)
216 for (i = 0; i < 2*NLIMBS-1; i++)
221 * felem_neg sets |out| to |-in|
223 * in[i] < 2^60 - 2^29
227 static void felem_neg(felem out, const felem in)
230 * In order to prevent underflow, we add a multiple of p before subtracting.
231 * Use telescopic sums to represent 2^12 * p redundantly with each limb
232 * of the form 2^60 + ...
234 static const limb two60m52m4 = (((limb) 1) << 60)
237 static const limb two60p44m12 = (((limb) 1) << 60)
239 - (((limb) 1) << 12);
240 static const limb two60m28m4 = (((limb) 1) << 60)
243 static const limb two60m4 = (((limb) 1) << 60)
246 out[0] = two60p44m12 - in[0];
247 out[1] = two60m52m4 - in[1];
248 out[2] = two60m28m4 - in[2];
249 out[3] = two60m4 - in[3];
250 out[4] = two60m4 - in[4];
251 out[5] = two60m4 - in[5];
252 out[6] = two60m4 - in[6];
256 * felem_diff64 subtracts |in| from |out|
258 * in[i] < 2^60 - 2^52 - 2^4
260 * out[i] < out_orig[i] + 2^60 + 2^44
262 static void felem_diff64(felem out, const felem in)
265 * In order to prevent underflow, we add a multiple of p before subtracting.
266 * Use telescopic sums to represent 2^12 * p redundantly with each limb
267 * of the form 2^60 + ...
270 static const limb two60m52m4 = (((limb) 1) << 60)
273 static const limb two60p44m12 = (((limb) 1) << 60)
275 - (((limb) 1) << 12);
276 static const limb two60m28m4 = (((limb) 1) << 60)
279 static const limb two60m4 = (((limb) 1) << 60)
282 out[0] += two60p44m12 - in[0];
283 out[1] += two60m52m4 - in[1];
284 out[2] += two60m28m4 - in[2];
285 out[3] += two60m4 - in[3];
286 out[4] += two60m4 - in[4];
287 out[5] += two60m4 - in[5];
288 out[6] += two60m4 - in[6];
293 * out[i] < out_orig[i] + 2^64 + 2^48
295 static void felem_diff_128_64(widefelem out, const felem in)
298 * In order to prevent underflow, we add a multiple of p before subtracting.
299 * Use telescopic sums to represent 2^16 * p redundantly with each limb
300 * of the form 2^64 + ...
303 static const widelimb two64m56m8 = (((widelimb) 1) << 64)
304 - (((widelimb) 1) << 56)
305 - (((widelimb) 1) << 8);
306 static const widelimb two64m32m8 = (((widelimb) 1) << 64)
307 - (((widelimb) 1) << 32)
308 - (((widelimb) 1) << 8);
309 static const widelimb two64m8 = (((widelimb) 1) << 64)
310 - (((widelimb) 1) << 8);
311 static const widelimb two64p48m16 = (((widelimb) 1) << 64)
312 + (((widelimb) 1) << 48)
313 - (((widelimb) 1) << 16);
316 out[0] += two64p48m16;
317 out[1] += two64m56m8;
318 out[2] += two64m32m8;
324 for (i = 0; i < NLIMBS; i++)
329 * in[i] < 2^127 - 2^119 - 2^71
330 * out[i] < out_orig[i] + 2^127 + 2^111
332 static void felem_diff128(widefelem out, const widefelem in)
335 * In order to prevent underflow, we add a multiple of p before subtracting.
336 * Use telescopic sums to represent 2^415 * p redundantly with each limb
337 * of the form 2^127 + ...
340 static const widelimb two127 = ((widelimb) 1) << 127;
341 static const widelimb two127m71 = (((widelimb) 1) << 127)
342 - (((widelimb) 1) << 71);
343 static const widelimb two127p111m79m71 = (((widelimb) 1) << 127)
344 + (((widelimb) 1) << 111)
345 - (((widelimb) 1) << 79)
346 - (((widelimb) 1) << 71);
347 static const widelimb two127m119m71 = (((widelimb) 1) << 127)
348 - (((widelimb) 1) << 119)
349 - (((widelimb) 1) << 71);
350 static const widelimb two127m95m71 = (((widelimb) 1) << 127)
351 - (((widelimb) 1) << 95)
352 - (((widelimb) 1) << 71);
361 out[6] += two127p111m79m71;
362 out[7] += two127m119m71;
363 out[8] += two127m95m71;
365 out[10] += two127m71;
366 out[11] += two127m71;
367 out[12] += two127m71;
369 for (i = 0; i < 2*NLIMBS-1; i++)
373 static void felem_square_ref(widefelem out, const felem in)
376 felem_scalar(inx2, in, 2);
378 out[0] = ((uint128_t) in[0]) * in[0];
380 out[1] = ((uint128_t) in[0]) * inx2[1];
382 out[2] = ((uint128_t) in[0]) * inx2[2]
383 + ((uint128_t) in[1]) * in[1];
385 out[3] = ((uint128_t) in[0]) * inx2[3]
386 + ((uint128_t) in[1]) * inx2[2];
388 out[4] = ((uint128_t) in[0]) * inx2[4]
389 + ((uint128_t) in[1]) * inx2[3]
390 + ((uint128_t) in[2]) * in[2];
392 out[5] = ((uint128_t) in[0]) * inx2[5]
393 + ((uint128_t) in[1]) * inx2[4]
394 + ((uint128_t) in[2]) * inx2[3];
396 out[6] = ((uint128_t) in[0]) * inx2[6]
397 + ((uint128_t) in[1]) * inx2[5]
398 + ((uint128_t) in[2]) * inx2[4]
399 + ((uint128_t) in[3]) * in[3];
401 out[7] = ((uint128_t) in[1]) * inx2[6]
402 + ((uint128_t) in[2]) * inx2[5]
403 + ((uint128_t) in[3]) * inx2[4];
405 out[8] = ((uint128_t) in[2]) * inx2[6]
406 + ((uint128_t) in[3]) * inx2[5]
407 + ((uint128_t) in[4]) * in[4];
409 out[9] = ((uint128_t) in[3]) * inx2[6]
410 + ((uint128_t) in[4]) * inx2[5];
412 out[10] = ((uint128_t) in[4]) * inx2[6]
413 + ((uint128_t) in[5]) * in[5];
415 out[11] = ((uint128_t) in[5]) * inx2[6];
417 out[12] = ((uint128_t) in[6]) * in[6];
420 static void felem_mul_ref(widefelem out, const felem in1, const felem in2)
422 out[0] = ((uint128_t) in1[0]) * in2[0];
424 out[1] = ((uint128_t) in1[0]) * in2[1]
425 + ((uint128_t) in1[1]) * in2[0];
427 out[2] = ((uint128_t) in1[0]) * in2[2]
428 + ((uint128_t) in1[1]) * in2[1]
429 + ((uint128_t) in1[2]) * in2[0];
431 out[3] = ((uint128_t) in1[0]) * in2[3]
432 + ((uint128_t) in1[1]) * in2[2]
433 + ((uint128_t) in1[2]) * in2[1]
434 + ((uint128_t) in1[3]) * in2[0];
436 out[4] = ((uint128_t) in1[0]) * in2[4]
437 + ((uint128_t) in1[1]) * in2[3]
438 + ((uint128_t) in1[2]) * in2[2]
439 + ((uint128_t) in1[3]) * in2[1]
440 + ((uint128_t) in1[4]) * in2[0];
442 out[5] = ((uint128_t) in1[0]) * in2[5]
443 + ((uint128_t) in1[1]) * in2[4]
444 + ((uint128_t) in1[2]) * in2[3]
445 + ((uint128_t) in1[3]) * in2[2]
446 + ((uint128_t) in1[4]) * in2[1]
447 + ((uint128_t) in1[5]) * in2[0];
449 out[6] = ((uint128_t) in1[0]) * in2[6]
450 + ((uint128_t) in1[1]) * in2[5]
451 + ((uint128_t) in1[2]) * in2[4]
452 + ((uint128_t) in1[3]) * in2[3]
453 + ((uint128_t) in1[4]) * in2[2]
454 + ((uint128_t) in1[5]) * in2[1]
455 + ((uint128_t) in1[6]) * in2[0];
457 out[7] = ((uint128_t) in1[1]) * in2[6]
458 + ((uint128_t) in1[2]) * in2[5]
459 + ((uint128_t) in1[3]) * in2[4]
460 + ((uint128_t) in1[4]) * in2[3]
461 + ((uint128_t) in1[5]) * in2[2]
462 + ((uint128_t) in1[6]) * in2[1];
464 out[8] = ((uint128_t) in1[2]) * in2[6]
465 + ((uint128_t) in1[3]) * in2[5]
466 + ((uint128_t) in1[4]) * in2[4]
467 + ((uint128_t) in1[5]) * in2[3]
468 + ((uint128_t) in1[6]) * in2[2];
470 out[9] = ((uint128_t) in1[3]) * in2[6]
471 + ((uint128_t) in1[4]) * in2[5]
472 + ((uint128_t) in1[5]) * in2[4]
473 + ((uint128_t) in1[6]) * in2[3];
475 out[10] = ((uint128_t) in1[4]) * in2[6]
476 + ((uint128_t) in1[5]) * in2[5]
477 + ((uint128_t) in1[6]) * in2[4];
479 out[11] = ((uint128_t) in1[5]) * in2[6]
480 + ((uint128_t) in1[6]) * in2[5];
482 out[12] = ((uint128_t) in1[6]) * in2[6];
486 * Reduce thirteen 128-bit coefficients to seven 64-bit coefficients.
487 * in[i] < 2^128 - 2^125
488 * out[i] < 2^56 for i < 6,
491 * The technique in use here stems from the format of the prime modulus:
492 * P384 = 2^384 - delta
494 * Thus we can reduce numbers of the form (X + 2^384 * Y) by substituting
495 * them with (X + delta Y), with delta = 2^128 + 2^96 + (-2^32 + 1). These
496 * coefficients are still quite large, and so we repeatedly apply this
497 * technique on high-order bits in order to guarantee the desired bounds on
498 * the size of our output.
500 * The three phases of elimination are as follows:
501 * [1]: Y = 2^120 (in[12] | in[11] | in[10] | in[9])
502 * [2]: Y = 2^8 (acc[8] | acc[7])
503 * [3]: Y = 2^48 (acc[6] >> 48)
504 * (Where a | b | c | d = (2^56)^3 a + (2^56)^2 b + (2^56) c + d)
506 static void felem_reduce(felem out, const widefelem in)
509 * In order to prevent underflow, we add a multiple of p before subtracting.
510 * Use telescopic sums to represent 2^76 * p redundantly with each limb
511 * of the form 2^124 + ...
513 static const widelimb two124m68 = (((widelimb) 1) << 124)
514 - (((widelimb) 1) << 68);
515 static const widelimb two124m116m68 = (((widelimb) 1) << 124)
516 - (((widelimb) 1) << 116)
517 - (((widelimb) 1) << 68);
518 static const widelimb two124p108m76 = (((widelimb) 1) << 124)
519 + (((widelimb) 1) << 108)
520 - (((widelimb) 1) << 76);
521 static const widelimb two124m92m68 = (((widelimb) 1) << 124)
522 - (((widelimb) 1) << 92)
523 - (((widelimb) 1) << 68);
524 widelimb temp, acc[9];
527 memcpy(acc, in, sizeof(widelimb) * 9);
529 acc[0] += two124p108m76;
530 acc[1] += two124m116m68;
531 acc[2] += two124m92m68;
537 /* [1]: Eliminate in[9], ..., in[12] */
538 acc[8] += in[12] >> 32;
539 acc[7] += (in[12] & 0xffffffff) << 24;
540 acc[7] += in[12] >> 8;
541 acc[6] += (in[12] & 0xff) << 48;
542 acc[6] -= in[12] >> 16;
543 acc[5] -= (in[12] & 0xffff) << 40;
544 acc[6] += in[12] >> 48;
545 acc[5] += (in[12] & 0xffffffffffff) << 8;
547 acc[7] += in[11] >> 32;
548 acc[6] += (in[11] & 0xffffffff) << 24;
549 acc[6] += in[11] >> 8;
550 acc[5] += (in[11] & 0xff) << 48;
551 acc[5] -= in[11] >> 16;
552 acc[4] -= (in[11] & 0xffff) << 40;
553 acc[5] += in[11] >> 48;
554 acc[4] += (in[11] & 0xffffffffffff) << 8;
556 acc[6] += in[10] >> 32;
557 acc[5] += (in[10] & 0xffffffff) << 24;
558 acc[5] += in[10] >> 8;
559 acc[4] += (in[10] & 0xff) << 48;
560 acc[4] -= in[10] >> 16;
561 acc[3] -= (in[10] & 0xffff) << 40;
562 acc[4] += in[10] >> 48;
563 acc[3] += (in[10] & 0xffffffffffff) << 8;
565 acc[5] += in[9] >> 32;
566 acc[4] += (in[9] & 0xffffffff) << 24;
567 acc[4] += in[9] >> 8;
568 acc[3] += (in[9] & 0xff) << 48;
569 acc[3] -= in[9] >> 16;
570 acc[2] -= (in[9] & 0xffff) << 40;
571 acc[3] += in[9] >> 48;
572 acc[2] += (in[9] & 0xffffffffffff) << 8;
575 * [2]: Eliminate acc[7], acc[8], that is the 7 and eighth limbs, as
576 * well as the contributions made from eliminating higher limbs.
577 * acc[7] < in[7] + 2^120 + 2^56 < in[7] + 2^121
578 * acc[8] < in[8] + 2^96
580 acc[4] += acc[8] >> 32;
581 acc[3] += (acc[8] & 0xffffffff) << 24;
582 acc[3] += acc[8] >> 8;
583 acc[2] += (acc[8] & 0xff) << 48;
584 acc[2] -= acc[8] >> 16;
585 acc[1] -= (acc[8] & 0xffff) << 40;
586 acc[2] += acc[8] >> 48;
587 acc[1] += (acc[8] & 0xffffffffffff) << 8;
589 acc[3] += acc[7] >> 32;
590 acc[2] += (acc[7] & 0xffffffff) << 24;
591 acc[2] += acc[7] >> 8;
592 acc[1] += (acc[7] & 0xff) << 48;
593 acc[1] -= acc[7] >> 16;
594 acc[0] -= (acc[7] & 0xffff) << 40;
595 acc[1] += acc[7] >> 48;
596 acc[0] += (acc[7] & 0xffffffffffff) << 8;
599 * acc[k] < in[k] + 2^124 + 2^121
601 * < 2^128, for k <= 6
606 * This has the effect of ensuring that these more significant limbs
607 * will be small in value after eliminating high bits from acc[6].
609 acc[5] += acc[4] >> 56;
610 acc[4] &= 0x00ffffffffffffff;
612 acc[6] += acc[5] >> 56;
613 acc[5] &= 0x00ffffffffffffff;
616 * acc[6] < in[6] + 2^124 + 2^121 + 2^72 + 2^16
621 /* [3]: Eliminate high bits of acc[6] */
623 acc[6] &= 0x0000ffffffffffff;
627 acc[3] += temp >> 40;
628 acc[2] += (temp & 0xffffffffff) << 16;
629 acc[2] += temp >> 16;
630 acc[1] += (temp & 0xffff) << 40;
631 acc[1] -= temp >> 24;
632 acc[0] -= (temp & 0xffffff) << 32;
636 * acc[k] < acc_old[k] + 2^64 + 2^56
637 * < in[k] + 2^124 + 2^121 + 2^72 + 2^64 + 2^56 + 2^16 , k < 4
640 /* Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
641 acc[1] += acc[0] >> 56; /* acc[1] < acc_old[1] + 2^72 */
642 acc[0] &= 0x00ffffffffffffff;
644 acc[2] += acc[1] >> 56; /* acc[2] < acc_old[2] + 2^72 + 2^16 */
645 acc[1] &= 0x00ffffffffffffff;
647 acc[3] += acc[2] >> 56; /* acc[3] < acc_old[3] + 2^72 + 2^16 */
648 acc[2] &= 0x00ffffffffffffff;
651 * acc[k] < acc_old[k] + 2^72 + 2^16
652 * < in[k] + 2^124 + 2^121 + 2^73 + 2^64 + 2^56 + 2^17
657 acc[4] += acc[3] >> 56; /*-
658 * acc[4] < acc_old[4] + 2^72 + 2^16
659 * < 2^72 + 2^56 + 2^16
661 acc[3] &= 0x00ffffffffffffff;
663 acc[5] += acc[4] >> 56; /*-
664 * acc[5] < acc_old[5] + 2^16 + 1
667 acc[4] &= 0x00ffffffffffffff;
669 acc[6] += acc[5] >> 56; /* acc[6] < 2^48 + 1 <= 2^48 */
670 acc[5] &= 0x00ffffffffffffff;
672 for (i = 0; i < NLIMBS; i++)
676 #if defined(ECP_NISTP384_ASM)
677 static void felem_square_wrapper(widefelem out, const felem in);
678 static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2);
680 static void (*felem_square_p)(widefelem out, const felem in) =
681 felem_square_wrapper;
682 static void (*felem_mul_p)(widefelem out, const felem in1, const felem in2) =
685 void p384_felem_square(widefelem out, const felem in);
686 void p384_felem_mul(widefelem out, const felem in1, const felem in2);
688 # if defined(_ARCH_PPC64)
689 # include "crypto/ppc_arch.h"
692 static void felem_select(void)
694 # if defined(_ARCH_PPC64)
695 if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
696 felem_square_p = p384_felem_square;
697 felem_mul_p = p384_felem_mul;
704 felem_square_p = felem_square_ref;
705 felem_mul_p = felem_mul_ref;
708 static void felem_square_wrapper(widefelem out, const felem in)
711 felem_square_p(out, in);
714 static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2)
717 felem_mul_p(out, in1, in2);
720 # define felem_square felem_square_p
721 # define felem_mul felem_mul_p
723 # define felem_square felem_square_ref
724 # define felem_mul felem_mul_ref
727 static ossl_inline void felem_square_reduce(felem out, const felem in)
731 felem_square(tmp, in);
732 felem_reduce(out, tmp);
735 static ossl_inline void felem_mul_reduce(felem out, const felem in1, const felem in2)
739 felem_mul(tmp, in1, in2);
740 felem_reduce(out, tmp);
744 * felem_inv calculates |out| = |in|^{-1}
746 * Based on Fermat's Little Theorem:
748 * a^{p-1} = 1 (mod p)
749 * a^{p-2} = a^{-1} (mod p)
751 static void felem_inv(felem out, const felem in)
753 felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6;
756 felem_square_reduce(ftmp, in); /* 2^1 */
757 felem_mul_reduce(ftmp, ftmp, in); /* 2^1 + 2^0 */
758 felem_assign(ftmp2, ftmp);
760 felem_square_reduce(ftmp, ftmp); /* 2^2 + 2^1 */
761 felem_mul_reduce(ftmp, ftmp, in); /* 2^2 + 2^1 * 2^0 */
762 felem_assign(ftmp3, ftmp);
764 for (i = 0; i < 3; i++)
765 felem_square_reduce(ftmp, ftmp); /* 2^5 + 2^4 + 2^3 */
766 felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0 */
767 felem_assign(ftmp4, ftmp);
769 for (i = 0; i < 6; i++)
770 felem_square_reduce(ftmp, ftmp); /* 2^11 + ... + 2^6 */
771 felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^11 + ... + 2^0 */
773 for (i = 0; i < 3; i++)
774 felem_square_reduce(ftmp, ftmp); /* 2^14 + ... + 2^3 */
775 felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^14 + ... + 2^0 */
776 felem_assign(ftmp5, ftmp);
778 for (i = 0; i < 15; i++)
779 felem_square_reduce(ftmp, ftmp); /* 2^29 + ... + 2^15 */
780 felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^29 + ... + 2^0 */
781 felem_assign(ftmp6, ftmp);
783 for (i = 0; i < 30; i++)
784 felem_square_reduce(ftmp, ftmp); /* 2^59 + ... + 2^30 */
785 felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^59 + ... + 2^0 */
786 felem_assign(ftmp4, ftmp);
788 for (i = 0; i < 60; i++)
789 felem_square_reduce(ftmp, ftmp); /* 2^119 + ... + 2^60 */
790 felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^119 + ... + 2^0 */
791 felem_assign(ftmp4, ftmp);
793 for (i = 0; i < 120; i++)
794 felem_square_reduce(ftmp, ftmp); /* 2^239 + ... + 2^120 */
795 felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^239 + ... + 2^0 */
797 for (i = 0; i < 15; i++)
798 felem_square_reduce(ftmp, ftmp); /* 2^254 + ... + 2^15 */
799 felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^254 + ... + 2^0 */
801 for (i = 0; i < 31; i++)
802 felem_square_reduce(ftmp, ftmp); /* 2^285 + ... + 2^31 */
803 felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^285 + ... + 2^31 + 2^29 + ... + 2^0 */
805 for (i = 0; i < 2; i++)
806 felem_square_reduce(ftmp, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^2 */
807 felem_mul_reduce(ftmp, ftmp2, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^0 */
809 for (i = 0; i < 94; i++)
810 felem_square_reduce(ftmp, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 */
811 felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 + 2^29 + ... + 2^0 */
813 for (i = 0; i < 2; i++)
814 felem_square_reduce(ftmp, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 */
815 felem_mul_reduce(ftmp, in, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 + 2^0 */
817 memcpy(out, ftmp, sizeof(felem));
821 * Zero-check: returns a limb with all bits set if |in| == 0 (mod p)
822 * and 0 otherwise. We know that field elements are reduced to
823 * 0 < in < 2p, so we only need to check two cases:
824 * 0 and 2^384 - 2^128 - 2^96 + 2^32 - 1
825 * in[k] < 2^56, k < 6
828 static limb felem_is_zero(const felem in)
832 zero = in[0] | in[1] | in[2] | in[3] | in[4] | in[5] | in[6];
833 zero = ((int64_t) (zero) - 1) >> 63;
834 p384 = (in[0] ^ 0x000000ffffffff) | (in[1] ^ 0xffff0000000000)
835 | (in[2] ^ 0xfffffffffeffff) | (in[3] ^ 0xffffffffffffff)
836 | (in[4] ^ 0xffffffffffffff) | (in[5] ^ 0xffffffffffffff)
837 | (in[6] ^ 0xffffffffffff);
838 p384 = ((int64_t) (p384) - 1) >> 63;
840 return (zero | p384);
843 static int felem_is_zero_int(const void *in)
845 return (int)(felem_is_zero(in) & ((limb) 1));
849 * felem_contract converts |in| to its unique, minimal representation.
850 * Assume we've removed all redundant bits.
852 * in[k] < 2^56, k < 6
855 static void felem_contract(felem out, const felem in)
857 static const int64_t two56 = ((limb) 1) << 56;
860 * We know for a fact that 0 <= |in| < 2*p, for p = 2^384 - 2^128 - 2^96 + 2^32 - 1
861 * Perform two successive, idempotent subtractions to reduce if |in| >= p.
864 int64_t tmp[NLIMBS], cond[5], a;
867 memcpy(tmp, in, sizeof(felem));
869 /* Case 1: a = 1 iff |in| >= 2^384 */
875 tmp[6] &= 0x0000ffffffffffff;
878 * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
879 * non-zero, so we only need one step
886 /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
887 tmp[2] += tmp[1] >> 56;
888 tmp[1] &= 0x00ffffffffffffff;
890 tmp[3] += tmp[2] >> 56;
891 tmp[2] &= 0x00ffffffffffffff;
893 tmp[4] += tmp[3] >> 56;
894 tmp[3] &= 0x00ffffffffffffff;
896 tmp[5] += tmp[4] >> 56;
897 tmp[4] &= 0x00ffffffffffffff;
899 tmp[6] += tmp[5] >> 56; /* tmp[6] < 2^48 */
900 tmp[5] &= 0x00ffffffffffffff;
903 * Case 2: a = all ones if p <= |in| < 2^384, 0 otherwise
906 /* 0 iff (2^129..2^383) are all one */
907 cond[0] = ((tmp[6] | 0xff000000000000) & tmp[5] & tmp[4] & tmp[3] & (tmp[2] | 0x0000000001ffff)) + 1;
908 /* 0 iff 2^128 bit is one */
909 cond[1] = (tmp[2] | ~0x00000000010000) + 1;
910 /* 0 iff (2^96..2^127) bits are all one */
911 cond[2] = ((tmp[2] | 0xffffffffff0000) & (tmp[1] | 0x0000ffffffffff)) + 1;
912 /* 0 iff (2^32..2^95) bits are all zero */
913 cond[3] = (tmp[1] & ~0xffff0000000000) | (tmp[0] & ~((int64_t) 0x000000ffffffff));
914 /* 0 iff (2^0..2^31) bits are all one */
915 cond[4] = (tmp[0] | 0xffffff00000000) + 1;
918 * In effect, invert our conditions, so that 0 values become all 1's,
919 * any non-zero value in the low-order 56 bits becomes all 0's
921 for (i = 0; i < 5; i++)
922 cond[i] = ((cond[i] & 0x00ffffffffffffff) - 1) >> 63;
925 * The condition for determining whether in is greater than our
926 * prime is given by the following condition.
929 /* First subtract 2^384 - 2^129 cheaply */
930 a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
935 tmp[2] &= ~a | 0x0000000001ffff;
938 * Subtract 2^128 - 2^96 by
939 * means of disjoint cases.
942 /* subtract 2^128 if that bit is present, and add 2^96 */
943 a = cond[0] & cond[1];
944 tmp[2] &= ~a | 0xfffffffffeffff;
945 tmp[1] += a & ((int64_t) 1 << 40);
947 /* otherwise, clear bits 2^127 .. 2^96 */
948 a = cond[0] & ~cond[1] & (cond[2] & (~cond[3] | cond[4]));
949 tmp[2] &= ~a | 0xffffffffff0000;
950 tmp[1] &= ~a | 0x0000ffffffffff;
952 /* finally, subtract the last 2^32 - 1 */
953 a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
954 tmp[0] += a & (-((int64_t) 1 << 32) + 1);
957 * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
958 * non-zero, so we only need one step
964 /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
965 tmp[2] += tmp[1] >> 56;
966 tmp[1] &= 0x00ffffffffffffff;
968 tmp[3] += tmp[2] >> 56;
969 tmp[2] &= 0x00ffffffffffffff;
971 tmp[4] += tmp[3] >> 56;
972 tmp[3] &= 0x00ffffffffffffff;
974 tmp[5] += tmp[4] >> 56;
975 tmp[4] &= 0x00ffffffffffffff;
977 tmp[6] += tmp[5] >> 56;
978 tmp[5] &= 0x00ffffffffffffff;
980 memcpy(out, tmp, sizeof(felem));
987 * Building on top of the field operations we have the operations on the
988 * elliptic curve group itself. Points on the curve are represented in Jacobian
993 * point_double calculates 2*(x_in, y_in, z_in)
995 * The method is taken from:
996 * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
998 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
999 * while x_out == y_in is not (maybe this works, but it's not tested).
1002 point_double(felem x_out, felem y_out, felem z_out,
1003 const felem x_in, const felem y_in, const felem z_in)
1005 widefelem tmp, tmp2;
1006 felem delta, gamma, beta, alpha, ftmp, ftmp2;
1008 felem_assign(ftmp, x_in);
1009 felem_assign(ftmp2, x_in);
1012 felem_square_reduce(delta, z_in); /* delta[i] < 2^56 */
1015 felem_square_reduce(gamma, y_in); /* gamma[i] < 2^56 */
1017 /* beta = x*gamma */
1018 felem_mul_reduce(beta, x_in, gamma); /* beta[i] < 2^56 */
1020 /* alpha = 3*(x-delta)*(x+delta) */
1021 felem_diff64(ftmp, delta); /* ftmp[i] < 2^60 + 2^58 + 2^44 */
1022 felem_sum64(ftmp2, delta); /* ftmp2[i] < 2^59 */
1023 felem_scalar64(ftmp2, 3); /* ftmp2[i] < 2^61 */
1024 felem_mul_reduce(alpha, ftmp, ftmp2); /* alpha[i] < 2^56 */
1026 /* x' = alpha^2 - 8*beta */
1027 felem_square(tmp, alpha); /* tmp[i] < 2^115 */
1028 felem_assign(ftmp, beta); /* ftmp[i] < 2^56 */
1029 felem_scalar64(ftmp, 8); /* ftmp[i] < 2^59 */
1030 felem_diff_128_64(tmp, ftmp); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1031 felem_reduce(x_out, tmp); /* x_out[i] < 2^56 */
1033 /* z' = (y + z)^2 - gamma - delta */
1034 felem_sum64(delta, gamma); /* delta[i] < 2^57 */
1035 felem_assign(ftmp, y_in); /* ftmp[i] < 2^56 */
1036 felem_sum64(ftmp, z_in); /* ftmp[i] < 2^56 */
1037 felem_square(tmp, ftmp); /* tmp[i] < 2^115 */
1038 felem_diff_128_64(tmp, delta); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1039 felem_reduce(z_out, tmp); /* z_out[i] < 2^56 */
1041 /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1042 felem_scalar64(beta, 4); /* beta[i] < 2^58 */
1043 felem_diff64(beta, x_out); /* beta[i] < 2^60 + 2^58 + 2^44 */
1044 felem_mul(tmp, alpha, beta); /* tmp[i] < 2^119 */
1045 felem_square(tmp2, gamma); /* tmp2[i] < 2^115 */
1046 felem_scalar128(tmp2, 8); /* tmp2[i] < 2^118 */
1047 felem_diff128(tmp, tmp2); /* tmp[i] < 2^127 + 2^119 + 2^111 */
1048 felem_reduce(y_out, tmp); /* tmp[i] < 2^56 */
1051 /* copy_conditional copies in to out iff mask is all ones. */
1052 static void copy_conditional(felem out, const felem in, limb mask)
1056 for (i = 0; i < NLIMBS; i++)
1057 out[i] ^= mask & (in[i] ^ out[i]);
1061 * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1063 * The method is taken from
1064 * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1065 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1067 * This function includes a branch for checking whether the two input points
1068 * are equal (while not equal to the point at infinity). See comment below
1071 static void point_add(felem x3, felem y3, felem z3,
1072 const felem x1, const felem y1, const felem z1,
1073 const int mixed, const felem x2, const felem y2,
1076 felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1077 widefelem tmp, tmp2;
1078 limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1081 z1_is_zero = felem_is_zero(z1);
1082 z2_is_zero = felem_is_zero(z2);
1084 /* ftmp = z1z1 = z1**2 */
1085 felem_square_reduce(ftmp, z1); /* ftmp[i] < 2^56 */
1088 /* ftmp2 = z2z2 = z2**2 */
1089 felem_square_reduce(ftmp2, z2); /* ftmp2[i] < 2^56 */
1091 /* u1 = ftmp3 = x1*z2z2 */
1092 felem_mul_reduce(ftmp3, x1, ftmp2); /* ftmp3[i] < 2^56 */
1094 /* ftmp5 = z1 + z2 */
1095 felem_assign(ftmp5, z1); /* ftmp5[i] < 2^56 */
1096 felem_sum64(ftmp5, z2); /* ftmp5[i] < 2^57 */
1098 /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1099 felem_square(tmp, ftmp5); /* tmp[i] < 2^117 */
1100 felem_diff_128_64(tmp, ftmp); /* tmp[i] < 2^117 + 2^64 + 2^48 */
1101 felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1102 felem_reduce(ftmp5, tmp); /* ftmp5[i] < 2^56 */
1104 /* ftmp2 = z2 * z2z2 */
1105 felem_mul_reduce(ftmp2, ftmp2, z2); /* ftmp2[i] < 2^56 */
1107 /* s1 = ftmp6 = y1 * z2**3 */
1108 felem_mul_reduce(ftmp6, y1, ftmp2); /* ftmp6[i] < 2^56 */
1111 * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1114 /* u1 = ftmp3 = x1*z2z2 */
1115 felem_assign(ftmp3, x1); /* ftmp3[i] < 2^56 */
1117 /* ftmp5 = 2*z1z2 */
1118 felem_scalar(ftmp5, z1, 2); /* ftmp5[i] < 2^57 */
1120 /* s1 = ftmp6 = y1 * z2**3 */
1121 felem_assign(ftmp6, y1); /* ftmp6[i] < 2^56 */
1123 /* ftmp3[i] < 2^56, ftmp5[i] < 2^57, ftmp6[i] < 2^56 */
1126 felem_mul(tmp, x2, ftmp); /* tmp[i] < 2^115 */
1128 /* h = ftmp4 = u2 - u1 */
1129 felem_diff_128_64(tmp, ftmp3); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1130 felem_reduce(ftmp4, tmp); /* ftmp[4] < 2^56 */
1132 x_equal = felem_is_zero(ftmp4);
1134 /* z_out = ftmp5 * h */
1135 felem_mul_reduce(z_out, ftmp5, ftmp4); /* z_out[i] < 2^56 */
1137 /* ftmp = z1 * z1z1 */
1138 felem_mul_reduce(ftmp, ftmp, z1); /* ftmp[i] < 2^56 */
1140 /* s2 = tmp = y2 * z1**3 */
1141 felem_mul(tmp, y2, ftmp); /* tmp[i] < 2^115 */
1143 /* r = ftmp5 = (s2 - s1)*2 */
1144 felem_diff_128_64(tmp, ftmp6); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1145 felem_reduce(ftmp5, tmp); /* ftmp5[i] < 2^56 */
1146 y_equal = felem_is_zero(ftmp5);
1147 felem_scalar64(ftmp5, 2); /* ftmp5[i] < 2^57 */
1150 * The formulae are incorrect if the points are equal, in affine coordinates
1151 * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1154 * We use bitwise operations to avoid potential side-channels introduced by
1155 * the short-circuiting behaviour of boolean operators.
1157 * The special case of either point being the point at infinity (z1 and/or
1158 * z2 are zero), is handled separately later on in this function, so we
1159 * avoid jumping to point_double here in those special cases.
1161 * Notice the comment below on the implications of this branching for timing
1162 * leaks and why it is considered practically irrelevant.
1164 points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1168 * This is obviously not constant-time but it will almost-never happen
1171 point_double(x3, y3, z3, x1, y1, z1);
1175 /* I = ftmp = (2h)**2 */
1176 felem_assign(ftmp, ftmp4); /* ftmp[i] < 2^56 */
1177 felem_scalar64(ftmp, 2); /* ftmp[i] < 2^57 */
1178 felem_square_reduce(ftmp, ftmp); /* ftmp[i] < 2^56 */
1180 /* J = ftmp2 = h * I */
1181 felem_mul_reduce(ftmp2, ftmp4, ftmp); /* ftmp2[i] < 2^56 */
1183 /* V = ftmp4 = U1 * I */
1184 felem_mul_reduce(ftmp4, ftmp3, ftmp); /* ftmp4[i] < 2^56 */
1186 /* x_out = r**2 - J - 2V */
1187 felem_square(tmp, ftmp5); /* tmp[i] < 2^117 */
1188 felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^64 + 2^48 */
1189 felem_assign(ftmp3, ftmp4); /* ftmp3[i] < 2^56 */
1190 felem_scalar64(ftmp4, 2); /* ftmp4[i] < 2^57 */
1191 felem_diff_128_64(tmp, ftmp4); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1192 felem_reduce(x_out, tmp); /* x_out[i] < 2^56 */
1194 /* y_out = r(V-x_out) - 2 * s1 * J */
1195 felem_diff64(ftmp3, x_out); /* ftmp3[i] < 2^60 + 2^56 + 2^44 */
1196 felem_mul(tmp, ftmp5, ftmp3); /* tmp[i] < 2^116 */
1197 felem_mul(tmp2, ftmp6, ftmp2); /* tmp2[i] < 2^115 */
1198 felem_scalar128(tmp2, 2); /* tmp2[i] < 2^116 */
1199 felem_diff128(tmp, tmp2); /* tmp[i] < 2^127 + 2^116 + 2^111 */
1200 felem_reduce(y_out, tmp); /* y_out[i] < 2^56 */
1202 copy_conditional(x_out, x2, z1_is_zero);
1203 copy_conditional(x_out, x1, z2_is_zero);
1204 copy_conditional(y_out, y2, z1_is_zero);
1205 copy_conditional(y_out, y1, z2_is_zero);
1206 copy_conditional(z_out, z2, z1_is_zero);
1207 copy_conditional(z_out, z1, z2_is_zero);
1208 felem_assign(x3, x_out);
1209 felem_assign(y3, y_out);
1210 felem_assign(z3, z_out);
1214 * Base point pre computation
1215 * --------------------------
1217 * Two different sorts of precomputed tables are used in the following code.
1218 * Each contain various points on the curve, where each point is three field
1219 * elements (x, y, z).
1221 * For the base point table, z is usually 1 (0 for the point at infinity).
1222 * This table has 16 elements:
1223 * index | bits | point
1224 * ------+---------+------------------------------
1227 * 2 | 0 0 1 0 | 2^95G
1228 * 3 | 0 0 1 1 | (2^95 + 1)G
1229 * 4 | 0 1 0 0 | 2^190G
1230 * 5 | 0 1 0 1 | (2^190 + 1)G
1231 * 6 | 0 1 1 0 | (2^190 + 2^95)G
1232 * 7 | 0 1 1 1 | (2^190 + 2^95 + 1)G
1233 * 8 | 1 0 0 0 | 2^285G
1234 * 9 | 1 0 0 1 | (2^285 + 1)G
1235 * 10 | 1 0 1 0 | (2^285 + 2^95)G
1236 * 11 | 1 0 1 1 | (2^285 + 2^95 + 1)G
1237 * 12 | 1 1 0 0 | (2^285 + 2^190)G
1238 * 13 | 1 1 0 1 | (2^285 + 2^190 + 1)G
1239 * 14 | 1 1 1 0 | (2^285 + 2^190 + 2^95)G
1240 * 15 | 1 1 1 1 | (2^285 + 2^190 + 2^95 + 1)G
1242 * The reason for this is so that we can clock bits into four different
1243 * locations when doing simple scalar multiplies against the base point.
1245 * Tables for other points have table[i] = iG for i in 0 .. 16.
1248 /* gmul is the table of precomputed base points */
1249 static const felem gmul[16][3] = {
1250 {{0, 0, 0, 0, 0, 0, 0},
1251 {0, 0, 0, 0, 0, 0, 0},
1252 {0, 0, 0, 0, 0, 0, 0}},
1253 {{0x00545e3872760ab7, 0x00f25dbf55296c3a, 0x00e082542a385502, 0x008ba79b9859f741,
1254 0x0020ad746e1d3b62, 0x0005378eb1c71ef3, 0x0000aa87ca22be8b},
1255 {0x00431d7c90ea0e5f, 0x00b1ce1d7e819d7a, 0x0013b5f0b8c00a60, 0x00289a147ce9da31,
1256 0x0092dc29f8f41dbd, 0x002c6f5d9e98bf92, 0x00003617de4a9626},
1257 {1, 0, 0, 0, 0, 0, 0}},
1258 {{0x00024711cc902a90, 0x00acb2e579ab4fe1, 0x00af818a4b4d57b1, 0x00a17c7bec49c3de,
1259 0x004280482d726a8b, 0x00128dd0f0a90f3b, 0x00004387c1c3fa3c},
1260 {0x002ce76543cf5c3a, 0x00de6cee5ef58f0a, 0x00403e42fa561ca6, 0x00bc54d6f9cb9731,
1261 0x007155f925fb4ff1, 0x004a9ce731b7b9bc, 0x00002609076bd7b2},
1262 {1, 0, 0, 0, 0, 0, 0}},
1263 {{0x00e74c9182f0251d, 0x0039bf54bb111974, 0x00b9d2f2eec511d2, 0x0036b1594eb3a6a4,
1264 0x00ac3bb82d9d564b, 0x00f9313f4615a100, 0x00006716a9a91b10},
1265 {0x0046698116e2f15c, 0x00f34347067d3d33, 0x008de4ccfdebd002, 0x00e838c6b8e8c97b,
1266 0x006faf0798def346, 0x007349794a57563c, 0x00002629e7e6ad84},
1267 {1, 0, 0, 0, 0, 0, 0}},
1268 {{0x0075300e34fd163b, 0x0092e9db4e8d0ad3, 0x00254be9f625f760, 0x00512c518c72ae68,
1269 0x009bfcf162bede5a, 0x00bf9341566ce311, 0x0000cd6175bd41cf},
1270 {0x007dfe52af4ac70f, 0x0002159d2d5c4880, 0x00b504d16f0af8d0, 0x0014585e11f5e64c,
1271 0x0089c6388e030967, 0x00ffb270cbfa5f71, 0x00009a15d92c3947},
1272 {1, 0, 0, 0, 0, 0, 0}},
1273 {{0x0033fc1278dc4fe5, 0x00d53088c2caa043, 0x0085558827e2db66, 0x00c192bef387b736,
1274 0x00df6405a2225f2c, 0x0075205aa90fd91a, 0x0000137e3f12349d},
1275 {0x00ce5b115efcb07e, 0x00abc3308410deeb, 0x005dc6fc1de39904, 0x00907c1c496f36b4,
1276 0x0008e6ad3926cbe1, 0x00110747b787928c, 0x0000021b9162eb7e},
1277 {1, 0, 0, 0, 0, 0, 0}},
1278 {{0x008180042cfa26e1, 0x007b826a96254967, 0x0082473694d6b194, 0x007bd6880a45b589,
1279 0x00c0a5097072d1a3, 0x0019186555e18b4e, 0x000020278190e5ca},
1280 {0x00b4bef17de61ac0, 0x009535e3c38ed348, 0x002d4aa8e468ceab, 0x00ef40b431036ad3,
1281 0x00defd52f4542857, 0x0086edbf98234266, 0x00002025b3a7814d},
1282 {1, 0, 0, 0, 0, 0, 0}},
1283 {{0x00b238aa97b886be, 0x00ef3192d6dd3a32, 0x0079f9e01fd62df8, 0x00742e890daba6c5,
1284 0x008e5289144408ce, 0x0073bbcc8e0171a5, 0x0000c4fd329d3b52},
1285 {0x00c6f64a15ee23e7, 0x00dcfb7b171cad8b, 0x00039f6cbd805867, 0x00de024e428d4562,
1286 0x00be6a594d7c64c5, 0x0078467b70dbcd64, 0x0000251f2ed7079b},
1287 {1, 0, 0, 0, 0, 0, 0}},
1288 {{0x000e5cc25fc4b872, 0x005ebf10d31ef4e1, 0x0061e0ebd11e8256, 0x0076e026096f5a27,
1289 0x0013e6fc44662e9a, 0x0042b00289d3597e, 0x000024f089170d88},
1290 {0x001604d7e0effbe6, 0x0048d77cba64ec2c, 0x008166b16da19e36, 0x006b0d1a0f28c088,
1291 0x000259fcd47754fd, 0x00cc643e4d725f9a, 0x00007b10f3c79c14},
1292 {1, 0, 0, 0, 0, 0, 0}},
1293 {{0x00430155e3b908af, 0x00b801e4fec25226, 0x00b0d4bcfe806d26, 0x009fc4014eb13d37,
1294 0x0066c94e44ec07e8, 0x00d16adc03874ba2, 0x000030c917a0d2a7},
1295 {0x00edac9e21eb891c, 0x00ef0fb768102eff, 0x00c088cef272a5f3, 0x00cbf782134e2964,
1296 0x0001044a7ba9a0e3, 0x00e363f5b194cf3c, 0x00009ce85249e372},
1297 {1, 0, 0, 0, 0, 0, 0}},
1298 {{0x001dd492dda5a7eb, 0x008fd577be539fd1, 0x002ff4b25a5fc3f1, 0x0074a8a1b64df72f,
1299 0x002ba3d8c204a76c, 0x009d5cff95c8235a, 0x0000e014b9406e0f},
1300 {0x008c2e4dbfc98aba, 0x00f30bb89f1a1436, 0x00b46f7aea3e259c, 0x009224454ac02f54,
1301 0x00906401f5645fa2, 0x003a1d1940eabc77, 0x00007c9351d680e6},
1302 {1, 0, 0, 0, 0, 0, 0}},
1303 {{0x005a35d872ef967c, 0x0049f1b7884e1987, 0x0059d46d7e31f552, 0x00ceb4869d2d0fb6,
1304 0x00e8e89eee56802a, 0x0049d806a774aaf2, 0x0000147e2af0ae24},
1305 {0x005fd1bd852c6e5e, 0x00b674b7b3de6885, 0x003b9ea5eb9b6c08, 0x005c9f03babf3ef7,
1306 0x00605337fecab3c7, 0x009a3f85b11bbcc8, 0x0000455470f330ec},
1307 {1, 0, 0, 0, 0, 0, 0}},
1308 {{0x002197ff4d55498d, 0x00383e8916c2d8af, 0x00eb203f34d1c6d2, 0x0080367cbd11b542,
1309 0x00769b3be864e4f5, 0x0081a8458521c7bb, 0x0000c531b34d3539},
1310 {0x00e2a3d775fa2e13, 0x00534fc379573844, 0x00ff237d2a8db54a, 0x00d301b2335a8882,
1311 0x000f75ea96103a80, 0x0018fecb3cdd96fa, 0x0000304bf61e94eb},
1312 {1, 0, 0, 0, 0, 0, 0}},
1313 {{0x00b2afc332a73dbd, 0x0029a0d5bb007bc5, 0x002d628eb210f577, 0x009f59a36dd05f50,
1314 0x006d339de4eca613, 0x00c75a71addc86bc, 0x000060384c5ea93c},
1315 {0x00aa9641c32a30b4, 0x00cc73ae8cce565d, 0x00ec911a4df07f61, 0x00aa4b762ea4b264,
1316 0x0096d395bb393629, 0x004efacfb7632fe0, 0x00006f252f46fa3f},
1317 {1, 0, 0, 0, 0, 0, 0}},
1318 {{0x00567eec597c7af6, 0x0059ba6795204413, 0x00816d4e6f01196f, 0x004ae6b3eb57951d,
1319 0x00420f5abdda2108, 0x003401d1f57ca9d9, 0x0000cf5837b0b67a},
1320 {0x00eaa64b8aeeabf9, 0x00246ddf16bcb4de, 0x000e7e3c3aecd751, 0x0008449f04fed72e,
1321 0x00307b67ccf09183, 0x0017108c3556b7b1, 0x0000229b2483b3bf},
1322 {1, 0, 0, 0, 0, 0, 0}},
1323 {{0x00e7c491a7bb78a1, 0x00eafddd1d3049ab, 0x00352c05e2bc7c98, 0x003d6880c165fa5c,
1324 0x00b6ac61cc11c97d, 0x00beeb54fcf90ce5, 0x0000dc1f0b455edc},
1325 {0x002db2e7aee34d60, 0x0073b5f415a2d8c0, 0x00dd84e4193e9a0c, 0x00d02d873467c572,
1326 0x0018baaeda60aee5, 0x0013fb11d697c61e, 0x000083aafcc3a973},
1327 {1, 0, 0, 0, 0, 0, 0}}
1331 * select_point selects the |idx|th point from a precomputation table and
1334 * pre_comp below is of the size provided in |size|.
1336 static void select_point(const limb idx, unsigned int size,
1337 const felem pre_comp[][3], felem out[3])
1340 limb *outlimbs = &out[0][0];
1342 memset(out, 0, sizeof(*out) * 3);
1344 for (i = 0; i < size; i++) {
1345 const limb *inlimbs = &pre_comp[i][0][0];
1346 limb mask = i ^ idx;
1353 for (j = 0; j < NLIMBS * 3; j++)
1354 outlimbs[j] |= inlimbs[j] & mask;
1358 /* get_bit returns the |i|th bit in |in| */
1359 static char get_bit(const felem_bytearray in, int i)
1361 if (i < 0 || i >= 384)
1363 return (in[i >> 3] >> (i & 7)) & 1;
1367 * Interleaved point multiplication using precomputed point multiples: The
1368 * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1369 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1370 * generator, using certain (large) precomputed multiples in g_pre_comp.
1371 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1373 static void batch_mul(felem x_out, felem y_out, felem z_out,
1374 const felem_bytearray scalars[],
1375 const unsigned int num_points, const u8 *g_scalar,
1376 const int mixed, const felem pre_comp[][17][3],
1377 const felem g_pre_comp[16][3])
1380 unsigned int num, gen_mul = (g_scalar != NULL);
1381 felem nq[3], tmp[4];
1385 /* set nq to the point at infinity */
1386 memset(nq, 0, sizeof(nq));
1389 * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1390 * of the generator (last quarter of rounds) and additions of other
1391 * points multiples (every 5th round).
1393 skip = 1; /* save two point operations in the first
1395 for (i = (num_points ? 380 : 98); i >= 0; --i) {
1398 point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1400 /* add multiples of the generator */
1401 if (gen_mul && (i <= 98)) {
1402 bits = get_bit(g_scalar, i + 285) << 3;
1404 bits |= get_bit(g_scalar, i + 190) << 2;
1405 bits |= get_bit(g_scalar, i + 95) << 1;
1406 bits |= get_bit(g_scalar, i);
1408 /* select the point to add, in constant time */
1409 select_point(bits, 16, g_pre_comp, tmp);
1411 /* The 1 argument below is for "mixed" */
1412 point_add(nq[0], nq[1], nq[2],
1413 nq[0], nq[1], nq[2], 1,
1414 tmp[0], tmp[1], tmp[2]);
1416 memcpy(nq, tmp, 3 * sizeof(felem));
1421 /* do other additions every 5 doublings */
1422 if (num_points && (i % 5 == 0)) {
1423 /* loop over all scalars */
1424 for (num = 0; num < num_points; ++num) {
1425 bits = get_bit(scalars[num], i + 4) << 5;
1426 bits |= get_bit(scalars[num], i + 3) << 4;
1427 bits |= get_bit(scalars[num], i + 2) << 3;
1428 bits |= get_bit(scalars[num], i + 1) << 2;
1429 bits |= get_bit(scalars[num], i) << 1;
1430 bits |= get_bit(scalars[num], i - 1);
1431 ossl_ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1434 * select the point to add or subtract, in constant time
1436 select_point(digit, 17, pre_comp[num], tmp);
1437 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1439 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1442 point_add(nq[0], nq[1], nq[2],
1443 nq[0], nq[1], nq[2], mixed,
1444 tmp[0], tmp[1], tmp[2]);
1446 memcpy(nq, tmp, 3 * sizeof(felem));
1452 felem_assign(x_out, nq[0]);
1453 felem_assign(y_out, nq[1]);
1454 felem_assign(z_out, nq[2]);
1457 /* Precomputation for the group generator. */
1458 struct nistp384_pre_comp_st {
1459 felem g_pre_comp[16][3];
1460 CRYPTO_REF_COUNT references;
1463 const EC_METHOD *ossl_ec_GFp_nistp384_method(void)
1465 static const EC_METHOD ret = {
1466 EC_FLAGS_DEFAULT_OCT,
1467 NID_X9_62_prime_field,
1468 ossl_ec_GFp_nistp384_group_init,
1469 ossl_ec_GFp_simple_group_finish,
1470 ossl_ec_GFp_simple_group_clear_finish,
1471 ossl_ec_GFp_nist_group_copy,
1472 ossl_ec_GFp_nistp384_group_set_curve,
1473 ossl_ec_GFp_simple_group_get_curve,
1474 ossl_ec_GFp_simple_group_get_degree,
1475 ossl_ec_group_simple_order_bits,
1476 ossl_ec_GFp_simple_group_check_discriminant,
1477 ossl_ec_GFp_simple_point_init,
1478 ossl_ec_GFp_simple_point_finish,
1479 ossl_ec_GFp_simple_point_clear_finish,
1480 ossl_ec_GFp_simple_point_copy,
1481 ossl_ec_GFp_simple_point_set_to_infinity,
1482 ossl_ec_GFp_simple_point_set_affine_coordinates,
1483 ossl_ec_GFp_nistp384_point_get_affine_coordinates,
1484 0, /* point_set_compressed_coordinates */
1487 ossl_ec_GFp_simple_add,
1488 ossl_ec_GFp_simple_dbl,
1489 ossl_ec_GFp_simple_invert,
1490 ossl_ec_GFp_simple_is_at_infinity,
1491 ossl_ec_GFp_simple_is_on_curve,
1492 ossl_ec_GFp_simple_cmp,
1493 ossl_ec_GFp_simple_make_affine,
1494 ossl_ec_GFp_simple_points_make_affine,
1495 ossl_ec_GFp_nistp384_points_mul,
1496 ossl_ec_GFp_nistp384_precompute_mult,
1497 ossl_ec_GFp_nistp384_have_precompute_mult,
1498 ossl_ec_GFp_nist_field_mul,
1499 ossl_ec_GFp_nist_field_sqr,
1501 ossl_ec_GFp_simple_field_inv,
1502 0, /* field_encode */
1503 0, /* field_decode */
1504 0, /* field_set_to_one */
1505 ossl_ec_key_simple_priv2oct,
1506 ossl_ec_key_simple_oct2priv,
1507 0, /* set private */
1508 ossl_ec_key_simple_generate_key,
1509 ossl_ec_key_simple_check_key,
1510 ossl_ec_key_simple_generate_public_key,
1513 ossl_ecdh_simple_compute_key,
1514 ossl_ecdsa_simple_sign_setup,
1515 ossl_ecdsa_simple_sign_sig,
1516 ossl_ecdsa_simple_verify_sig,
1517 0, /* field_inverse_mod_ord */
1518 0, /* blind_coordinates */
1520 0, /* ladder_step */
1527 /******************************************************************************/
1529 * FUNCTIONS TO MANAGE PRECOMPUTATION
1532 static NISTP384_PRE_COMP *nistp384_pre_comp_new(void)
1534 NISTP384_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1539 if (!CRYPTO_NEW_REF(&ret->references, 1)) {
1546 NISTP384_PRE_COMP *ossl_ec_nistp384_pre_comp_dup(NISTP384_PRE_COMP *p)
1551 CRYPTO_UP_REF(&p->references, &i);
1555 void ossl_ec_nistp384_pre_comp_free(NISTP384_PRE_COMP *p)
1562 CRYPTO_DOWN_REF(&p->references, &i);
1563 REF_PRINT_COUNT("ossl_ec_nistp384", p);
1566 REF_ASSERT_ISNT(i < 0);
1568 CRYPTO_FREE_REF(&p->references);
1572 /******************************************************************************/
1574 * OPENSSL EC_METHOD FUNCTIONS
1577 int ossl_ec_GFp_nistp384_group_init(EC_GROUP *group)
1581 ret = ossl_ec_GFp_simple_group_init(group);
1582 group->a_is_minus3 = 1;
1586 int ossl_ec_GFp_nistp384_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1587 const BIGNUM *a, const BIGNUM *b,
1591 BIGNUM *curve_p, *curve_a, *curve_b;
1593 BN_CTX *new_ctx = NULL;
1596 ctx = new_ctx = BN_CTX_new();
1602 curve_p = BN_CTX_get(ctx);
1603 curve_a = BN_CTX_get(ctx);
1604 curve_b = BN_CTX_get(ctx);
1605 if (curve_b == NULL)
1607 BN_bin2bn(nistp384_curve_params[0], sizeof(felem_bytearray), curve_p);
1608 BN_bin2bn(nistp384_curve_params[1], sizeof(felem_bytearray), curve_a);
1609 BN_bin2bn(nistp384_curve_params[2], sizeof(felem_bytearray), curve_b);
1610 if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1611 ERR_raise(ERR_LIB_EC, EC_R_WRONG_CURVE_PARAMETERS);
1614 group->field_mod_func = BN_nist_mod_384;
1615 ret = ossl_ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1619 BN_CTX_free(new_ctx);
1625 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1628 int ossl_ec_GFp_nistp384_point_get_affine_coordinates(const EC_GROUP *group,
1629 const EC_POINT *point,
1630 BIGNUM *x, BIGNUM *y,
1633 felem z1, z2, x_in, y_in, x_out, y_out;
1636 if (EC_POINT_is_at_infinity(group, point)) {
1637 ERR_raise(ERR_LIB_EC, EC_R_POINT_AT_INFINITY);
1640 if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1641 (!BN_to_felem(z1, point->Z)))
1644 felem_square(tmp, z2);
1645 felem_reduce(z1, tmp);
1646 felem_mul(tmp, x_in, z1);
1647 felem_reduce(x_in, tmp);
1648 felem_contract(x_out, x_in);
1650 if (!felem_to_BN(x, x_out)) {
1651 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1655 felem_mul(tmp, z1, z2);
1656 felem_reduce(z1, tmp);
1657 felem_mul(tmp, y_in, z1);
1658 felem_reduce(y_in, tmp);
1659 felem_contract(y_out, y_in);
1661 if (!felem_to_BN(y, y_out)) {
1662 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1669 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1670 static void make_points_affine(size_t num, felem points[][3],
1674 * Runs in constant time, unless an input is the point at infinity (which
1675 * normally shouldn't happen).
1677 ossl_ec_GFp_nistp_points_make_affine_internal(num,
1681 (void (*)(void *))felem_one,
1683 (void (*)(void *, const void *))
1685 (void (*)(void *, const void *))
1686 felem_square_reduce,
1687 (void (*)(void *, const void *, const void*))
1689 (void (*)(void *, const void *))
1691 (void (*)(void *, const void *))
1696 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1697 * values Result is stored in r (r can equal one of the inputs).
1699 int ossl_ec_GFp_nistp384_points_mul(const EC_GROUP *group, EC_POINT *r,
1700 const BIGNUM *scalar, size_t num,
1701 const EC_POINT *points[],
1702 const BIGNUM *scalars[], BN_CTX *ctx)
1707 BIGNUM *x, *y, *z, *tmp_scalar;
1708 felem_bytearray g_secret;
1709 felem_bytearray *secrets = NULL;
1710 felem (*pre_comp)[17][3] = NULL;
1711 felem *tmp_felems = NULL;
1714 int have_pre_comp = 0;
1715 size_t num_points = num;
1716 felem x_in, y_in, z_in, x_out, y_out, z_out;
1717 NISTP384_PRE_COMP *pre = NULL;
1718 felem(*g_pre_comp)[3] = NULL;
1719 EC_POINT *generator = NULL;
1720 const EC_POINT *p = NULL;
1721 const BIGNUM *p_scalar = NULL;
1724 x = BN_CTX_get(ctx);
1725 y = BN_CTX_get(ctx);
1726 z = BN_CTX_get(ctx);
1727 tmp_scalar = BN_CTX_get(ctx);
1728 if (tmp_scalar == NULL)
1731 if (scalar != NULL) {
1732 pre = group->pre_comp.nistp384;
1734 /* we have precomputation, try to use it */
1735 g_pre_comp = &pre->g_pre_comp[0];
1737 /* try to use the standard precomputation */
1738 g_pre_comp = (felem(*)[3]) gmul;
1739 generator = EC_POINT_new(group);
1740 if (generator == NULL)
1742 /* get the generator from precomputation */
1743 if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1744 !felem_to_BN(y, g_pre_comp[1][1]) ||
1745 !felem_to_BN(z, g_pre_comp[1][2])) {
1746 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1749 if (!ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group,
1753 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1754 /* precomputation matches generator */
1758 * we don't have valid precomputation: treat the generator as a
1764 if (num_points > 0) {
1765 if (num_points >= 2) {
1767 * unless we precompute multiples for just one point, converting
1768 * those into affine form is time well spent
1772 secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1773 pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1776 OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1777 if ((secrets == NULL) || (pre_comp == NULL)
1778 || (mixed && (tmp_felems == NULL)))
1782 * we treat NULL scalars as 0, and NULL points as points at infinity,
1783 * i.e., they contribute nothing to the linear combination
1785 for (i = 0; i < num_points; ++i) {
1788 * we didn't have a valid precomputation, so we pick the
1791 p = EC_GROUP_get0_generator(group);
1794 /* the i^th point */
1796 p_scalar = scalars[i];
1798 if (p_scalar != NULL && p != NULL) {
1799 /* reduce scalar to 0 <= scalar < 2^384 */
1800 if ((BN_num_bits(p_scalar) > 384)
1801 || (BN_is_negative(p_scalar))) {
1803 * this is an unusual input, and we don't guarantee
1806 if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1807 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1810 num_bytes = BN_bn2lebinpad(tmp_scalar,
1811 secrets[i], sizeof(secrets[i]));
1813 num_bytes = BN_bn2lebinpad(p_scalar,
1814 secrets[i], sizeof(secrets[i]));
1816 if (num_bytes < 0) {
1817 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1820 /* precompute multiples */
1821 if ((!BN_to_felem(x_out, p->X)) ||
1822 (!BN_to_felem(y_out, p->Y)) ||
1823 (!BN_to_felem(z_out, p->Z)))
1825 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1826 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1827 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1828 for (j = 2; j <= 16; ++j) {
1830 point_add(pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1831 pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2], 0,
1832 pre_comp[i][j - 1][0], pre_comp[i][j - 1][1], pre_comp[i][j - 1][2]);
1834 point_double(pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1835 pre_comp[i][j / 2][0], pre_comp[i][j / 2][1], pre_comp[i][j / 2][2]);
1841 make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1844 /* the scalar for the generator */
1845 if (scalar != NULL && have_pre_comp) {
1846 memset(g_secret, 0, sizeof(g_secret));
1847 /* reduce scalar to 0 <= scalar < 2^384 */
1848 if ((BN_num_bits(scalar) > 384) || (BN_is_negative(scalar))) {
1850 * this is an unusual input, and we don't guarantee
1853 if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1854 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1857 num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
1859 num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
1861 /* do the multiplication with generator precomputation */
1862 batch_mul(x_out, y_out, z_out,
1863 (const felem_bytearray(*))secrets, num_points,
1865 mixed, (const felem(*)[17][3])pre_comp,
1866 (const felem(*)[3])g_pre_comp);
1868 /* do the multiplication without generator precomputation */
1869 batch_mul(x_out, y_out, z_out,
1870 (const felem_bytearray(*))secrets, num_points,
1871 NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1873 /* reduce the output to its unique minimal representation */
1874 felem_contract(x_in, x_out);
1875 felem_contract(y_in, y_out);
1876 felem_contract(z_in, z_out);
1877 if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1878 (!felem_to_BN(z, z_in))) {
1879 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1882 ret = ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z,
1887 EC_POINT_free(generator);
1888 OPENSSL_free(secrets);
1889 OPENSSL_free(pre_comp);
1890 OPENSSL_free(tmp_felems);
1894 int ossl_ec_GFp_nistp384_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1897 NISTP384_PRE_COMP *pre = NULL;
1900 EC_POINT *generator = NULL;
1901 felem tmp_felems[16];
1903 BN_CTX *new_ctx = NULL;
1906 /* throw away old precomputation */
1907 EC_pre_comp_free(group);
1911 ctx = new_ctx = BN_CTX_new();
1917 x = BN_CTX_get(ctx);
1918 y = BN_CTX_get(ctx);
1921 /* get the generator */
1922 if (group->generator == NULL)
1924 generator = EC_POINT_new(group);
1925 if (generator == NULL)
1927 BN_bin2bn(nistp384_curve_params[3], sizeof(felem_bytearray), x);
1928 BN_bin2bn(nistp384_curve_params[4], sizeof(felem_bytearray), y);
1929 if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
1931 if ((pre = nistp384_pre_comp_new()) == NULL)
1934 * if the generator is the standard one, use built-in precomputation
1936 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1937 memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1940 if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
1941 (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
1942 (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
1944 /* compute 2^95*G, 2^190*G, 2^285*G */
1945 for (i = 1; i <= 4; i <<= 1) {
1946 point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1947 pre->g_pre_comp[i][0], pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
1948 for (j = 0; j < 94; ++j) {
1949 point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1950 pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2]);
1953 /* g_pre_comp[0] is the point at infinity */
1954 memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
1955 /* the remaining multiples */
1956 /* 2^95*G + 2^190*G */
1957 point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1], pre->g_pre_comp[6][2],
1958 pre->g_pre_comp[4][0], pre->g_pre_comp[4][1], pre->g_pre_comp[4][2], 0,
1959 pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]);
1960 /* 2^95*G + 2^285*G */
1961 point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1], pre->g_pre_comp[10][2],
1962 pre->g_pre_comp[8][0], pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], 0,
1963 pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]);
1964 /* 2^190*G + 2^285*G */
1965 point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
1966 pre->g_pre_comp[8][0], pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], 0,
1967 pre->g_pre_comp[4][0], pre->g_pre_comp[4][1], pre->g_pre_comp[4][2]);
1968 /* 2^95*G + 2^190*G + 2^285*G */
1969 point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1], pre->g_pre_comp[14][2],
1970 pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2], 0,
1971 pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]);
1972 for (i = 1; i < 8; ++i) {
1973 /* odd multiples: add G */
1974 point_add(pre->g_pre_comp[2 * i + 1][0], pre->g_pre_comp[2 * i + 1][1], pre->g_pre_comp[2 * i + 1][2],
1975 pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
1976 pre->g_pre_comp[1][0], pre->g_pre_comp[1][1], pre->g_pre_comp[1][2]);
1978 make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
1981 SETPRECOMP(group, nistp384, pre);
1986 EC_POINT_free(generator);
1988 BN_CTX_free(new_ctx);
1990 ossl_ec_nistp384_pre_comp_free(pre);
1994 int ossl_ec_GFp_nistp384_have_precompute_mult(const EC_GROUP *group)
1996 return HAVEPRECOMP(group, nistp384);