- * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2) */
-
-/* This function is not entirely constant-time:
- * it includes a branch for checking whether the two input points are equal,
- * (while not equal to the point at infinity).
- * This case never happens during single point multiplication,
- * so there is no timing leak for ECDH or ECDSA signing. */
-static void point_add(fslice x3[4], fslice y3[4], fslice z3[4],
- const fslice x1[4], const fslice y1[4], const fslice z1[4],
- const fslice x2[4], const fslice y2[4], const fslice z2[4])
- {
- fslice ftmp[4], ftmp2[4], ftmp3[4], ftmp4[4], ftmp5[4];
- uint128_t tmp[7], tmp2[7];
- fslice z1_is_zero, z2_is_zero, x_equal, y_equal;
-
- /* ftmp = z1^2 */
- felem_square(tmp, z1);
- felem_reduce(ftmp, tmp);
-
- /* ftmp2 = z2^2 */
- felem_square(tmp, z2);
- felem_reduce(ftmp2, tmp);
-
- /* ftmp3 = z1^3 */
- felem_mul(tmp, ftmp, z1);
- felem_reduce(ftmp3, tmp);
-
- /* ftmp4 = z2^3 */
- felem_mul(tmp, ftmp2, z2);
- felem_reduce(ftmp4, tmp);
-
- /* ftmp3 = z1^3*y2 */
- felem_mul(tmp, ftmp3, y2);
- /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
-
- /* ftmp4 = z2^3*y1 */
- felem_mul(tmp2, ftmp4, y1);
- felem_reduce(ftmp4, tmp2);
-
- /* ftmp3 = z1^3*y2 - z2^3*y1 */
- felem_diff_128_64(tmp, ftmp4);
- /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
- felem_reduce(ftmp3, tmp);
-
- /* ftmp = z1^2*x2 */
- felem_mul(tmp, ftmp, x2);
- /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
-
- /* ftmp2 =z2^2*x1 */
- felem_mul(tmp2, ftmp2, x1);
- felem_reduce(ftmp2, tmp2);
-
- /* ftmp = z1^2*x2 - z2^2*x1 */
- felem_diff128(tmp, tmp2);
- /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
- felem_reduce(ftmp, tmp);
-
- /* the formulae are incorrect if the points are equal
- * so we check for this and do doubling if this happens */
- x_equal = felem_is_zero(ftmp);
- y_equal = felem_is_zero(ftmp3);
- z1_is_zero = felem_is_zero(z1);
- z2_is_zero = felem_is_zero(z2);
- /* In affine coordinates, (X_1, Y_1) == (X_2, Y_2) */
- if (x_equal && y_equal && !z1_is_zero && !z2_is_zero)
- {
- point_double(x3, y3, z3, x1, y1, z1);
- return;
- }
-
- /* ftmp5 = z1*z2 */
- felem_mul(tmp, z1, z2);
- felem_reduce(ftmp5, tmp);
-
- /* z3 = (z1^2*x2 - z2^2*x1)*(z1*z2) */
- felem_mul(tmp, ftmp, ftmp5);
- felem_reduce(z3, tmp);
-
- /* ftmp = (z1^2*x2 - z2^2*x1)^2 */
- memcpy(ftmp5, ftmp, 4 * sizeof(fslice));
- felem_square(tmp, ftmp);
- felem_reduce(ftmp, tmp);
-
- /* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */
- felem_mul(tmp, ftmp, ftmp5);
- felem_reduce(ftmp5, tmp);
-
- /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
- felem_mul(tmp, ftmp2, ftmp);
- felem_reduce(ftmp2, tmp);
-
- /* ftmp4 = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
- felem_mul(tmp, ftmp4, ftmp5);
- /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
-
- /* tmp2 = (z1^3*y2 - z2^3*y1)^2 */
- felem_square(tmp2, ftmp3);
- /* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */
-
- /* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */
- felem_diff_128_64(tmp2, ftmp5);
- /* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */
-
- /* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
- memcpy(ftmp5, ftmp2, 4 * sizeof(fslice));
- felem_scalar64(ftmp5, 2);
- /* ftmp5[i] < 2 * 2^57 = 2^58 */
-
- /* x3 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 -
- 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
- felem_diff_128_64(tmp2, ftmp5);
- /* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */
- felem_reduce(x3, tmp2);
-
- /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x3 */
- felem_diff64(ftmp2, x3);
- /* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */
-
- /* tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x3) */
- felem_mul(tmp2, ftmp3, ftmp2);
- /* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */
-
- /* y3 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x3) -
- z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
- felem_diff128(tmp2, tmp);
- /* tmp2[i] < 2^118 + 2^120 < 2^121 */
- felem_reduce(y3, tmp2);
-
- /* the result (x3, y3, z3) is incorrect if one of the inputs is the
- * point at infinity, so we need to check for this separately */
-
- /* if point 1 is at infinity, copy point 2 to output, and vice versa */
- copy_conditional(x3, x2, 4, z1_is_zero);
- copy_conditional(x3, x1, 4, z2_is_zero);
- copy_conditional(y3, y2, 4, z1_is_zero);
- copy_conditional(y3, y1, 4, z2_is_zero);
- copy_conditional(z3, z2, 4, z1_is_zero);
- copy_conditional(z3, z1, 4, z2_is_zero);
- }
-
-/* Select a point from an array of 16 precomputed point multiples,
- * in constant time: for bits = {b_0, b_1, b_2, b_3}, return the point
- * pre_comp[8*b_3 + 4*b_2 + 2*b_1 + b_0] */
-static void select_point(const fslice bits[4], const fslice pre_comp[16][3][4],
- fslice out[12])
- {
- fslice tmp[5][12];
- select_conditional(tmp[0], pre_comp[7][0], pre_comp[15][0], 12, bits[3]);
- select_conditional(tmp[1], pre_comp[3][0], pre_comp[11][0], 12, bits[3]);
- select_conditional(tmp[2], tmp[1], tmp[0], 12, bits[2]);
- select_conditional(tmp[0], pre_comp[5][0], pre_comp[13][0], 12, bits[3]);
- select_conditional(tmp[1], pre_comp[1][0], pre_comp[9][0], 12, bits[3]);
- select_conditional(tmp[3], tmp[1], tmp[0], 12, bits[2]);
- select_conditional(tmp[4], tmp[3], tmp[2], 12, bits[1]);
- select_conditional(tmp[0], pre_comp[6][0], pre_comp[14][0], 12, bits[3]);
- select_conditional(tmp[1], pre_comp[2][0], pre_comp[10][0], 12, bits[3]);
- select_conditional(tmp[2], tmp[1], tmp[0], 12, bits[2]);
- select_conditional(tmp[0], pre_comp[4][0], pre_comp[12][0], 12, bits[3]);
- select_conditional(tmp[1], pre_comp[0][0], pre_comp[8][0], 12, bits[3]);
- select_conditional(tmp[3], tmp[1], tmp[0], 12, bits[2]);
- select_conditional(tmp[1], tmp[3], tmp[2], 12, bits[1]);
- select_conditional(out, tmp[1], tmp[4], 12, bits[0]);
- }
-
-/* Interleaved point multiplication using precomputed point multiples:
- * The small point multiples 0*P, 1*P, ..., 15*P are in pre_comp[],
- * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
- * of the generator, using certain (large) precomputed multiples in g_pre_comp.
- * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
-static void batch_mul(fslice x_out[4], fslice y_out[4], fslice z_out[4],
- const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar,
- const fslice pre_comp[][16][3][4], const fslice g_pre_comp[16][3][4])
- {
- unsigned i, j, num;
- unsigned gen_mul = (g_scalar != NULL);
- fslice nq[12], nqt[12], tmp[12];
- fslice bits[4];
- u8 byte;
-
- /* set nq to the point at infinity */
- memset(nq, 0, 12 * sizeof(fslice));
-
- /* Loop over all scalars msb-to-lsb, 4 bits at a time: for each nibble,
- * double 4 times, then add the precomputed point multiples.
- * If we are also adding multiples of the generator, then interleave
- * these additions with the last 56 doublings. */
- for (i = (num_points ? 28 : 7); i > 0; --i)
- {
- for (j = 0; j < 8; ++j)
- {
- /* double once */
- point_double(nq, nq+4, nq+8, nq, nq+4, nq+8);
- /* add multiples of the generator */
- if ((gen_mul) && (i <= 7))
- {
- bits[3] = (g_scalar[i+20] >> (7-j)) & 1;
- bits[2] = (g_scalar[i+13] >> (7-j)) & 1;
- bits[1] = (g_scalar[i+6] >> (7-j)) & 1;
- bits[0] = (g_scalar[i-1] >> (7-j)) & 1;
- /* select the point to add, in constant time */
- select_point(bits, g_pre_comp, tmp);
- memcpy(nqt, nq, 12 * sizeof(fslice));
- point_add(nq, nq+4, nq+8, nqt, nqt+4, nqt+8,
- tmp, tmp+4, tmp+8);
- }
- /* do an addition after every 4 doublings */
- if (j % 4 == 3)
- {
- /* loop over all scalars */
- for (num = 0; num < num_points; ++num)
- {
- byte = scalars[num][i-1];
- bits[3] = (byte >> (10-j)) & 1;
- bits[2] = (byte >> (9-j)) & 1;
- bits[1] = (byte >> (8-j)) & 1;
- bits[0] = (byte >> (7-j)) & 1;
- /* select the point to add */
- select_point(bits,
- pre_comp[num], tmp);
- memcpy(nqt, nq, 12 * sizeof(fslice));
- point_add(nq, nq+4, nq+8, nqt, nqt+4,
- nqt+8, tmp, tmp+4, tmp+8);
- }
- }
- }
- }
- memcpy(x_out, nq, 4 * sizeof(fslice));
- memcpy(y_out, nq+4, 4 * sizeof(fslice));
- memcpy(z_out, nq+8, 4 * sizeof(fslice));
- }
+ * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2)
+ *
+ * This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0.
+ */
+
+/*
+ * This function is not entirely constant-time: it includes a branch for
+ * checking whether the two input points are equal, (while not equal to the
+ * point at infinity). This case never happens during single point
+ * multiplication, so there is no timing leak for ECDH or ECDSA signing.
+ */
+static void point_add(felem x3, felem y3, felem z3,
+ const felem x1, const felem y1, const felem z1,
+ const int mixed, const felem x2, const felem y2,
+ const felem z2)
+{
+ felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out;
+ widefelem tmp, tmp2;
+ limb z1_is_zero, z2_is_zero, x_equal, y_equal;
+
+ if (!mixed) {
+ /* ftmp2 = z2^2 */
+ felem_square(tmp, z2);
+ felem_reduce(ftmp2, tmp);
+
+ /* ftmp4 = z2^3 */
+ felem_mul(tmp, ftmp2, z2);
+ felem_reduce(ftmp4, tmp);
+
+ /* ftmp4 = z2^3*y1 */
+ felem_mul(tmp2, ftmp4, y1);
+ felem_reduce(ftmp4, tmp2);
+
+ /* ftmp2 = z2^2*x1 */
+ felem_mul(tmp2, ftmp2, x1);
+ felem_reduce(ftmp2, tmp2);
+ } else {
+ /*
+ * We'll assume z2 = 1 (special case z2 = 0 is handled later)
+ */
+
+ /* ftmp4 = z2^3*y1 */
+ felem_assign(ftmp4, y1);
+
+ /* ftmp2 = z2^2*x1 */
+ felem_assign(ftmp2, x1);
+ }
+
+ /* ftmp = z1^2 */
+ felem_square(tmp, z1);
+ felem_reduce(ftmp, tmp);
+
+ /* ftmp3 = z1^3 */
+ felem_mul(tmp, ftmp, z1);
+ felem_reduce(ftmp3, tmp);
+
+ /* tmp = z1^3*y2 */
+ felem_mul(tmp, ftmp3, y2);
+ /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
+
+ /* ftmp3 = z1^3*y2 - z2^3*y1 */
+ felem_diff_128_64(tmp, ftmp4);
+ /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
+ felem_reduce(ftmp3, tmp);
+
+ /* tmp = z1^2*x2 */
+ felem_mul(tmp, ftmp, x2);
+ /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
+
+ /* ftmp = z1^2*x2 - z2^2*x1 */
+ felem_diff_128_64(tmp, ftmp2);
+ /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
+ felem_reduce(ftmp, tmp);
+
+ /*
+ * the formulae are incorrect if the points are equal so we check for
+ * this and do doubling if this happens
+ */
+ x_equal = felem_is_zero(ftmp);
+ y_equal = felem_is_zero(ftmp3);
+ z1_is_zero = felem_is_zero(z1);
+ z2_is_zero = felem_is_zero(z2);
+ /* In affine coordinates, (X_1, Y_1) == (X_2, Y_2) */
+ if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
+ point_double(x3, y3, z3, x1, y1, z1);
+ return;
+ }
+
+ /* ftmp5 = z1*z2 */
+ if (!mixed) {
+ felem_mul(tmp, z1, z2);
+ felem_reduce(ftmp5, tmp);
+ } else {
+ /* special case z2 = 0 is handled later */
+ felem_assign(ftmp5, z1);
+ }
+
+ /* z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) */
+ felem_mul(tmp, ftmp, ftmp5);
+ felem_reduce(z_out, tmp);
+
+ /* ftmp = (z1^2*x2 - z2^2*x1)^2 */
+ felem_assign(ftmp5, ftmp);
+ felem_square(tmp, ftmp);
+ felem_reduce(ftmp, tmp);
+
+ /* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */
+ felem_mul(tmp, ftmp, ftmp5);
+ felem_reduce(ftmp5, tmp);
+
+ /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
+ felem_mul(tmp, ftmp2, ftmp);
+ felem_reduce(ftmp2, tmp);
+
+ /* tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
+ felem_mul(tmp, ftmp4, ftmp5);
+ /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
+
+ /* tmp2 = (z1^3*y2 - z2^3*y1)^2 */
+ felem_square(tmp2, ftmp3);
+ /* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */
+
+ /* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */
+ felem_diff_128_64(tmp2, ftmp5);
+ /* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */
+
+ /* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
+ felem_assign(ftmp5, ftmp2);
+ felem_scalar(ftmp5, 2);
+ /* ftmp5[i] < 2 * 2^57 = 2^58 */
+
+ /*-
+ * x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 -
+ * 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2
+ */
+ felem_diff_128_64(tmp2, ftmp5);
+ /* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */
+ felem_reduce(x_out, tmp2);
+
+ /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out */
+ felem_diff(ftmp2, x_out);
+ /* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */
+
+ /*
+ * tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out)
+ */
+ felem_mul(tmp2, ftmp3, ftmp2);
+ /* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */
+
+ /*-
+ * y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) -
+ * z2^3*y1*(z1^2*x2 - z2^2*x1)^3
+ */
+ widefelem_diff(tmp2, tmp);
+ /* tmp2[i] < 2^118 + 2^120 < 2^121 */
+ felem_reduce(y_out, tmp2);
+
+ /*
+ * the result (x_out, y_out, z_out) is incorrect if one of the inputs is
+ * the point at infinity, so we need to check for this separately
+ */
+
+ /*
+ * if point 1 is at infinity, copy point 2 to output, and vice versa
+ */
+ copy_conditional(x_out, x2, z1_is_zero);
+ copy_conditional(x_out, x1, z2_is_zero);
+ copy_conditional(y_out, y2, z1_is_zero);
+ copy_conditional(y_out, y1, z2_is_zero);
+ copy_conditional(z_out, z2, z1_is_zero);
+ copy_conditional(z_out, z1, z2_is_zero);
+ felem_assign(x3, x_out);
+ felem_assign(y3, y_out);
+ felem_assign(z3, z_out);
+}
+
+/*
+ * select_point selects the |idx|th point from a precomputation table and
+ * copies it to out.
+ * The pre_comp array argument should be size of |size| argument
+ */
+static void select_point(const u64 idx, unsigned int size,
+ const felem pre_comp[][3], felem out[3])
+{
+ unsigned i, j;
+ limb *outlimbs = &out[0][0];
+
+ memset(out, 0, sizeof(*out) * 3);
+ for (i = 0; i < size; i++) {
+ const limb *inlimbs = &pre_comp[i][0][0];
+ u64 mask = i ^ idx;
+ mask |= mask >> 4;
+ mask |= mask >> 2;
+ mask |= mask >> 1;
+ mask &= 1;
+ mask--;
+ for (j = 0; j < 4 * 3; j++)
+ outlimbs[j] |= inlimbs[j] & mask;
+ }
+}
+
+/* get_bit returns the |i|th bit in |in| */
+static char get_bit(const felem_bytearray in, unsigned i)
+{
+ if (i >= 224)
+ return 0;
+ return (in[i >> 3] >> (i & 7)) & 1;
+}
+
+/*
+ * Interleaved point multiplication using precomputed point multiples: The
+ * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
+ * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
+ * generator, using certain (large) precomputed multiples in g_pre_comp.
+ * Output point (X, Y, Z) is stored in x_out, y_out, z_out
+ */
+static void batch_mul(felem x_out, felem y_out, felem z_out,
+ const felem_bytearray scalars[],
+ const unsigned num_points, const u8 *g_scalar,
+ const int mixed, const felem pre_comp[][17][3],
+ const felem g_pre_comp[2][16][3])
+{
+ int i, skip;
+ unsigned num;
+ unsigned gen_mul = (g_scalar != NULL);
+ felem nq[3], tmp[4];
+ u64 bits;
+ u8 sign, digit;
+
+ /* set nq to the point at infinity */
+ memset(nq, 0, sizeof(nq));
+
+ /*
+ * Loop over all scalars msb-to-lsb, interleaving additions of multiples
+ * of the generator (two in each of the last 28 rounds) and additions of
+ * other points multiples (every 5th round).
+ */
+ skip = 1; /* save two point operations in the first
+ * round */
+ for (i = (num_points ? 220 : 27); i >= 0; --i) {
+ /* double */
+ if (!skip)
+ point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
+
+ /* add multiples of the generator */
+ if (gen_mul && (i <= 27)) {
+ /* first, look 28 bits upwards */
+ bits = get_bit(g_scalar, i + 196) << 3;
+ bits |= get_bit(g_scalar, i + 140) << 2;
+ bits |= get_bit(g_scalar, i + 84) << 1;
+ bits |= get_bit(g_scalar, i + 28);
+ /* select the point to add, in constant time */
+ select_point(bits, 16, g_pre_comp[1], tmp);
+
+ if (!skip) {
+ /* value 1 below is argument for "mixed" */
+ point_add(nq[0], nq[1], nq[2],
+ nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
+ } else {
+ memcpy(nq, tmp, 3 * sizeof(felem));
+ skip = 0;
+ }
+
+ /* second, look at the current position */
+ bits = get_bit(g_scalar, i + 168) << 3;
+ bits |= get_bit(g_scalar, i + 112) << 2;
+ bits |= get_bit(g_scalar, i + 56) << 1;
+ bits |= get_bit(g_scalar, i);
+ /* select the point to add, in constant time */
+ select_point(bits, 16, g_pre_comp[0], tmp);
+ point_add(nq[0], nq[1], nq[2],
+ nq[0], nq[1], nq[2],
+ 1 /* mixed */ , tmp[0], tmp[1], tmp[2]);
+ }
+
+ /* do other additions every 5 doublings */
+ if (num_points && (i % 5 == 0)) {
+ /* loop over all scalars */
+ for (num = 0; num < num_points; ++num) {
+ bits = get_bit(scalars[num], i + 4) << 5;
+ bits |= get_bit(scalars[num], i + 3) << 4;
+ bits |= get_bit(scalars[num], i + 2) << 3;
+ bits |= get_bit(scalars[num], i + 1) << 2;
+ bits |= get_bit(scalars[num], i) << 1;
+ bits |= get_bit(scalars[num], i - 1);
+ ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
+
+ /* select the point to add or subtract */
+ select_point(digit, 17, pre_comp[num], tmp);
+ felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
+ * point */
+ copy_conditional(tmp[1], tmp[3], sign);
+
+ if (!skip) {
+ point_add(nq[0], nq[1], nq[2],
+ nq[0], nq[1], nq[2],
+ mixed, tmp[0], tmp[1], tmp[2]);
+ } else {
+ memcpy(nq, tmp, 3 * sizeof(felem));
+ skip = 0;
+ }
+ }
+ }
+ }
+ felem_assign(x_out, nq[0]);
+ felem_assign(y_out, nq[1]);
+ felem_assign(z_out, nq[2]);
+}