2 * Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
3 * Copyright 2014 Cryptography Research, Inc.
5 * Licensed under the OpenSSL license (the "License"). You may not use
6 * this file except in compliance with the License. You can obtain a copy
7 * in the file LICENSE in the source distribution or at
8 * https://www.openssl.org/source/license.html
10 * Originally written by Mike Hamburg
15 static inline void __attribute__ ((gnu_inline, always_inline))
16 smlal(uint64_t *acc, const uint32_t a, const uint32_t b)
20 uint32_t lo = *acc, hi = (*acc) >> 32;
22 __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
23 : [lo]"+&r"(lo), [hi]"+&r"(hi)
24 : [a]"r"(a), [b]"r"(b));
27 *acc = lo + (((uint64_t)hi) << 32);
29 *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)b;
33 static inline void __attribute__ ((gnu_inline, always_inline))
34 smlal2(uint64_t *acc, const uint32_t a, const uint32_t b)
37 uint32_t lo = *acc, hi = (*acc) >> 32;
39 __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
40 : [lo]"+&r"(lo), [hi]"+&r"(hi)
41 : [a]"r"(a), [b]"r"(2 * b));
45 *acc = lo + (((uint64_t)hi) << 32);
47 *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2);
51 static inline void __attribute__ ((gnu_inline, always_inline))
52 smull(uint64_t *acc, const uint32_t a, const uint32_t b)
57 __asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]"
58 : [lo]"=&r"(lo), [hi]"=&r"(hi)
59 : [a]"r"(a), [b]"r"(b));
61 *acc = lo + (((uint64_t)hi) << 32);
63 *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)b;
67 static inline void __attribute__ ((gnu_inline, always_inline))
68 smull2(uint64_t *acc, const uint32_t a, const uint32_t b)
73 __asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]"
74 : [lo]"=&r"(lo), [hi]"=&r"(hi)
75 : [a]"r"(a), [b]"r"(2*b));
77 *acc = lo + (((uint64_t)hi) << 32);
79 *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2);
83 void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
86 const uint32_t *a = as->limb, *b = bs->limb;
87 uint32_t *c = cs->limb;
89 uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1;
90 uint32_t mask = (1 << 28) - 1;
92 uint32_t aa[8], bm[8];
95 for (i = 0; i < 8; i++) {
96 aa[i] = a[i] + a[i + 8];
97 bm[i] = b[i] - b[i + 8];
103 smull(&accum1, ax = aa[1], bx = b[15]);
104 smull(&accum3, ax = aa[2], bx);
105 smlal(&accum1, ax, bx = b[14]);
106 smlal(&accum3, ax = aa[3], bx);
107 smlal(&accum1, ax, bx = b[13]);
108 smlal(&accum3, ax = aa[4], bx);
109 smlal(&accum1, ax, bx = b[12]);
110 smlal(&accum3, ax = aa[5], bx);
111 smlal(&accum1, ax, bx = b[11]);
112 smlal(&accum3, ax = aa[6], bx);
113 smlal(&accum1, ax, bx = b[10]);
114 smlal(&accum3, ax = aa[7], bx);
115 smlal(&accum1, ax, bx = b[9]);
121 smlal(&accum2, ax = aa[0], bx);
122 smlal(&accum0, ax, bx = b[8]);
123 smlal(&accum2, ax = aa[1], bx);
125 smlal(&accum0, ax = a[9], bx = b[7]);
126 smlal(&accum2, ax = a[10], bx);
127 smlal(&accum0, ax, bx = b[6]);
128 smlal(&accum2, ax = a[11], bx);
129 smlal(&accum0, ax, bx = b[5]);
130 smlal(&accum2, ax = a[12], bx);
131 smlal(&accum0, ax, bx = b[4]);
132 smlal(&accum2, ax = a[13], bx);
133 smlal(&accum0, ax, bx = b[3]);
134 smlal(&accum2, ax = a[14], bx);
135 smlal(&accum0, ax, bx = b[2]);
136 smlal(&accum2, ax = a[15], bx);
137 smlal(&accum0, ax, bx = b[1]);
142 smlal(&accum3, ax = a[8], bx);
143 smlal(&accum1, ax, bx = b[0]);
144 smlal(&accum3, ax = a[9], bx);
146 smlal(&accum1, ax = a[1], bx = bm[7]);
147 smlal(&accum3, ax = a[2], bx);
148 smlal(&accum1, ax, bx = bm[6]);
149 smlal(&accum3, ax = a[3], bx);
150 smlal(&accum1, ax, bx = bm[5]);
151 smlal(&accum3, ax = a[4], bx);
152 smlal(&accum1, ax, bx = bm[4]);
153 smlal(&accum3, ax = a[5], bx);
154 smlal(&accum1, ax, bx = bm[3]);
155 smlal(&accum3, ax = a[6], bx);
156 smlal(&accum1, ax, bx = bm[2]);
157 smlal(&accum3, ax = a[7], bx);
158 smlal(&accum1, ax, bx = bm[1]);
161 smlal(&accum2, ax = a[0], bx);
162 smlal(&accum0, ax, bx = bm[0]);
163 smlal(&accum2, ax = a[1], bx);
165 accum2 += accum0 >> 28;
166 accum3 += accum1 >> 28;
168 c[0] = ((uint32_t)(accum0)) & mask;
169 c[1] = ((uint32_t)(accum2)) & mask;
170 c[8] = ((uint32_t)(accum1)) & mask;
171 c[9] = ((uint32_t)(accum3)) & mask;
173 accumC0 = accum2 >> 28;
174 accumC1 = accum3 >> 28;
178 smull(&accum1, ax = aa[3], bx = b[15]);
179 smull(&accum3, ax = aa[4], bx);
180 smlal(&accum1, ax, bx = b[14]);
181 smlal(&accum3, ax = aa[5], bx);
182 smlal(&accum1, ax, bx = b[13]);
183 smlal(&accum3, ax = aa[6], bx);
184 smlal(&accum1, ax, bx = b[12]);
185 smlal(&accum3, ax = aa[7], bx);
186 smlal(&accum1, ax, bx = b[11]);
192 smlal(&accum2, ax = aa[0], bx);
193 smlal(&accum0, ax, bx = b[10]);
194 smlal(&accum2, ax = aa[1], bx);
195 smlal(&accum0, ax, bx = b[9]);
196 smlal(&accum2, ax = aa[2], bx);
197 smlal(&accum0, ax, bx = b[8]);
198 smlal(&accum2, ax = aa[3], bx);
200 smlal(&accum0, ax = a[11], bx = b[7]);
201 smlal(&accum2, ax = a[12], bx);
202 smlal(&accum0, ax, bx = b[6]);
203 smlal(&accum2, ax = a[13], bx);
204 smlal(&accum0, ax, bx = b[5]);
205 smlal(&accum2, ax = a[14], bx);
206 smlal(&accum0, ax, bx = b[4]);
207 smlal(&accum2, ax = a[15], bx);
208 smlal(&accum0, ax, bx = b[3]);
213 smlal(&accum3, ax = a[8], bx);
214 smlal(&accum1, ax, bx = b[2]);
215 smlal(&accum3, ax = a[9], bx);
216 smlal(&accum1, ax, bx = b[1]);
217 smlal(&accum3, ax = a[10], bx);
218 smlal(&accum1, ax, bx = b[0]);
219 smlal(&accum3, ax = a[11], bx);
221 smlal(&accum1, ax = a[3], bx = bm[7]);
222 smlal(&accum3, ax = a[4], bx);
223 smlal(&accum1, ax, bx = bm[6]);
224 smlal(&accum3, ax = a[5], bx);
225 smlal(&accum1, ax, bx = bm[5]);
226 smlal(&accum3, ax = a[6], bx);
227 smlal(&accum1, ax, bx = bm[4]);
228 smlal(&accum3, ax = a[7], bx);
229 smlal(&accum1, ax, bx = bm[3]);
232 smlal(&accum2, ax = a[0], bx);
233 smlal(&accum0, ax, bx = bm[2]);
234 smlal(&accum2, ax = a[1], bx);
235 smlal(&accum0, ax, bx = bm[1]);
236 smlal(&accum2, ax = a[2], bx);
237 smlal(&accum0, ax, bx = bm[0]);
238 smlal(&accum2, ax = a[3], bx);
242 accum2 += accum0 >> 28;
243 accum3 += accum1 >> 28;
245 c[2] = ((uint32_t)(accum0)) & mask;
246 c[3] = ((uint32_t)(accum2)) & mask;
247 c[10] = ((uint32_t)(accum1)) & mask;
248 c[11] = ((uint32_t)(accum3)) & mask;
250 accumC0 = accum2 >> 28;
251 accumC1 = accum3 >> 28;
256 smull(&accum1, ax = aa[5], bx = b[15]);
257 smull(&accum3, ax = aa[6], bx);
258 smlal(&accum1, ax, bx = b[14]);
259 smlal(&accum3, ax = aa[7], bx);
260 smlal(&accum1, ax, bx = b[13]);
267 smlal(&accum2, ax = aa[0], bx);
268 smlal(&accum0, ax, bx = b[12]);
269 smlal(&accum2, ax = aa[1], bx);
270 smlal(&accum0, ax, bx = b[11]);
271 smlal(&accum2, ax = aa[2], bx);
272 smlal(&accum0, ax, bx = b[10]);
273 smlal(&accum2, ax = aa[3], bx);
274 smlal(&accum0, ax, bx = b[9]);
275 smlal(&accum2, ax = aa[4], bx);
276 smlal(&accum0, ax, bx = b[8]);
277 smlal(&accum2, ax = aa[5], bx);
279 smlal(&accum0, ax = a[13], bx = b[7]);
280 smlal(&accum2, ax = a[14], bx);
281 smlal(&accum0, ax, bx = b[6]);
282 smlal(&accum2, ax = a[15], bx);
283 smlal(&accum0, ax, bx = b[5]);
289 smlal(&accum3, ax = a[8], bx);
290 smlal(&accum1, ax, bx = b[4]);
291 smlal(&accum3, ax = a[9], bx);
292 smlal(&accum1, ax, bx = b[3]);
293 smlal(&accum3, ax = a[10], bx);
294 smlal(&accum1, ax, bx = b[2]);
295 smlal(&accum3, ax = a[11], bx);
296 smlal(&accum1, ax, bx = b[1]);
297 smlal(&accum3, ax = a[12], bx);
298 smlal(&accum1, ax, bx = b[0]);
299 smlal(&accum3, ax = a[13], bx);
301 smlal(&accum1, ax = a[5], bx = bm[7]);
302 smlal(&accum3, ax = a[6], bx);
303 smlal(&accum1, ax, bx = bm[6]);
304 smlal(&accum3, ax = a[7], bx);
305 smlal(&accum1, ax, bx = bm[5]);
309 smlal(&accum2, ax = a[0], bx);
310 smlal(&accum0, ax, bx = bm[4]);
311 smlal(&accum2, ax = a[1], bx);
312 smlal(&accum0, ax, bx = bm[3]);
313 smlal(&accum2, ax = a[2], bx);
314 smlal(&accum0, ax, bx = bm[2]);
315 smlal(&accum2, ax = a[3], bx);
316 smlal(&accum0, ax, bx = bm[1]);
317 smlal(&accum2, ax = a[4], bx);
318 smlal(&accum0, ax, bx = bm[0]);
319 smlal(&accum2, ax = a[5], bx);
323 accum2 += accum0 >> 28;
324 accum3 += accum1 >> 28;
326 c[4] = ((uint32_t)(accum0)) & mask;
327 c[5] = ((uint32_t)(accum2)) & mask;
328 c[12] = ((uint32_t)(accum1)) & mask;
329 c[13] = ((uint32_t)(accum3)) & mask;
331 accumC0 = accum2 >> 28;
332 accumC1 = accum3 >> 28;
337 smull(&accum1, ax = aa[7], bx = b[15]);
342 smull(&accum2, ax = aa[0], bx);
343 smlal(&accum0, ax, bx = b[14]);
344 smlal(&accum2, ax = aa[1], bx);
345 smlal(&accum0, ax, bx = b[13]);
346 smlal(&accum2, ax = aa[2], bx);
347 smlal(&accum0, ax, bx = b[12]);
348 smlal(&accum2, ax = aa[3], bx);
349 smlal(&accum0, ax, bx = b[11]);
350 smlal(&accum2, ax = aa[4], bx);
351 smlal(&accum0, ax, bx = b[10]);
352 smlal(&accum2, ax = aa[5], bx);
353 smlal(&accum0, ax, bx = b[9]);
354 smlal(&accum2, ax = aa[6], bx);
355 smlal(&accum0, ax, bx = b[8]);
356 smlal(&accum2, ax = aa[7], bx);
358 smlal(&accum0, ax = a[15], bx = b[7]);
364 smlal(&accum3, ax = a[8], bx);
365 smlal(&accum1, ax, bx = b[6]);
366 smlal(&accum3, ax = a[9], bx);
367 smlal(&accum1, ax, bx = b[5]);
368 smlal(&accum3, ax = a[10], bx);
369 smlal(&accum1, ax, bx = b[4]);
370 smlal(&accum3, ax = a[11], bx);
371 smlal(&accum1, ax, bx = b[3]);
372 smlal(&accum3, ax = a[12], bx);
373 smlal(&accum1, ax, bx = b[2]);
374 smlal(&accum3, ax = a[13], bx);
375 smlal(&accum1, ax, bx = b[1]);
376 smlal(&accum3, ax = a[14], bx);
377 smlal(&accum1, ax, bx = b[0]);
378 smlal(&accum3, ax = a[15], bx);
380 smlal(&accum1, ax = a[7], bx = bm[7]);
384 smlal(&accum2, ax = a[0], bx);
385 smlal(&accum0, ax, bx = bm[6]);
386 smlal(&accum2, ax = a[1], bx);
387 smlal(&accum0, ax, bx = bm[5]);
388 smlal(&accum2, ax = a[2], bx);
389 smlal(&accum0, ax, bx = bm[4]);
390 smlal(&accum2, ax = a[3], bx);
391 smlal(&accum0, ax, bx = bm[3]);
392 smlal(&accum2, ax = a[4], bx);
393 smlal(&accum0, ax, bx = bm[2]);
394 smlal(&accum2, ax = a[5], bx);
395 smlal(&accum0, ax, bx = bm[1]);
396 smlal(&accum2, ax = a[6], bx);
397 smlal(&accum0, ax, bx = bm[0]);
398 smlal(&accum2, ax = a[7], bx);
402 accum2 += accum0 >> 28;
403 accum3 += accum1 >> 28;
405 c[6] = ((uint32_t)(accum0)) & mask;
406 c[7] = ((uint32_t)(accum2)) & mask;
407 c[14] = ((uint32_t)(accum1)) & mask;
408 c[15] = ((uint32_t)(accum3)) & mask;
410 accum0 = accum2 >> 28;
411 accum1 = accum3 >> 28;
417 c[8] = ((uint32_t)(accum0)) & mask;
418 c[0] = ((uint32_t)(accum1)) & mask;
422 c[9] += ((uint32_t)(accum0));
423 c[1] += ((uint32_t)(accum1));
426 void gf_sqr(gf_s * __restrict__ cs, const gf as)
428 const uint32_t *a = as->limb;
429 uint32_t *c = cs->limb;
431 uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1, tmp;
432 uint32_t mask = (1 << 28) - 1;
437 for (i = 0; i < 8; i++) {
438 bm[i] = a[i] - a[i + 8];
444 smull2(&accum1, ax = a[9], bx = a[15]);
445 smull2(&accum3, ax = a[10], bx);
446 smlal2(&accum1, ax, bx = a[14]);
447 smlal2(&accum3, ax = a[11], bx);
448 smlal2(&accum1, ax, bx = a[13]);
449 smlal2(&accum3, ax = a[12], bx);
450 smlal(&accum1, ax, ax);
456 smlal2(&accum2, ax = a[8], a[9]);
457 smlal(&accum0, ax, ax);
459 smlal2(&accum0, ax = a[1], bx = a[7]);
460 smlal2(&accum2, ax = a[2], bx);
461 smlal2(&accum0, ax, bx = a[6]);
462 smlal2(&accum2, ax = a[3], bx);
463 smlal2(&accum0, ax, bx = a[5]);
464 smlal2(&accum2, ax = a[4], bx);
465 smlal(&accum0, ax, ax);
470 smlal2(&accum3, ax = a[0], bx = a[1]);
471 smlal(&accum1, ax, ax);
478 smlal2(&accum1, ax = bm[1], bx = bm[7]);
479 smlal2(&accum3, ax = bm[2], bx);
480 smlal2(&accum1, ax, bx = bm[6]);
481 smlal2(&accum3, ax = bm[3], bx);
482 smlal2(&accum1, ax, bx = bm[5]);
483 smlal2(&accum3, ax = bm[4], bx);
484 smlal(&accum1, ax, ax);
487 smlal2(&accum2, ax = bm[0], bx = bm[1]);
488 smlal(&accum0, ax, ax);
491 accum3 = tmp - accum2;
494 accum1 = tmp - accum0;
497 accum2 += accum0 >> 28;
498 accum3 += accum1 >> 28;
500 c[0] = ((uint32_t)(accum0)) & mask;
501 c[1] = ((uint32_t)(accum2)) & mask;
502 c[8] = ((uint32_t)(accum1)) & mask;
503 c[9] = ((uint32_t)(accum3)) & mask;
505 accumC0 = accum2 >> 28;
506 accumC1 = accum3 >> 28;
510 smull2(&accum1, ax = a[11], bx = a[15]);
511 smull2(&accum3, ax = a[12], bx);
512 smlal2(&accum1, ax, bx = a[14]);
513 smlal2(&accum3, ax = a[13], bx);
514 smlal(&accum1, ax, ax);
520 smlal2(&accum2, ax = a[8], bx = a[11]);
521 smlal2(&accum0, ax, bx = a[10]);
522 smlal2(&accum2, ax = a[9], bx);
523 smlal(&accum0, ax, ax);
525 smlal2(&accum0, ax = a[3], bx = a[7]);
526 smlal2(&accum2, ax = a[4], bx);
527 smlal2(&accum0, ax, bx = a[6]);
528 smlal2(&accum2, ax = a[5], bx);
529 smlal(&accum0, ax, ax);
534 smlal2(&accum3, ax = a[0], bx = a[3]);
535 smlal2(&accum1, ax, bx = a[2]);
536 smlal2(&accum3, ax = a[1], bx);
537 smlal(&accum1, ax, ax);
544 smlal2(&accum1, ax = bm[3], bx = bm[7]);
545 smlal2(&accum3, ax = bm[4], bx);
546 smlal2(&accum1, ax, bx = bm[6]);
547 smlal2(&accum3, ax = bm[5], bx);
548 smlal(&accum1, ax, ax);
551 smlal2(&accum2, ax = bm[0], bx = bm[3]);
552 smlal2(&accum0, ax, bx = bm[2]);
553 smlal2(&accum2, ax = bm[1], bx);
554 smlal(&accum0, ax, ax);
557 accum3 = tmp - accum2;
560 accum1 = tmp - accum0;
565 accum2 += accum0 >> 28;
566 accum3 += accum1 >> 28;
568 c[2] = ((uint32_t)(accum0)) & mask;
569 c[3] = ((uint32_t)(accum2)) & mask;
570 c[10] = ((uint32_t)(accum1)) & mask;
571 c[11] = ((uint32_t)(accum3)) & mask;
573 accumC0 = accum2 >> 28;
574 accumC1 = accum3 >> 28;
579 smull2(&accum1, ax = a[13], bx = a[15]);
580 smull2(&accum3, ax = a[14], bx);
581 smlal(&accum1, ax, ax);
588 smlal2(&accum2, ax = a[8], bx = a[13]);
589 smlal2(&accum0, ax, bx = a[12]);
590 smlal2(&accum2, ax = a[9], bx);
591 smlal2(&accum0, ax, bx = a[11]);
592 smlal2(&accum2, ax = a[10], bx);
593 smlal(&accum0, ax, ax);
595 smlal2(&accum0, ax = a[5], bx = a[7]);
596 smlal2(&accum2, ax = a[6], bx);
597 smlal(&accum0, ax, ax);
603 smlal2(&accum3, ax = a[0], bx = a[5]);
604 smlal2(&accum1, ax, bx = a[4]);
605 smlal2(&accum3, ax = a[1], bx);
606 smlal2(&accum1, ax, bx = a[3]);
607 smlal2(&accum3, ax = a[2], bx);
608 smlal(&accum1, ax, ax);
615 smlal2(&accum1, ax = bm[5], bx = bm[7]);
616 smlal2(&accum3, ax = bm[6], bx);
617 smlal(&accum1, ax, ax);
621 smlal2(&accum2, ax = bm[0], bx = bm[5]);
622 smlal2(&accum0, ax, bx = bm[4]);
623 smlal2(&accum2, ax = bm[1], bx);
624 smlal2(&accum0, ax, bx = bm[3]);
625 smlal2(&accum2, ax = bm[2], bx);
626 smlal(&accum0, ax, ax);
629 accum3 = tmp - accum2;
632 accum1 = tmp - accum0;
637 accum2 += accum0 >> 28;
638 accum3 += accum1 >> 28;
640 c[4] = ((uint32_t)(accum0)) & mask;
641 c[5] = ((uint32_t)(accum2)) & mask;
642 c[12] = ((uint32_t)(accum1)) & mask;
643 c[13] = ((uint32_t)(accum3)) & mask;
645 accumC0 = accum2 >> 28;
646 accumC1 = accum3 >> 28;
651 smull(&accum1, ax = a[15], bx = a[15]);
656 smull2(&accum2, ax = a[8], bx);
657 smlal2(&accum0, ax, bx = a[14]);
658 smlal2(&accum2, ax = a[9], bx);
659 smlal2(&accum0, ax, bx = a[13]);
660 smlal2(&accum2, ax = a[10], bx);
661 smlal2(&accum0, ax, bx = a[12]);
662 smlal2(&accum2, ax = a[11], bx);
663 smlal(&accum0, ax, ax);
665 smlal(&accum0, ax = a[7], bx = a[7]);
671 smlal2(&accum3, ax = a[0], bx);
672 smlal2(&accum1, ax, bx = a[6]);
673 smlal2(&accum3, ax = a[1], bx);
674 smlal2(&accum1, ax, bx = a[5]);
675 smlal2(&accum3, ax = a[2], bx);
676 smlal2(&accum1, ax, bx = a[4]);
677 smlal2(&accum3, ax = a[3], bx);
678 smlal(&accum1, ax, ax);
686 smlal(&accum1, bx, bx);
690 smlal2(&accum2, ax = bm[0], bx);
691 smlal2(&accum0, ax, bx = bm[6]);
692 smlal2(&accum2, ax = bm[1], bx);
693 smlal2(&accum0, ax, bx = bm[5]);
694 smlal2(&accum2, ax = bm[2], bx);
695 smlal2(&accum0, ax, bx = bm[4]);
696 smlal2(&accum2, ax = bm[3], bx);
697 smlal(&accum0, ax, ax);
700 accum3 = tmp - accum2;
703 accum1 = tmp - accum0;
708 accum2 += accum0 >> 28;
709 accum3 += accum1 >> 28;
711 c[6] = ((uint32_t)(accum0)) & mask;
712 c[7] = ((uint32_t)(accum2)) & mask;
713 c[14] = ((uint32_t)(accum1)) & mask;
714 c[15] = ((uint32_t)(accum3)) & mask;
716 accum0 = accum2 >> 28;
717 accum1 = accum3 >> 28;
723 c[8] = ((uint32_t)(accum0)) & mask;
724 c[0] = ((uint32_t)(accum1)) & mask;
728 c[9] += ((uint32_t)(accum0));
729 c[1] += ((uint32_t)(accum1));
732 void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
734 uint32_t mask = (1ull << 28) - 1;
735 const uint32_t *a = as->limb;
736 uint32_t *c = cs->limb;
737 uint64_t accum0, accum8;
739 uint32_t c0, c8, n0, n8;
745 accum0 = widemul(b, c0);
746 accum8 = widemul(b, c8);
748 c[0] = accum0 & mask;
750 c[8] = accum8 & mask;
757 smlal(&accum0, b, n0);
758 smlal(&accum8, b, n8);
760 c[i] = accum0 & mask;
762 c[i + 8] = accum8 & mask;
769 smlal(&accum0, b, c0);
770 smlal(&accum8, b, c8);
772 c[i] = accum0 & mask;
774 c[i + 8] = accum8 & mask;
781 smlal(&accum0, b, n0);
782 smlal(&accum8, b, n8);
784 c[i] = accum0 & mask;
786 c[i + 8] = accum8 & mask;
793 smlal(&accum0, b, c0);
794 smlal(&accum8, b, c8);
796 c[i] = accum0 & mask;
798 c[i + 8] = accum8 & mask;
805 smlal(&accum0, b, n0);
806 smlal(&accum8, b, n8);
808 c[i] = accum0 & mask;
810 c[i + 8] = accum8 & mask;
817 smlal(&accum0, b, c0);
818 smlal(&accum8, b, c8);
820 c[i] = accum0 & mask;
822 c[i + 8] = accum8 & mask;
829 smlal(&accum0, b, n0);
830 smlal(&accum8, b, n8);
832 c[i] = accum0 & mask;
834 c[i + 8] = accum8 & mask;
839 accum0 += accum8 + c[8];
840 c[8] = accum0 & mask;
841 c[9] += accum0 >> 28;
844 c[0] = accum8 & mask;
845 c[1] += accum8 >> 28;