2 * Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
3 * Copyright 2014 Cryptography Research, Inc.
5 * Licensed under the OpenSSL license (the "License"). You may not use
6 * this file except in compliance with the License. You can obtain a copy
7 * in the file LICENSE in the source distribution or at
8 * https://www.openssl.org/source/license.html
10 * Originally written by Mike Hamburg
15 static inline void __attribute__ ((gnu_inline, always_inline))
16 smlal(uint64_t *acc, const uint32_t a, const uint32_t b)
20 uint32_t lo = *acc, hi = (*acc) >> 32;
22 __asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo),
24 :[a] "r"(a),[b] "r"(b));
26 *acc = lo + (((uint64_t)hi) << 32);
28 *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)b;
32 static inline void __attribute__ ((gnu_inline, always_inline))
33 smlal2(uint64_t *acc, const uint32_t a, const uint32_t b)
36 uint32_t lo = *acc, hi = (*acc) >> 32;
38 __asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo),
40 :[a] "r"(a),[b] "r"(2 * b));
42 *acc = lo + (((uint64_t)hi) << 32);
44 *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2);
48 static inline void __attribute__ ((gnu_inline, always_inline))
49 smull(uint64_t *acc, const uint32_t a, const uint32_t b)
54 __asm__ __volatile__("smull %[lo], %[hi], %[a], %[b]":[lo] "=&r"(lo),
56 :[a] "r"(a),[b] "r"(b));
58 *acc = lo + (((uint64_t)hi) << 32);
60 *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)b;
64 static inline void __attribute__ ((gnu_inline, always_inline))
65 smull2(uint64_t *acc, const uint32_t a, const uint32_t b)
70 __asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]"
71 : [lo] "=&r"(lo),[hi] "=&r"(hi)
72 : [a] "r"(a),[b] "r"(2 * b));
74 *acc = lo + (((uint64_t)hi) << 32);
76 *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2);
80 void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
83 const uint32_t *a = as->limb, *b = bs->limb;
84 uint32_t *c = cs->limb;
86 uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1;
87 uint32_t mask = (1 << 28) - 1;
89 uint32_t aa[8], bm[8];
92 for (i = 0; i < 8; i++) {
93 aa[i] = a[i] + a[i + 8];
94 bm[i] = b[i] - b[i + 8];
100 smull(&accum1, ax = aa[1], bx = b[15]);
101 smull(&accum3, ax = aa[2], bx);
102 smlal(&accum1, ax, bx = b[14]);
103 smlal(&accum3, ax = aa[3], bx);
104 smlal(&accum1, ax, bx = b[13]);
105 smlal(&accum3, ax = aa[4], bx);
106 smlal(&accum1, ax, bx = b[12]);
107 smlal(&accum3, ax = aa[5], bx);
108 smlal(&accum1, ax, bx = b[11]);
109 smlal(&accum3, ax = aa[6], bx);
110 smlal(&accum1, ax, bx = b[10]);
111 smlal(&accum3, ax = aa[7], bx);
112 smlal(&accum1, ax, bx = b[9]);
118 smlal(&accum2, ax = aa[0], bx);
119 smlal(&accum0, ax, bx = b[8]);
120 smlal(&accum2, ax = aa[1], bx);
122 smlal(&accum0, ax = a[9], bx = b[7]);
123 smlal(&accum2, ax = a[10], bx);
124 smlal(&accum0, ax, bx = b[6]);
125 smlal(&accum2, ax = a[11], bx);
126 smlal(&accum0, ax, bx = b[5]);
127 smlal(&accum2, ax = a[12], bx);
128 smlal(&accum0, ax, bx = b[4]);
129 smlal(&accum2, ax = a[13], bx);
130 smlal(&accum0, ax, bx = b[3]);
131 smlal(&accum2, ax = a[14], bx);
132 smlal(&accum0, ax, bx = b[2]);
133 smlal(&accum2, ax = a[15], bx);
134 smlal(&accum0, ax, bx = b[1]);
139 smlal(&accum3, ax = a[8], bx);
140 smlal(&accum1, ax, bx = b[0]);
141 smlal(&accum3, ax = a[9], bx);
143 smlal(&accum1, ax = a[1], bx = bm[7]);
144 smlal(&accum3, ax = a[2], bx);
145 smlal(&accum1, ax, bx = bm[6]);
146 smlal(&accum3, ax = a[3], bx);
147 smlal(&accum1, ax, bx = bm[5]);
148 smlal(&accum3, ax = a[4], bx);
149 smlal(&accum1, ax, bx = bm[4]);
150 smlal(&accum3, ax = a[5], bx);
151 smlal(&accum1, ax, bx = bm[3]);
152 smlal(&accum3, ax = a[6], bx);
153 smlal(&accum1, ax, bx = bm[2]);
154 smlal(&accum3, ax = a[7], bx);
155 smlal(&accum1, ax, bx = bm[1]);
158 smlal(&accum2, ax = a[0], bx);
159 smlal(&accum0, ax, bx = bm[0]);
160 smlal(&accum2, ax = a[1], bx);
162 accum2 += accum0 >> 28;
163 accum3 += accum1 >> 28;
165 c[0] = ((uint32_t)(accum0)) & mask;
166 c[1] = ((uint32_t)(accum2)) & mask;
167 c[8] = ((uint32_t)(accum1)) & mask;
168 c[9] = ((uint32_t)(accum3)) & mask;
170 accumC0 = accum2 >> 28;
171 accumC1 = accum3 >> 28;
175 smull(&accum1, ax = aa[3], bx = b[15]);
176 smull(&accum3, ax = aa[4], bx);
177 smlal(&accum1, ax, bx = b[14]);
178 smlal(&accum3, ax = aa[5], bx);
179 smlal(&accum1, ax, bx = b[13]);
180 smlal(&accum3, ax = aa[6], bx);
181 smlal(&accum1, ax, bx = b[12]);
182 smlal(&accum3, ax = aa[7], bx);
183 smlal(&accum1, ax, bx = b[11]);
189 smlal(&accum2, ax = aa[0], bx);
190 smlal(&accum0, ax, bx = b[10]);
191 smlal(&accum2, ax = aa[1], bx);
192 smlal(&accum0, ax, bx = b[9]);
193 smlal(&accum2, ax = aa[2], bx);
194 smlal(&accum0, ax, bx = b[8]);
195 smlal(&accum2, ax = aa[3], bx);
197 smlal(&accum0, ax = a[11], bx = b[7]);
198 smlal(&accum2, ax = a[12], bx);
199 smlal(&accum0, ax, bx = b[6]);
200 smlal(&accum2, ax = a[13], bx);
201 smlal(&accum0, ax, bx = b[5]);
202 smlal(&accum2, ax = a[14], bx);
203 smlal(&accum0, ax, bx = b[4]);
204 smlal(&accum2, ax = a[15], bx);
205 smlal(&accum0, ax, bx = b[3]);
210 smlal(&accum3, ax = a[8], bx);
211 smlal(&accum1, ax, bx = b[2]);
212 smlal(&accum3, ax = a[9], bx);
213 smlal(&accum1, ax, bx = b[1]);
214 smlal(&accum3, ax = a[10], bx);
215 smlal(&accum1, ax, bx = b[0]);
216 smlal(&accum3, ax = a[11], bx);
218 smlal(&accum1, ax = a[3], bx = bm[7]);
219 smlal(&accum3, ax = a[4], bx);
220 smlal(&accum1, ax, bx = bm[6]);
221 smlal(&accum3, ax = a[5], bx);
222 smlal(&accum1, ax, bx = bm[5]);
223 smlal(&accum3, ax = a[6], bx);
224 smlal(&accum1, ax, bx = bm[4]);
225 smlal(&accum3, ax = a[7], bx);
226 smlal(&accum1, ax, bx = bm[3]);
229 smlal(&accum2, ax = a[0], bx);
230 smlal(&accum0, ax, bx = bm[2]);
231 smlal(&accum2, ax = a[1], bx);
232 smlal(&accum0, ax, bx = bm[1]);
233 smlal(&accum2, ax = a[2], bx);
234 smlal(&accum0, ax, bx = bm[0]);
235 smlal(&accum2, ax = a[3], bx);
239 accum2 += accum0 >> 28;
240 accum3 += accum1 >> 28;
242 c[2] = ((uint32_t)(accum0)) & mask;
243 c[3] = ((uint32_t)(accum2)) & mask;
244 c[10] = ((uint32_t)(accum1)) & mask;
245 c[11] = ((uint32_t)(accum3)) & mask;
247 accumC0 = accum2 >> 28;
248 accumC1 = accum3 >> 28;
253 smull(&accum1, ax = aa[5], bx = b[15]);
254 smull(&accum3, ax = aa[6], bx);
255 smlal(&accum1, ax, bx = b[14]);
256 smlal(&accum3, ax = aa[7], bx);
257 smlal(&accum1, ax, bx = b[13]);
264 smlal(&accum2, ax = aa[0], bx);
265 smlal(&accum0, ax, bx = b[12]);
266 smlal(&accum2, ax = aa[1], bx);
267 smlal(&accum0, ax, bx = b[11]);
268 smlal(&accum2, ax = aa[2], bx);
269 smlal(&accum0, ax, bx = b[10]);
270 smlal(&accum2, ax = aa[3], bx);
271 smlal(&accum0, ax, bx = b[9]);
272 smlal(&accum2, ax = aa[4], bx);
273 smlal(&accum0, ax, bx = b[8]);
274 smlal(&accum2, ax = aa[5], bx);
276 smlal(&accum0, ax = a[13], bx = b[7]);
277 smlal(&accum2, ax = a[14], bx);
278 smlal(&accum0, ax, bx = b[6]);
279 smlal(&accum2, ax = a[15], bx);
280 smlal(&accum0, ax, bx = b[5]);
286 smlal(&accum3, ax = a[8], bx);
287 smlal(&accum1, ax, bx = b[4]);
288 smlal(&accum3, ax = a[9], bx);
289 smlal(&accum1, ax, bx = b[3]);
290 smlal(&accum3, ax = a[10], bx);
291 smlal(&accum1, ax, bx = b[2]);
292 smlal(&accum3, ax = a[11], bx);
293 smlal(&accum1, ax, bx = b[1]);
294 smlal(&accum3, ax = a[12], bx);
295 smlal(&accum1, ax, bx = b[0]);
296 smlal(&accum3, ax = a[13], bx);
298 smlal(&accum1, ax = a[5], bx = bm[7]);
299 smlal(&accum3, ax = a[6], bx);
300 smlal(&accum1, ax, bx = bm[6]);
301 smlal(&accum3, ax = a[7], bx);
302 smlal(&accum1, ax, bx = bm[5]);
306 smlal(&accum2, ax = a[0], bx);
307 smlal(&accum0, ax, bx = bm[4]);
308 smlal(&accum2, ax = a[1], bx);
309 smlal(&accum0, ax, bx = bm[3]);
310 smlal(&accum2, ax = a[2], bx);
311 smlal(&accum0, ax, bx = bm[2]);
312 smlal(&accum2, ax = a[3], bx);
313 smlal(&accum0, ax, bx = bm[1]);
314 smlal(&accum2, ax = a[4], bx);
315 smlal(&accum0, ax, bx = bm[0]);
316 smlal(&accum2, ax = a[5], bx);
320 accum2 += accum0 >> 28;
321 accum3 += accum1 >> 28;
323 c[4] = ((uint32_t)(accum0)) & mask;
324 c[5] = ((uint32_t)(accum2)) & mask;
325 c[12] = ((uint32_t)(accum1)) & mask;
326 c[13] = ((uint32_t)(accum3)) & mask;
328 accumC0 = accum2 >> 28;
329 accumC1 = accum3 >> 28;
334 smull(&accum1, ax = aa[7], bx = b[15]);
339 smull(&accum2, ax = aa[0], bx);
340 smlal(&accum0, ax, bx = b[14]);
341 smlal(&accum2, ax = aa[1], bx);
342 smlal(&accum0, ax, bx = b[13]);
343 smlal(&accum2, ax = aa[2], bx);
344 smlal(&accum0, ax, bx = b[12]);
345 smlal(&accum2, ax = aa[3], bx);
346 smlal(&accum0, ax, bx = b[11]);
347 smlal(&accum2, ax = aa[4], bx);
348 smlal(&accum0, ax, bx = b[10]);
349 smlal(&accum2, ax = aa[5], bx);
350 smlal(&accum0, ax, bx = b[9]);
351 smlal(&accum2, ax = aa[6], bx);
352 smlal(&accum0, ax, bx = b[8]);
353 smlal(&accum2, ax = aa[7], bx);
355 smlal(&accum0, ax = a[15], bx = b[7]);
361 smlal(&accum3, ax = a[8], bx);
362 smlal(&accum1, ax, bx = b[6]);
363 smlal(&accum3, ax = a[9], bx);
364 smlal(&accum1, ax, bx = b[5]);
365 smlal(&accum3, ax = a[10], bx);
366 smlal(&accum1, ax, bx = b[4]);
367 smlal(&accum3, ax = a[11], bx);
368 smlal(&accum1, ax, bx = b[3]);
369 smlal(&accum3, ax = a[12], bx);
370 smlal(&accum1, ax, bx = b[2]);
371 smlal(&accum3, ax = a[13], bx);
372 smlal(&accum1, ax, bx = b[1]);
373 smlal(&accum3, ax = a[14], bx);
374 smlal(&accum1, ax, bx = b[0]);
375 smlal(&accum3, ax = a[15], bx);
377 smlal(&accum1, ax = a[7], bx = bm[7]);
381 smlal(&accum2, ax = a[0], bx);
382 smlal(&accum0, ax, bx = bm[6]);
383 smlal(&accum2, ax = a[1], bx);
384 smlal(&accum0, ax, bx = bm[5]);
385 smlal(&accum2, ax = a[2], bx);
386 smlal(&accum0, ax, bx = bm[4]);
387 smlal(&accum2, ax = a[3], bx);
388 smlal(&accum0, ax, bx = bm[3]);
389 smlal(&accum2, ax = a[4], bx);
390 smlal(&accum0, ax, bx = bm[2]);
391 smlal(&accum2, ax = a[5], bx);
392 smlal(&accum0, ax, bx = bm[1]);
393 smlal(&accum2, ax = a[6], bx);
394 smlal(&accum0, ax, bx = bm[0]);
395 smlal(&accum2, ax = a[7], bx);
399 accum2 += accum0 >> 28;
400 accum3 += accum1 >> 28;
402 c[6] = ((uint32_t)(accum0)) & mask;
403 c[7] = ((uint32_t)(accum2)) & mask;
404 c[14] = ((uint32_t)(accum1)) & mask;
405 c[15] = ((uint32_t)(accum3)) & mask;
407 accum0 = accum2 >> 28;
408 accum1 = accum3 >> 28;
414 c[8] = ((uint32_t)(accum0)) & mask;
415 c[0] = ((uint32_t)(accum1)) & mask;
419 c[9] += ((uint32_t)(accum0));
420 c[1] += ((uint32_t)(accum1));
423 void gf_sqr(gf_s * __restrict__ cs, const gf as)
425 const uint32_t *a = as->limb;
426 uint32_t *c = cs->limb;
428 uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1, tmp;
429 uint32_t mask = (1 << 28) - 1;
434 for (i = 0; i < 8; i++) {
435 bm[i] = a[i] - a[i + 8];
441 smull2(&accum1, ax = a[9], bx = a[15]);
442 smull2(&accum3, ax = a[10], bx);
443 smlal2(&accum1, ax, bx = a[14]);
444 smlal2(&accum3, ax = a[11], bx);
445 smlal2(&accum1, ax, bx = a[13]);
446 smlal2(&accum3, ax = a[12], bx);
447 smlal(&accum1, ax, ax);
453 smlal2(&accum2, ax = a[8], a[9]);
454 smlal(&accum0, ax, ax);
456 smlal2(&accum0, ax = a[1], bx = a[7]);
457 smlal2(&accum2, ax = a[2], bx);
458 smlal2(&accum0, ax, bx = a[6]);
459 smlal2(&accum2, ax = a[3], bx);
460 smlal2(&accum0, ax, bx = a[5]);
461 smlal2(&accum2, ax = a[4], bx);
462 smlal(&accum0, ax, ax);
467 smlal2(&accum3, ax = a[0], bx = a[1]);
468 smlal(&accum1, ax, ax);
475 smlal2(&accum1, ax = bm[1], bx = bm[7]);
476 smlal2(&accum3, ax = bm[2], bx);
477 smlal2(&accum1, ax, bx = bm[6]);
478 smlal2(&accum3, ax = bm[3], bx);
479 smlal2(&accum1, ax, bx = bm[5]);
480 smlal2(&accum3, ax = bm[4], bx);
481 smlal(&accum1, ax, ax);
484 smlal2(&accum2, ax = bm[0], bx = bm[1]);
485 smlal(&accum0, ax, ax);
488 accum3 = tmp - accum2;
491 accum1 = tmp - accum0;
494 accum2 += accum0 >> 28;
495 accum3 += accum1 >> 28;
497 c[0] = ((uint32_t)(accum0)) & mask;
498 c[1] = ((uint32_t)(accum2)) & mask;
499 c[8] = ((uint32_t)(accum1)) & mask;
500 c[9] = ((uint32_t)(accum3)) & mask;
502 accumC0 = accum2 >> 28;
503 accumC1 = accum3 >> 28;
507 smull2(&accum1, ax = a[11], bx = a[15]);
508 smull2(&accum3, ax = a[12], bx);
509 smlal2(&accum1, ax, bx = a[14]);
510 smlal2(&accum3, ax = a[13], bx);
511 smlal(&accum1, ax, ax);
517 smlal2(&accum2, ax = a[8], bx = a[11]);
518 smlal2(&accum0, ax, bx = a[10]);
519 smlal2(&accum2, ax = a[9], bx);
520 smlal(&accum0, ax, ax);
522 smlal2(&accum0, ax = a[3], bx = a[7]);
523 smlal2(&accum2, ax = a[4], bx);
524 smlal2(&accum0, ax, bx = a[6]);
525 smlal2(&accum2, ax = a[5], bx);
526 smlal(&accum0, ax, ax);
531 smlal2(&accum3, ax = a[0], bx = a[3]);
532 smlal2(&accum1, ax, bx = a[2]);
533 smlal2(&accum3, ax = a[1], bx);
534 smlal(&accum1, ax, ax);
541 smlal2(&accum1, ax = bm[3], bx = bm[7]);
542 smlal2(&accum3, ax = bm[4], bx);
543 smlal2(&accum1, ax, bx = bm[6]);
544 smlal2(&accum3, ax = bm[5], bx);
545 smlal(&accum1, ax, ax);
548 smlal2(&accum2, ax = bm[0], bx = bm[3]);
549 smlal2(&accum0, ax, bx = bm[2]);
550 smlal2(&accum2, ax = bm[1], bx);
551 smlal(&accum0, ax, ax);
554 accum3 = tmp - accum2;
557 accum1 = tmp - accum0;
562 accum2 += accum0 >> 28;
563 accum3 += accum1 >> 28;
565 c[2] = ((uint32_t)(accum0)) & mask;
566 c[3] = ((uint32_t)(accum2)) & mask;
567 c[10] = ((uint32_t)(accum1)) & mask;
568 c[11] = ((uint32_t)(accum3)) & mask;
570 accumC0 = accum2 >> 28;
571 accumC1 = accum3 >> 28;
576 smull2(&accum1, ax = a[13], bx = a[15]);
577 smull2(&accum3, ax = a[14], bx);
578 smlal(&accum1, ax, ax);
585 smlal2(&accum2, ax = a[8], bx = a[13]);
586 smlal2(&accum0, ax, bx = a[12]);
587 smlal2(&accum2, ax = a[9], bx);
588 smlal2(&accum0, ax, bx = a[11]);
589 smlal2(&accum2, ax = a[10], bx);
590 smlal(&accum0, ax, ax);
592 smlal2(&accum0, ax = a[5], bx = a[7]);
593 smlal2(&accum2, ax = a[6], bx);
594 smlal(&accum0, ax, ax);
600 smlal2(&accum3, ax = a[0], bx = a[5]);
601 smlal2(&accum1, ax, bx = a[4]);
602 smlal2(&accum3, ax = a[1], bx);
603 smlal2(&accum1, ax, bx = a[3]);
604 smlal2(&accum3, ax = a[2], bx);
605 smlal(&accum1, ax, ax);
612 smlal2(&accum1, ax = bm[5], bx = bm[7]);
613 smlal2(&accum3, ax = bm[6], bx);
614 smlal(&accum1, ax, ax);
618 smlal2(&accum2, ax = bm[0], bx = bm[5]);
619 smlal2(&accum0, ax, bx = bm[4]);
620 smlal2(&accum2, ax = bm[1], bx);
621 smlal2(&accum0, ax, bx = bm[3]);
622 smlal2(&accum2, ax = bm[2], bx);
623 smlal(&accum0, ax, ax);
626 accum3 = tmp - accum2;
629 accum1 = tmp - accum0;
634 accum2 += accum0 >> 28;
635 accum3 += accum1 >> 28;
637 c[4] = ((uint32_t)(accum0)) & mask;
638 c[5] = ((uint32_t)(accum2)) & mask;
639 c[12] = ((uint32_t)(accum1)) & mask;
640 c[13] = ((uint32_t)(accum3)) & mask;
642 accumC0 = accum2 >> 28;
643 accumC1 = accum3 >> 28;
648 smull(&accum1, ax = a[15], bx = a[15]);
653 smull2(&accum2, ax = a[8], bx);
654 smlal2(&accum0, ax, bx = a[14]);
655 smlal2(&accum2, ax = a[9], bx);
656 smlal2(&accum0, ax, bx = a[13]);
657 smlal2(&accum2, ax = a[10], bx);
658 smlal2(&accum0, ax, bx = a[12]);
659 smlal2(&accum2, ax = a[11], bx);
660 smlal(&accum0, ax, ax);
662 smlal(&accum0, ax = a[7], bx = a[7]);
668 smlal2(&accum3, ax = a[0], bx);
669 smlal2(&accum1, ax, bx = a[6]);
670 smlal2(&accum3, ax = a[1], bx);
671 smlal2(&accum1, ax, bx = a[5]);
672 smlal2(&accum3, ax = a[2], bx);
673 smlal2(&accum1, ax, bx = a[4]);
674 smlal2(&accum3, ax = a[3], bx);
675 smlal(&accum1, ax, ax);
683 smlal(&accum1, bx, bx);
687 smlal2(&accum2, ax = bm[0], bx);
688 smlal2(&accum0, ax, bx = bm[6]);
689 smlal2(&accum2, ax = bm[1], bx);
690 smlal2(&accum0, ax, bx = bm[5]);
691 smlal2(&accum2, ax = bm[2], bx);
692 smlal2(&accum0, ax, bx = bm[4]);
693 smlal2(&accum2, ax = bm[3], bx);
694 smlal(&accum0, ax, ax);
697 accum3 = tmp - accum2;
700 accum1 = tmp - accum0;
705 accum2 += accum0 >> 28;
706 accum3 += accum1 >> 28;
708 c[6] = ((uint32_t)(accum0)) & mask;
709 c[7] = ((uint32_t)(accum2)) & mask;
710 c[14] = ((uint32_t)(accum1)) & mask;
711 c[15] = ((uint32_t)(accum3)) & mask;
713 accum0 = accum2 >> 28;
714 accum1 = accum3 >> 28;
720 c[8] = ((uint32_t)(accum0)) & mask;
721 c[0] = ((uint32_t)(accum1)) & mask;
725 c[9] += ((uint32_t)(accum0));
726 c[1] += ((uint32_t)(accum1));
729 void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
731 uint32_t mask = (1ull << 28) - 1;
734 const uint32_t *a = as->limb;
735 uint32_t *c = cs->limb;
737 uint64_t accum0, accum8;
741 uint32_t c0, c8, n0, n8;
744 accum0 = widemul(b, c0);
745 accum8 = widemul(b, c8);
747 c[0] = accum0 & mask;
749 c[8] = accum8 & mask;
756 smlal(&accum0, b, n0);
757 smlal(&accum8, b, n8);
759 c[i] = accum0 & mask;
761 c[i + 8] = accum8 & mask;
768 smlal(&accum0, b, c0);
769 smlal(&accum8, b, c8);
771 c[i] = accum0 & mask;
773 c[i + 8] = accum8 & mask;
780 smlal(&accum0, b, n0);
781 smlal(&accum8, b, n8);
783 c[i] = accum0 & mask;
785 c[i + 8] = accum8 & mask;
792 smlal(&accum0, b, c0);
793 smlal(&accum8, b, c8);
795 c[i] = accum0 & mask;
797 c[i + 8] = accum8 & mask;
804 smlal(&accum0, b, n0);
805 smlal(&accum8, b, n8);
807 c[i] = accum0 & mask;
809 c[i + 8] = accum8 & mask;
816 smlal(&accum0, b, c0);
817 smlal(&accum8, b, c8);
819 c[i] = accum0 & mask;
821 c[i + 8] = accum8 & mask;
828 smlal(&accum0, b, n0);
829 smlal(&accum8, b, n8);
831 c[i] = accum0 & mask;
833 c[i + 8] = accum8 & mask;
838 accum0 += accum8 + c[8];
839 c[8] = accum0 & mask;
840 c[9] += accum0 >> 28;
843 c[0] = accum8 & mask;
844 c[1] += accum8 >> 28;