crypto/bn/bn_asm.c

   1 /*
   2  * Copyright 1995-2023 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the Apache License 2.0 (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 #include <assert.h>
  11 #include <openssl/crypto.h>
  12 #include "internal/cryptlib.h"
  13 #include "bn_local.h"
  14
  15 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
  16
  17 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
  18                           BN_ULONG w)
  19 {
  20     BN_ULONG c1 = 0;
  21
  22     assert(num >= 0);
  23     if (num <= 0)
  24         return c1;
  25
  26 # ifndef OPENSSL_SMALL_FOOTPRINT
  27     while (num & ~3) {
  28         mul_add(rp[0], ap[0], w, c1);
  29         mul_add(rp[1], ap[1], w, c1);
  30         mul_add(rp[2], ap[2], w, c1);
  31         mul_add(rp[3], ap[3], w, c1);
  32         ap += 4;
  33         rp += 4;
  34         num -= 4;
  35     }
  36 # endif
  37     while (num) {
  38         mul_add(rp[0], ap[0], w, c1);
  39         ap++;
  40         rp++;
  41         num--;
  42     }
  43
  44     return c1;
  45 }
  46
  47 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
  48 {
  49     BN_ULONG c1 = 0;
  50
  51     assert(num >= 0);
  52     if (num <= 0)
  53         return c1;
  54
  55 # ifndef OPENSSL_SMALL_FOOTPRINT
  56     while (num & ~3) {
  57         mul(rp[0], ap[0], w, c1);
  58         mul(rp[1], ap[1], w, c1);
  59         mul(rp[2], ap[2], w, c1);
  60         mul(rp[3], ap[3], w, c1);
  61         ap += 4;
  62         rp += 4;
  63         num -= 4;
  64     }
  65 # endif
  66     while (num) {
  67         mul(rp[0], ap[0], w, c1);
  68         ap++;
  69         rp++;
  70         num--;
  71     }
  72     return c1;
  73 }
  74
  75 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
  76 {
  77     assert(n >= 0);
  78     if (n <= 0)
  79         return;
  80
  81 # ifndef OPENSSL_SMALL_FOOTPRINT
  82     while (n & ~3) {
  83         sqr(r[0], r[1], a[0]);
  84         sqr(r[2], r[3], a[1]);
  85         sqr(r[4], r[5], a[2]);
  86         sqr(r[6], r[7], a[3]);
  87         a += 4;
  88         r += 8;
  89         n -= 4;
  90     }
  91 # endif
  92     while (n) {
  93         sqr(r[0], r[1], a[0]);
  94         a++;
  95         r += 2;
  96         n--;
  97     }
  98 }
  99
 100 #else                           /* !(defined(BN_LLONG) ||
 101                                  * defined(BN_UMULT_HIGH)) */
 102
 103 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
 104                           BN_ULONG w)
 105 {
 106     BN_ULONG c = 0;
 107     BN_ULONG bl, bh;
 108
 109     assert(num >= 0);
 110     if (num <= 0)
 111         return (BN_ULONG)0;
 112
 113     bl = LBITS(w);
 114     bh = HBITS(w);
 115
 116 # ifndef OPENSSL_SMALL_FOOTPRINT
 117     while (num & ~3) {
 118         mul_add(rp[0], ap[0], bl, bh, c);
 119         mul_add(rp[1], ap[1], bl, bh, c);
 120         mul_add(rp[2], ap[2], bl, bh, c);
 121         mul_add(rp[3], ap[3], bl, bh, c);
 122         ap += 4;
 123         rp += 4;
 124         num -= 4;
 125     }
 126 # endif
 127     while (num) {
 128         mul_add(rp[0], ap[0], bl, bh, c);
 129         ap++;
 130         rp++;
 131         num--;
 132     }
 133     return c;
 134 }
 135
 136 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 137 {
 138     BN_ULONG carry = 0;
 139     BN_ULONG bl, bh;
 140
 141     assert(num >= 0);
 142     if (num <= 0)
 143         return (BN_ULONG)0;
 144
 145     bl = LBITS(w);
 146     bh = HBITS(w);
 147
 148 # ifndef OPENSSL_SMALL_FOOTPRINT
 149     while (num & ~3) {
 150         mul(rp[0], ap[0], bl, bh, carry);
 151         mul(rp[1], ap[1], bl, bh, carry);
 152         mul(rp[2], ap[2], bl, bh, carry);
 153         mul(rp[3], ap[3], bl, bh, carry);
 154         ap += 4;
 155         rp += 4;
 156         num -= 4;
 157     }
 158 # endif
 159     while (num) {
 160         mul(rp[0], ap[0], bl, bh, carry);
 161         ap++;
 162         rp++;
 163         num--;
 164     }
 165     return carry;
 166 }
 167
 168 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
 169 {
 170     assert(n >= 0);
 171     if (n <= 0)
 172         return;
 173
 174 # ifndef OPENSSL_SMALL_FOOTPRINT
 175     while (n & ~3) {
 176         sqr64(r[0], r[1], a[0]);
 177         sqr64(r[2], r[3], a[1]);
 178         sqr64(r[4], r[5], a[2]);
 179         sqr64(r[6], r[7], a[3]);
 180         a += 4;
 181         r += 8;
 182         n -= 4;
 183     }
 184 # endif
 185     while (n) {
 186         sqr64(r[0], r[1], a[0]);
 187         a++;
 188         r += 2;
 189         n--;
 190     }
 191 }
 192
 193 #endif                          /* !(defined(BN_LLONG) ||
 194                                  * defined(BN_UMULT_HIGH)) */
 195
 196 #if defined(BN_LLONG) && defined(BN_DIV2W)
 197
 198 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 199 {
 200     return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d));
 201 }
 202
 203 #else
 204
 205 /* Divide h,l by d and return the result. */
 206 /* I need to test this some more :-( */
 207 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 208 {
 209     BN_ULONG dh, dl, q, ret = 0, th, tl, t;
 210     int i, count = 2;
 211
 212     if (d == 0)
 213         return BN_MASK2;
 214
 215     i = BN_num_bits_word(d);
 216     assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
 217
 218     i = BN_BITS2 - i;
 219     if (h >= d)
 220         h -= d;
 221
 222     if (i) {
 223         d <<= i;
 224         h = (h << i) | (l >> (BN_BITS2 - i));
 225         l <<= i;
 226     }
 227     dh = (d & BN_MASK2h) >> BN_BITS4;
 228     dl = (d & BN_MASK2l);
 229     for (;;) {
 230         if ((h >> BN_BITS4) == dh)
 231             q = BN_MASK2l;
 232         else
 233             q = h / dh;
 234
 235         th = q * dh;
 236         tl = dl * q;
 237         for (;;) {
 238             t = h - th;
 239             if ((t & BN_MASK2h) ||
 240                 ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4))))
 241                 break;
 242             q--;
 243             th -= dh;
 244             tl -= dl;
 245         }
 246         t = (tl >> BN_BITS4);
 247         tl = (tl << BN_BITS4) & BN_MASK2h;
 248         th += t;
 249
 250         if (l < tl)
 251             th++;
 252         l -= tl;
 253         if (h < th) {
 254             h += d;
 255             q--;
 256         }
 257         h -= th;
 258
 259         if (--count == 0)
 260             break;
 261
 262         ret = q << BN_BITS4;
 263         h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
 264         l = (l & BN_MASK2l) << BN_BITS4;
 265     }
 266     ret |= q;
 267     return ret;
 268 }
 269 #endif                          /* !defined(BN_LLONG) && defined(BN_DIV2W) */
 270
 271 #ifdef BN_LLONG
 272 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 273                       int n)
 274 {
 275     BN_ULLONG ll = 0;
 276
 277     assert(n >= 0);
 278     if (n <= 0)
 279         return (BN_ULONG)0;
 280
 281 # ifndef OPENSSL_SMALL_FOOTPRINT
 282     while (n & ~3) {
 283         ll += (BN_ULLONG) a[0] + b[0];
 284         r[0] = (BN_ULONG)ll & BN_MASK2;
 285         ll >>= BN_BITS2;
 286         ll += (BN_ULLONG) a[1] + b[1];
 287         r[1] = (BN_ULONG)ll & BN_MASK2;
 288         ll >>= BN_BITS2;
 289         ll += (BN_ULLONG) a[2] + b[2];
 290         r[2] = (BN_ULONG)ll & BN_MASK2;
 291         ll >>= BN_BITS2;
 292         ll += (BN_ULLONG) a[3] + b[3];
 293         r[3] = (BN_ULONG)ll & BN_MASK2;
 294         ll >>= BN_BITS2;
 295         a += 4;
 296         b += 4;
 297         r += 4;
 298         n -= 4;
 299     }
 300 # endif
 301     while (n) {
 302         ll += (BN_ULLONG) a[0] + b[0];
 303         r[0] = (BN_ULONG)ll & BN_MASK2;
 304         ll >>= BN_BITS2;
 305         a++;
 306         b++;
 307         r++;
 308         n--;
 309     }
 310     return (BN_ULONG)ll;
 311 }
 312 #else                           /* !BN_LLONG */
 313 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 314                       int n)
 315 {
 316     BN_ULONG c, l, t;
 317
 318     assert(n >= 0);
 319     if (n <= 0)
 320         return (BN_ULONG)0;
 321
 322     c = 0;
 323 # ifndef OPENSSL_SMALL_FOOTPRINT
 324     while (n & ~3) {
 325         t = a[0];
 326         t = (t + c) & BN_MASK2;
 327         c = (t < c);
 328         l = (t + b[0]) & BN_MASK2;
 329         c += (l < t);
 330         r[0] = l;
 331         t = a[1];
 332         t = (t + c) & BN_MASK2;
 333         c = (t < c);
 334         l = (t + b[1]) & BN_MASK2;
 335         c += (l < t);
 336         r[1] = l;
 337         t = a[2];
 338         t = (t + c) & BN_MASK2;
 339         c = (t < c);
 340         l = (t + b[2]) & BN_MASK2;
 341         c += (l < t);
 342         r[2] = l;
 343         t = a[3];
 344         t = (t + c) & BN_MASK2;
 345         c = (t < c);
 346         l = (t + b[3]) & BN_MASK2;
 347         c += (l < t);
 348         r[3] = l;
 349         a += 4;
 350         b += 4;
 351         r += 4;
 352         n -= 4;
 353     }
 354 # endif
 355     while (n) {
 356         t = a[0];
 357         t = (t + c) & BN_MASK2;
 358         c = (t < c);
 359         l = (t + b[0]) & BN_MASK2;
 360         c += (l < t);
 361         r[0] = l;
 362         a++;
 363         b++;
 364         r++;
 365         n--;
 366     }
 367     return (BN_ULONG)c;
 368 }
 369 #endif                          /* !BN_LLONG */
 370
 371 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 372                       int n)
 373 {
 374     BN_ULONG t1, t2;
 375     int c = 0;
 376
 377     assert(n >= 0);
 378     if (n <= 0)
 379         return (BN_ULONG)0;
 380
 381 #ifndef OPENSSL_SMALL_FOOTPRINT
 382     while (n & ~3) {
 383         t1 = a[0];
 384         t2 = (t1 - c) & BN_MASK2;
 385         c  = (t2 > t1);
 386         t1 = b[0];
 387         t1 = (t2 - t1) & BN_MASK2;
 388         r[0] = t1;
 389         c += (t1 > t2);
 390         t1 = a[1];
 391         t2 = (t1 - c) & BN_MASK2;
 392         c  = (t2 > t1);
 393         t1 = b[1];
 394         t1 = (t2 - t1) & BN_MASK2;
 395         r[1] = t1;
 396         c += (t1 > t2);
 397         t1 = a[2];
 398         t2 = (t1 - c) & BN_MASK2;
 399         c  = (t2 > t1);
 400         t1 = b[2];
 401         t1 = (t2 - t1) & BN_MASK2;
 402         r[2] = t1;
 403         c += (t1 > t2);
 404         t1 = a[3];
 405         t2 = (t1 - c) & BN_MASK2;
 406         c  = (t2 > t1);
 407         t1 = b[3];
 408         t1 = (t2 - t1) & BN_MASK2;
 409         r[3] = t1;
 410         c += (t1 > t2);
 411         a += 4;
 412         b += 4;
 413         r += 4;
 414         n -= 4;
 415     }
 416 #endif
 417     while (n) {
 418         t1 = a[0];
 419         t2 = (t1 - c) & BN_MASK2;
 420         c  = (t2 > t1);
 421         t1 = b[0];
 422         t1 = (t2 - t1) & BN_MASK2;
 423         r[0] = t1;
 424         c += (t1 > t2);
 425         a++;
 426         b++;
 427         r++;
 428         n--;
 429     }
 430     return c;
 431 }
 432
 433 #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
 434
 435 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
 436 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
 437 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
 438 /*
 439  * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
 440  * c=(c2,c1,c0)
 441  */
 442
 443 # ifdef BN_LLONG
 444 /*
 445  * Keep in mind that additions to multiplication result can not
 446  * overflow, because its high half cannot be all-ones.
 447  */
 448 #  define mul_add_c(a,b,c0,c1,c2)       do {    \
 449         BN_ULONG hi;                            \
 450         BN_ULLONG t = (BN_ULLONG)(a)*(b);       \
 451         t += c0;                /* no carry */  \
 452         c0 = (BN_ULONG)Lw(t);                   \
 453         hi = (BN_ULONG)Hw(t);                   \
 454         c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi);   \
 455         } while(0)
 456
 457 #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
 458         BN_ULONG hi;                            \
 459         BN_ULLONG t = (BN_ULLONG)(a)*(b);       \
 460         BN_ULLONG tt = t+c0;    /* no carry */  \
 461         c0 = (BN_ULONG)Lw(tt);                  \
 462         hi = (BN_ULONG)Hw(tt);                  \
 463         c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi);   \
 464         t += c0;                /* no carry */  \
 465         c0 = (BN_ULONG)Lw(t);                   \
 466         hi = (BN_ULONG)Hw(t);                   \
 467         c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi);   \
 468         } while(0)
 469
 470 #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
 471         BN_ULONG hi;                            \
 472         BN_ULLONG t = (BN_ULLONG)a[i]*a[i];     \
 473         t += c0;                /* no carry */  \
 474         c0 = (BN_ULONG)Lw(t);                   \
 475         hi = (BN_ULONG)Hw(t);                   \
 476         c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi);   \
 477         } while(0)
 478
 479 #  define sqr_add_c2(a,i,j,c0,c1,c2) \
 480         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 481
 482 # elif defined(BN_UMULT_LOHI)
 483 /*
 484  * Keep in mind that additions to hi can not overflow, because
 485  * the high word of a multiplication result cannot be all-ones.
 486  */
 487 #  define mul_add_c(a,b,c0,c1,c2)       do {    \
 488         BN_ULONG ta = (a), tb = (b);            \
 489         BN_ULONG lo, hi;                        \
 490         BN_UMULT_LOHI(lo,hi,ta,tb);             \
 491         c0 += lo; hi += (c0<lo);                \
 492         c1 += hi; c2 += (c1<hi);                \
 493         } while(0)
 494
 495 #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
 496         BN_ULONG ta = (a), tb = (b);            \
 497         BN_ULONG lo, hi, tt;                    \
 498         BN_UMULT_LOHI(lo,hi,ta,tb);             \
 499         c0 += lo; tt = hi + (c0<lo);            \
 500         c1 += tt; c2 += (c1<tt);                \
 501         c0 += lo; hi += (c0<lo);                \
 502         c1 += hi; c2 += (c1<hi);                \
 503         } while(0)
 504
 505 #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
 506         BN_ULONG ta = (a)[i];                   \
 507         BN_ULONG lo, hi;                        \
 508         BN_UMULT_LOHI(lo,hi,ta,ta);             \
 509         c0 += lo; hi += (c0<lo);                \
 510         c1 += hi; c2 += (c1<hi);                \
 511         } while(0)
 512
 513 #  define sqr_add_c2(a,i,j,c0,c1,c2)    \
 514         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 515
 516 # elif defined(BN_UMULT_HIGH)
 517 /*
 518  * Keep in mind that additions to hi can not overflow, because
 519  * the high word of a multiplication result cannot be all-ones.
 520  */
 521 #  define mul_add_c(a,b,c0,c1,c2)       do {    \
 522         BN_ULONG ta = (a), tb = (b);            \
 523         BN_ULONG lo = ta * tb;                  \
 524         BN_ULONG hi = BN_UMULT_HIGH(ta,tb);     \
 525         c0 += lo; hi += (c0<lo);                \
 526         c1 += hi; c2 += (c1<hi);                \
 527         } while(0)
 528
 529 #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
 530         BN_ULONG ta = (a), tb = (b), tt;        \
 531         BN_ULONG lo = ta * tb;                  \
 532         BN_ULONG hi = BN_UMULT_HIGH(ta,tb);     \
 533         c0 += lo; tt = hi + (c0<lo);            \
 534         c1 += tt; c2 += (c1<tt);                \
 535         c0 += lo; hi += (c0<lo);                \
 536         c1 += hi; c2 += (c1<hi);                \
 537         } while(0)
 538
 539 #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
 540         BN_ULONG ta = (a)[i];                   \
 541         BN_ULONG lo = ta * ta;                  \
 542         BN_ULONG hi = BN_UMULT_HIGH(ta,ta);     \
 543         c0 += lo; hi += (c0<lo);                \
 544         c1 += hi; c2 += (c1<hi);                \
 545         } while(0)
 546
 547 #  define sqr_add_c2(a,i,j,c0,c1,c2)      \
 548         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 549
 550 # else                          /* !BN_LLONG */
 551 /*
 552  * Keep in mind that additions to hi can not overflow, because
 553  * the high word of a multiplication result cannot be all-ones.
 554  */
 555 #  define mul_add_c(a,b,c0,c1,c2)       do {    \
 556         BN_ULONG lo = LBITS(a), hi = HBITS(a);  \
 557         BN_ULONG bl = LBITS(b), bh = HBITS(b);  \
 558         mul64(lo,hi,bl,bh);                     \
 559         c0 = (c0+lo)&BN_MASK2; hi += (c0<lo);   \
 560         c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi);   \
 561         } while(0)
 562
 563 #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
 564         BN_ULONG tt;                            \
 565         BN_ULONG lo = LBITS(a), hi = HBITS(a);  \
 566         BN_ULONG bl = LBITS(b), bh = HBITS(b);  \
 567         mul64(lo,hi,bl,bh);                     \
 568         tt = hi;                                \
 569         c0 = (c0+lo)&BN_MASK2; tt += (c0<lo);   \
 570         c1 = (c1+tt)&BN_MASK2; c2 += (c1<tt);   \
 571         c0 = (c0+lo)&BN_MASK2; hi += (c0<lo);   \
 572         c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi);   \
 573         } while(0)
 574
 575 #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
 576         BN_ULONG lo, hi;                        \
 577         sqr64(lo,hi,(a)[i]);                    \
 578         c0 = (c0+lo)&BN_MASK2; hi += (c0<lo);   \
 579         c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi);   \
 580         } while(0)
 581
 582 #  define sqr_add_c2(a,i,j,c0,c1,c2) \
 583         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 584 # endif                         /* !BN_LLONG */
 585
 586 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 587 {
 588     BN_ULONG c1, c2, c3;
 589
 590     c1 = 0;
 591     c2 = 0;
 592     c3 = 0;
 593     mul_add_c(a[0], b[0], c1, c2, c3);
 594     r[0] = c1;
 595     c1 = 0;
 596     mul_add_c(a[0], b[1], c2, c3, c1);
 597     mul_add_c(a[1], b[0], c2, c3, c1);
 598     r[1] = c2;
 599     c2 = 0;
 600     mul_add_c(a[2], b[0], c3, c1, c2);
 601     mul_add_c(a[1], b[1], c3, c1, c2);
 602     mul_add_c(a[0], b[2], c3, c1, c2);
 603     r[2] = c3;
 604     c3 = 0;
 605     mul_add_c(a[0], b[3], c1, c2, c3);
 606     mul_add_c(a[1], b[2], c1, c2, c3);
 607     mul_add_c(a[2], b[1], c1, c2, c3);
 608     mul_add_c(a[3], b[0], c1, c2, c3);
 609     r[3] = c1;
 610     c1 = 0;
 611     mul_add_c(a[4], b[0], c2, c3, c1);
 612     mul_add_c(a[3], b[1], c2, c3, c1);
 613     mul_add_c(a[2], b[2], c2, c3, c1);
 614     mul_add_c(a[1], b[3], c2, c3, c1);
 615     mul_add_c(a[0], b[4], c2, c3, c1);
 616     r[4] = c2;
 617     c2 = 0;
 618     mul_add_c(a[0], b[5], c3, c1, c2);
 619     mul_add_c(a[1], b[4], c3, c1, c2);
 620     mul_add_c(a[2], b[3], c3, c1, c2);
 621     mul_add_c(a[3], b[2], c3, c1, c2);
 622     mul_add_c(a[4], b[1], c3, c1, c2);
 623     mul_add_c(a[5], b[0], c3, c1, c2);
 624     r[5] = c3;
 625     c3 = 0;
 626     mul_add_c(a[6], b[0], c1, c2, c3);
 627     mul_add_c(a[5], b[1], c1, c2, c3);
 628     mul_add_c(a[4], b[2], c1, c2, c3);
 629     mul_add_c(a[3], b[3], c1, c2, c3);
 630     mul_add_c(a[2], b[4], c1, c2, c3);
 631     mul_add_c(a[1], b[5], c1, c2, c3);
 632     mul_add_c(a[0], b[6], c1, c2, c3);
 633     r[6] = c1;
 634     c1 = 0;
 635     mul_add_c(a[0], b[7], c2, c3, c1);
 636     mul_add_c(a[1], b[6], c2, c3, c1);
 637     mul_add_c(a[2], b[5], c2, c3, c1);
 638     mul_add_c(a[3], b[4], c2, c3, c1);
 639     mul_add_c(a[4], b[3], c2, c3, c1);
 640     mul_add_c(a[5], b[2], c2, c3, c1);
 641     mul_add_c(a[6], b[1], c2, c3, c1);
 642     mul_add_c(a[7], b[0], c2, c3, c1);
 643     r[7] = c2;
 644     c2 = 0;
 645     mul_add_c(a[7], b[1], c3, c1, c2);
 646     mul_add_c(a[6], b[2], c3, c1, c2);
 647     mul_add_c(a[5], b[3], c3, c1, c2);
 648     mul_add_c(a[4], b[4], c3, c1, c2);
 649     mul_add_c(a[3], b[5], c3, c1, c2);
 650     mul_add_c(a[2], b[6], c3, c1, c2);
 651     mul_add_c(a[1], b[7], c3, c1, c2);
 652     r[8] = c3;
 653     c3 = 0;
 654     mul_add_c(a[2], b[7], c1, c2, c3);
 655     mul_add_c(a[3], b[6], c1, c2, c3);
 656     mul_add_c(a[4], b[5], c1, c2, c3);
 657     mul_add_c(a[5], b[4], c1, c2, c3);
 658     mul_add_c(a[6], b[3], c1, c2, c3);
 659     mul_add_c(a[7], b[2], c1, c2, c3);
 660     r[9] = c1;
 661     c1 = 0;
 662     mul_add_c(a[7], b[3], c2, c3, c1);
 663     mul_add_c(a[6], b[4], c2, c3, c1);
 664     mul_add_c(a[5], b[5], c2, c3, c1);
 665     mul_add_c(a[4], b[6], c2, c3, c1);
 666     mul_add_c(a[3], b[7], c2, c3, c1);
 667     r[10] = c2;
 668     c2 = 0;
 669     mul_add_c(a[4], b[7], c3, c1, c2);
 670     mul_add_c(a[5], b[6], c3, c1, c2);
 671     mul_add_c(a[6], b[5], c3, c1, c2);
 672     mul_add_c(a[7], b[4], c3, c1, c2);
 673     r[11] = c3;
 674     c3 = 0;
 675     mul_add_c(a[7], b[5], c1, c2, c3);
 676     mul_add_c(a[6], b[6], c1, c2, c3);
 677     mul_add_c(a[5], b[7], c1, c2, c3);
 678     r[12] = c1;
 679     c1 = 0;
 680     mul_add_c(a[6], b[7], c2, c3, c1);
 681     mul_add_c(a[7], b[6], c2, c3, c1);
 682     r[13] = c2;
 683     c2 = 0;
 684     mul_add_c(a[7], b[7], c3, c1, c2);
 685     r[14] = c3;
 686     r[15] = c1;
 687 }
 688
 689 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 690 {
 691     BN_ULONG c1, c2, c3;
 692
 693     c1 = 0;
 694     c2 = 0;
 695     c3 = 0;
 696     mul_add_c(a[0], b[0], c1, c2, c3);
 697     r[0] = c1;
 698     c1 = 0;
 699     mul_add_c(a[0], b[1], c2, c3, c1);
 700     mul_add_c(a[1], b[0], c2, c3, c1);
 701     r[1] = c2;
 702     c2 = 0;
 703     mul_add_c(a[2], b[0], c3, c1, c2);
 704     mul_add_c(a[1], b[1], c3, c1, c2);
 705     mul_add_c(a[0], b[2], c3, c1, c2);
 706     r[2] = c3;
 707     c3 = 0;
 708     mul_add_c(a[0], b[3], c1, c2, c3);
 709     mul_add_c(a[1], b[2], c1, c2, c3);
 710     mul_add_c(a[2], b[1], c1, c2, c3);
 711     mul_add_c(a[3], b[0], c1, c2, c3);
 712     r[3] = c1;
 713     c1 = 0;
 714     mul_add_c(a[3], b[1], c2, c3, c1);
 715     mul_add_c(a[2], b[2], c2, c3, c1);
 716     mul_add_c(a[1], b[3], c2, c3, c1);
 717     r[4] = c2;
 718     c2 = 0;
 719     mul_add_c(a[2], b[3], c3, c1, c2);
 720     mul_add_c(a[3], b[2], c3, c1, c2);
 721     r[5] = c3;
 722     c3 = 0;
 723     mul_add_c(a[3], b[3], c1, c2, c3);
 724     r[6] = c1;
 725     r[7] = c2;
 726 }
 727
 728 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
 729 {
 730     BN_ULONG c1, c2, c3;
 731
 732     c1 = 0;
 733     c2 = 0;
 734     c3 = 0;
 735     sqr_add_c(a, 0, c1, c2, c3);
 736     r[0] = c1;
 737     c1 = 0;
 738     sqr_add_c2(a, 1, 0, c2, c3, c1);
 739     r[1] = c2;
 740     c2 = 0;
 741     sqr_add_c(a, 1, c3, c1, c2);
 742     sqr_add_c2(a, 2, 0, c3, c1, c2);
 743     r[2] = c3;
 744     c3 = 0;
 745     sqr_add_c2(a, 3, 0, c1, c2, c3);
 746     sqr_add_c2(a, 2, 1, c1, c2, c3);
 747     r[3] = c1;
 748     c1 = 0;
 749     sqr_add_c(a, 2, c2, c3, c1);
 750     sqr_add_c2(a, 3, 1, c2, c3, c1);
 751     sqr_add_c2(a, 4, 0, c2, c3, c1);
 752     r[4] = c2;
 753     c2 = 0;
 754     sqr_add_c2(a, 5, 0, c3, c1, c2);
 755     sqr_add_c2(a, 4, 1, c3, c1, c2);
 756     sqr_add_c2(a, 3, 2, c3, c1, c2);
 757     r[5] = c3;
 758     c3 = 0;
 759     sqr_add_c(a, 3, c1, c2, c3);
 760     sqr_add_c2(a, 4, 2, c1, c2, c3);
 761     sqr_add_c2(a, 5, 1, c1, c2, c3);
 762     sqr_add_c2(a, 6, 0, c1, c2, c3);
 763     r[6] = c1;
 764     c1 = 0;
 765     sqr_add_c2(a, 7, 0, c2, c3, c1);
 766     sqr_add_c2(a, 6, 1, c2, c3, c1);
 767     sqr_add_c2(a, 5, 2, c2, c3, c1);
 768     sqr_add_c2(a, 4, 3, c2, c3, c1);
 769     r[7] = c2;
 770     c2 = 0;
 771     sqr_add_c(a, 4, c3, c1, c2);
 772     sqr_add_c2(a, 5, 3, c3, c1, c2);
 773     sqr_add_c2(a, 6, 2, c3, c1, c2);
 774     sqr_add_c2(a, 7, 1, c3, c1, c2);
 775     r[8] = c3;
 776     c3 = 0;
 777     sqr_add_c2(a, 7, 2, c1, c2, c3);
 778     sqr_add_c2(a, 6, 3, c1, c2, c3);
 779     sqr_add_c2(a, 5, 4, c1, c2, c3);
 780     r[9] = c1;
 781     c1 = 0;
 782     sqr_add_c(a, 5, c2, c3, c1);
 783     sqr_add_c2(a, 6, 4, c2, c3, c1);
 784     sqr_add_c2(a, 7, 3, c2, c3, c1);
 785     r[10] = c2;
 786     c2 = 0;
 787     sqr_add_c2(a, 7, 4, c3, c1, c2);
 788     sqr_add_c2(a, 6, 5, c3, c1, c2);
 789     r[11] = c3;
 790     c3 = 0;
 791     sqr_add_c(a, 6, c1, c2, c3);
 792     sqr_add_c2(a, 7, 5, c1, c2, c3);
 793     r[12] = c1;
 794     c1 = 0;
 795     sqr_add_c2(a, 7, 6, c2, c3, c1);
 796     r[13] = c2;
 797     c2 = 0;
 798     sqr_add_c(a, 7, c3, c1, c2);
 799     r[14] = c3;
 800     r[15] = c1;
 801 }
 802
 803 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 804 {
 805     BN_ULONG c1, c2, c3;
 806
 807     c1 = 0;
 808     c2 = 0;
 809     c3 = 0;
 810     sqr_add_c(a, 0, c1, c2, c3);
 811     r[0] = c1;
 812     c1 = 0;
 813     sqr_add_c2(a, 1, 0, c2, c3, c1);
 814     r[1] = c2;
 815     c2 = 0;
 816     sqr_add_c(a, 1, c3, c1, c2);
 817     sqr_add_c2(a, 2, 0, c3, c1, c2);
 818     r[2] = c3;
 819     c3 = 0;
 820     sqr_add_c2(a, 3, 0, c1, c2, c3);
 821     sqr_add_c2(a, 2, 1, c1, c2, c3);
 822     r[3] = c1;
 823     c1 = 0;
 824     sqr_add_c(a, 2, c2, c3, c1);
 825     sqr_add_c2(a, 3, 1, c2, c3, c1);
 826     r[4] = c2;
 827     c2 = 0;
 828     sqr_add_c2(a, 3, 2, c3, c1, c2);
 829     r[5] = c3;
 830     c3 = 0;
 831     sqr_add_c(a, 3, c1, c2, c3);
 832     r[6] = c1;
 833     r[7] = c2;
 834 }
 835
 836 # ifdef OPENSSL_NO_ASM
 837 #  ifdef OPENSSL_BN_ASM_MONT
 838 #   include <alloca.h>
 839 /*
 840  * This is essentially reference implementation, which may or may not
 841  * result in performance improvement. E.g. on IA-32 this routine was
 842  * observed to give 40% faster rsa1024 private key operations and 10%
 843  * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
 844  * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
 845  * reference implementation, one to be used as starting point for
 846  * platform-specific assembler. Mentioned numbers apply to compiler
 847  * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
 848  * can vary not only from platform to platform, but even for compiler
 849  * versions. Assembler vs. assembler improvement coefficients can
 850  * [and are known to] differ and are to be documented elsewhere.
 851  */
 852 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
 853                 const BN_ULONG *np, const BN_ULONG *n0p, int num)
 854 {
 855     BN_ULONG c0, c1, ml, *tp, n0;
 856 #   ifdef mul64
 857     BN_ULONG mh;
 858 #   endif
 859     volatile BN_ULONG *vp;
 860     int i = 0, j;
 861
 862 #   if 0                        /* template for platform-specific
 863                                  * implementation */
 864     if (ap == bp)
 865         return bn_sqr_mont(rp, ap, np, n0p, num);
 866 #   endif
 867     vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
 868
 869     n0 = *n0p;
 870
 871     c0 = 0;
 872     ml = bp[0];
 873 #   ifdef mul64
 874     mh = HBITS(ml);
 875     ml = LBITS(ml);
 876     for (j = 0; j < num; ++j)
 877         mul(tp[j], ap[j], ml, mh, c0);
 878 #   else
 879     for (j = 0; j < num; ++j)
 880         mul(tp[j], ap[j], ml, c0);
 881 #   endif
 882
 883     tp[num] = c0;
 884     tp[num + 1] = 0;
 885     goto enter;
 886
 887     for (i = 0; i < num; i++) {
 888         c0 = 0;
 889         ml = bp[i];
 890 #   ifdef mul64
 891         mh = HBITS(ml);
 892         ml = LBITS(ml);
 893         for (j = 0; j < num; ++j)
 894             mul_add(tp[j], ap[j], ml, mh, c0);
 895 #   else
 896         for (j = 0; j < num; ++j)
 897             mul_add(tp[j], ap[j], ml, c0);
 898 #   endif
 899         c1 = (tp[num] + c0) & BN_MASK2;
 900         tp[num] = c1;
 901         tp[num + 1] = (c1 < c0 ? 1 : 0);
 902  enter:
 903         c1 = tp[0];
 904         ml = (c1 * n0) & BN_MASK2;
 905         c0 = 0;
 906 #   ifdef mul64
 907         mh = HBITS(ml);
 908         ml = LBITS(ml);
 909         mul_add(c1, np[0], ml, mh, c0);
 910 #   else
 911         mul_add(c1, ml, np[0], c0);
 912 #   endif
 913         for (j = 1; j < num; j++) {
 914             c1 = tp[j];
 915 #   ifdef mul64
 916             mul_add(c1, np[j], ml, mh, c0);
 917 #   else
 918             mul_add(c1, ml, np[j], c0);
 919 #   endif
 920             tp[j - 1] = c1 & BN_MASK2;
 921         }
 922         c1 = (tp[num] + c0) & BN_MASK2;
 923         tp[num - 1] = c1;
 924         tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
 925     }
 926
 927     if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
 928         c0 = bn_sub_words(rp, tp, np, num);
 929         if (tp[num] != 0 || c0 == 0) {
 930             for (i = 0; i < num + 2; i++)
 931                 vp[i] = 0;
 932             return 1;
 933         }
 934     }
 935     for (i = 0; i < num; i++)
 936         rp[i] = tp[i], vp[i] = 0;
 937     vp[num] = 0;
 938     vp[num + 1] = 0;
 939     return 1;
 940 }
 941 #  else
 942 /*
 943  * Return value of 0 indicates that multiplication/convolution was not
 944  * performed to signal the caller to fall down to alternative/original
 945  * code-path.
 946  */
 947 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
 948                 const BN_ULONG *np, const BN_ULONG *n0, int num)
 949 {
 950     return 0;
 951 }
 952 #  endif                        /* OPENSSL_BN_ASM_MONT */
 953 # endif
 954
 955 #else                           /* !BN_MUL_COMBA */
 956
 957 /* hmm... is it faster just to do a multiply? */
 958 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 959 {
 960     BN_ULONG t[8];
 961     bn_sqr_normal(r, a, 4, t);
 962 }
 963
 964 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
 965 {
 966     BN_ULONG t[16];
 967     bn_sqr_normal(r, a, 8, t);
 968 }
 969
 970 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 971 {
 972     r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
 973     r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
 974     r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
 975     r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
 976 }
 977
 978 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 979 {
 980     r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
 981     r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
 982     r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
 983     r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
 984     r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
 985     r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
 986     r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
 987     r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
 988 }
 989
 990 # ifdef OPENSSL_NO_ASM
 991 #  ifdef OPENSSL_BN_ASM_MONT
 992 #   include <alloca.h>
 993 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
 994                 const BN_ULONG *np, const BN_ULONG *n0p, int num)
 995 {
 996     BN_ULONG c0, c1, *tp, n0 = *n0p;
 997     volatile BN_ULONG *vp;
 998     int i = 0, j;
 999
1000     vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
1001
1002     for (i = 0; i <= num; i++)
1003         tp[i] = 0;
1004
1005     for (i = 0; i < num; i++) {
1006         c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1007         c1 = (tp[num] + c0) & BN_MASK2;
1008         tp[num] = c1;
1009         tp[num + 1] = (c1 < c0 ? 1 : 0);
1010
1011         c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1012         c1 = (tp[num] + c0) & BN_MASK2;
1013         tp[num] = c1;
1014         tp[num + 1] += (c1 < c0 ? 1 : 0);
1015         for (j = 0; j <= num; j++)
1016             tp[j] = tp[j + 1];
1017     }
1018
1019     if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1020         c0 = bn_sub_words(rp, tp, np, num);
1021         if (tp[num] != 0 || c0 == 0) {
1022             for (i = 0; i < num + 2; i++)
1023                 vp[i] = 0;
1024             return 1;
1025         }
1026     }
1027     for (i = 0; i < num; i++)
1028         rp[i] = tp[i], vp[i] = 0;
1029     vp[num] = 0;
1030     vp[num + 1] = 0;
1031     return 1;
1032 }
1033 #  else
1034 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1035                 const BN_ULONG *np, const BN_ULONG *n0, int num)
1036 {
1037     return 0;
1038 }
1039 #  endif                        /* OPENSSL_BN_ASM_MONT */
1040 # endif
1041
1042 #endif                          /* !BN_MUL_COMBA */